from bs4 import BeautifulSoup
from datetime import datetime, timezone
from io import BytesIO
from json import loads
from pathlib import Path
from pydub import AudioSegment
from requests import get
from time import sleep
from typing import Union
from zeitsprung.base import Base
from zeitsprung.database import SQLiteEngine
[docs]class Scraper(Base):
"""Class for scraping and preprocessing the data from the 'www.zeitsprung.fm' website."""
def __init__(self, data_folder: str, update_interval: int = 24*60*60,
reset: bool = False, verbose: bool = True) -> None:
"""
Class constructor for the Scraper class.
Parameters
----------
data_folder : str
Folder to store the database and audio files. Is created or if existing, the files will bind to this.
update_interval : int, default 24*60*60
Interval to wait for updating after the last episode is fetched.
reset : bool, default False
Ignore and reset an existing database.?
verbose : bool, default True
Print messages about the activities conducted by a class instance.
"""
super().__init__(verbose)
self.created_at = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
self.data_folder = Path(data_folder)
self.db = SQLiteEngine(self.data_folder / 'zeitsprung.db')
self.update_interval = update_interval
self.verbose = verbose
if (self.data_folder / 'zeitsprung.db').exists() and reset:
self._print(f"Overwriting existing directory structure in '{data_folder}'.")
Path(data_folder).mkdir(parents=True, exist_ok=True)
(Path(data_folder) / 'audio').mkdir(parents=True, exist_ok=True)
self.db.setup_schema()
elif (self.data_folder / 'zeitsprung.db').exists() and not reset:
self._print(f"Binding to existing directory structure in '{data_folder}'.")
else:
self._print(f"Creating directory structure in '{data_folder}'.")
Path(data_folder).mkdir(parents=True, exist_ok=True)
(Path(data_folder) / 'audio').mkdir(parents=True, exist_ok=True)
self.db.setup_schema()
self.current_episode = self.db.query_last_episode_id()
def __str__(self) -> str:
"""
Print function of the class.
Returns
-------
str
A string, which describes the class instance.
"""
return f"Scraper created at '{self.created_at}' with db connection to " \
f"'{self.db.db_file}', current episode is 'ZS{self.current_episode}'."
[docs] @staticmethod
def search_key(key, dict_obj):
for entry in dict_obj:
if key in entry:
return entry[key]
[docs] def get_episode_audio(self, url: str) -> Union[AudioSegment, None]:
"""
Downloads the audio of a specified episode.
Parameters
----------
url : str
URL to download the audio from.
Returns
-------
AudioSegment:
The audio of the episode.
"""
if url is not None:
self._print(f"Fetching audio file from {url}")
audio_mp3 = BytesIO(get(url, allow_redirects=True).content)
audio = AudioSegment.from_file(audio_mp3)
return audio
else:
self._print('No audio file available for this episode.')
return None
[docs] def save_episode_audio(self, audio: AudioSegment, file_name: str) -> None:
"""
Save the audio file of an episode and as '.wav' file.
Parameters
----------
audio : AudioSegment
Audio file to save.
file_name : str
File name with path, where the file should be saved to.
Returns
-------
None
"""
self._print(f"Exporting audio sequence to file '{file_name}'")
audio.export(file_name, format="wav")
[docs] def run(self) -> None:
"""
Start the scraper, which will download the meta data and audio files of all not yet existing episodes in the
database.
Returns
-------
None
"""
while True:
meta_row = self.get_episode_meta(self.current_episode + 1)
if meta_row is not None:
self.db.insert_meta_row(meta_row)
audio = self.get_episode_audio(meta_row[7])
if audio is not None:
audio_row = [
self.current_episode + 1,
self.data_folder / 'audio' / f'{str(self.current_episode + 1).zfill(3)}.wav',
round(audio.duration_seconds),
audio.frame_rate,
audio.frame_width
]
self.save_episode_audio(audio, audio_row[1])
self.db.insert_audio_row(audio_row)
self.current_episode += 1
else:
self._print(f"Episode not yet published, pausing for {int(self.update_interval/(60*60))} hours.")
sleep(self.update_interval)