Source code for proboards_scraper.scraper_manager

import asyncio
import logging
import pathlib
import time

import aiohttp
import bs4
import selenium.webdriver

from proboards_scraper.database import Database
from .http_requests import get_source, download_image


logger = logging.getLogger(__name__)


[docs]class ScraperManager: def __init__( self, db: Database, client_session: aiohttp.ClientSession, content_queue: asyncio.Queue = None, driver: selenium.webdriver.Chrome = None, image_dir: pathlib.Path = None, user_queue: asyncio.Queue = None, request_threshold: int = 15, short_delay_time: float = 1.5, long_delay_time: float = 20.0 ): """ This class has three purposes: 1) to store references to objects that will be used in the process of scraping, 2) to serve as an abstraction layer between the scraper functionality and the database, and 3) to handle HTTP requests (adding delays between requests as needed to avoid throttling) and process the queues (popping items from the queues in the necessary order and inserting them into the database). Args: db: Database handle. client_session: ``aiohttp`` session. content_queue: Queue to which all content (excluding users) should be added for insertion into the database. driver: Selenium Chrome driver. image_dir: Directory to which downloaded images should be saved. user_queue: Queue to which users should be added for insertion into the database. request_threshold: After every :attr:`request_threshold` calls to :meth:`ScraperManager.get_source`, wait :attr:`long_delay_time` seconds before continuing. This is to prevent request throttling due to a large number of consecutive requests. short_delay_time: Number of seconds to wait after each call to :meth:`ScraperManager.get_source` (to help prevent request throttling). long_delay_time: See :attr:`request_threshold`. """ self.db = db self.client_session = client_session if driver is None: # Selenium is required to scrape poll content (and, by corollary, # a Selenium driver). logger.warning( "Polls cannot be scraped without setting a Chrome webdriver" ) self.driver = driver if image_dir is None: image_dir = pathlib.Path("./images").expanduser().resolve() image_dir.mkdir(exist_ok=True) self.image_dir = image_dir if content_queue is None: content_queue = asyncio.Queue() self.content_queue = content_queue if user_queue is None: user_queue = asyncio.Queue() self.user_queue = user_queue # TODO: include selenium webdriver in request count? self.request_threshold = request_threshold self.short_delay_time = short_delay_time self.long_delay_time = long_delay_time self.request_count = 0 async def _delay(self) -> None: """ Asynchronously sleep for an amount of time based on the number of requests, the request threshold, and the short/long delay times. """ if not self.short_delay_time and not self.long_delay_time: return delay = self.short_delay_time if self.request_threshold is not None and self.long_delay_time: mod = self.request_threshold - 1 if self.request_count % self.request_threshold == mod: delay = self.long_delay_time logger.debug( f"Request count = {self.request_count + 1}, " f"sleeping {delay} s" ) await asyncio.sleep(delay)
[docs] async def download_image(self, url: str) -> dict: """ Download an image to :attr:`image_dir`. Args: url: URL of the image to be downloaded. Returns: Image download status and metadata; see :func:`proboards_scraper.download_image`. """ if "proboards.com" in url: await self._delay() self.request_count += 1 return await download_image(url, self.client_session, self.image_dir)
[docs] async def get_source(self, url: str) -> bs4.BeautifulSoup: """ Wrapper around :func:`proboards_scraper.get_source` with an added short delay via call to :func:`time.sleep` before each request, and a longer delay after every ``self.request_threshold`` calls to :meth:`ScraperManager.get_source`. This rate-limiting is performed to help avoid request throttling by the server, which may result from a large number of requests in a short period of time. Args: url: URL whose page source to retrieve. Returns: BeautifulSoup page source object. """ await self._delay() self.request_count += 1 return await get_source(url, self.client_session)
[docs] def insert_guest(self, name: str) -> int: """ Insert a guest user into the database. Args: name: The guest's username. Returns: The user ID of the guest returned by :meth:`proboards_scraper.database.Database.insert_guest`. """ guest = { "id": -1, "name": name, } # Get guest user id. guest_db_obj = self.db.insert_guest(guest) guest_id = guest_db_obj.id return guest_id
[docs] def insert_image(self, image: dict) -> int: """ Insert an image entry into the database. Args: image: A dict representing the image entry. Returns: The image ID of the image returned by :meth:`proboards_scraper.database.Database.insert_image`. """ image_db_obj = self.db.insert_image(image) image_id = image_db_obj.id return image_id
[docs] async def run(self) -> None: """ Run the scraper, first processing the user queue and then processing the content queue, calling the appropriate database insert/query methods as needed, and closing the Selenium and aiohttp sessions upon completion. Because all content (threads, posts, etc.) is associated with users, the content queue is not processed until all users have been added from the user queue (the end of which is marked by a sentinel value). Guest users are an exception, since they are not present in the site's member list; instead, guests are added/queried as they are encountered by calling :meth:`ScraperManager.insert_guest`. """ if self.user_queue is not None: all_users_added = False while not all_users_added: user = await self.user_queue.get() if user is None: all_users_added = True else: self.db.insert_user(user) all_content_added = False while not all_content_added: content = await self.content_queue.get() if content is None: all_content_added = True else: type_ = content["type"] del content["type"] type_to_insert_func = { "board": self.db.insert_board, "category": self.db.insert_category, "image": self.db.insert_image, "moderator": self.db.insert_moderator, "poll": self.db.insert_poll, "poll_option": self.db.insert_poll_option, "poll_voter": self.db.insert_poll_voter, "post": self.db.insert_post, "shoutbox_post": self.db.insert_shoutbox_post, "thread": self.db.insert_thread, } insert_func = type_to_insert_func[type_] insert_func(content) await self.client_session.close() self.driver.quit()