import asyncio
import logging
import pathlib
import time
import aiohttp
import bs4
import selenium.webdriver
from proboards_scraper.database import Database
from .http_requests import get_source, download_image
logger = logging.getLogger(__name__)
[docs]class ScraperManager:
def __init__(
self,
db: Database,
client_session: aiohttp.ClientSession,
content_queue: asyncio.Queue = None,
driver: selenium.webdriver.Chrome = None,
image_dir: pathlib.Path = None,
user_queue: asyncio.Queue = None,
request_threshold: int = 15,
short_delay_time: float = 1.5,
long_delay_time: float = 20.0
):
"""
This class has three purposes: 1) to store references to objects that
will be used in the process of scraping, 2) to serve as an abstraction
layer between the scraper functionality and the database, and 3) to
handle HTTP requests (adding delays between requests as needed to
avoid throttling) and process the queues (popping items from the queues
in the necessary order and inserting them into the database).
Args:
db: Database handle.
client_session: ``aiohttp`` session.
content_queue: Queue to which all content (excluding users) should
be added for insertion into the database.
driver: Selenium Chrome driver.
image_dir: Directory to which downloaded images should be saved.
user_queue: Queue to which users should be added for insertion
into the database.
request_threshold: After every :attr:`request_threshold` calls to
:meth:`ScraperManager.get_source`, wait :attr:`long_delay_time`
seconds before continuing. This is to prevent request
throttling due to a large number of consecutive requests.
short_delay_time: Number of seconds to wait after each call to
:meth:`ScraperManager.get_source` (to help prevent request
throttling).
long_delay_time: See :attr:`request_threshold`.
"""
self.db = db
self.client_session = client_session
if driver is None:
# Selenium is required to scrape poll content (and, by corollary,
# a Selenium driver).
logger.warning(
"Polls cannot be scraped without setting a Chrome webdriver"
)
self.driver = driver
if image_dir is None:
image_dir = pathlib.Path("./images").expanduser().resolve()
image_dir.mkdir(exist_ok=True)
self.image_dir = image_dir
if content_queue is None:
content_queue = asyncio.Queue()
self.content_queue = content_queue
if user_queue is None:
user_queue = asyncio.Queue()
self.user_queue = user_queue
# TODO: include selenium webdriver in request count?
self.request_threshold = request_threshold
self.short_delay_time = short_delay_time
self.long_delay_time = long_delay_time
self.request_count = 0
async def _delay(self) -> None:
"""
Asynchronously sleep for an amount of time based on the number of
requests, the request threshold, and the short/long delay times.
"""
if not self.short_delay_time and not self.long_delay_time:
return
delay = self.short_delay_time
if self.request_threshold is not None and self.long_delay_time:
mod = self.request_threshold - 1
if self.request_count % self.request_threshold == mod:
delay = self.long_delay_time
logger.debug(
f"Request count = {self.request_count + 1}, "
f"sleeping {delay} s"
)
await asyncio.sleep(delay)
[docs] async def download_image(self, url: str) -> dict:
"""
Download an image to :attr:`image_dir`.
Args:
url: URL of the image to be downloaded.
Returns:
Image download status and metadata; see
:func:`proboards_scraper.download_image`.
"""
if "proboards.com" in url:
await self._delay()
self.request_count += 1
return await download_image(url, self.client_session, self.image_dir)
[docs] async def get_source(self, url: str) -> bs4.BeautifulSoup:
"""
Wrapper around :func:`proboards_scraper.get_source` with an
added short delay via call to :func:`time.sleep` before each
request, and a longer delay after every ``self.request_threshold``
calls to :meth:`ScraperManager.get_source`. This rate-limiting is
performed to help avoid request throttling by the server, which may
result from a large number of requests in a short period of time.
Args:
url: URL whose page source to retrieve.
Returns:
BeautifulSoup page source object.
"""
await self._delay()
self.request_count += 1
return await get_source(url, self.client_session)
[docs] def insert_guest(self, name: str) -> int:
"""
Insert a guest user into the database.
Args:
name: The guest's username.
Returns:
The user ID of the guest returned by
:meth:`proboards_scraper.database.Database.insert_guest`.
"""
guest = {
"id": -1,
"name": name,
}
# Get guest user id.
guest_db_obj = self.db.insert_guest(guest)
guest_id = guest_db_obj.id
return guest_id
[docs] def insert_image(self, image: dict) -> int:
"""
Insert an image entry into the database.
Args:
image: A dict representing the image entry.
Returns:
The image ID of the image returned by
:meth:`proboards_scraper.database.Database.insert_image`.
"""
image_db_obj = self.db.insert_image(image)
image_id = image_db_obj.id
return image_id
[docs] async def run(self) -> None:
"""
Run the scraper, first processing the user queue and then processing
the content queue, calling the appropriate database insert/query
methods as needed, and closing the Selenium and aiohttp sessions upon
completion.
Because all content (threads, posts, etc.) is associated with users,
the content queue is not processed until all users have been added
from the user queue (the end of which is marked by a sentinel value).
Guest users are an exception, since they are not present in the site's
member list; instead, guests are added/queried as they are encountered
by calling :meth:`ScraperManager.insert_guest`.
"""
if self.user_queue is not None:
all_users_added = False
while not all_users_added:
user = await self.user_queue.get()
if user is None:
all_users_added = True
else:
self.db.insert_user(user)
all_content_added = False
while not all_content_added:
content = await self.content_queue.get()
if content is None:
all_content_added = True
else:
type_ = content["type"]
del content["type"]
type_to_insert_func = {
"board": self.db.insert_board,
"category": self.db.insert_category,
"image": self.db.insert_image,
"moderator": self.db.insert_moderator,
"poll": self.db.insert_poll,
"poll_option": self.db.insert_poll_option,
"poll_voter": self.db.insert_poll_voter,
"post": self.db.insert_post,
"shoutbox_post": self.db.insert_shoutbox_post,
"thread": self.db.insert_thread,
}
insert_func = type_to_insert_func[type_]
insert_func(content)
await self.client_session.close()
self.driver.quit()