Source code for proboards_scraper.core

import asyncio
import logging
import pathlib
from typing import Callable, Literal

import aiohttp

from .http_requests import (
    get_chrome_driver, get_login_cookies, get_login_session
)
from .scraper_manager import ScraperManager
from proboards_scraper.database import Database
from proboards_scraper.scraper import (
    split_url, scrape_board, scrape_forum, scrape_thread, scrape_user,
    scrape_users,
)


logger = logging.getLogger(__name__)


async def _task_wrapper(
    func: Callable,
    queue_name: Literal["user", "content", "both"],
    url: str,
    manager: ScraperManager
):
    """
    Args:
        func: The async function to be called for scraping user(s) or content.
        queue_name: The queue(s) in which ``None`` should be put after ``func``
            completes, signaling to :meth:`ScraperManager.run` that that
            queue's task is complete.
        url: The URL to be passed to ``func``.
        manager: The ``ScraperManager`` instance to be passed to ``func``.
    """
    await func(url, manager)

    if queue_name == "both" or queue_name == "user":
        await manager.user_queue.put(None)

    if queue_name == "both" or queue_name == "content":
        await manager.content_queue.put(None)


[docs]def run_scraper( url: str, dst_dir: pathlib.Path = "site", username: str = None, password: str = None, skip_users: bool = False, no_delay: bool = False ) -> None: """ Main function that runs the scraper and calls the appropriate `async` functions/methods. This is the only function that needs to be called to actually run the scraper (with all the default settings). Args: url: URL of the the page to scrape. * If the URL is that of the forum homepage (e.g., `https://yoursite.proboards.com/`), the entire site (including users, shoutbox, category/board/thread/post content, etc.) will be scraped. * If it is the URL for the members page (e.g., `https://yoursite.proboards.com/members`), only the users will be scraped. * If it is the URL for a specific user profile (e.g., `https://yoursite.proboards.com/user/10`), only that particular user will be scraped. * If it is the URL for a board (e.g., `https://yoursite.proboards.com/board/3/board-name`), only that particular board and its threads/posts/sub-boards will be scraped. * If it is the URL for a thread (e.g., `https://yoursite.proboards.com/thread/1234/thread-title`) only that particular thread and its posts will be scraped. dst_dir: Directory in which to place the resulting files. The database file is written to ``<dst_dir>/forum.db`` and image files are saved to ``<dst_dir>/images``. username: Username for login. password: Password for login. skip_users: Skip scraping/adding users from the forum members page (only applies if the forum homepage is provided for ``url``. no_delay: Do not add a delay between subsequent requests (see :class:`ScraperManager` for more information). Note that this may result in request throttling. """ dst_dir = dst_dir.expanduser().resolve() dst_dir.mkdir(parents=True, exist_ok=True) image_dir = dst_dir / "images" image_dir.mkdir(exist_ok=True) db_path = dst_dir / "forum.db" db = Database(db_path) chrome_driver = get_chrome_driver() base_url, url_path = split_url(url) # Get cookies for parts of the site requiring login authentication. if username and password: logger.info(f"Logging in to {base_url}") cookies = get_login_cookies( base_url, username, password, chrome_driver ) # Create a persistent aiohttp login session from the cookies. client_session = get_login_session(cookies) logger.info("Login successful") else: logger.info( "Username and/or password not provided; proceeding without login" ) client_session = aiohttp.ClientSession() manager_kwargs = { "driver": chrome_driver, "image_dir": image_dir, } if no_delay: manager_kwargs["request_threshold"] = None manager_kwargs["short_delay_time"] = None manager_kwargs["long_delay_time"] = None manager = ScraperManager( db, client_session, **manager_kwargs ) tasks = [] users_task = None content_task = None if url_path is None: # This represents the case where the forum homepage URL was provided, # i.e., we scrape the entire site. logger.info("Scraping entire forum") content_task = _task_wrapper( scrape_forum, "content", base_url, manager ) if skip_users: logger.info("Skipping user profiles") else: users_page_url = f"{base_url}/members" users_task = _task_wrapper( scrape_users, "user", users_page_url, manager ) elif url_path.startswith("/members"): users_task = _task_wrapper(scrape_users, "both", url, manager) elif url_path.startswith("/user"): users_task = _task_wrapper(scrape_user, "both", url, manager) elif url_path.startswith("/board"): content_task = _task_wrapper( scrape_board, "content", url, manager ) elif url_path.startswith("/thread"): content_task = _task_wrapper( scrape_thread, "content", url, manager ) if users_task is not None: tasks.append(users_task) else: manager.user_queue = None if content_task is not None: tasks.append(content_task) database_task = manager.run() tasks.append(database_task) task_group = asyncio.gather(*tasks) asyncio.get_event_loop().run_until_complete(task_group)