Source code for proboards_scraper.core

import asyncio
import logging
import pathlib
from typing import Callable, Literal

import aiohttp

from .http_requests import (
    get_chrome_driver, get_login_cookies, get_login_session
)
from .scraper_manager import ScraperManager
from proboards_scraper.database import Database
from proboards_scraper.scraper import (
    split_url, scrape_board, scrape_forum, scrape_thread, scrape_user,
    scrape_users,
)


logger = logging.getLogger(__name__)


async def _task_wrapper(
    func: Callable,
    queue_name: Literal["user", "content", "both"],
    url: str,
    manager: ScraperManager
):
    """
    Args:
        func: The async function to be called for scraping user(s) or content.
        queue_name: The queue(s) in which ``None`` should be put after ``func``
            completes, signaling to :meth:`ScraperManager.run` that that
            queue's task is complete.
        url: The URL to be passed to ``func``.
        manager: The ``ScraperManager`` instance to be passed to ``func``.
    """
    await func(url, manager)

    if queue_name == "both" or queue_name == "user":
        await manager.user_queue.put(None)

    if queue_name == "both" or queue_name == "content":
        await manager.content_queue.put(None)


[docs]def run_scraper(
    url: str,
    dst_dir: pathlib.Path = "site",
    username: str = None,
    password: str = None,
    skip_users: bool = False,
    no_delay: bool = False
) -> None:
    """
    Main function that runs the scraper and calls the appropriate `async`
    functions/methods. This is the only function that needs to be called to
    actually run the scraper (with all the default settings).

    Args:
        url: URL of the the page to scrape.

            * If the URL is that of the forum homepage (e.g.,
              `https://yoursite.proboards.com/`), the entire site
              (including users, shoutbox, category/board/thread/post content,
              etc.) will be scraped.
            * If it is the URL for the members page
              (e.g., `https://yoursite.proboards.com/members`), only the users
              will be scraped.
            * If it is the URL for a specific user profile
              (e.g., `https://yoursite.proboards.com/user/10`), only that
              particular user will be scraped.
            * If it is the URL for a board
              (e.g., `https://yoursite.proboards.com/board/3/board-name`),
              only that particular board and its threads/posts/sub-boards
              will be scraped.
            * If it is the URL for a thread
              (e.g., `https://yoursite.proboards.com/thread/1234/thread-title`)
              only that particular thread and its posts will be scraped.

        dst_dir: Directory in which to place the resulting files. The database
            file is written to ``<dst_dir>/forum.db`` and image files are
            saved to ``<dst_dir>/images``.
        username: Username for login.
        password: Password for login.
        skip_users: Skip scraping/adding users from the forum members page
            (only applies if the forum homepage is provided for ``url``.
        no_delay: Do not add a delay between subsequent requests (see
            :class:`ScraperManager` for more information). Note that this may
            result in request throttling.
    """
    dst_dir = dst_dir.expanduser().resolve()
    dst_dir.mkdir(parents=True, exist_ok=True)

    image_dir = dst_dir / "images"
    image_dir.mkdir(exist_ok=True)

    db_path = dst_dir / "forum.db"
    db = Database(db_path)

    chrome_driver = get_chrome_driver()

    base_url, url_path = split_url(url)

    # Get cookies for parts of the site requiring login authentication.
    if username and password:
        logger.info(f"Logging in to {base_url}")
        cookies = get_login_cookies(
            base_url, username, password, chrome_driver
        )

        # Create a persistent aiohttp login session from the cookies.
        client_session = get_login_session(cookies)
        logger.info("Login successful")
    else:
        logger.info(
            "Username and/or password not provided; proceeding without login"
        )
        client_session = aiohttp.ClientSession()

    manager_kwargs = {
        "driver": chrome_driver,
        "image_dir": image_dir,
    }

    if no_delay:
        manager_kwargs["request_threshold"] = None
        manager_kwargs["short_delay_time"] = None
        manager_kwargs["long_delay_time"] = None

    manager = ScraperManager(
        db, client_session, **manager_kwargs
    )

    tasks = []

    users_task = None
    content_task = None

    if url_path is None:
        # This represents the case where the forum homepage URL was provided,
        # i.e., we scrape the entire site.
        logger.info("Scraping entire forum")

        content_task = _task_wrapper(
            scrape_forum, "content", base_url, manager
        )

        if skip_users:
            logger.info("Skipping user profiles")
        else:
            users_page_url = f"{base_url}/members"
            users_task = _task_wrapper(
                scrape_users, "user", users_page_url, manager
            )
    elif url_path.startswith("/members"):
        users_task = _task_wrapper(scrape_users, "both", url, manager)
    elif url_path.startswith("/user"):
        users_task = _task_wrapper(scrape_user, "both", url, manager)
    elif url_path.startswith("/board"):
        content_task = _task_wrapper(
            scrape_board, "content", url, manager
        )
    elif url_path.startswith("/thread"):
        content_task = _task_wrapper(
            scrape_thread, "content", url, manager
        )

    if users_task is not None:
        tasks.append(users_task)
    else:
        manager.user_queue = None

    if content_task is not None:
        tasks.append(content_task)

    database_task = manager.run()
    tasks.append(database_task)

    task_group = asyncio.gather(*tasks)
    asyncio.get_event_loop().run_until_complete(task_group)