Source code for proboards_scraper.http_requests

import hashlib
import http
import imghdr
import logging
import pathlib
import time
from typing import List

import aiofiles
import aiohttp
import bs4
import selenium.webdriver


logger = logging.getLogger(__name__)


def test_ico(h: bytes, f):
    """
    Test for .ico files to be added to the ``imghdr`` module tests.
    See `ICO file format`_ and `imghdr.tests`_.

    .. _`ICO file format`: https://en.wikipedia.org/wiki/ICO_(file_format)
    .. _`imghdr.tests`:
        https://docs.python.org/3/library/imghdr.html#imghdr.tests
    """
    if h.startswith(b"\x00\x00") and (h[2:4] in (b"\x01\x00", b"\x02\x00")):
        return "ico"
imghdr.tests.append(test_ico)


[docs]def get_chrome_driver() -> selenium.webdriver.Chrome:
    """
    Returns an instance of a Selenium Chrome driver with the headless
    option set to ``True``.

    Returns:
        Headless Chrome driver.
    """
    chrome_opts = selenium.webdriver.ChromeOptions()
    chrome_opts.headless = True
    driver = selenium.webdriver.Chrome(options=chrome_opts)
    return driver


[docs]def get_login_cookies(
    home_url: str, username: str, password: str,
    driver: selenium.webdriver.Chrome = None, page_load_wait: int = 1
) -> List[dict]:
    """
    Logs in to a Proboards account using Selenium and returns the cookies from
    the authenticated login session.

    Args:
        home_url: URL for the Proboards forum homepage.
        username: Login username.
        password: Login password.
        driver: Selenium Chrome driver (optional).
        page_load_wait: Time (in seconds) to wait to allow the page to load.

    Returns:
        A list of dicts, where each dict corresponds to a cookie, from the
        Selenium Chrome driver.
    """
    if driver is None:
        driver = get_chrome_driver()

    driver.get(home_url)
    time.sleep(page_load_wait)

    links = driver.find_elements_by_tag_name("a")
    login_url = None

    for link in links:
        href = link.get_attribute("href")
        if href.startswith("https://login.proboards.com/login"):
            login_url = href
            break

    # Navigate to login page and fill in username/password fields.
    driver.get(login_url)
    time.sleep(page_load_wait)

    email_input = None
    password_input = None
    submit_input = None

    inputs = driver.find_elements_by_tag_name("input")
    for input_ in inputs:
        try:
            input_name = input_.get_attribute("name")
            if input_name == "email":
                email_input = input_
            elif input_name == "password":
                password_input = input_
            elif input_name == "continue":
                submit_input = input_
        except Exception:
            pass

    email_input.send_keys(username)
    password_input.send_keys(password)
    submit_input.click()
    time.sleep(page_load_wait)

    cookies = driver.get_cookies()
    return cookies


[docs]def get_login_session(cookies: List[dict]) -> aiohttp.ClientSession:
    """
    Get an authenticated ``aiohttp`` session using the cookies provided.

    This is achieved by converting cookies from a Selenium driver session
    to ``http`` module Morsels (see `http.cookies.Morsel`_), which can be
    added to the ``aiohttp`` session's cookie jar.

    Args:
        cookies: A list of dicts as returned by :func:`get_login_cookies`,
            i.e., from a Selenium driver session.

    Returns:
        An ``aiohttp`` session with the given cookies in its cookie jar.

    .. _`http.cookies.Morsel`:
        https://docs.python.org/3/library/http.cookies.html#morsel-objects
    """
    logger.debug("Creating aiohttp login session from cookies")
    session = aiohttp.ClientSession()

    morsels = {}
    for cookie in cookies:
        # https://docs.python.org/3/library/http.cookies.html#morsel-objects
        morsel = http.cookies.Morsel()
        morsel.set(cookie["name"], cookie["value"], cookie["value"])
        morsel["domain"] = cookie["domain"]
        morsel["httponly"] = cookie["httpOnly"]
        morsel["path"] = cookie["path"]
        morsel["secure"] = cookie["secure"]

        # NOTE: ignore expires field; if it's absent, the cookie remains
        # valid for the duration of the session.
        # if "expiry" in cookie:
        #     morsel["expires"] = cookie["expiry"]

        morsels[cookie["name"]] = morsel

    session.cookie_jar.update_cookies(morsels)

    logger.debug("Added cookies to aiohttp session")
    return session


[docs]async def get_source(
    url: str, session: aiohttp.ClientSession
) -> bs4.BeautifulSoup:
    """
    Get page source of a URL.

    Args:
        url: URL to visit.
        session: ``aiohttp`` session.

    Returns:
        Page source.
    """
    logger.debug(f"Getting page source for {url}")
    # TODO: check response HTTP status code
    resp = await session.get(url)
    text = await resp.text()
    return bs4.BeautifulSoup(text, "html.parser")


[docs]async def download_image(
    url: str, session: aiohttp.ClientSession, dst_dir: pathlib.Path
) -> dict:
    """
    Attempt to download the image at ``url`` to the directory specified by
    ``dst_dir``. The downloaded file is named after its MD5 hash to ensure
    uniqueness. If a file already exists on disk (i.e., has been previously
    downloaded), it is not rewritten.

    Args:
        url: Image URL.
        session: ``aiohttp`` session.
        dst_dir: Directory to which the image should be downloaded.

    Returns:
        A dict containing information on the download attempt and, if download
        was successful, image metadata::

            {
                "status": {
                    "get": HTTP response code,
                    "exists": whether the image already exists on disk (bool),
                    "valid": whether the file is a valid image file,
                },
                "image": {
                    "url": image download URL,
                    "filename": downloaded image filename,
                    "md5_hash": file MD5 hash,
                    "size": filesize on disk,
                },
            }
    """
    if url.startswith("//"):
        url = f"https:{url}"

    logger.debug(f"Downloading image: {url}")

    ret = {
        "status": {
            "get": None,
            "exists": None,
            "valid": None
        },
        "image": {
            "url": url,
            "filename": None,
            "md5_hash": None,
            "size": None,
        },
    }

    try:
        response = await session.get(url, timeout=45)
    except aiohttp.client_exceptions.ClientConnectorError as e:
        logger.warning(
            f"Failed to download image at {url}: {str(e)} "
            "(it is likely the image or server no longer exists)"
        )
    else:
        ret["status"]["get"] = response.status

        if response.status == 200:
            img = await response.read()

            # The file extension doesn't necessarily match the filetype, so we
            # manually check the file header and set the correct extension. If
            # the file doesn't correspond to a supported image filetype, we
            # assume the downloaded file is invalid and skip it.
            ret["status"]["valid"] = False

            filetype = imghdr.what(None, h=img)

            if filetype == "jpeg":
                filetype = "jpg"

            if filetype is not None:
                ret["status"]["valid"] = True

                # Set the filestem to the md5 hash of the image.
                img_md5 = hashlib.md5(img).hexdigest()

                new_fname = f"{img_md5}.{filetype}"

                ret["image"]["filename"] = new_fname
                ret["image"]["size"] = len(img)
                ret["image"]["md5_hash"] = img_md5

                img_fpath = dst_dir / new_fname

                if not img_fpath.exists():
                    ret["status"]["exists"] = False
                    async with aiofiles.open(img_fpath, "wb") as f:
                        await f.write(img)
                else:
                    ret["status"]["exists"] = True
    finally:
        return ret