#! /usr/bin/env -S pipx run
# /// script
# dependencies = [
#   "BeautifulSoup4==4.*",
#   "diskcache==5.*",
#   "html5lib==1.*",
#   "httpx<2",
#   "ollama==0.2.1",
#   "platformdirs==4.*",
#   "tqdm==4.*",
# ]
# requires-python = ">=3.10"
# ///
#
# Copyright (c) 2024 D. Bohdan
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

from __future__ import annotations

import json
import re
import sys
import textwrap
from dataclasses import dataclass

import httpx
import ollama
from bs4 import BeautifulSoup, Tag
from diskcache import Cache, JSONDisk
from platformdirs import PlatformDirs
from tqdm import tqdm

APP_NAME = "tilde.institute-catalog"
APP_AUTHOR = "dbohdan"
DIRS = PlatformDirs(APP_NAME, APP_AUTHOR)

CACHE_COMPRESSION = 6
CACHE_DIR = DIRS.user_cache_dir
CACHE_EXPIRE = 24 * 60 * 60
JSON_INDENT = 4
STATS_PAGE_URL = "https://tilde.institute/stats"
TIMEOUT = 10

INSTANT_INCLUDE_TAGS = ("audio", "iframe", "img", "script", "video")

MODEL = "llama3"
MODEL_THREADS = 4
MODEL_CUSTOM = f"llama3-{MODEL_THREADS}t"

MODELFILE = f"""
FROM {MODEL}
PARAMETER num_thread {MODEL_THREADS}
"""

MODEL_PROMPT = (
    "Is a webpage with the following text likely empty, a placeholder, "
    "a test page, or otherwise not worth listing in a catalog because of "
    'sparse content? Truncated text is okay. Answer "YES" or "NO".'
    "\n\n```\n{text}\n```"
)
MODEL_TEXT_CUTOFF = 6000
MODEL_YES = "YES"


@dataclass
class Page:
    empty: bool
    title: str


@dataclass
class Response:
    pass


@dataclass
class FailureResponse(Response):
    cause: str


@dataclass
class SuccessResponse(Response):
    content_type: str
    text: str


def default_soup(markup: str) -> BeautifulSoup:
    return BeautifulSoup(markup, "html5lib")


def compress_whitespace(s: str) -> str:
    return re.sub(r"\s{2,}", " ", s)


def model_says_empty(body_text: str) -> bool:
    prepared_text = textwrap.shorten(
        compress_whitespace(body_text),
        width=MODEL_TEXT_CUTOFF,
    )

    msg = {
        "role": "user",
        "content": MODEL_PROMPT.format(text=prepared_text),
    }
    response = ollama.chat(model=MODEL_CUSTOM, messages=[msg])

    return response["message"]["content"].strip() == MODEL_YES


def process_page(contents: str, *, html: bool) -> Page:
    body_text = ""
    instant_include = False
    title = ""

    if html:
        soup = default_soup(contents)

        title_tag = soup.find("title")
        if title_tag:
            title = title_tag.text

        body_tag = soup.find("body")
        if body_tag:
            body_text = body_tag.text

        instant_include = any(soup.find(tag) for tag in INSTANT_INCLUDE_TAGS)
    else:
        body_text = contents

    return Page(
        empty=not instant_include and model_says_empty(body_text),
        title=title,
    )


def user_site_urls() -> list[str]:
    soup = default_soup(httpx.get(STATS_PAGE_URL, timeout=TIMEOUT).text)

    user_list = soup.find(class_="userlist")
    if not isinstance(user_list, Tag):
        msg = "expected '.userlist' to be a Tag"
        raise TypeError(msg)

    return [elem.attrs["href"] for elem in user_list.find_all("a")]


def get_cached(url: str, *, cache: Cache) -> Response:
    if url not in cache:
        try:
            r = httpx.get(url, follow_redirects=True, timeout=TIMEOUT)
            r.raise_for_status()

            cache.set(
                url,
                (r.headers.get("content-type", ""), r.text),
                expire=CACHE_EXPIRE,
            )
        except (httpx.ConnectError, httpx.HTTPError) as e:
            return FailureResponse(str(e).splitlines()[0])

    content_type, text = cache.get(url)
    return SuccessResponse(content_type=content_type, text=text)


def main() -> None:
    pages = []

    ollama.pull(MODEL)
    ollama.create(model=MODEL_CUSTOM, modelfile=MODELFILE)

    urls = user_site_urls()

    cache = Cache(CACHE_DIR, compression=CACHE_COMPRESSION, disk=JSONDisk)

    for url in tqdm(urls):
        resp = get_cached(url, cache=cache)

        match resp:
            case FailureResponse(cause):
                print(f"skipping: {cause}", file=sys.stderr)
                continue
            case SuccessResponse(content_type, text):
                page = process_page(text, html=content_type == "text/html")
                if page.empty:
                    continue

                pages.append({"title": page.title, "url": url})

    print(
        json.dumps(
            pages,
            ensure_ascii=False,
            indent=JSON_INDENT,
        ),
    )


if __name__ == "__main__":
    main()