#! /usr/bin/env -S pipx run # /// script # dependencies = [ # "BeautifulSoup4==4.*", # "diskcache==5.*", # "html5lib==1.*", # "httpx<2", # "ollama==0.2.1", # "platformdirs==4.*", # "tqdm==4.*", # ] # requires-python = ">=3.10" # /// # # Copyright (c) 2024 D. Bohdan # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. from __future__ import annotations import json import re import sys import textwrap from dataclasses import dataclass import httpx import ollama from bs4 import BeautifulSoup, Tag from diskcache import Cache, JSONDisk from platformdirs import PlatformDirs from tqdm import tqdm APP_NAME = "tilde.institute-catalog" APP_AUTHOR = "dbohdan" DIRS = PlatformDirs(APP_NAME, APP_AUTHOR) CACHE_COMPRESSION = 6 CACHE_DIR = DIRS.user_cache_dir CACHE_EXPIRE = 24 * 60 * 60 JSON_INDENT = 4 STATS_PAGE_URL = "https://tilde.institute/stats" TIMEOUT = 10 INSTANT_INCLUDE_TAGS = ("audio", "iframe", "img", "script", "video") MODEL = "llama3" MODEL_THREADS = 4 MODEL_CUSTOM = f"llama3-{MODEL_THREADS}t" MODELFILE = f""" FROM {MODEL} PARAMETER num_thread {MODEL_THREADS} """ MODEL_PROMPT = ( "Is a webpage with the following text likely empty, a placeholder, " "a test page, or otherwise not worth listing in a catalog because of " 'sparse content? Truncated text is okay. Answer "YES" or "NO".' "\n\n```\n{text}\n```" ) MODEL_TEXT_CUTOFF = 6000 MODEL_YES = "YES" @dataclass class Page: empty: bool title: str @dataclass class Response: pass @dataclass class FailureResponse(Response): cause: str @dataclass class SuccessResponse(Response): content_type: str text: str def default_soup(markup: str) -> BeautifulSoup: return BeautifulSoup(markup, "html5lib") def compress_whitespace(s: str) -> str: return re.sub(r"\s{2,}", " ", s) def model_says_empty(body_text: str) -> bool: prepared_text = textwrap.shorten( compress_whitespace(body_text), width=MODEL_TEXT_CUTOFF, ) msg = { "role": "user", "content": MODEL_PROMPT.format(text=prepared_text), } response = ollama.chat(model=MODEL_CUSTOM, messages=[msg]) return response["message"]["content"].strip() == MODEL_YES def process_page(contents: str, *, html: bool) -> Page: body_text = "" instant_include = False title = "" if html: soup = default_soup(contents) title_tag = soup.find("title") if title_tag: title = title_tag.text body_tag = soup.find("body") if body_tag: body_text = body_tag.text instant_include = any(soup.find(tag) for tag in INSTANT_INCLUDE_TAGS) else: body_text = contents return Page( empty=not instant_include and model_says_empty(body_text), title=title, ) def user_site_urls() -> list[str]: soup = default_soup(httpx.get(STATS_PAGE_URL, timeout=TIMEOUT).text) user_list = soup.find(class_="userlist") if not isinstance(user_list, Tag): msg = "expected '.userlist' to be a Tag" raise TypeError(msg) return [elem.attrs["href"] for elem in user_list.find_all("a")] def get_cached(url: str, *, cache: Cache) -> Response: if url not in cache: try: r = httpx.get(url, follow_redirects=True, timeout=TIMEOUT) r.raise_for_status() cache.set( url, (r.headers.get("content-type", ""), r.text), expire=CACHE_EXPIRE, ) except (httpx.ConnectError, httpx.HTTPError) as e: return FailureResponse(str(e).splitlines()[0]) content_type, text = cache.get(url) return SuccessResponse(content_type=content_type, text=text) def main() -> None: pages = [] ollama.pull(MODEL) ollama.create(model=MODEL_CUSTOM, modelfile=MODELFILE) urls = user_site_urls() cache = Cache(CACHE_DIR, compression=CACHE_COMPRESSION, disk=JSONDisk) for url in tqdm(urls): resp = get_cached(url, cache=cache) match resp: case FailureResponse(cause): print(f"skipping: {cause}", file=sys.stderr) continue case SuccessResponse(content_type, text): page = process_page(text, html=content_type == "text/html") if page.empty: continue pages.append({"title": page.title, "url": url}) print( json.dumps( pages, ensure_ascii=False, indent=JSON_INDENT, ), ) if __name__ == "__main__": main()