diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..84023d9fbe5519c9b714306b80f8243a77a42c0c
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,131 @@
+# WHY GitHub Actions CI: catches broken imports, TypeScript errors, and test
+# failures before they reach HuggingFace Spaces. Runs on every push to main.
+name: CI
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Detect C sources
+ id: detect-c
+ run: |
+ if find . -name "*.c" -o -name "*.h" | grep -q .; then
+ echo "present=true" >> "$GITHUB_OUTPUT"
+ else
+ echo "present=false" >> "$GITHUB_OUTPUT"
+ fi
+
+ - name: Install lcov
+ run: sudo apt-get update && sudo apt-get install -y lcov
+
+ - name: Install clang-format (if needed)
+ if: steps.detect-c.outputs.present == 'true'
+ run: sudo apt-get install -y clang-format
+
+ - name: Set up Python 3.12
+ id: setup-python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+
+ - name: Cache pip
+ uses: actions/cache@v4
+ with:
+ path: ~/.cache/pip
+ key: ${{ runner.os }}-pip-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-pip-${{ steps.setup-python.outputs.python-version }}-
+ ${{ runner.os }}-pip-
+
+ - name: Install Python dependencies
+ run: pip install -r requirements.txt -r requirements-dev.txt
+
+ - name: Python lint (ruff)
+ run: ruff check app/ tests/
+
+ - name: C style (clang-format)
+ if: steps.detect-c.outputs.present == 'true'
+ run: |
+ mapfile -d '' files < <(find . -name "*.c" -o -name "*.h" -print0)
+ clang-format --version
+ printf '%s\0' "${files[@]}" | xargs -0 clang-format --dry-run --Werror
+
+ - name: Check all Python modules import cleanly
+ run: |
+ python -c "from app.model import ToxicityClassifier"
+ python -c "from app.database import get_recent_posts, save_post, seed_if_empty"
+ python -c "from app.graph import build_cooccurrence_graph"
+ python -c "from app.ingestion import ALGOSPEAK_QUERIES"
+ python -c "from app.main import app"
+
+ - name: Run tests with coverage
+ env:
+ BLUESKY_HANDLE: "test@test.com"
+ BLUESKY_PASSWORD: "testpassword"
+ PYTHONPATH: .
+ run: |
+ python -m pytest tests/ -v --tb=short --cov=app --cov-report=xml
+
+ # ── Frontend build verification ──────────────────────────────────────────
+ # WHY build React in CI: catches TypeScript errors and missing imports
+ # before they cause a silent failure during Docker build on HuggingFace.
+ - name: Set up Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: "20"
+
+ - name: Install pnpm
+ run: npm install -g pnpm
+
+ - name: Cache pnpm store
+ uses: actions/cache@v4
+ with:
+ path: ~/.local/share/pnpm/store
+ key: ${{ runner.os }}-pnpm-${{ hashFiles('frontend/pnpm-lock.yaml') }}
+ restore-keys: |
+ ${{ runner.os }}-pnpm-
+
+ - name: Install frontend dependencies
+ working-directory: frontend
+ run: pnpm install --frozen-lockfile
+
+ - name: Build frontend (verify no TypeScript errors)
+ working-directory: frontend
+ run: pnpm build
+
+ - name: Collect lcov coverage
+ id: lcov
+ run: |
+ if find . -name "*.gcda" -o -name "*.gcno" | grep -q .; then
+ lcov --capture --directory . --output-file coverage.info --ignore-errors unused --no-external
+ echo "generated=true" >> "$GITHUB_OUTPUT"
+ else
+ echo "No gcda/gcno files found; skipping lcov capture."
+ echo "generated=false" >> "$GITHUB_OUTPUT"
+ fi
+
+ - name: Upload Python coverage to Codecov
+ uses: codecov/codecov-action@v4
+ with:
+ files: ./coverage.xml
+ token: ${{ secrets.CODECOV_TOKEN }}
+ flags: python
+ fail_ci_if_error: false
+
+ - name: Upload C coverage to Codecov
+ if: steps.lcov.outputs.generated == 'true'
+ uses: codecov/codecov-action@v4
+ with:
+ files: ./coverage.info
+ token: ${{ secrets.CODECOV_TOKEN }}
+ flags: c
+ fail_ci_if_error: false
diff --git a/.github/workflows/hf-deploy.yml b/.github/workflows/hf-deploy.yml
new file mode 100644
index 0000000000000000000000000000000000000000..761a962cfffcca217070faf683830d95f9edaa79
--- /dev/null
+++ b/.github/workflows/hf-deploy.yml
@@ -0,0 +1,53 @@
+name: Deploy to HuggingFace Spaces
+
+on:
+ push:
+ branches: [main]
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ - name: Set up Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: "20"
+
+ - name: Install pnpm
+ run: npm install -g pnpm
+
+ - name: Build React frontend
+ working-directory: frontend
+ run: |
+ pnpm install --frozen-lockfile
+ pnpm build
+
+ - name: Push to HuggingFace Space
+ env:
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
+ run: |
+ git config --global user.email "deploy@github-actions.com"
+ git config --global user.name "GitHub Actions"
+
+ git checkout --orphan hf-deploy
+
+ git rm -rf assets/ 2>/dev/null || true
+ git rm -r --cached frontend/src/ 2>/dev/null || true
+ git rm -r --cached frontend/node_modules/ 2>/dev/null || true
+ git rm -r --cached frontend/public/ 2>/dev/null || true
+
+ cp README_HF.md README.md
+ git rm --cached README_HF.md 2>/dev/null || true
+ rm README_HF.md
+
+ git add -f frontend/dist/
+ git add .
+
+ git commit -m "Deploy to HuggingFace Spaces"
+ git remote add space https://odeliyach:$HF_TOKEN@huggingface.co/spaces/odeliyach/Algoscope
+ git push space hf-deploy:main --force
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d2249e0ab2f7b868385d4b494a0f4dff13d21e65
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,38 @@
+# Credentials — NEVER commit
+.env
+.env.local
+.env.*.local
+
+# Database — contains scraped user data
+algoscope.db
+*.db
+
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+venv/
+.venv/
+*.egg-info/
+dist/
+build/
+
+# HuggingFace model cache
+.cache/
+
+# Pyvis auto-generated output — not application code
+*.html
+test_graph.html
+
+# React build output — generated by `pnpm build`, not source code
+frontend/dist/
+frontend/node_modules/
+
+# OS
+.DS_Store
+Thumbs.db
+
+# IDE
+.vscode/
+.idea/
+*.swp
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..7b1cbed4c8fdabb8b8147980fe75c43b6422f22f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app/ ./app/
+COPY frontend/dist/ ./frontend/dist/
+
+EXPOSE 7860
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "debug"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..00b9a13e594fbabe203ced36576087c89b7a6243
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Odeliya Charitonova
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..305f0481e66a1d2d4e7e80dd127e7a9588746117
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+.PHONY: install run-api run-dashboard test lint clean
+
+install:
+ pip install -r requirements.txt
+
+run-api:
+ uvicorn app.main:app --reload --port 8000
+
+run-dashboard:
+ streamlit run dashboard.py
+
+test:
+ python -m pytest tests/ -v --tb=short
+
+lint:
+ ruff check app/ dashboard.py tests/
+
+clean:
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+ rm -f algoscope.db
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10c7b6f65d1142ad39146d7f157d9889118a87b3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,205 @@
+---
+title: AlgoScope
+emoji: 🔍
+colorFrom: red
+colorTo: yellow
+sdk: docker
+pinned: false
+---
+
+
+
+# 🔍 AlgoScope
+
+**Real-time algospeak & toxicity detection on Bluesky**
+
+[](https://github.com/odeliyach/Algoscope/actions/workflows/ci.yml)
+[](https://codecov.io/gh/odeliyach/Algoscope)
+[](https://github.com/odeliyach/Algoscope/actions/workflows/ci.yml)
+[](https://python.org)
+[](https://huggingface.co/odeliyach/AlgoShield-Algospeak-Detection)
+[](https://huggingface.co/spaces/odeliyach/algoscope)
+[](https://streamlit.io)
+[](LICENSE)
+
+*Odeliya Charitonova · Tel Aviv University, School of CS & AI · 2026*
+
+
+
+---
+
+## What is AlgoScope?
+
+Algospeak is the evolving coded language people use to evade content moderation — "unalive" instead of suicide, "seggs" instead of sex, "le dollar bean" instead of lesbian. Standard toxicity APIs score these near zero because they look benign to classifiers trained on explicit language.
+
+AlgoScope is a live dashboard that catches them anyway. It ingests posts from the Bluesky social network in real time, classifies each one with a fine-tuned DistilBERT model trained specifically on algospeak, and visualizes toxicity patterns, co-occurrence networks, and trend spikes in an interactive dashboard.
+
+> **Why this matters:** Algospeak evasion is an active research problem in content moderation. This project turns published NLP research into a live, clickable product.
+
+---
+
+## Live Demo
+
+| Resource | Link |
+|----------|------|
+| 🖥️ Live dashboard | [huggingface.co/spaces/odeliyach/algoscope](https://huggingface.co/spaces/odeliyach/algoscope) |
+| 🤗 Fine-tuned model | [odeliyach/AlgoShield-Algospeak-Detection](https://huggingface.co/odeliyach/AlgoShield-Algospeak-Detection) |
+| 💻 GitHub | [github.com/odeliyach/Algoscope](https://github.com/odeliyach/Algoscope) |
+
+---
+
+---
+
+## Features
+
+- **🚨 Spike alerts** — red banner when a tracked term exceeds 80% toxic in the last hour
+- **📊 Toxicity over time** — hourly line chart with color-coded data points (green/orange/red by toxicity level)
+- **🕸️ Co-occurrence graph** — interactive word graph built with NetworkX + Pyvis; nodes colored by toxicity rate
+- **⚖️ Term comparison** — side-by-side toxicity profiles for any two tracked terms
+- **📥 Export** — download all analyzed posts as CSV or JSON
+- **🎛️ Threshold slider** — tune precision/recall tradeoff at inference time without retraining
+
+---
+
+## Architecture
+
+```
+┌─────────────────┐ AT Protocol ┌───────────────────┐
+│ Bluesky API │ ───────────────▶ │ ingestion.py │
+└─────────────────┘ │ dedup + preproc │
+ └─────────┬─────────┘
+ │
+ ┌─────────▼─────────┐
+ │ model.py │
+ │ DistilBERT │
+ │ singleton + batch│
+ └─────────┬─────────┘
+ │
+ ┌────────────────────▼──────────────────────┐
+ │ database.py │
+ │ SQLite · URI-keyed deduplication │
+ └────────────────────┬──────────────────────┘
+ │
+ ┌────────────────────────────────▼────────────────────────────┐
+ │ dashboard.py │
+ │ Streamlit · Plotly · NetworkX · Pyvis (4 tabs) │
+ └─────────────────────────────────────────────────────────────┘
+```
+
+**Stack:** Python 3.12 · FastAPI · Streamlit · SQLite · NetworkX · Pyvis · Plotly · HuggingFace Transformers · AT Protocol (Bluesky)
+
+---
+
+## Model — AlgoShield
+
+The classifier powering AlgoScope is **AlgoShield**, a DistilBERT model fine-tuned on the [MADOC dataset](https://arxiv.org/abs/2306.01976) (Multimodal Algospeak Detection and Offensive Content). It was trained and evaluated separately — full training code, dataset preprocessing, and evaluation notebooks are in the [AlgoShield repository](https://huggingface.co/odeliyach/AlgoShield-Algospeak-Detection).
+
+| Metric | Baseline DistilBERT | AlgoShield (fine-tuned) |
+|--------|---------------------|------------------------|
+| Precision | 70.3% | 61.2% |
+| Recall | 33.2% | **73.2% (+40 pts)** |
+| F1 | 49.0% | **66.7% (+17.7 pts)** |
+
+The +40-point recall improvement comes at the cost of ~9 points of precision — a deliberate tradeoff. In content moderation, a false negative (missing a toxic post) causes real harm; a false positive just means a human reviews something innocent. The threshold slider in AlgoScope lets operators tune this tradeoff at deployment time without retraining.
+
+> Want to understand how AlgoShield was built? See the [model card and training details →](https://huggingface.co/odeliyach/AlgoShield-Algospeak-Detection)
+
+---
+
+## Key Engineering Decisions
+
+**Train/serve parity** — The same `preprocess_text()` function used during AlgoShield's training is applied at inference time in AlgoScope. Without this, the model sees out-of-distribution input on every prediction — a production ML bug called train/serve skew.
+
+**Threshold separation** — The model outputs a raw confidence score; a threshold slider converts it to a binary label. This separates the ML model from business policy — the same pattern used in Gmail spam and YouTube moderation. One model, multiple thresholds tuned per context.
+
+**Graph construction order** — The co-occurrence graph filters to the 1-hop neighborhood of algospeak seed words *before* frequency ranking. The naive approach (top-30 globally, then filter) always returns generic English function words ("get", "like", "know") — useless for the project's purpose.
+
+**Physics disabled** — Pyvis force-directed layout is O(n²) per animation frame. With 30+ nodes it froze the browser for 2+ minutes. A fixed `randomSeed` layout loads instantly with reproducible positions.
+
+**SQLite with clean abstraction** — All persistence is isolated in `database.py`. No other file imports `sqlite3` directly. Replacing SQLite with PostgreSQL or Cassandra requires changing only that one file.
+
+---
+
+## Running Locally
+
+**Requirements:** Python 3.12, a Bluesky account
+
+```bash
+git clone https://github.com/odeliyach/Algoscope
+cd Algoscope
+python -m venv venv
+venv\Scripts\activate # Windows
+# source venv/bin/activate # Mac/Linux
+pip install -r requirements.txt
+```
+
+Or with Make:
+```bash
+make install
+make run-dashboard # in one terminal
+make run-api # in another
+```
+
+Create `.env` in the project root:
+```env
+BLUESKY_HANDLE=yourhandle.bsky.social
+BLUESKY_PASSWORD=yourpassword
+```
+
+---
+
+## Project Structure
+
+```
+Algoscope/
+├── app/
+│ ├── main.py # FastAPI endpoints (/health, /predict)
+│ ├── model.py # ToxicityClassifier — singleton load, batch inference
+│ ├── ingestion.py # Bluesky AT Protocol client + preprocessing
+│ ├── database.py # SQLite persistence — isolated for easy swap
+│ └── graph.py # NetworkX co-occurrence graph + Pyvis HTML export
+├── assets/
+│ ├── overview.png # Dashboard overview screenshot
+│ ├── graph.png # Co-occurrence graph screenshot
+│ └── term_comparison.png # Term comparison screenshot
+├── tests/
+│ └── test_core.py # Preprocessing parity, DB round-trip, stopwords
+├── dashboard.py # Streamlit dashboard — 4 tabs
+├── Makefile # install / run / test / lint shortcuts
+├── requirements.txt # Runtime dependencies
+├── pyproject.toml # Project metadata + tooling config
+├── Dockerfile # python:3.12-slim, non-root user
+├── .github/workflows/
+│ └── ci.yml # Import checks + syntax + pytest on every push
+└── .env # Credentials — not committed
+```
+
+---
+
+## Deployment (HuggingFace Spaces)
+
+1. Push this repo to GitHub (verify `.env` and `algoscope.db` are in `.gitignore`)
+2. Go to [huggingface.co](https://huggingface.co) → New Space → Streamlit → connect this GitHub repo
+3. In Space Settings → Secrets, add `BLUESKY_HANDLE` and `BLUESKY_PASSWORD`
+4. The Space auto-deploys on every push to `main`
+
+---
+
+## Limitations & Future Work
+
+- **Bluesky-only** — the ingestion layer is modular; adding Reddit or Mastodon requires only a new adapter in `ingestion.py`
+- **Fetch-on-click** — a background ingestion loop would keep data flowing continuously without user interaction
+- **Static model** — algospeak evolves; periodic retraining or drift detection would maintain coverage over time
+- **SQLite single-writer** — replacing with PostgreSQL or Cassandra enables concurrent multi-worker ingestion
+
+---
+
+## License
+
+MIT — see [LICENSE](LICENSE)
+
+---
+
+
+AlgoScope · Tel Aviv University, School of CS & AI · Odeliya Charitonova · 2026
+
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/app/__init__.py
@@ -0,0 +1 @@
+
diff --git a/app/database.py b/app/database.py
new file mode 100644
index 0000000000000000000000000000000000000000..25f1153b8ac6ffdb916b8afe3b98cd0831402787
--- /dev/null
+++ b/app/database.py
@@ -0,0 +1,164 @@
+"""
+SQLite database for storing post classification results.
+
+ARCHITECTURE NOTE (interview talking point):
+All persistence is isolated in this file. No other module imports sqlite3
+directly. This means swapping SQLite for PostgreSQL or any other store
+requires changing only this one file — the rest of the codebase is
+completely unaware of how data is stored.
+"""
+
+import logging
+import os
+import sqlite3
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+_temp_dir = os.environ.get("TMPDIR")
+if not _temp_dir:
+ _temp_dir = os.environ.get("TEMP") or os.environ.get("TMP") or "/tmp"
+DB_PATH = os.environ.get(
+ "ALGOSCOPE_DB_PATH",
+ os.path.join(_temp_dir, "algoscope.db"),
+)
+
+_db_initialized = False
+
+
+def _get_connection() -> sqlite3.Connection:
+ conn = sqlite3.connect(DB_PATH)
+ conn.row_factory = sqlite3.Row
+ return conn
+
+
+def init_db() -> None:
+ """Create tables if they don't exist. Safe to call multiple times."""
+ with _get_connection() as conn:
+ conn.execute(
+ """
+ CREATE TABLE IF NOT EXISTS posts (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ label TEXT NOT NULL,
+ score REAL NOT NULL,
+ platform TEXT NOT NULL,
+ query_term TEXT NOT NULL DEFAULT '',
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+ )
+ """
+ )
+ try:
+ conn.execute("ALTER TABLE posts ADD COLUMN query_term TEXT NOT NULL DEFAULT ''")
+ except sqlite3.OperationalError:
+ pass
+ conn.commit()
+
+
+def _ensure_init() -> None:
+ """Initialize DB once per process, not on every call."""
+ global _db_initialized
+ if not _db_initialized:
+ init_db()
+ _db_initialized = True
+
+
+def save_post(
+ text: str,
+ label: str,
+ score: float,
+ platform: str,
+ query_term: str = "",
+) -> None:
+ """Insert a classified post into the posts table."""
+ _ensure_init()
+ with _get_connection() as conn:
+ conn.execute(
+ "INSERT INTO posts (text, label, score, platform, query_term) VALUES (?, ?, ?, ?, ?)",
+ (text, label, score, platform, query_term),
+ )
+ conn.commit()
+
+
+def get_recent_posts(limit: int = 100) -> list[dict[str, Any]]:
+ """Return the most recent posts as a list of dicts, newest first."""
+ _ensure_init()
+ with _get_connection() as conn:
+ cursor = conn.execute(
+ """
+ SELECT id, text, label, score, platform, query_term, created_at
+ FROM posts
+ ORDER BY created_at DESC
+ LIMIT ?
+ """,
+ (limit,),
+ )
+ rows = cursor.fetchall()
+ return [dict(row) for row in rows]
+
+
+def get_post_count() -> int:
+ """Return total number of posts in the DB."""
+ _ensure_init()
+ with _get_connection() as conn:
+ return conn.execute("SELECT COUNT(*) FROM posts").fetchone()[0]
+
+
+def seed_if_empty() -> None:
+ """
+ If the DB is empty (cold start or HF ephemeral filesystem wipe), fetch
+ a small batch of real posts from Bluesky and classify them so the
+ dashboard has data immediately without requiring the user to click FETCH.
+
+ WHY this is safe now (it was disabled before):
+ Previously this ran at module import time, triggering a model download
+ before uvicorn bound to port 7860, killing the container with no logs.
+ Now it is called from lifespan() AFTER the server is up and AFTER the
+ classifier has loaded. A failure here is non-fatal.
+
+ WHY 4 queries at limit=32 (not all queries at full limit):
+ Seeding is best-effort background work. ~30 posts is enough to populate
+ all dashboard widgets. Seeding all queries would add 10-30s to cold
+ start time, unacceptable for a free-tier Space that restarts often.
+ """
+ _ensure_init()
+ count = get_post_count()
+ if count > 0:
+ logger.info("seed_if_empty: DB has %d posts, skipping seed", count)
+ return
+
+ logger.info("seed_if_empty: DB is empty, seeding from Bluesky...")
+ try:
+ from app.ingestion import ALGOSPEAK_QUERIES, fetch_posts
+ from app.model import ToxicityClassifier
+
+ classifier = ToxicityClassifier()
+ if classifier._pipeline is None:
+ logger.warning("seed_if_empty: classifier not ready, skipping seed")
+ return
+
+ seed_queries = ALGOSPEAK_QUERIES[:4]
+ posts = fetch_posts(query=seed_queries[0], limit=32, queries=seed_queries)
+ if not posts:
+ logger.warning("seed_if_empty: no posts returned from Bluesky")
+ return
+
+ texts = [t for t, _ in posts]
+ timestamps = [ts for _, ts in posts]
+ predictions = classifier.predict_batch(texts)
+
+ for text, ts, pred in zip(texts, timestamps, predictions):
+ score = float(pred.get("score", 0.0) or 0.0)
+ label = "toxic" if score >= 0.70 else "non-toxic"
+ matched = next(
+ (q for q in seed_queries if q and q.lower() in text.lower()),
+ seed_queries[0],
+ )
+ save_post(text=text, label=label, score=score, platform="bluesky", query_term=matched)
+
+ logger.info("seed_if_empty: seeded %d posts", len(texts))
+ except Exception as exc:
+ # WHY catch-all: Bluesky credentials may not be set, the network may
+ # be unavailable, or the model may not have loaded. The app must start
+ # regardless - the user can always click FETCH manually.
+ logger.warning("seed_if_empty: failed (non-fatal): %s", exc)
diff --git a/app/graph.py b/app/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..a778828ab689bc211328fe85bd7308bec3b5d9ec
--- /dev/null
+++ b/app/graph.py
@@ -0,0 +1,211 @@
+"""
+Graph utilities for exploring algospeak co-occurrence patterns.
+
+Co-occurrence graphs are a classic NLP exploratory tool: by connecting words
+that frequently appear together, we can surface clusters of related slang
+or emergent euphemisms that would be hard to spot from raw text alone.
+"""
+
+from __future__ import annotations
+
+import re
+from itertools import combinations
+from typing import Dict, List
+
+import networkx as nx
+from pyvis.network import Network
+
+from app.database import get_recent_posts
+
+# All words to exclude from the graph. Centralizing here makes it easy to tune.
+STOPWORDS = {
+ # English function words
+ "the", "and", "or", "but", "if", "then", "else", "when", "where",
+ "what", "which", "who", "whom", "this", "that", "these", "those",
+ "a", "an", "of", "in", "on", "for", "to", "from", "by", "with",
+ "at", "as", "is", "am", "are", "was", "were", "be", "been", "being",
+ "it", "its", "he", "she", "they", "them", "we", "us", "you", "your",
+ "yours", "i", "me", "my", "mine", "our", "ours", "their", "theirs",
+ "do", "does", "did", "doing", "done", "have", "has", "had",
+ "will", "would", "can", "could", "should", "must", "may", "might",
+ "just", "like", "so", "very", "too", "not", "no", "yes",
+ "there", "here", "than", "then", "also", "even", "more", "most",
+ "get", "got", "go", "going", "say", "said", "out", "now", "day",
+ "because", "some", "people", "love", "social", "really", "while",
+ "think", "know", "want", "see", "make", "take", "come", "look",
+ "good", "new", "first", "last", "long", "great", "little", "own",
+ "right", "big", "high", "small", "large", "next", "early", "old",
+ "well", "still", "way", "every", "never", "always", "much", "need",
+ "feel", "put", "keep", "let", "ask", "seem", "show", "try", "call",
+ "back", "other", "free", "real", "best", "true", "about", "after",
+ "again", "dont", "isnt", "cant", "wont", "didnt", "doesnt", "youre",
+ "theyre", "whats", "thats", "dont", "thing", "things", "time",
+ # Spanish function words (common on Bluesky)
+ "de", "que", "con", "como", "para", "una", "uno", "los", "las",
+ "por", "del", "sus", "pero", "todo", "esta", "este", "son", "hay",
+ "nos", "han", "fue", "ser", "ver", "vez", "sin", "sobre", "entre",
+ "cuando", "bien", "solo", "puede", "tiene", "desde", "hasta",
+ # Web / file tokens
+ "jpg", "jpeg", "png", "gif", "webp", "www", "http", "https",
+ "com", "org", "net", "html", "php", "amp", "via", "bit",
+}
+
+
+def _tokenize(text: str) -> List[str]:
+ """
+ Lightweight tokenizer for short social posts.
+ Drops stopwords, short tokens, and tokens with digits.
+ """
+ text = text.lower()
+ raw_tokens = re.split(r"\W+", text)
+ tokens = []
+ for tok in raw_tokens:
+ if not tok or len(tok) <= 2:
+ continue
+ if any(ch.isdigit() for ch in tok):
+ continue
+ if tok in STOPWORDS:
+ continue
+ tokens.append(tok)
+ return tokens
+
+
+def build_cooccurrence_graph(min_cooccurrence: int = 2) -> nx.Graph:
+ """
+ Build a word co-occurrence graph from all posts in the database.
+
+ WHY co-occurrence graphs for algospeak: slang evolves in clusters.
+ 'unalive' tends to appear with 'suicide', 'depression', 'mental'.
+ Mapping these clusters reveals semantic neighborhoods of evasive language
+ that a simple keyword list would miss.
+ """
+ posts = get_recent_posts(limit=10_000_000)
+
+ G = nx.Graph()
+ word_counts: Dict[str, int] = {}
+ toxic_word_counts: Dict[str, int] = {}
+
+ for row in posts:
+ text = (row.get("text") or "").strip()
+ label = (row.get("label") or "non-toxic").lower()
+ if not text:
+ continue
+ tokens = _tokenize(text)
+ if not tokens:
+ continue
+
+ # Use a set so repeated words in one post don't inflate edge weights
+ unique_words = set(tokens)
+
+ for w in unique_words:
+ word_counts[w] = word_counts.get(w, 0) + 1
+ if label == "toxic":
+ toxic_word_counts[w] = toxic_word_counts.get(w, 0) + 1
+
+ for w1, w2 in combinations(sorted(unique_words), 2):
+ if G.has_edge(w1, w2):
+ G[w1][w2]["weight"] += 1
+ else:
+ G.add_edge(w1, w2, weight=1)
+
+ # Remove weak edges and isolated nodes
+ G.remove_edges_from([
+ (u, v) for u, v, d in G.edges(data=True)
+ if d.get("weight", 0) < min_cooccurrence
+ ])
+ G.remove_nodes_from(list(nx.isolates(G)))
+
+ # Attach node metadata for visualization
+ for word, count in word_counts.items():
+ if word not in G:
+ continue
+ G.nodes[word]["count"] = count
+ G.nodes[word]["toxic_count"] = toxic_word_counts.get(word, 0)
+
+ # STEP 1: Filter to algospeak neighborhood FIRST.
+ # WHY order matters: filtering before top-30 ensures we get the most
+ # frequent *algospeak-related* words, not the most frequent generic words.
+ from app.ingestion import ALGOSPEAK_QUERIES
+ seed_words = {w for q in ALGOSPEAK_QUERIES for w in q.lower().split()}
+
+ relevant = set()
+ for seed in seed_words:
+ if seed in G:
+ relevant.add(seed)
+ relevant.update(G.neighbors(seed))
+
+ if relevant:
+ G = G.subgraph(relevant).copy()
+
+ # STEP 2: Take top 30 by frequency from the algospeak neighborhood
+ if G.number_of_nodes() > 30:
+ top_nodes = sorted(
+ G.nodes(data=True),
+ key=lambda x: x[1].get("count", 0),
+ reverse=True,
+ )[:30]
+ G = G.subgraph({n[0] for n in top_nodes}).copy()
+
+ return G
+
+
+def graph_to_pyvis(graph: nx.Graph, toxic_only: bool = False) -> str:
+ """
+ Convert a NetworkX graph into interactive Pyvis HTML.
+ Physics disabled for instant rendering (animation caused 2min load times).
+ """
+ net = Network(height="600px", width="100%", directed=False, notebook=False)
+ net.set_options('''{
+ "physics": {"enabled": false},
+ "configure": {"enabled": false},
+ "layout": {"randomSeed": 42}
+ }''')
+
+ included_nodes = set()
+ for node, data in graph.nodes(data=True):
+ count = int(data.get("count", 1) or 1)
+ toxic_count = int(data.get("toxic_count", 0) or 0)
+ toxic_ratio = toxic_count / count if count else 0.0
+
+ if toxic_only and toxic_count == 0:
+ continue
+
+ # 3-color system: red=mostly toxic, orange=mixed, green=mostly benign
+ # More informative than binary because it shows usage context gradient
+ if toxic_ratio > 0.7:
+ color = "#ff4b4b"
+ elif toxic_ratio >= 0.4:
+ color = "#ff9f43"
+ else:
+ color = "#2ecc71"
+
+ net.add_node(node, label=node, color=color, value=count)
+ included_nodes.add(node)
+
+ for u, v, data in graph.edges(data=True):
+ if u not in included_nodes or v not in included_nodes:
+ continue
+ net.add_edge(u, v, value=int(data.get("weight", 1) or 1))
+
+ # Replace local file reference that breaks in Streamlit's sandboxed iframe
+ html = net.generate_html()
+
+ import json # noqa: F401
+ center_script = """
+
+ """
+ html = html.replace("