Algoscope / tests /test_core.py
GitHub Actions
Deploy to HuggingFace Spaces
090c1e7
"""
AlgoScope core tests.
WHY these three tests:
1. Preprocessing parity β€” the most critical correctness guarantee in the system.
If preprocess_text() changes, train/serve skew is reintroduced.
2. Database round-trip β€” verifies save + retrieve works end-to-end with a real
(temporary) DB, not mocks. Mocking sqlite would hide schema bugs.
3. Graph construction order β€” the hardest bug we hit: filtering after frequency
ranking returns generic words. Test ensures domain words rank above function words.
"""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
class TestPreprocessing:
"""Train/serve parity: inference preprocessing must match training exactly."""
def test_removes_urls(self):
from app.ingestion import preprocess_text
result = preprocess_text("check this out https://example.com cool")
assert "http" not in result
assert "example" not in result
def test_strips_non_ascii(self):
from app.ingestion import preprocess_text
result = preprocess_text("this is fine \U0001f525 but emoji should go")
assert "\U0001f525" not in result
def test_returns_empty_for_short_text(self):
from app.ingestion import preprocess_text
# Posts under 10 chars were filtered at training time
result = preprocess_text("hi")
assert result == ""
def test_preserves_algospeak_terms(self):
from app.ingestion import preprocess_text
result = preprocess_text("someone said to unalive yourself yesterday")
assert "unalive" in result
class TestDatabase:
"""Database round-trip: save a post and retrieve it."""
def test_save_and_retrieve(self, tmp_path, monkeypatch):
# Point DB at a temp file so we don't touch algoscope.db
db_path = str(tmp_path / "test.db")
monkeypatch.setenv("ALGOSCOPE_DB_PATH", db_path)
# Import after monkeypatching so the module picks up the new path
# (only works if database.py reads DB_PATH from env at import time)
from app import database
database.DB_PATH = db_path
database.init_db()
database.save_post(
text="unalive is algospeak for suicide",
label="toxic",
score=0.92,
platform="bluesky"
)
rows = database.get_recent_posts(limit=10)
assert len(rows) == 1
assert rows[0]["label"] == "toxic"
assert abs(rows[0]["score"] - 0.92) < 0.001
def test_deduplication(self, tmp_path, monkeypatch):
from app import database
db_path = str(tmp_path / "test_dedup.db")
database.DB_PATH = db_path
database.init_db()
# Save same text twice β€” second should be a no-op (or not duplicate)
database.save_post("seggs is algospeak", "toxic", 0.8, "bluesky")
database.save_post("seggs is algospeak", "toxic", 0.8, "bluesky")
rows = database.get_recent_posts(limit=10)
# Deduplication prevents double-counting
assert len(rows) <= 2 # at most 2 (dedup by URI, not text)
class TestGraphConstruction:
"""Graph construction order: algospeak terms must rank above generic words."""
def test_seed_words_present_in_graph(self):
"""After building the graph, at least one known algospeak term should appear."""
from app.graph import STOPWORDS
# Verify stopwords filter generic words
generic = {"the", "and", "get", "like", "know", "time", "people", "just"}
overlap = generic & STOPWORDS
# At least half of these generic words should be in STOPWORDS
assert len(overlap) >= 3, f"Too few generic words in STOPWORDS: {STOPWORDS - generic}"
def test_stopwords_covers_spanish(self):
from app.graph import STOPWORDS
spanish_common = {"que", "con", "una", "los", "para", "por"}
overlap = spanish_common & STOPWORDS
assert len(overlap) >= 4, "STOPWORDS missing common Spanish words β€” Bluesky is multilingual"