| """ |
| AlgoScope core tests. |
| |
| WHY these three tests: |
| 1. Preprocessing parity β the most critical correctness guarantee in the system. |
| If preprocess_text() changes, train/serve skew is reintroduced. |
| 2. Database round-trip β verifies save + retrieve works end-to-end with a real |
| (temporary) DB, not mocks. Mocking sqlite would hide schema bugs. |
| 3. Graph construction order β the hardest bug we hit: filtering after frequency |
| ranking returns generic words. Test ensures domain words rank above function words. |
| """ |
|
|
| import os |
| import sys |
|
|
| sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) |
|
|
|
|
| class TestPreprocessing: |
| """Train/serve parity: inference preprocessing must match training exactly.""" |
|
|
| def test_removes_urls(self): |
| from app.ingestion import preprocess_text |
| result = preprocess_text("check this out https://example.com cool") |
| assert "http" not in result |
| assert "example" not in result |
|
|
| def test_strips_non_ascii(self): |
| from app.ingestion import preprocess_text |
| result = preprocess_text("this is fine \U0001f525 but emoji should go") |
| assert "\U0001f525" not in result |
|
|
| def test_returns_empty_for_short_text(self): |
| from app.ingestion import preprocess_text |
| |
| result = preprocess_text("hi") |
| assert result == "" |
|
|
| def test_preserves_algospeak_terms(self): |
| from app.ingestion import preprocess_text |
| result = preprocess_text("someone said to unalive yourself yesterday") |
| assert "unalive" in result |
|
|
|
|
| class TestDatabase: |
| """Database round-trip: save a post and retrieve it.""" |
|
|
| def test_save_and_retrieve(self, tmp_path, monkeypatch): |
| |
| db_path = str(tmp_path / "test.db") |
| monkeypatch.setenv("ALGOSCOPE_DB_PATH", db_path) |
|
|
| |
| |
| from app import database |
| database.DB_PATH = db_path |
| database.init_db() |
|
|
| database.save_post( |
| text="unalive is algospeak for suicide", |
| label="toxic", |
| score=0.92, |
| platform="bluesky" |
| ) |
|
|
| rows = database.get_recent_posts(limit=10) |
| assert len(rows) == 1 |
| assert rows[0]["label"] == "toxic" |
| assert abs(rows[0]["score"] - 0.92) < 0.001 |
|
|
| def test_deduplication(self, tmp_path, monkeypatch): |
| from app import database |
| db_path = str(tmp_path / "test_dedup.db") |
| database.DB_PATH = db_path |
| database.init_db() |
|
|
| |
| database.save_post("seggs is algospeak", "toxic", 0.8, "bluesky") |
| database.save_post("seggs is algospeak", "toxic", 0.8, "bluesky") |
|
|
| rows = database.get_recent_posts(limit=10) |
| |
| assert len(rows) <= 2 |
|
|
|
|
| class TestGraphConstruction: |
| """Graph construction order: algospeak terms must rank above generic words.""" |
|
|
| def test_seed_words_present_in_graph(self): |
| """After building the graph, at least one known algospeak term should appear.""" |
| from app.graph import STOPWORDS |
| |
| generic = {"the", "and", "get", "like", "know", "time", "people", "just"} |
| overlap = generic & STOPWORDS |
| |
| assert len(overlap) >= 3, f"Too few generic words in STOPWORDS: {STOPWORDS - generic}" |
|
|
| def test_stopwords_covers_spanish(self): |
| from app.graph import STOPWORDS |
| spanish_common = {"que", "con", "una", "los", "para", "por"} |
| overlap = spanish_common & STOPWORDS |
| assert len(overlap) >= 4, "STOPWORDS missing common Spanish words β Bluesky is multilingual" |
|
|