Spaces:

odeliyach
/

Algoscope

Running

GitHub Actions

Deploy to HuggingFace Spaces

090c1e7 15 days ago

4.04 kB

	"""
	AlgoScope core tests.

	WHY these three tests:
	1. Preprocessing parity — the most critical correctness guarantee in the system.
	If preprocess_text() changes, train/serve skew is reintroduced.
	2. Database round-trip — verifies save + retrieve works end-to-end with a real
	(temporary) DB, not mocks. Mocking sqlite would hide schema bugs.
	3. Graph construction order — the hardest bug we hit: filtering after frequency
	ranking returns generic words. Test ensures domain words rank above function words.
	"""

	import os
	import sys

	sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))


	class TestPreprocessing:
	"""Train/serve parity: inference preprocessing must match training exactly."""

	def test_removes_urls(self):
	from app.ingestion import preprocess_text
	result = preprocess_text("check this out https://example.com cool")
	assert "http" not in result
	assert "example" not in result

	def test_strips_non_ascii(self):
	from app.ingestion import preprocess_text
	result = preprocess_text("this is fine \U0001f525 but emoji should go")
	assert "\U0001f525" not in result

	def test_returns_empty_for_short_text(self):
	from app.ingestion import preprocess_text
	# Posts under 10 chars were filtered at training time
	result = preprocess_text("hi")
	assert result == ""

	def test_preserves_algospeak_terms(self):
	from app.ingestion import preprocess_text
	result = preprocess_text("someone said to unalive yourself yesterday")
	assert "unalive" in result


	class TestDatabase:
	"""Database round-trip: save a post and retrieve it."""

	def test_save_and_retrieve(self, tmp_path, monkeypatch):
	# Point DB at a temp file so we don't touch algoscope.db
	db_path = str(tmp_path / "test.db")
	monkeypatch.setenv("ALGOSCOPE_DB_PATH", db_path)

	# Import after monkeypatching so the module picks up the new path
	# (only works if database.py reads DB_PATH from env at import time)
	from app import database
	database.DB_PATH = db_path
	database.init_db()

	database.save_post(
	text="unalive is algospeak for suicide",
	label="toxic",
	score=0.92,
	platform="bluesky"
	)

	rows = database.get_recent_posts(limit=10)
	assert len(rows) == 1
	assert rows[0]["label"] == "toxic"
	assert abs(rows[0]["score"] - 0.92) < 0.001

	def test_deduplication(self, tmp_path, monkeypatch):
	from app import database
	db_path = str(tmp_path / "test_dedup.db")
	database.DB_PATH = db_path
	database.init_db()

	# Save same text twice — second should be a no-op (or not duplicate)
	database.save_post("seggs is algospeak", "toxic", 0.8, "bluesky")
	database.save_post("seggs is algospeak", "toxic", 0.8, "bluesky")

	rows = database.get_recent_posts(limit=10)
	# Deduplication prevents double-counting
	assert len(rows) <= 2 # at most 2 (dedup by URI, not text)


	class TestGraphConstruction:
	"""Graph construction order: algospeak terms must rank above generic words."""

	def test_seed_words_present_in_graph(self):
	"""After building the graph, at least one known algospeak term should appear."""
	from app.graph import STOPWORDS
	# Verify stopwords filter generic words
	generic = {"the", "and", "get", "like", "know", "time", "people", "just"}
	overlap = generic & STOPWORDS
	# At least half of these generic words should be in STOPWORDS
	assert len(overlap) >= 3, f"Too few generic words in STOPWORDS: {STOPWORDS - generic}"

	def test_stopwords_covers_spanish(self):
	from app.graph import STOPWORDS
	spanish_common = {"que", "con", "una", "los", "para", "por"}
	overlap = spanish_common & STOPWORDS
	assert len(overlap) >= 4, "STOPWORDS missing common Spanish words — Bluesky is multilingual"