Spaces:
Running
Running
| """Task 8 quality gate: flag low-quality units from a crawl batch. | |
| Usage: | |
| python3 scripts/audit_crawl_quality.py --source aops --since 2024-01-01 | |
| python3 scripts/audit_crawl_quality.py --source aops --since 2024-01-01 --delete-batch | |
| """ | |
| import argparse | |
| import sqlite3 | |
| import sys | |
| DB_PATH = "math_wiki.db" | |
| MIN_LENGTH = 50 | |
| NAVIGATION_PATTERNS = [ | |
| "Retrieved from", | |
| "This article", | |
| "AoPS Wiki", | |
| "Art of Problem Solving", | |
| "Category:", | |
| "Navigation menu", | |
| ] | |
| def is_navigation_text(content: str) -> bool: | |
| return any(p in content for p in NAVIGATION_PATTERNS) | |
| def main(source: str, since: str | None, delete_batch: bool) -> None: | |
| conn = sqlite3.connect(DB_PATH) | |
| conn.row_factory = sqlite3.Row | |
| query = "SELECT id, topic, subtopic, content, source_url FROM wiki_units WHERE deleted=0 AND source=?" | |
| params: list = [source] | |
| if since: | |
| # created_at column may not be present in older schema; fall back gracefully | |
| cols = {row[1] for row in conn.execute("PRAGMA table_info(wiki_units)").fetchall()} | |
| if "created_at" in cols: | |
| query += " AND created_at >= ?" | |
| params.append(since) | |
| else: | |
| print("Warning: no created_at column — ignoring --since filter.") | |
| rows = conn.execute(query, params).fetchall() | |
| print(f"Checking {len(rows)} units from source='{source}'" + (f" since {since}" if since else "")) | |
| flagged: list[tuple[str, str, str]] = [] # (id, reason, content_preview) | |
| for row in rows: | |
| content = row["content"] | |
| if len(content) < MIN_LENGTH: | |
| flagged.append((row["id"], f"too_short ({len(content)} chars)", content)) | |
| elif is_navigation_text(content): | |
| flagged.append((row["id"], "navigation_text", content[:100])) | |
| total = len(rows) | |
| pct = (len(flagged) / total * 100) if total else 0 | |
| print(f"Flagged: {len(flagged)} / {total} ({pct:.1f}%)") | |
| for uid, reason, preview in flagged: | |
| print(f" [{reason}] {uid}: {preview[:80]!r}") | |
| if pct > 10: | |
| print(f"\nWARNING: >10% failure rate — diagnose concept_ingest before continuing.") | |
| if not delete_batch: | |
| print("Re-run with --delete-batch to soft-delete entire batch.") | |
| if delete_batch and flagged: | |
| ids = [f[0] for f in flagged] | |
| conn.execute( | |
| f"UPDATE wiki_units SET deleted=1 WHERE id IN ({','.join('?'*len(ids))})", | |
| ids, | |
| ) | |
| conn.commit() | |
| print(f"\nSoft-deleted {len(ids)} flagged units.") | |
| conn.close() | |
| return len(flagged) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--source", default="aops") | |
| parser.add_argument("--since", default=None, help="ISO date, e.g. 2024-01-01") | |
| parser.add_argument("--delete-batch", action="store_true") | |
| args = parser.parse_args() | |
| main(args.source, args.since, args.delete_batch) | |