ai-agent-app / scripts /audit_crawl_quality.py
MinhTai's picture
deploy: 5e8a959
f0a6334
"""Task 8 quality gate: flag low-quality units from a crawl batch.
Usage:
python3 scripts/audit_crawl_quality.py --source aops --since 2024-01-01
python3 scripts/audit_crawl_quality.py --source aops --since 2024-01-01 --delete-batch
"""
import argparse
import sqlite3
import sys
DB_PATH = "math_wiki.db"
MIN_LENGTH = 50
NAVIGATION_PATTERNS = [
"Retrieved from",
"This article",
"AoPS Wiki",
"Art of Problem Solving",
"Category:",
"Navigation menu",
]
def is_navigation_text(content: str) -> bool:
return any(p in content for p in NAVIGATION_PATTERNS)
def main(source: str, since: str | None, delete_batch: bool) -> None:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
query = "SELECT id, topic, subtopic, content, source_url FROM wiki_units WHERE deleted=0 AND source=?"
params: list = [source]
if since:
# created_at column may not be present in older schema; fall back gracefully
cols = {row[1] for row in conn.execute("PRAGMA table_info(wiki_units)").fetchall()}
if "created_at" in cols:
query += " AND created_at >= ?"
params.append(since)
else:
print("Warning: no created_at column — ignoring --since filter.")
rows = conn.execute(query, params).fetchall()
print(f"Checking {len(rows)} units from source='{source}'" + (f" since {since}" if since else ""))
flagged: list[tuple[str, str, str]] = [] # (id, reason, content_preview)
for row in rows:
content = row["content"]
if len(content) < MIN_LENGTH:
flagged.append((row["id"], f"too_short ({len(content)} chars)", content))
elif is_navigation_text(content):
flagged.append((row["id"], "navigation_text", content[:100]))
total = len(rows)
pct = (len(flagged) / total * 100) if total else 0
print(f"Flagged: {len(flagged)} / {total} ({pct:.1f}%)")
for uid, reason, preview in flagged:
print(f" [{reason}] {uid}: {preview[:80]!r}")
if pct > 10:
print(f"\nWARNING: >10% failure rate — diagnose concept_ingest before continuing.")
if not delete_batch:
print("Re-run with --delete-batch to soft-delete entire batch.")
if delete_batch and flagged:
ids = [f[0] for f in flagged]
conn.execute(
f"UPDATE wiki_units SET deleted=1 WHERE id IN ({','.join('?'*len(ids))})",
ids,
)
conn.commit()
print(f"\nSoft-deleted {len(ids)} flagged units.")
conn.close()
return len(flagged)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--source", default="aops")
parser.add_argument("--since", default=None, help="ISO date, e.g. 2024-01-01")
parser.add_argument("--delete-batch", action="store_true")
args = parser.parse_args()
main(args.source, args.since, args.delete_batch)