Spaces:
Running
Running
| """Task 2: Strip markdown bold (**text**) and HTML-LaTeX artifacts. | |
| Converts r'\(...\)' to '$...$', r'\[...\]' to '$$...$$', and '**text**' to 'text'. | |
| Also soft-deletes any unit whose content contains 'References' boilerplate. | |
| """ | |
| import argparse | |
| import re | |
| import sqlite3 | |
| DB_PATH = "math_wiki.db" | |
| REFERENCES_PATTERN = re.compile(r"\bReferences\b") | |
| def clean_content(text: str) -> str: | |
| # \(...\) → $...$ | |
| text = re.sub(r"\\\((.+?)\\\)", r"$\1$", text, flags=re.DOTALL) | |
| # \[...\] → $$...$$ | |
| text = re.sub(r"\\\[(.+?)\\\]", r"$$\1$$", text, flags=re.DOTALL) | |
| # **text** → text | |
| text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) | |
| return text | |
| def main(dry_run: bool) -> None: | |
| conn = sqlite3.connect(DB_PATH) | |
| conn.row_factory = sqlite3.Row | |
| rows = conn.execute( | |
| "SELECT id, content FROM wiki_units WHERE deleted=0" | |
| ).fetchall() | |
| updates: list[tuple[str, str]] = [] # (new_content, id) | |
| deletes: list[str] = [] | |
| for row in rows: | |
| content = row["content"] | |
| if REFERENCES_PATTERN.search(content): | |
| deletes.append(row["id"]) | |
| continue | |
| cleaned = clean_content(content) | |
| if cleaned != content: | |
| updates.append((cleaned, row["id"])) | |
| print(f"Units to clean: {len(updates)}") | |
| for new_content, uid in updates: | |
| old = conn.execute("SELECT content FROM wiki_units WHERE id=?", (uid,)).fetchone()["content"] | |
| print(f" {uid}:") | |
| print(f" BEFORE: {old[:100]!r}") | |
| print(f" AFTER: {new_content[:100]!r}") | |
| print(f"\nUnits to soft-delete (References boilerplate): {len(deletes)}") | |
| for uid in deletes: | |
| print(f" {uid}") | |
| if dry_run: | |
| print("\nDRY RUN — no changes made.") | |
| conn.close() | |
| return | |
| for new_content, uid in updates: | |
| conn.execute("UPDATE wiki_units SET content=? WHERE id=?", (new_content, uid)) | |
| for uid in deletes: | |
| conn.execute("UPDATE wiki_units SET deleted=1 WHERE id=?", (uid,)) | |
| conn.commit() | |
| remaining_bold = conn.execute( | |
| "SELECT COUNT(*) FROM wiki_units WHERE content LIKE '%**%' AND deleted=0" | |
| ).fetchone()[0] | |
| remaining_latex = conn.execute( | |
| "SELECT COUNT(*) FROM wiki_units WHERE content LIKE '%\\(%' AND deleted=0" | |
| ).fetchone()[0] | |
| print(f"\nDone. Remaining '**' units: {remaining_bold} (target 0)") | |
| print(f"Remaining '\\(' units: {remaining_latex} (target 0)") | |
| conn.close() | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--dry-run", action="store_true") | |
| args = parser.parse_args() | |
| main(args.dry_run) | |