File size: 2,662 Bytes
ecd70d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""Task 2: Strip markdown bold (**text**) and HTML-LaTeX artifacts.
Converts r'\(...\)' to '$...$', r'\[...\]' to '$$...$$', and '**text**' to 'text'.
Also soft-deletes any unit whose content contains 'References' boilerplate.
"""
import argparse
import re
import sqlite3


DB_PATH = "math_wiki.db"

REFERENCES_PATTERN = re.compile(r"\bReferences\b")


def clean_content(text: str) -> str:
    # \(...\) → $...$
    text = re.sub(r"\\\((.+?)\\\)", r"$\1$", text, flags=re.DOTALL)
    # \[...\] → $$...$$
    text = re.sub(r"\\\[(.+?)\\\]", r"$$\1$$", text, flags=re.DOTALL)
    # **text** → text
    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
    return text


def main(dry_run: bool) -> None:
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row

    rows = conn.execute(
        "SELECT id, content FROM wiki_units WHERE deleted=0"
    ).fetchall()

    updates: list[tuple[str, str]] = []   # (new_content, id)
    deletes: list[str] = []

    for row in rows:
        content = row["content"]
        if REFERENCES_PATTERN.search(content):
            deletes.append(row["id"])
            continue
        cleaned = clean_content(content)
        if cleaned != content:
            updates.append((cleaned, row["id"]))

    print(f"Units to clean: {len(updates)}")
    for new_content, uid in updates:
        old = conn.execute("SELECT content FROM wiki_units WHERE id=?", (uid,)).fetchone()["content"]
        print(f"  {uid}:")
        print(f"    BEFORE: {old[:100]!r}")
        print(f"    AFTER:  {new_content[:100]!r}")

    print(f"\nUnits to soft-delete (References boilerplate): {len(deletes)}")
    for uid in deletes:
        print(f"  {uid}")

    if dry_run:
        print("\nDRY RUN — no changes made.")
        conn.close()
        return

    for new_content, uid in updates:
        conn.execute("UPDATE wiki_units SET content=? WHERE id=?", (new_content, uid))
    for uid in deletes:
        conn.execute("UPDATE wiki_units SET deleted=1 WHERE id=?", (uid,))
    conn.commit()

    remaining_bold = conn.execute(
        "SELECT COUNT(*) FROM wiki_units WHERE content LIKE '%**%' AND deleted=0"
    ).fetchone()[0]
    remaining_latex = conn.execute(
        "SELECT COUNT(*) FROM wiki_units WHERE content LIKE '%\\(%' AND deleted=0"
    ).fetchone()[0]
    print(f"\nDone. Remaining '**' units: {remaining_bold} (target 0)")
    print(f"Remaining '\\(' units: {remaining_latex} (target 0)")
    conn.close()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()
    main(args.dry_run)