ai-agent-app / scripts /fix_markdown_artifacts.py
MinhTai's picture
deploy: ccb63e1
dd6cc27
"""Task 2: Strip markdown bold (**text**) and HTML-LaTeX artifacts.
Converts r'\(...\)' to '$...$', r'\[...\]' to '$$...$$', and '**text**' to 'text'.
Also soft-deletes any unit whose content contains 'References' boilerplate.
"""
import argparse
import re
import sqlite3
DB_PATH = "math_wiki.db"
REFERENCES_PATTERN = re.compile(r"\bReferences\b")
def clean_content(text: str) -> str:
# \(...\) → $...$
text = re.sub(r"\\\((.+?)\\\)", r"$\1$", text, flags=re.DOTALL)
# \[...\] → $$...$$
text = re.sub(r"\\\[(.+?)\\\]", r"$$\1$$", text, flags=re.DOTALL)
# **text** → text
text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
return text
def main(dry_run: bool) -> None:
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
rows = conn.execute(
"SELECT id, content FROM wiki_units WHERE deleted=0"
).fetchall()
updates: list[tuple[str, str]] = [] # (new_content, id)
deletes: list[str] = []
for row in rows:
content = row["content"]
if REFERENCES_PATTERN.search(content):
deletes.append(row["id"])
continue
cleaned = clean_content(content)
if cleaned != content:
updates.append((cleaned, row["id"]))
print(f"Units to clean: {len(updates)}")
for new_content, uid in updates:
old = conn.execute("SELECT content FROM wiki_units WHERE id=?", (uid,)).fetchone()["content"]
print(f" {uid}:")
print(f" BEFORE: {old[:100]!r}")
print(f" AFTER: {new_content[:100]!r}")
print(f"\nUnits to soft-delete (References boilerplate): {len(deletes)}")
for uid in deletes:
print(f" {uid}")
if dry_run:
print("\nDRY RUN — no changes made.")
conn.close()
return
for new_content, uid in updates:
conn.execute("UPDATE wiki_units SET content=? WHERE id=?", (new_content, uid))
for uid in deletes:
conn.execute("UPDATE wiki_units SET deleted=1 WHERE id=?", (uid,))
conn.commit()
remaining_bold = conn.execute(
"SELECT COUNT(*) FROM wiki_units WHERE content LIKE '%**%' AND deleted=0"
).fetchone()[0]
remaining_latex = conn.execute(
"SELECT COUNT(*) FROM wiki_units WHERE content LIKE '%\\(%' AND deleted=0"
).fetchone()[0]
print(f"\nDone. Remaining '**' units: {remaining_bold} (target 0)")
print(f"Remaining '\\(' units: {remaining_latex} (target 0)")
conn.close()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
main(args.dry_run)