datamatters24
/

research-document-archive

+#!/usr/bin/env python3
+"""
+Export document text for topic classification on RunPod.
+Creates a JSONL file with document_id + concatenated OCR text (first 5 pages).
+"""
+import json
+import logging
+import sys
+from db import get_conn
+logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(levelname)-8s  %(message)s")
+log = logging.getLogger(__name__)
+OUTPUT = "/var/www/research/ml/topic_export.jsonl"
+MAX_CHARS = 2000  # Truncate text for classifier input
+def export():
+    conn = get_conn()
+    cur = conn.cursor()
+    # Get docs that don't have topics yet
+    cur.execute("""
+        SELECT d.id, d.source_section
+        FROM documents d
+        LEFT JOIN document_topics dt ON dt.document_id = d.id
+        WHERE dt.id IS NULL
+        ORDER BY d.id
+    """)
+    doc_ids = cur.fetchall()
+    log.info(f"Exporting {len(doc_ids)} documents")
+    written = 0
+    with open(OUTPUT, "w") as f:
+        for i, (doc_id, section) in enumerate(doc_ids):
+            cur.execute("""
+                SELECT string_agg(ocr_text, ' ' ORDER BY page_number) as text
+                FROM (
+                    SELECT ocr_text, page_number FROM pages
+                    WHERE document_id = %s AND ocr_text IS NOT NULL AND word_count > 5
+                    ORDER BY page_number LIMIT 5
+                ) sub
+            """, (doc_id,))
+            row = cur.fetchone()
+            text = (row[0] or "").strip()[:MAX_CHARS]
+            if len(text) < 50:
+                continue
+            f.write(json.dumps({
+                "id": doc_id,
+                "section": section,
+                "text": text,
+            }) + "\n")
+            written += 1
+            if (i + 1) % 10000 == 0:
+                log.info(f"  Exported {i+1}/{len(doc_ids)}...")
+    conn.close()
+    log.info(f"Wrote {written} docs to {OUTPUT}")
+if __name__ == "__main__":
+    export()