datamatters24 commited on
Commit
225534a
·
verified ·
1 Parent(s): fe76792

Upload ml/04_export_for_topics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ml/04_export_for_topics.py +68 -0
ml/04_export_for_topics.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Export document text for topic classification on RunPod.
4
+
5
+ Creates a JSONL file with document_id + concatenated OCR text (first 5 pages).
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import sys
11
+
12
+ from db import get_conn
13
+
14
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s")
15
+ log = logging.getLogger(__name__)
16
+
17
+ OUTPUT = "/var/www/research/ml/topic_export.jsonl"
18
+ MAX_CHARS = 2000 # Truncate text for classifier input
19
+
20
+
21
+ def export():
22
+ conn = get_conn()
23
+ cur = conn.cursor()
24
+
25
+ # Get docs that don't have topics yet
26
+ cur.execute("""
27
+ SELECT d.id, d.source_section
28
+ FROM documents d
29
+ LEFT JOIN document_topics dt ON dt.document_id = d.id
30
+ WHERE dt.id IS NULL
31
+ ORDER BY d.id
32
+ """)
33
+ doc_ids = cur.fetchall()
34
+ log.info(f"Exporting {len(doc_ids)} documents")
35
+
36
+ written = 0
37
+ with open(OUTPUT, "w") as f:
38
+ for i, (doc_id, section) in enumerate(doc_ids):
39
+ cur.execute("""
40
+ SELECT string_agg(ocr_text, ' ' ORDER BY page_number) as text
41
+ FROM (
42
+ SELECT ocr_text, page_number FROM pages
43
+ WHERE document_id = %s AND ocr_text IS NOT NULL AND word_count > 5
44
+ ORDER BY page_number LIMIT 5
45
+ ) sub
46
+ """, (doc_id,))
47
+ row = cur.fetchone()
48
+ text = (row[0] or "").strip()[:MAX_CHARS]
49
+
50
+ if len(text) < 50:
51
+ continue
52
+
53
+ f.write(json.dumps({
54
+ "id": doc_id,
55
+ "section": section,
56
+ "text": text,
57
+ }) + "\n")
58
+ written += 1
59
+
60
+ if (i + 1) % 10000 == 0:
61
+ log.info(f" Exported {i+1}/{len(doc_ids)}...")
62
+
63
+ conn.close()
64
+ log.info(f"Wrote {written} docs to {OUTPUT}")
65
+
66
+
67
+ if __name__ == "__main__":
68
+ export()