datamatters24 commited on
Commit
6c33ee2
·
verified ·
1 Parent(s): 9f39ddf

Upload ml/01_extract_dates.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ml/01_extract_dates.py +267 -0
ml/01_extract_dates.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Stage 1: Extract estimated dates for documents.
4
+
5
+ Sources (in priority order):
6
+ 1. Filename parsing (congress session, year folders, JFK doc IDs)
7
+ 2. DATE entities already in the entities table (most frequent date per doc)
8
+ 3. Regex patterns in OCR text (fallback)
9
+
10
+ Populates: document_dates table
11
+ """
12
+
13
+ import re
14
+ import logging
15
+ import sys
16
+ from datetime import date, datetime
17
+ from collections import Counter
18
+
19
+ import psycopg2
20
+ import psycopg2.extras
21
+ from config import CONGRESS_DATES, BATCH_SIZE
22
+ from db import get_conn, fetch_all, fetch_one
23
+
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format="%(asctime)s %(levelname)-8s %(message)s",
27
+ handlers=[logging.StreamHandler(sys.stdout)],
28
+ )
29
+ log = logging.getLogger(__name__)
30
+
31
+
32
+ def parse_congress_from_path(file_path: str) -> int | None:
33
+ """Extract congress session number from file path or filename."""
34
+ # Match patterns like congress_118, BILLS-118hr, congress_103
35
+ m = re.search(r'congress_(\d{2,3})', file_path)
36
+ if m:
37
+ return int(m.group(1))
38
+ m = re.search(r'BILLS-(\d{2,3})', file_path)
39
+ if m:
40
+ return int(m.group(1))
41
+ # Congressional Record with ordinal congress
42
+ m = re.search(r'(\d{2,3})(st|nd|rd|th)\s+Congress', file_path, re.IGNORECASE)
43
+ if m:
44
+ return int(m.group(1))
45
+ return None
46
+
47
+
48
+ def parse_year_from_path(file_path: str) -> int | None:
49
+ """Extract a year from folder structure like /2021/ or /2017-2018/."""
50
+ # Folder-based year
51
+ m = re.search(r'/(\d{4})(?:[_/-](\d{4}))?/', file_path)
52
+ if m:
53
+ return int(m.group(1))
54
+ # Year in filename
55
+ m = re.search(r'[_-](\d{4})[_.-]', file_path)
56
+ if m:
57
+ yr = int(m.group(1))
58
+ if 1800 <= yr <= 2030:
59
+ return yr
60
+ return None
61
+
62
+
63
+ def congress_to_date_range(session: int) -> tuple[date | None, date | None]:
64
+ """Convert congress session to a date range."""
65
+ if session in CONGRESS_DATES:
66
+ s, e = CONGRESS_DATES[session]
67
+ return date.fromisoformat(s), date.fromisoformat(e)
68
+ # Approximate: each congress starts Jan 3 of odd year
69
+ # Congress 1 started 1789, session N starts 1789 + (N-1)*2
70
+ start_year = 1789 + (session - 1) * 2
71
+ if 1789 <= start_year <= 2030:
72
+ return date(start_year, 1, 3), date(start_year + 2, 1, 3)
73
+ return None, None
74
+
75
+
76
+ def parse_date_entities(doc_id: int, conn) -> tuple[date | None, float]:
77
+ """
78
+ Find the most common parseable date from DATE entities for a document.
79
+ Returns (estimated_date, confidence).
80
+ """
81
+ with conn.cursor() as cur:
82
+ cur.execute(
83
+ "SELECT entity_text FROM entities "
84
+ "WHERE document_id = %s AND entity_type = 'DATE'",
85
+ (doc_id,)
86
+ )
87
+ rows = cur.fetchall()
88
+
89
+ if not rows:
90
+ return None, 0.0
91
+
92
+ year_counts = Counter()
93
+ full_dates = []
94
+
95
+ for (text,) in rows:
96
+ text = text.strip()
97
+ # Try full date patterns
98
+ for fmt in ("%B %d, %Y", "%b %d, %Y", "%m/%d/%Y", "%Y-%m-%d", "%d %B %Y"):
99
+ try:
100
+ dt = datetime.strptime(text, fmt).date()
101
+ if 1800 <= dt.year <= 2030:
102
+ full_dates.append(dt)
103
+ year_counts[dt.year] += 1
104
+ break
105
+ except ValueError:
106
+ continue
107
+ else:
108
+ # Try just year
109
+ m = re.search(r'\b(1[89]\d{2}|20[0-2]\d)\b', text)
110
+ if m:
111
+ year_counts[int(m.group(1))] += 1
112
+
113
+ if full_dates:
114
+ # Return most common full date
115
+ date_counts = Counter(full_dates)
116
+ best_date, count = date_counts.most_common(1)[0]
117
+ confidence = min(count / len(rows), 1.0)
118
+ return best_date, confidence
119
+
120
+ if year_counts:
121
+ best_year, count = year_counts.most_common(1)[0]
122
+ confidence = min(count / len(rows) * 0.5, 0.8) # lower confidence for year-only
123
+ return date(best_year, 7, 1), confidence # midpoint of year
124
+
125
+ return None, 0.0
126
+
127
+
128
+ def process_documents():
129
+ """Main processing loop."""
130
+ conn = get_conn()
131
+ conn.autocommit = False
132
+
133
+ # Get documents that don't have dates yet
134
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
135
+ cur.execute("""
136
+ SELECT d.id, d.file_path, d.source_section
137
+ FROM documents d
138
+ LEFT JOIN document_dates dd ON dd.document_id = d.id
139
+ WHERE dd.document_id IS NULL
140
+ ORDER BY d.id
141
+ """)
142
+ docs = cur.fetchall()
143
+
144
+ total = len(docs)
145
+ log.info(f"Processing {total} documents for date extraction")
146
+
147
+ batch = []
148
+ processed = 0
149
+
150
+ for doc in docs:
151
+ doc_id = doc["id"]
152
+ path = doc["file_path"]
153
+ section = doc["source_section"]
154
+
155
+ estimated_date = None
156
+ date_source = None
157
+ date_confidence = 0.0
158
+ date_range_start = None
159
+ date_range_end = None
160
+ congress_session = None
161
+
162
+ # Priority 1: Congress session from filename
163
+ congress = parse_congress_from_path(path)
164
+ if congress:
165
+ congress_session = congress
166
+ start, end = congress_to_date_range(congress)
167
+ if start and end:
168
+ date_range_start = start
169
+ date_range_end = end
170
+ # Midpoint as estimate
171
+ mid = start.toordinal() + (end.toordinal() - start.toordinal()) // 2
172
+ estimated_date = date.fromordinal(mid)
173
+ date_source = "filename_congress"
174
+ date_confidence = 0.7
175
+
176
+ # Priority 2: Year from folder/filename
177
+ if not estimated_date:
178
+ year = parse_year_from_path(path)
179
+ if year:
180
+ estimated_date = date(year, 7, 1)
181
+ date_range_start = date(year, 1, 1)
182
+ date_range_end = date(year, 12, 31)
183
+ date_source = "filename_year"
184
+ date_confidence = 0.6
185
+
186
+ # Priority 3: DATE entities from NER
187
+ if not estimated_date:
188
+ ner_date, ner_conf = parse_date_entities(doc_id, conn)
189
+ if ner_date:
190
+ estimated_date = ner_date
191
+ date_source = "ner_entities"
192
+ date_confidence = ner_conf
193
+
194
+ # Priority 4: Collection-level defaults
195
+ if not estimated_date:
196
+ defaults = {
197
+ "cia_mkultra": (date(1963, 1, 1), "collection_default", 0.3,
198
+ date(1953, 1, 1), date(1973, 12, 31)),
199
+ "cia_stargate": (date(1986, 1, 1), "collection_default", 0.3,
200
+ date(1978, 1, 1), date(1995, 12, 31)),
201
+ "lincoln_archives": (date(1865, 1, 1), "collection_default", 0.3,
202
+ date(1860, 1, 1), date(1877, 12, 31)),
203
+ }
204
+ if section in defaults:
205
+ d = defaults[section]
206
+ estimated_date = d[0]
207
+ date_source = d[1]
208
+ date_confidence = d[2]
209
+ date_range_start = d[3]
210
+ date_range_end = d[4]
211
+
212
+ batch.append((
213
+ doc_id, estimated_date, date_source, date_confidence,
214
+ date_range_start, date_range_end, congress_session,
215
+ ))
216
+
217
+ if len(batch) >= BATCH_SIZE:
218
+ _flush_batch(conn, batch)
219
+ processed += len(batch)
220
+ log.info(f"Progress: {processed}/{total} ({processed*100//total}%)")
221
+ batch = []
222
+
223
+ if batch:
224
+ _flush_batch(conn, batch)
225
+ processed += len(batch)
226
+
227
+ conn.close()
228
+ log.info(f"Done. Processed {processed} documents.")
229
+
230
+ # Stats
231
+ stats = fetch_all("""
232
+ SELECT date_source, COUNT(*) as cnt,
233
+ ROUND(AVG(date_confidence)::numeric, 2) as avg_conf
234
+ FROM document_dates
235
+ GROUP BY date_source
236
+ ORDER BY cnt DESC
237
+ """)
238
+ log.info("Date extraction stats:")
239
+ for row in stats:
240
+ log.info(f" {row['date_source'] or 'no_date'}: {row['cnt']} docs (avg conf: {row['avg_conf']})")
241
+
242
+
243
+ def _flush_batch(conn, batch):
244
+ with conn.cursor() as cur:
245
+ psycopg2.extras.execute_batch(
246
+ cur,
247
+ """INSERT INTO document_dates
248
+ (document_id, estimated_date, date_source, date_confidence,
249
+ date_range_start, date_range_end, congress_session)
250
+ VALUES (%s, %s, %s, %s, %s, %s, %s)
251
+ ON CONFLICT (document_id) DO UPDATE SET
252
+ estimated_date = EXCLUDED.estimated_date,
253
+ date_source = EXCLUDED.date_source,
254
+ date_confidence = EXCLUDED.date_confidence,
255
+ date_range_start = EXCLUDED.date_range_start,
256
+ date_range_end = EXCLUDED.date_range_end,
257
+ congress_session = EXCLUDED.congress_session,
258
+ created_at = NOW()
259
+ """,
260
+ batch,
261
+ page_size=500,
262
+ )
263
+ conn.commit()
264
+
265
+
266
+ if __name__ == "__main__":
267
+ process_documents()