ramanna commited on
Commit
15bc3e7
Β·
verified Β·
1 Parent(s): 0f65484

Upload data_updating_scripts/build_calendar.py with huggingface_hub

Browse files
data_updating_scripts/build_calendar.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ build_calendar.py
4
+ -----------------
5
+ Reads bill action history from known_bills_visualize.json and builds a
6
+ lightweight calendar of recent legislative milestones (committee referrals,
7
+ floor votes, governor actions, etc.).
8
+
9
+ Output: data/bill_calendar.json
10
+
11
+ Can be run standalone or as part of the weekly pipeline.
12
+ """
13
+
14
+ import json
15
+ import logging
16
+ import os
17
+ import re
18
+ import sys
19
+ import time
20
+ from datetime import datetime, timedelta, timezone
21
+ from pathlib import Path
22
+
23
+ # Add project root to path
24
+ sys.path.append(str(Path(__file__).parent.parent))
25
+
26
+ # ── Pipeline status tracking (no-op when running standalone) ──────────────
27
+ _PIPELINE_SCRIPT = os.environ.get("PIPELINE_CURRENT_SCRIPT")
28
+ _pipeline = None
29
+ _last_status_write = 0.0
30
+ if _PIPELINE_SCRIPT:
31
+ try:
32
+ from pipeline_status import PipelineStatus
33
+ _pipeline = PipelineStatus()
34
+ except Exception:
35
+ pass
36
+
37
+
38
+ def _update_pipeline_progress(current, total, unit="bills", message=""):
39
+ global _last_status_write
40
+ if not _pipeline:
41
+ return
42
+ now = time.time()
43
+ if now - _last_status_write < 3.0:
44
+ return
45
+ _last_status_write = now
46
+ try:
47
+ _pipeline.update_progress(_PIPELINE_SCRIPT, current, total, unit, message)
48
+ except Exception:
49
+ pass
50
+
51
+
52
+ # ── Paths ─────────────────────────────────────────────────────────────────
53
+ DATA_DIR = Path("data")
54
+ BILLS_FILE = DATA_DIR / "known_bills_visualize.json"
55
+ CALENDAR_FILE = DATA_DIR / "bill_calendar.json"
56
+
57
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
58
+
59
+ # ── Logging ───────────────────────────────────────────────────────────────
60
+ logging.basicConfig(
61
+ level=logging.INFO,
62
+ format="%(asctime)s [%(levelname)s] %(message)s",
63
+ handlers=[
64
+ logging.StreamHandler(),
65
+ logging.FileHandler("data_updating_scripts/logs/build_calendar.log"),
66
+ ],
67
+ )
68
+ logger = logging.getLogger(__name__)
69
+
70
+
71
+ # ── Milestone patterns ────────────────────────────────────────────────────
72
+ # Each key maps to a list of regex patterns that identify that milestone type.
73
+ MILESTONE_PATTERNS = {
74
+ "hearing_scheduled": [
75
+ r"(?i)scheduled\s+for\s+hearing",
76
+ r"(?i)hearing\s+set\s+for",
77
+ r"(?i)notice\s+of\s+hearing",
78
+ r"(?i)public\s+hearing",
79
+ ],
80
+ "committee_referral": [
81
+ r"(?i)referred\s+to\s+(the\s+)?(committee|subcommittee|comm\.)",
82
+ r"(?i)assigned\s+to\s+(the\s+)?(committee|subcommittee|comm\.)",
83
+ r"(?i)re-?referred\s+to",
84
+ ],
85
+ "committee_passed": [
86
+ r"(?i)passed\s+(the\s+)?(committee|subcommittee|comm\.)",
87
+ r"(?i)reported\s+(out\s+)?(favorably|without\s+amendment)",
88
+ r"(?i)\bdo\s+pass\b",
89
+ r"(?i)recommended\s+for\s+passage",
90
+ r"(?i)reported\s+with\s+recommendation",
91
+ ],
92
+ "floor_vote": [
93
+ r"(?i)third\s+reading",
94
+ r"(?i)3rd\s+reading",
95
+ r"(?i)placed\s+on\s+(the\s+)?calendar",
96
+ r"(?i)ordered\s+(to\s+)?(be\s+)?engrossed",
97
+ r"(?i)passed\s+(the\s+)?(house|senate|assembly|chamber)",
98
+ r"(?i)adopted\s+by\s+(the\s+)?(house|senate|assembly)",
99
+ ],
100
+ "sent_to_governor": [
101
+ r"(?i)sent\s+to\s+(the\s+)?governor",
102
+ r"(?i)presented\s+to\s+(the\s+)?governor",
103
+ r"(?i)\benrolled\b",
104
+ r"(?i)transmitted\s+to\s+(the\s+)?governor",
105
+ ],
106
+ }
107
+
108
+ # Human-readable labels for each milestone type
109
+ MILESTONE_LABELS = {
110
+ "hearing_scheduled": "Hearing Scheduled",
111
+ "committee_referral": "Committee Referral",
112
+ "committee_passed": "Passed Committee",
113
+ "floor_vote": "Floor Vote",
114
+ "sent_to_governor": "Sent to Governor",
115
+ }
116
+
117
+
118
+ def _classify_action(action_text: str) -> str | None:
119
+ """Match action text against milestone patterns. Returns type or None."""
120
+ for milestone_type, patterns in MILESTONE_PATTERNS.items():
121
+ for pattern in patterns:
122
+ if re.search(pattern, action_text):
123
+ return milestone_type
124
+ return None
125
+
126
+
127
+ def build_calendar(bills: list, lookback_days: int = 30) -> list:
128
+ """
129
+ Scan bill actions for legislative milestones within the lookback window.
130
+
131
+ Returns a list of event dicts sorted by date descending.
132
+ """
133
+ cutoff = (datetime.now(timezone.utc) - timedelta(days=lookback_days)).strftime("%Y-%m-%d")
134
+ events = []
135
+
136
+ for i, bill in enumerate(bills):
137
+ if i % 500 == 0:
138
+ _update_pipeline_progress(i, len(bills), "bills",
139
+ f"Scanning actions β€” {len(events)} milestones found")
140
+
141
+ actions = bill.get("actions", [])
142
+ if not actions or not isinstance(actions, list):
143
+ continue
144
+
145
+ bill_id = str(bill.get("bill_id", ""))
146
+ state = bill.get("state", "")
147
+ bill_number = bill.get("bill_number", "")
148
+ title = bill.get("title", "")
149
+ bill_url = bill.get("bill_url", "")
150
+ bill_status = bill.get("status", "")
151
+
152
+ for action in actions:
153
+ try:
154
+ action_text = action.get("action", "") or ""
155
+ # LegiScan uses "date" or "action_date" depending on context
156
+ action_date = action.get("date") or action.get("action_date") or ""
157
+
158
+ if not action_text or not action_date:
159
+ continue
160
+
161
+ # Skip actions older than the lookback window
162
+ if action_date < cutoff:
163
+ continue
164
+
165
+ milestone_type = _classify_action(action_text)
166
+ if milestone_type:
167
+ events.append({
168
+ "bill_id": bill_id,
169
+ "state": state,
170
+ "bill_number": bill_number,
171
+ "title": title[:150],
172
+ "event_type": milestone_type,
173
+ "event_label": MILESTONE_LABELS.get(milestone_type, milestone_type),
174
+ "event_description": action_text.strip(),
175
+ "event_date": action_date,
176
+ "chamber": action.get("chamber", ""),
177
+ "bill_url": bill_url,
178
+ "bill_status": bill_status,
179
+ })
180
+ except Exception as e:
181
+ logger.warning(f"Error processing action for bill {bill_id}: {e}")
182
+ continue
183
+
184
+ # Sort by date descending (most recent first)
185
+ events.sort(key=lambda e: e.get("event_date", ""), reverse=True)
186
+ return events
187
+
188
+
189
+ def main():
190
+ logger.info("=" * 60)
191
+ logger.info("Building legislative calendar from bill actions")
192
+ logger.info("=" * 60)
193
+
194
+ # Load bills
195
+ if not BILLS_FILE.exists():
196
+ logger.error(f"Bills file not found: {BILLS_FILE}")
197
+ return
198
+
199
+ try:
200
+ with open(BILLS_FILE, "r", encoding="utf-8") as f:
201
+ bills = json.load(f)
202
+ except Exception as e:
203
+ logger.error(f"Failed to load bills: {e}")
204
+ return
205
+
206
+ logger.info(f"Loaded {len(bills)} bills")
207
+
208
+ # Count bills with non-empty actions
209
+ bills_with_actions = sum(1 for b in bills if b.get("actions"))
210
+ logger.info(f"Bills with action history: {bills_with_actions}")
211
+
212
+ if bills_with_actions == 0:
213
+ logger.warning("No bills have action history yet. Writing empty calendar.")
214
+ with open(CALENDAR_FILE, "w", encoding="utf-8") as f:
215
+ json.dump([], f)
216
+ logger.info(f"Wrote empty {CALENDAR_FILE}")
217
+ return
218
+
219
+ # Build calendar
220
+ events = build_calendar(bills, lookback_days=30)
221
+ logger.info(f"Found {len(events)} milestone events in the last 30 days")
222
+
223
+ # Log breakdown by type
224
+ type_counts = {}
225
+ for e in events:
226
+ t = e["event_type"]
227
+ type_counts[t] = type_counts.get(t, 0) + 1
228
+ for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
229
+ logger.info(f" {MILESTONE_LABELS.get(t, t)}: {c}")
230
+
231
+ # Write output
232
+ try:
233
+ with open(CALENDAR_FILE, "w", encoding="utf-8") as f:
234
+ json.dump(events, f, indent=2, ensure_ascii=False)
235
+ logger.info(f"Wrote {len(events)} events to {CALENDAR_FILE}")
236
+ except Exception as e:
237
+ logger.error(f"Failed to write calendar: {e}")
238
+ return
239
+
240
+ _update_pipeline_progress(len(bills), len(bills), "bills",
241
+ f"Done β€” {len(events)} milestones")
242
+ logger.info("Calendar build complete.")
243
+
244
+
245
+ if __name__ == "__main__":
246
+ main()