legislation-tracker / utils /data_loader.py
ramanna's picture
Upload utils/data_loader.py with huggingface_hub
0f65484 verified
"""
utils/data_loader.py
--------------------
Shared data loading functions used across all pages.
"""
import json
import time
from pathlib import Path
import pandas as pd
import streamlit as st
DATA_DIR = Path("data")
BILLS_FILE = DATA_DIR / "known_bills_visualize.json"
SUMMARIES_FILE = DATA_DIR / "bill_summaries.json"
QUESTIONS_FILE = DATA_DIR / "bill_suggested_questions.json"
REPORTS_FILE = DATA_DIR / "bill_reports.json"
NEWSLETTER_DIR = DATA_DIR / "newsletter_drafts"
CHANGES_DIR = DATA_DIR / "weekly_changes"
CALENDAR_FILE = DATA_DIR / "bill_calendar.json"
@st.cache_data(show_spinner=False)
def load_bills() -> pd.DataFrame:
"""Load and process the main bills JSON into a DataFrame."""
if not BILLS_FILE.exists():
return pd.DataFrame()
try:
with BILLS_FILE.open("r", encoding="utf-8") as f:
bills_data = json.load(f)
df = pd.DataFrame(bills_data)
if "last_action_date" in df.columns:
df["last_action_date"] = pd.to_datetime(df["last_action_date"], errors="coerce")
if "lastUpdatedAt" in df.columns:
df["lastUpdatedAt"] = pd.to_datetime(df["lastUpdatedAt"], errors="coerce")
return df
except Exception as e:
st.error(f"Error loading bills: {e}")
return pd.DataFrame()
@st.cache_data(show_spinner=False)
def load_summaries() -> dict:
"""Load pre-generated bill summaries keyed by state_billnumber."""
try:
if SUMMARIES_FILE.exists():
with open(SUMMARIES_FILE, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {}
@st.cache_data(show_spinner=False)
def load_suggested_questions() -> dict:
"""Load pre-generated suggested questions keyed by state_billnumber."""
try:
if QUESTIONS_FILE.exists():
with open(QUESTIONS_FILE, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {}
@st.cache_data(show_spinner=False)
def load_reports() -> dict:
"""Load pre-generated bill reports keyed by bill_id."""
try:
if REPORTS_FILE.exists():
with open(REPORTS_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
return {r["bill_id"]: r["report_markdown"] for r in data}
except Exception:
pass
return {}
@st.cache_data(show_spinner=False)
def load_calendar() -> list:
"""Load pre-computed legislative calendar events."""
try:
if CALENDAR_FILE.exists():
with open(CALENDAR_FILE, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return []
def get_summary(bill_data: dict, summaries: dict) -> str:
key = f"{bill_data.get('state', '')}_{bill_data.get('bill_number', '')}"
entry = summaries.get(key, {})
summary = entry.get("summary", "") if isinstance(entry, dict) else ""
if not summary or summary.startswith("ERROR:"):
return ""
return summary
def get_suggested_questions(bill_data: dict, questions: dict) -> list:
key = f"{bill_data.get('state', '')}_{bill_data.get('bill_number', '')}"
entry = questions.get(key, {})
qs = entry.get("suggested_questions", []) if isinstance(entry, dict) else []
if qs:
return qs
return [
"What are the key definitions in this bill?",
"What are the enforcement mechanisms?",
"Who does this bill apply to?",
"What are the compliance requirements?",
"What penalties are specified?",
]
def get_report(bill_data: dict, reports: dict) -> str:
bill_id = str(bill_data.get("bill_id", ""))
report = reports.get(bill_id, "")
if not report or str(report).startswith("ERROR:"):
return ""
return report
def get_last_updated(df: pd.DataFrame) -> str:
if "lastUpdatedAt" not in df.columns or df.empty:
return "N/A"
valid = df[df["lastUpdatedAt"].notna()]["lastUpdatedAt"]
if valid.empty:
return "N/A"
most_recent = valid.max()
days_ago = (pd.Timestamp.now(tz=most_recent.tzinfo if most_recent.tzinfo else None) - most_recent).days
date_str = most_recent.strftime("%Y-%m-%d")
if days_ago <= 3:
color = "#28a745"
elif days_ago <= 7:
color = "#f0ad4e"
else:
color = "#dc3545"
ago_text = "Today" if days_ago == 0 else f"{days_ago}d ago"
return f'{date_str} <span style="color:#CFB991;">({ago_text})</span>'
def load_newsletters() -> dict:
"""Return {label: Path} for all newsletter drafts, newest first."""
if not NEWSLETTER_DIR.exists():
return {}
files = sorted(NEWSLETTER_DIR.glob("newsletter_*.md"), reverse=True)
result = {}
for nf in files:
date_part = nf.stem.replace("newsletter_", "")
result[f"Week of {date_part}"] = nf
return result