Spaces:

VAILL
/

legislation-tracker

Running on CPU Upgrade

File size: 4,876 Bytes

"""
utils/data_loader.py
--------------------
Shared data loading functions used across all pages.
"""

import json
import time
from pathlib import Path

import pandas as pd
import streamlit as st


DATA_DIR = Path("data")
BILLS_FILE = DATA_DIR / "known_bills_visualize.json"
SUMMARIES_FILE = DATA_DIR / "bill_summaries.json"
QUESTIONS_FILE = DATA_DIR / "bill_suggested_questions.json"
REPORTS_FILE = DATA_DIR / "bill_reports.json"
NEWSLETTER_DIR = DATA_DIR / "newsletter_drafts"
CHANGES_DIR = DATA_DIR / "weekly_changes"
CALENDAR_FILE = DATA_DIR / "bill_calendar.json"


@st.cache_data(show_spinner=False)
def load_bills() -> pd.DataFrame:
    """Load and process the main bills JSON into a DataFrame."""
    if not BILLS_FILE.exists():
        return pd.DataFrame()
    try:
        with BILLS_FILE.open("r", encoding="utf-8") as f:
            bills_data = json.load(f)
        df = pd.DataFrame(bills_data)
        if "last_action_date" in df.columns:
            df["last_action_date"] = pd.to_datetime(df["last_action_date"], errors="coerce")
        if "lastUpdatedAt" in df.columns:
            df["lastUpdatedAt"] = pd.to_datetime(df["lastUpdatedAt"], errors="coerce")
        return df
    except Exception as e:
        st.error(f"Error loading bills: {e}")
        return pd.DataFrame()


@st.cache_data(show_spinner=False)
def load_summaries() -> dict:
    """Load pre-generated bill summaries keyed by state_billnumber."""
    try:
        if SUMMARIES_FILE.exists():
            with open(SUMMARIES_FILE, "r", encoding="utf-8") as f:
                return json.load(f)
    except Exception:
        pass
    return {}


@st.cache_data(show_spinner=False)
def load_suggested_questions() -> dict:
    """Load pre-generated suggested questions keyed by state_billnumber."""
    try:
        if QUESTIONS_FILE.exists():
            with open(QUESTIONS_FILE, "r", encoding="utf-8") as f:
                return json.load(f)
    except Exception:
        pass
    return {}


@st.cache_data(show_spinner=False)
def load_reports() -> dict:
    """Load pre-generated bill reports keyed by bill_id."""
    try:
        if REPORTS_FILE.exists():
            with open(REPORTS_FILE, "r", encoding="utf-8") as f:
                data = json.load(f)
            return {r["bill_id"]: r["report_markdown"] for r in data}
    except Exception:
        pass
    return {}


@st.cache_data(show_spinner=False)
def load_calendar() -> list:
    """Load pre-computed legislative calendar events."""
    try:
        if CALENDAR_FILE.exists():
            with open(CALENDAR_FILE, "r", encoding="utf-8") as f:
                return json.load(f)
    except Exception:
        pass
    return []


def get_summary(bill_data: dict, summaries: dict) -> str:
    key = f"{bill_data.get('state', '')}_{bill_data.get('bill_number', '')}"
    entry = summaries.get(key, {})
    summary = entry.get("summary", "") if isinstance(entry, dict) else ""
    if not summary or summary.startswith("ERROR:"):
        return ""
    return summary


def get_suggested_questions(bill_data: dict, questions: dict) -> list:
    key = f"{bill_data.get('state', '')}_{bill_data.get('bill_number', '')}"
    entry = questions.get(key, {})
    qs = entry.get("suggested_questions", []) if isinstance(entry, dict) else []
    if qs:
        return qs
    return [
        "What are the key definitions in this bill?",
        "What are the enforcement mechanisms?",
        "Who does this bill apply to?",
        "What are the compliance requirements?",
        "What penalties are specified?",
    ]


def get_report(bill_data: dict, reports: dict) -> str:
    bill_id = str(bill_data.get("bill_id", ""))
    report = reports.get(bill_id, "")
    if not report or str(report).startswith("ERROR:"):
        return ""
    return report


def get_last_updated(df: pd.DataFrame) -> str:
    if "lastUpdatedAt" not in df.columns or df.empty:
        return "N/A"
    valid = df[df["lastUpdatedAt"].notna()]["lastUpdatedAt"]
    if valid.empty:
        return "N/A"
    most_recent = valid.max()
    days_ago = (pd.Timestamp.now(tz=most_recent.tzinfo if most_recent.tzinfo else None) - most_recent).days
    date_str = most_recent.strftime("%Y-%m-%d")
    if days_ago <= 3:
        color = "#28a745"
    elif days_ago <= 7:
        color = "#f0ad4e"
    else:
        color = "#dc3545"
    ago_text = "Today" if days_ago == 0 else f"{days_ago}d ago"
    return f'{date_str} <span style="color:#CFB991;">({ago_text})</span>'


def load_newsletters() -> dict:
    """Return {label: Path} for all newsletter drafts, newest first."""
    if not NEWSLETTER_DIR.exists():
        return {}
    files = sorted(NEWSLETTER_DIR.glob("newsletter_*.md"), reverse=True)
    result = {}
    for nf in files:
        date_part = nf.stem.replace("newsletter_", "")
        result[f"Week of {date_part}"] = nf
    return result