ramanna's picture
Upload 30 files
b5a9373 verified
"""
generate_reports.py
--------------------
Generates detailed Markdown reports for AI-related bills from `known_bills_visualize.json`
using the latest LangChain pipeline syntax.
Now includes resume functionality - can be safely stopped and restarted.
"""
from __future__ import annotations
import json
import logging
import os
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import dotenv
dotenv.load_dotenv()
# Create logs directory if it doesn't exist
os.makedirs("data_updating_scripts/logs", exist_ok=True)
# Latest LangChain imports
try:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
except ImportError: # pragma: no cover
ChatOpenAI = None # type: ignore
ChatPromptTemplate = None # type: ignore
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_reports.log")],
)
logger = logging.getLogger(__name__)
@dataclass
class BillReport:
"""Stores a bill ID and its generated detailed report."""
bill_id: str
report_markdown: str
# Prompt template
DETAILED_REPORT_PROMPT = ChatPromptTemplate.from_template(
"""You are a seasoned legislative analyst adept at interpreting and
summarising bills related to artificial intelligence. Using the bill
information provided as JSON, produce a detailed report in Markdown
format for stakeholders.
Include:
- Bill's title, number, and state
- Status and key dates
- URL to the bill on legiscan
- Sponsors and scope
- Goals and intent
- Key provisions, regulatory approaches, implementation & enforcement
- Unique aspects or notable features
Format:
- Use Markdown headings and bullet points
- Paraphrase content
- Do not invent facts
- If bill text is truncated in source JSON, note this at the end
Bill JSON:
```json
{bill_json}
```
Now craft the detailed report.
"""
)
def _ensure_llm() -> ChatOpenAI:
"""Initialise ChatOpenAI with latest settings."""
if ChatOpenAI is None:
raise RuntimeError(
"The 'langchain' and 'openai' packages are required. Install them via 'pip install langchain openai'."
)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("The OPENAI_API_KEY environment variable is not set.")
model_name = os.getenv("MODEL_NAME", "gpt-4o")
logger.debug("Initialising ChatOpenAI with model %s", model_name)
return ChatOpenAI(model=model_name, temperature=0)
def create_detailed_report(
bill: Dict[str, Any], *, llm: Optional[ChatOpenAI] = None
) -> BillReport:
"""Generate a detailed report for a single bill using latest LangChain syntax."""
if llm is None:
llm = _ensure_llm()
bill_json = json.dumps(bill, ensure_ascii=False, indent=2)
# Latest syntax: prompt | llm
chain = DETAILED_REPORT_PROMPT | llm
result = chain.invoke({"bill_json": bill_json})
# result can be AIMessage; get text
report_text = getattr(result, "content", str(result))
return BillReport(bill_id=str(bill.get("bill_id")), report_markdown=report_text)
def load_existing_reports(output_path: str) -> Dict[str, str]:
"""Load existing reports from file if it exists."""
if os.path.exists(output_path):
try:
with open(output_path, "r", encoding="utf-8") as f:
reports_list = json.load(f)
# Convert list to dict for easy lookup
reports_dict = {
report["bill_id"]: report["report_markdown"]
for report in reports_list
if "bill_id" in report and "report_markdown" in report
}
logger.info(f"Loaded {len(reports_dict)} existing reports from {output_path}")
return reports_dict
except Exception as e:
logger.warning(f"Could not load existing reports: {e}")
return {}
return {}
def save_reports_to_file(reports_dict: Dict[str, str], output_path: str) -> None:
"""Save reports dictionary to a JSON file."""
# Convert dict back to list format for consistency
out_list = [
{"bill_id": bill_id, "report_markdown": report_markdown}
for bill_id, report_markdown in reports_dict.items()
]
with open(output_path, "w", encoding="utf-8") as f:
json.dump(out_list, f, ensure_ascii=False, indent=2)
logger.info("Saved %d reports to %s", len(out_list), output_path)
def create_reports_with_resume(
bills: List[Dict[str, Any]],
output_path: str,
*,
llm: Optional[ChatOpenAI] = None,
save_interval: int = 10
) -> Dict[str, str]:
"""
Generate detailed reports for multiple bills with resume capability.
Args:
bills: List of bill dictionaries
output_path: Path to save reports
llm: Optional LLM instance
save_interval: Save progress every N bills
Returns:
Dictionary of bill_id -> report_markdown
"""
if not bills:
return {}
if llm is None:
llm = _ensure_llm()
# Load existing reports
reports_dict = load_existing_reports(output_path)
# Track progress
total_bills = len(bills)
processed = 0
skipped = 0
errors = 0
logger.info(f"Starting report generation for {total_bills} bills")
for i, bill in enumerate(bills, 1):
bill_id = str(bill.get("bill_id"))
# Skip if already processed
if bill_id in reports_dict and reports_dict[bill_id] and not reports_dict[bill_id].startswith("ERROR:"):
logger.info(f"Skipping bill {bill_id} - already processed ({i}/{total_bills})")
skipped += 1
continue
logger.info(f"Processing {i}/{total_bills}: Bill ID {bill_id}")
try:
report = create_detailed_report(bill, llm=llm)
reports_dict[bill_id] = report.report_markdown
processed += 1
except Exception as exc:
logger.exception(
"Failed to generate report for bill %s: %s", bill_id, exc
)
reports_dict[bill_id] = f"ERROR: Failed to generate report - {str(exc)}"
errors += 1
# Save progress periodically
if i % save_interval == 0:
save_reports_to_file(reports_dict, output_path)
logger.info(f"Progress: {i}/{total_bills} - Processed: {processed}, Skipped: {skipped}, Errors: {errors}")
# Rate limiting to avoid API throttling
if bill_id not in reports_dict or reports_dict[bill_id].startswith("ERROR:"):
time.sleep(1) # 1 second delay between API calls
# Final save
save_reports_to_file(reports_dict, output_path)
logger.info(f"Report generation complete!")
logger.info(f"Total bills: {total_bills}")
logger.info(f"Successfully processed: {processed}")
logger.info(f"Skipped (already done): {skipped}")
logger.info(f"Errors: {errors}")
return reports_dict
def read_bills_from_file(path: str) -> List[Dict[str, Any]]:
"""Read bill records from a JSON file."""
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(f"Expected list of bills in {path}, got {type(data)}")
return data
def generate_reports_from_files(
input_path: str = "data/known_bills_visualize.json",
output_path: str = "data/bill_reports.json",
) -> None:
"""Read bills, generate reports with resume capability, and write them to disk."""
bills = read_bills_from_file(input_path)
create_reports_with_resume(bills, output_path)
def main() -> None:
import argparse
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
parser = argparse.ArgumentParser(
description="Generate detailed AI legislation reports from bill data with resume capability."
)
parser.add_argument("--input", default="data/known_bills_visualize.json", help="Path to input JSON file")
parser.add_argument("--output", default="data/bill_reports.json", help="Path to output JSON file")
parser.add_argument("--save-interval", type=int, default=10, help="Save progress every N bills (default: 10)")
args = parser.parse_args()
try:
bills = read_bills_from_file(args.input)
create_reports_with_resume(bills, args.output, save_interval=args.save_interval)
print(f"✅ Report generation completed successfully!")
print(f" Reports saved to: {args.output}")
except Exception as e:
logger.error(f"Fatal error: {e}")
print(f"❌ Error: {e}")
import sys
sys.exit(1)
if __name__ == "__main__":
main()