Spaces:

VAILL
/

legislation-tracker

Running on CPU Upgrade

App Files Files Community

legislation-tracker / data_updating_scripts /generate_reports.py

ramanna

Upload 30 files

b5a9373 verified about 1 month ago

raw

history blame contribute delete

9.06 kB

	"""
	generate_reports.py
	--------------------

	Generates detailed Markdown reports for AI-related bills from `known_bills_visualize.json`
	using the latest LangChain pipeline syntax.

	Now includes resume functionality - can be safely stopped and restarted.
	"""

	from __future__ import annotations

	import json
	import logging
	import os
	import time
	from dataclasses import dataclass
	from typing import Any, Dict, List, Optional
	import dotenv


	dotenv.load_dotenv()

	# Create logs directory if it doesn't exist
	os.makedirs("data_updating_scripts/logs", exist_ok=True)

	# Latest LangChain imports
	try:
	from langchain_openai import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate
	except ImportError: # pragma: no cover
	ChatOpenAI = None # type: ignore
	ChatPromptTemplate = None # type: ignore

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_reports.log")],
	)

	logger = logging.getLogger(__name__)


	@dataclass
	class BillReport:
	"""Stores a bill ID and its generated detailed report."""
	bill_id: str
	report_markdown: str


	# Prompt template
	DETAILED_REPORT_PROMPT = ChatPromptTemplate.from_template(
	"""You are a seasoned legislative analyst adept at interpreting and
	summarising bills related to artificial intelligence. Using the bill
	information provided as JSON, produce a detailed report in Markdown
	format for stakeholders.

	Include:
	- Bill's title, number, and state
	- Status and key dates
	- URL to the bill on legiscan
	- Sponsors and scope
	- Goals and intent
	- Key provisions, regulatory approaches, implementation & enforcement
	- Unique aspects or notable features

	Format:
	- Use Markdown headings and bullet points
	- Paraphrase content
	- Do not invent facts
	- If bill text is truncated in source JSON, note this at the end

	Bill JSON:
	```json
	{bill_json}
	```

	Now craft the detailed report.
	"""
	)


	def _ensure_llm() -> ChatOpenAI:
	"""Initialise ChatOpenAI with latest settings."""
	if ChatOpenAI is None:
	raise RuntimeError(
	"The 'langchain' and 'openai' packages are required. Install them via 'pip install langchain openai'."
	)
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	raise RuntimeError("The OPENAI_API_KEY environment variable is not set.")
	model_name = os.getenv("MODEL_NAME", "gpt-4o")
	logger.debug("Initialising ChatOpenAI with model %s", model_name)
	return ChatOpenAI(model=model_name, temperature=0)


	def create_detailed_report(
	bill: Dict[str, Any], *, llm: Optional[ChatOpenAI] = None
	) -> BillReport:
	"""Generate a detailed report for a single bill using latest LangChain syntax."""
	if llm is None:
	llm = _ensure_llm()

	bill_json = json.dumps(bill, ensure_ascii=False, indent=2)

	# Latest syntax: prompt \| llm
	chain = DETAILED_REPORT_PROMPT \| llm
	result = chain.invoke({"bill_json": bill_json})

	# result can be AIMessage; get text
	report_text = getattr(result, "content", str(result))

	return BillReport(bill_id=str(bill.get("bill_id")), report_markdown=report_text)


	def load_existing_reports(output_path: str) -> Dict[str, str]:
	"""Load existing reports from file if it exists."""
	if os.path.exists(output_path):
	try:
	with open(output_path, "r", encoding="utf-8") as f:
	reports_list = json.load(f)
	# Convert list to dict for easy lookup
	reports_dict = {
	report["bill_id"]: report["report_markdown"]
	for report in reports_list
	if "bill_id" in report and "report_markdown" in report
	}
	logger.info(f"Loaded {len(reports_dict)} existing reports from {output_path}")
	return reports_dict
	except Exception as e:
	logger.warning(f"Could not load existing reports: {e}")
	return {}
	return {}


	def save_reports_to_file(reports_dict: Dict[str, str], output_path: str) -> None:
	"""Save reports dictionary to a JSON file."""
	# Convert dict back to list format for consistency
	out_list = [
	{"bill_id": bill_id, "report_markdown": report_markdown}
	for bill_id, report_markdown in reports_dict.items()
	]
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(out_list, f, ensure_ascii=False, indent=2)
	logger.info("Saved %d reports to %s", len(out_list), output_path)


	def create_reports_with_resume(
	bills: List[Dict[str, Any]],
	output_path: str,
	*,
	llm: Optional[ChatOpenAI] = None,
	save_interval: int = 10
	) -> Dict[str, str]:
	"""
	Generate detailed reports for multiple bills with resume capability.

	Args:
	bills: List of bill dictionaries
	output_path: Path to save reports
	llm: Optional LLM instance
	save_interval: Save progress every N bills

	Returns:
	Dictionary of bill_id -> report_markdown
	"""
	if not bills:
	return {}

	if llm is None:
	llm = _ensure_llm()

	# Load existing reports
	reports_dict = load_existing_reports(output_path)

	# Track progress
	total_bills = len(bills)
	processed = 0
	skipped = 0
	errors = 0

	logger.info(f"Starting report generation for {total_bills} bills")

	for i, bill in enumerate(bills, 1):
	bill_id = str(bill.get("bill_id"))

	# Skip if already processed
	if bill_id in reports_dict and reports_dict[bill_id] and not reports_dict[bill_id].startswith("ERROR:"):
	logger.info(f"Skipping bill {bill_id} - already processed ({i}/{total_bills})")
	skipped += 1
	continue

	logger.info(f"Processing {i}/{total_bills}: Bill ID {bill_id}")

	try:
	report = create_detailed_report(bill, llm=llm)
	reports_dict[bill_id] = report.report_markdown
	processed += 1

	except Exception as exc:
	logger.exception(
	"Failed to generate report for bill %s: %s", bill_id, exc
	)
	reports_dict[bill_id] = f"ERROR: Failed to generate report - {str(exc)}"
	errors += 1

	# Save progress periodically
	if i % save_interval == 0:
	save_reports_to_file(reports_dict, output_path)
	logger.info(f"Progress: {i}/{total_bills} - Processed: {processed}, Skipped: {skipped}, Errors: {errors}")

	# Rate limiting to avoid API throttling
	if bill_id not in reports_dict or reports_dict[bill_id].startswith("ERROR:"):
	time.sleep(1) # 1 second delay between API calls

	# Final save
	save_reports_to_file(reports_dict, output_path)

	logger.info(f"Report generation complete!")
	logger.info(f"Total bills: {total_bills}")
	logger.info(f"Successfully processed: {processed}")
	logger.info(f"Skipped (already done): {skipped}")
	logger.info(f"Errors: {errors}")

	return reports_dict


	def read_bills_from_file(path: str) -> List[Dict[str, Any]]:
	"""Read bill records from a JSON file."""
	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)
	if not isinstance(data, list):
	raise ValueError(f"Expected list of bills in {path}, got {type(data)}")
	return data


	def generate_reports_from_files(
	input_path: str = "data/known_bills_visualize.json",
	output_path: str = "data/bill_reports.json",
	) -> None:
	"""Read bills, generate reports with resume capability, and write them to disk."""
	bills = read_bills_from_file(input_path)
	create_reports_with_resume(bills, output_path)


	def main() -> None:
	import argparse
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
	)
	parser = argparse.ArgumentParser(
	description="Generate detailed AI legislation reports from bill data with resume capability."
	)
	parser.add_argument("--input", default="data/known_bills_visualize.json", help="Path to input JSON file")
	parser.add_argument("--output", default="data/bill_reports.json", help="Path to output JSON file")
	parser.add_argument("--save-interval", type=int, default=10, help="Save progress every N bills (default: 10)")
	args = parser.parse_args()

	try:
	bills = read_bills_from_file(args.input)
	create_reports_with_resume(bills, args.output, save_interval=args.save_interval)
	print(f"✅ Report generation completed successfully!")
	print(f" Reports saved to: {args.output}")
	except Exception as e:
	logger.error(f"Fatal error: {e}")
	print(f"❌ Error: {e}")
	import sys
	sys.exit(1)


	if __name__ == "__main__":
	main()