Spaces:

VAILL
/

legislation-tracker

Running on CPU Upgrade

App Files Files Community

legislation-tracker / data_updating_scripts /fix_pdf_bills.py

ramanna

Upload 30 files

b5a9373 verified about 2 months ago

raw

history blame contribute delete

9.7 kB

	import os
	import json
	import base64
	import logging
	import sys
	from datetime import datetime, timezone
	import requests
	from dotenv import load_dotenv
	import PyPDF2
	from io import BytesIO
	import re
	import shutil

	# Load environment variables
	load_dotenv()
	API_KEY = os.getenv("LEGISCAN_API_KEY")

	# Files
	INPUT_FILE = "data/known_bills.json"
	OUTPUT_FILE = "data/known_bills_fixed.json"
	BACKUP_FILE = "data/known_bills_backup.json"

	# Rate limiting
	import time
	RATE_LIMIT = 0.2 # seconds between API requests

	# Logging configuration
	LOG_FILE = "data_updating_scripts/logs/fix_pdf_bills.log"
	os.makedirs("data_updating_scripts/logs", exist_ok=True)

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[
	logging.StreamHandler(sys.stdout),
	logging.FileHandler(LOG_FILE)
	]
	)
	logger = logging.getLogger(__name__)


	def is_pdf_content(text):
	"""Check if the text content is an unprocessed PDF."""
	if not text:
	return False
	# Check for PDF header signatures
	pdf_signatures = ["%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF1.3", "%PDF1.4", "%PDF1.5", "%PDF1.6", "%PDF1.7"]
	text_start = text[:20] if len(text) >= 20 else text
	return any(text_start.startswith(sig) for sig in pdf_signatures)


	def extract_text_from_pdf_bytes(pdf_bytes):
	"""Extract text from PDF bytes using PyPDF2."""
	try:
	pdf_file = BytesIO(pdf_bytes)
	pdf_reader = PyPDF2.PdfReader(pdf_file)

	text_content = []
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	page_text = page.extract_text()
	if page_text:
	text_content.append(page_text)

	full_text = "\n".join(text_content)

	# Clean up the extracted text
	# Remove excessive whitespace while preserving paragraph breaks
	full_text = re.sub(r'\n{3,}', '\n\n', full_text)
	full_text = re.sub(r' {2,}', ' ', full_text)
	full_text = full_text.strip()

	return full_text
	except Exception as e:
	logger.error(f"Error extracting text from PDF: {e}")
	return None


	def legi_request(op, params):
	"""Make a request to the LegiScan API."""
	base = "https://api.legiscan.com/"
	params.update({"key": API_KEY, "op": op})
	try:
	resp = requests.get(base, params=params, timeout=10)
	resp.raise_for_status()
	data = resp.json()
	if data.get("status") != "OK":
	logger.error(f"API error {op}: {data.get('message', data)}")
	return None
	return data
	except requests.RequestException as e:
	logger.error(f"Request failed ({op}): {e}")
	return None


	def fix_pdf_bill(bill):
	"""Fix a single bill with unprocessed PDF content."""
	bill_id = bill.get("bill_id")
	state = bill.get("state")
	bill_num = bill.get("bill_number")

	logger.info(f"Fixing PDF content for {state} {bill_num} (ID: {bill_id})")

	# First, try to get the bill details again
	details_resp = legi_request("getBill", {"id": bill_id})
	if not details_resp:
	logger.warning(f"Could not fetch bill details for {bill_id}")
	return None

	details = details_resp.get("bill", {})
	texts = details.get("texts", [])

	if not texts:
	logger.warning(f"No text documents available for {bill_id}")
	return None

	# Try to get the text document
	doc_id = texts[0].get("doc_id")
	text_resp = legi_request("getBillText", {"id": doc_id})

	if not text_resp or "text" not in text_resp:
	logger.warning(f"Could not fetch text for {bill_id}")
	return None

	raw_b64 = text_resp["text"].get("doc", "")
	if not raw_b64:
	logger.warning(f"No document content for {bill_id}")
	return None

	try:
	# Decode the base64 content
	decoded = base64.b64decode(raw_b64)

	# Check if it's a PDF by looking at the magic bytes
	if decoded[:4] == b'%PDF':
	# It's a PDF, extract text
	extracted_text = extract_text_from_pdf_bytes(decoded)
	if extracted_text and len(extracted_text.strip()) > 100: # Ensure we got meaningful text
	logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF for {bill_id}")
	return extracted_text
	else:
	logger.warning(f"Extracted text too short or empty for {bill_id}")
	return None
	else:
	# Try to decode as HTML (shouldn't happen for these cases, but just in case)
	try:
	from bs4 import BeautifulSoup
	html = decoded.decode("utf-8", errors="ignore")
	soup = BeautifulSoup(html, "html.parser")
	plain_text = soup.get_text(separator="\n", strip=True)
	if plain_text and len(plain_text.strip()) > 100:
	logger.info(f"Successfully extracted HTML text for {bill_id}")
	return plain_text
	except:
	pass

	logger.warning(f"Could not process document for {bill_id}")
	return None

	except Exception as e:
	logger.error(f"Error processing document for {bill_id}: {e}")
	return None


	def main(overwrite: bool \| None = None):
	# Load the bills
	logger.info(f"Loading bills from {INPUT_FILE}")
	try:
	with open(INPUT_FILE, 'r') as f:
	bills = json.load(f)
	except Exception as e:
	logger.error(f"Could not load bills file: {e}")
	sys.exit(1)

	logger.info(f"Loaded {len(bills)} bills")

	# Create a backup
	logger.info(f"Creating backup at {BACKUP_FILE}")
	with open(BACKUP_FILE, 'w') as f:
	json.dump(bills, f, indent=2)

	# Find bills with unprocessed PDF content
	pdf_bills = []
	for i, bill in enumerate(bills):
	if is_pdf_content(bill.get("text")):
	pdf_bills.append(i)

	logger.info(f"Found {len(pdf_bills)} bills with unprocessed PDF content")

	# Process each PDF bill
	fixed_count = 0
	failed_count = 0

	for idx, bill_idx in enumerate(pdf_bills):
	bill = bills[bill_idx]
	logger.info(f"Processing {idx + 1}/{len(pdf_bills)}: {bill.get('state')} {bill.get('bill_number')}")

	# Try to fix the PDF content
	fixed_text = fix_pdf_bill(bill)

	if fixed_text:
	# Update the bill with the fixed text
	bills[bill_idx]["text"] = fixed_text
	bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
	bills[bill_idx]["text_fixed"] = True # Mark that we fixed this
	fixed_count += 1
	logger.info(f"Successfully fixed bill {bill.get('bill_id')}")
	else:
	# Mark that we tried but failed
	bills[bill_idx]["text_extraction_failed"] = True
	bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
	failed_count += 1
	logger.warning(f"Failed to fix bill {bill.get('bill_id')}")

	# Rate limiting
	time.sleep(RATE_LIMIT)

	# Save progress every 50 bills
	if (idx + 1) % 50 == 0:
	logger.info(f"Saving progress... ({idx + 1}/{len(pdf_bills)} processed)")
	with open(OUTPUT_FILE, 'w') as f:
	json.dump(bills, f, indent=2)

	# Save final results
	logger.info(f"Saving final results to {OUTPUT_FILE}")
	with open(OUTPUT_FILE, 'w') as f:
	json.dump(bills, f, indent=2)

	logger.info(f"Processing complete!")
	logger.info(f"Successfully fixed: {fixed_count} bills")
	logger.info(f"Failed to fix: {failed_count} bills")
	logger.info(f"Output saved to: {OUTPUT_FILE}")

	if fixed_count > 0:
	# Decide overwrite behavior
	if overwrite is None:
	# CLI mode: ask the user (guardrail preserved)
	try:
	response = input(
	f"\nDo you want to overwrite {INPUT_FILE} with the fixed data? (y/n): "
	)
	except EOFError:
	logger.error(
	"No input available (EOF). Leaving original file unchanged."
	)
	return
	overwrite_flag = response.strip().lower().startswith("y")
	else:
	# Non-interactive mode (e.g. Streamlit pipeline)
	overwrite_flag = overwrite

	if overwrite_flag:
	shutil.copy2(OUTPUT_FILE, INPUT_FILE)
	logger.info(f"Original file {INPUT_FILE} has been updated with fixed data.")
	else:
	logger.info("Overwrite declined; original file left unchanged.")



	if __name__ == "__main__":
	# If running under Streamlit / pipeline, we expect FIX_PDF_OVERWRITE in env:
	# "yes", "y", "true", "1" -> overwrite=True
	# "no", "n", "false", "0" -> overwrite=False
	# If it's not set, we fall back to CLI mode and ask via input().
	env_choice = os.getenv("FIX_PDF_OVERWRITE")

	if env_choice is None:
	# Local CLI run → still interactive
	main(overwrite=None)
	else:
	choice = env_choice.strip().lower()
	if choice in ("yes", "y", "true", "1"):
	main(overwrite=True)
	elif choice in ("no", "n", "false", "0"):
	main(overwrite=False)
	else:
	logger.warning(
	f"Invalid FIX_PDF_OVERWRITE='{env_choice}', defaulting to no overwrite."
	)
	main(overwrite=False)