import os import json import base64 import logging import sys from datetime import datetime, timezone import requests from dotenv import load_dotenv import PyPDF2 from io import BytesIO import re import shutil # Load environment variables load_dotenv() API_KEY = os.getenv("LEGISCAN_API_KEY") # Files INPUT_FILE = "data/known_bills.json" OUTPUT_FILE = "data/known_bills_fixed.json" BACKUP_FILE = "data/known_bills_backup.json" # Rate limiting import time RATE_LIMIT = 0.2 # seconds between API requests # Logging configuration LOG_FILE = "data_updating_scripts/logs/fix_pdf_bills.log" os.makedirs("data_updating_scripts/logs", exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler(LOG_FILE) ] ) logger = logging.getLogger(__name__) def is_pdf_content(text): """Check if the text content is an unprocessed PDF.""" if not text: return False # Check for PDF header signatures pdf_signatures = ["%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF1.3", "%PDF1.4", "%PDF1.5", "%PDF1.6", "%PDF1.7"] text_start = text[:20] if len(text) >= 20 else text return any(text_start.startswith(sig) for sig in pdf_signatures) def extract_text_from_pdf_bytes(pdf_bytes): """Extract text from PDF bytes using PyPDF2.""" try: pdf_file = BytesIO(pdf_bytes) pdf_reader = PyPDF2.PdfReader(pdf_file) text_content = [] for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text() if page_text: text_content.append(page_text) full_text = "\n".join(text_content) # Clean up the extracted text # Remove excessive whitespace while preserving paragraph breaks full_text = re.sub(r'\n{3,}', '\n\n', full_text) full_text = re.sub(r' {2,}', ' ', full_text) full_text = full_text.strip() return full_text except Exception as e: logger.error(f"Error extracting text from PDF: {e}") return None def legi_request(op, params): """Make a request to the LegiScan API.""" base = "https://api.legiscan.com/" params.update({"key": API_KEY, "op": op}) try: resp = requests.get(base, params=params, timeout=10) resp.raise_for_status() data = resp.json() if data.get("status") != "OK": logger.error(f"API error {op}: {data.get('message', data)}") return None return data except requests.RequestException as e: logger.error(f"Request failed ({op}): {e}") return None def fix_pdf_bill(bill): """Fix a single bill with unprocessed PDF content.""" bill_id = bill.get("bill_id") state = bill.get("state") bill_num = bill.get("bill_number") logger.info(f"Fixing PDF content for {state} {bill_num} (ID: {bill_id})") # First, try to get the bill details again details_resp = legi_request("getBill", {"id": bill_id}) if not details_resp: logger.warning(f"Could not fetch bill details for {bill_id}") return None details = details_resp.get("bill", {}) texts = details.get("texts", []) if not texts: logger.warning(f"No text documents available for {bill_id}") return None # Try to get the text document doc_id = texts[0].get("doc_id") text_resp = legi_request("getBillText", {"id": doc_id}) if not text_resp or "text" not in text_resp: logger.warning(f"Could not fetch text for {bill_id}") return None raw_b64 = text_resp["text"].get("doc", "") if not raw_b64: logger.warning(f"No document content for {bill_id}") return None try: # Decode the base64 content decoded = base64.b64decode(raw_b64) # Check if it's a PDF by looking at the magic bytes if decoded[:4] == b'%PDF': # It's a PDF, extract text extracted_text = extract_text_from_pdf_bytes(decoded) if extracted_text and len(extracted_text.strip()) > 100: # Ensure we got meaningful text logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF for {bill_id}") return extracted_text else: logger.warning(f"Extracted text too short or empty for {bill_id}") return None else: # Try to decode as HTML (shouldn't happen for these cases, but just in case) try: from bs4 import BeautifulSoup html = decoded.decode("utf-8", errors="ignore") soup = BeautifulSoup(html, "html.parser") plain_text = soup.get_text(separator="\n", strip=True) if plain_text and len(plain_text.strip()) > 100: logger.info(f"Successfully extracted HTML text for {bill_id}") return plain_text except: pass logger.warning(f"Could not process document for {bill_id}") return None except Exception as e: logger.error(f"Error processing document for {bill_id}: {e}") return None def main(overwrite: bool | None = None): # Load the bills logger.info(f"Loading bills from {INPUT_FILE}") try: with open(INPUT_FILE, 'r') as f: bills = json.load(f) except Exception as e: logger.error(f"Could not load bills file: {e}") sys.exit(1) logger.info(f"Loaded {len(bills)} bills") # Create a backup logger.info(f"Creating backup at {BACKUP_FILE}") with open(BACKUP_FILE, 'w') as f: json.dump(bills, f, indent=2) # Find bills with unprocessed PDF content pdf_bills = [] for i, bill in enumerate(bills): if is_pdf_content(bill.get("text")): pdf_bills.append(i) logger.info(f"Found {len(pdf_bills)} bills with unprocessed PDF content") # Process each PDF bill fixed_count = 0 failed_count = 0 for idx, bill_idx in enumerate(pdf_bills): bill = bills[bill_idx] logger.info(f"Processing {idx + 1}/{len(pdf_bills)}: {bill.get('state')} {bill.get('bill_number')}") # Try to fix the PDF content fixed_text = fix_pdf_bill(bill) if fixed_text: # Update the bill with the fixed text bills[bill_idx]["text"] = fixed_text bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat() bills[bill_idx]["text_fixed"] = True # Mark that we fixed this fixed_count += 1 logger.info(f"Successfully fixed bill {bill.get('bill_id')}") else: # Mark that we tried but failed bills[bill_idx]["text_extraction_failed"] = True bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat() failed_count += 1 logger.warning(f"Failed to fix bill {bill.get('bill_id')}") # Rate limiting time.sleep(RATE_LIMIT) # Save progress every 50 bills if (idx + 1) % 50 == 0: logger.info(f"Saving progress... ({idx + 1}/{len(pdf_bills)} processed)") with open(OUTPUT_FILE, 'w') as f: json.dump(bills, f, indent=2) # Save final results logger.info(f"Saving final results to {OUTPUT_FILE}") with open(OUTPUT_FILE, 'w') as f: json.dump(bills, f, indent=2) logger.info(f"Processing complete!") logger.info(f"Successfully fixed: {fixed_count} bills") logger.info(f"Failed to fix: {failed_count} bills") logger.info(f"Output saved to: {OUTPUT_FILE}") if fixed_count > 0: # Decide overwrite behavior if overwrite is None: # CLI mode: ask the user (guardrail preserved) try: response = input( f"\nDo you want to overwrite {INPUT_FILE} with the fixed data? (y/n): " ) except EOFError: logger.error( "No input available (EOF). Leaving original file unchanged." ) return overwrite_flag = response.strip().lower().startswith("y") else: # Non-interactive mode (e.g. Streamlit pipeline) overwrite_flag = overwrite if overwrite_flag: shutil.copy2(OUTPUT_FILE, INPUT_FILE) logger.info(f"Original file {INPUT_FILE} has been updated with fixed data.") else: logger.info("Overwrite declined; original file left unchanged.") if __name__ == "__main__": # If running under Streamlit / pipeline, we expect FIX_PDF_OVERWRITE in env: # "yes", "y", "true", "1" -> overwrite=True # "no", "n", "false", "0" -> overwrite=False # If it's not set, we fall back to CLI mode and ask via input(). env_choice = os.getenv("FIX_PDF_OVERWRITE") if env_choice is None: # Local CLI run → still interactive main(overwrite=None) else: choice = env_choice.strip().lower() if choice in ("yes", "y", "true", "1"): main(overwrite=True) elif choice in ("no", "n", "false", "0"): main(overwrite=False) else: logger.warning( f"Invalid FIX_PDF_OVERWRITE='{env_choice}', defaulting to no overwrite." ) main(overwrite=False)