Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import json | |
| import base64 | |
| import logging | |
| import sys | |
| from datetime import datetime, timezone | |
| import requests | |
| from dotenv import load_dotenv | |
| import PyPDF2 | |
| from io import BytesIO | |
| import re | |
| import shutil | |
| # Load environment variables | |
| load_dotenv() | |
| API_KEY = os.getenv("LEGISCAN_API_KEY") | |
| # Files | |
| INPUT_FILE = "data/known_bills.json" | |
| OUTPUT_FILE = "data/known_bills_fixed.json" | |
| BACKUP_FILE = "data/known_bills_backup.json" | |
| # Rate limiting | |
| import time | |
| RATE_LIMIT = 0.2 # seconds between API requests | |
| # Logging configuration | |
| LOG_FILE = "data_updating_scripts/logs/fix_pdf_bills.log" | |
| os.makedirs("data_updating_scripts/logs", exist_ok=True) | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout), | |
| logging.FileHandler(LOG_FILE) | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def is_pdf_content(text): | |
| """Check if the text content is an unprocessed PDF.""" | |
| if not text: | |
| return False | |
| # Check for PDF header signatures | |
| pdf_signatures = ["%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF1.3", "%PDF1.4", "%PDF1.5", "%PDF1.6", "%PDF1.7"] | |
| text_start = text[:20] if len(text) >= 20 else text | |
| return any(text_start.startswith(sig) for sig in pdf_signatures) | |
| def extract_text_from_pdf_bytes(pdf_bytes): | |
| """Extract text from PDF bytes using PyPDF2.""" | |
| try: | |
| pdf_file = BytesIO(pdf_bytes) | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text_content = [] | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| page_text = page.extract_text() | |
| if page_text: | |
| text_content.append(page_text) | |
| full_text = "\n".join(text_content) | |
| # Clean up the extracted text | |
| # Remove excessive whitespace while preserving paragraph breaks | |
| full_text = re.sub(r'\n{3,}', '\n\n', full_text) | |
| full_text = re.sub(r' {2,}', ' ', full_text) | |
| full_text = full_text.strip() | |
| return full_text | |
| except Exception as e: | |
| logger.error(f"Error extracting text from PDF: {e}") | |
| return None | |
| def legi_request(op, params): | |
| """Make a request to the LegiScan API.""" | |
| base = "https://api.legiscan.com/" | |
| params.update({"key": API_KEY, "op": op}) | |
| try: | |
| resp = requests.get(base, params=params, timeout=10) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| if data.get("status") != "OK": | |
| logger.error(f"API error {op}: {data.get('message', data)}") | |
| return None | |
| return data | |
| except requests.RequestException as e: | |
| logger.error(f"Request failed ({op}): {e}") | |
| return None | |
| def fix_pdf_bill(bill): | |
| """Fix a single bill with unprocessed PDF content.""" | |
| bill_id = bill.get("bill_id") | |
| state = bill.get("state") | |
| bill_num = bill.get("bill_number") | |
| logger.info(f"Fixing PDF content for {state} {bill_num} (ID: {bill_id})") | |
| # First, try to get the bill details again | |
| details_resp = legi_request("getBill", {"id": bill_id}) | |
| if not details_resp: | |
| logger.warning(f"Could not fetch bill details for {bill_id}") | |
| return None | |
| details = details_resp.get("bill", {}) | |
| texts = details.get("texts", []) | |
| if not texts: | |
| logger.warning(f"No text documents available for {bill_id}") | |
| return None | |
| # Try to get the text document | |
| doc_id = texts[0].get("doc_id") | |
| text_resp = legi_request("getBillText", {"id": doc_id}) | |
| if not text_resp or "text" not in text_resp: | |
| logger.warning(f"Could not fetch text for {bill_id}") | |
| return None | |
| raw_b64 = text_resp["text"].get("doc", "") | |
| if not raw_b64: | |
| logger.warning(f"No document content for {bill_id}") | |
| return None | |
| try: | |
| # Decode the base64 content | |
| decoded = base64.b64decode(raw_b64) | |
| # Check if it's a PDF by looking at the magic bytes | |
| if decoded[:4] == b'%PDF': | |
| # It's a PDF, extract text | |
| extracted_text = extract_text_from_pdf_bytes(decoded) | |
| if extracted_text and len(extracted_text.strip()) > 100: # Ensure we got meaningful text | |
| logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF for {bill_id}") | |
| return extracted_text | |
| else: | |
| logger.warning(f"Extracted text too short or empty for {bill_id}") | |
| return None | |
| else: | |
| # Try to decode as HTML (shouldn't happen for these cases, but just in case) | |
| try: | |
| from bs4 import BeautifulSoup | |
| html = decoded.decode("utf-8", errors="ignore") | |
| soup = BeautifulSoup(html, "html.parser") | |
| plain_text = soup.get_text(separator="\n", strip=True) | |
| if plain_text and len(plain_text.strip()) > 100: | |
| logger.info(f"Successfully extracted HTML text for {bill_id}") | |
| return plain_text | |
| except: | |
| pass | |
| logger.warning(f"Could not process document for {bill_id}") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error processing document for {bill_id}: {e}") | |
| return None | |
| def main(overwrite: bool | None = None): | |
| # Load the bills | |
| logger.info(f"Loading bills from {INPUT_FILE}") | |
| try: | |
| with open(INPUT_FILE, 'r') as f: | |
| bills = json.load(f) | |
| except Exception as e: | |
| logger.error(f"Could not load bills file: {e}") | |
| sys.exit(1) | |
| logger.info(f"Loaded {len(bills)} bills") | |
| # Create a backup | |
| logger.info(f"Creating backup at {BACKUP_FILE}") | |
| with open(BACKUP_FILE, 'w') as f: | |
| json.dump(bills, f, indent=2) | |
| # Find bills with unprocessed PDF content | |
| pdf_bills = [] | |
| for i, bill in enumerate(bills): | |
| if is_pdf_content(bill.get("text")): | |
| pdf_bills.append(i) | |
| logger.info(f"Found {len(pdf_bills)} bills with unprocessed PDF content") | |
| # Process each PDF bill | |
| fixed_count = 0 | |
| failed_count = 0 | |
| for idx, bill_idx in enumerate(pdf_bills): | |
| bill = bills[bill_idx] | |
| logger.info(f"Processing {idx + 1}/{len(pdf_bills)}: {bill.get('state')} {bill.get('bill_number')}") | |
| # Try to fix the PDF content | |
| fixed_text = fix_pdf_bill(bill) | |
| if fixed_text: | |
| # Update the bill with the fixed text | |
| bills[bill_idx]["text"] = fixed_text | |
| bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat() | |
| bills[bill_idx]["text_fixed"] = True # Mark that we fixed this | |
| fixed_count += 1 | |
| logger.info(f"Successfully fixed bill {bill.get('bill_id')}") | |
| else: | |
| # Mark that we tried but failed | |
| bills[bill_idx]["text_extraction_failed"] = True | |
| bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat() | |
| failed_count += 1 | |
| logger.warning(f"Failed to fix bill {bill.get('bill_id')}") | |
| # Rate limiting | |
| time.sleep(RATE_LIMIT) | |
| # Save progress every 50 bills | |
| if (idx + 1) % 50 == 0: | |
| logger.info(f"Saving progress... ({idx + 1}/{len(pdf_bills)} processed)") | |
| with open(OUTPUT_FILE, 'w') as f: | |
| json.dump(bills, f, indent=2) | |
| # Save final results | |
| logger.info(f"Saving final results to {OUTPUT_FILE}") | |
| with open(OUTPUT_FILE, 'w') as f: | |
| json.dump(bills, f, indent=2) | |
| logger.info(f"Processing complete!") | |
| logger.info(f"Successfully fixed: {fixed_count} bills") | |
| logger.info(f"Failed to fix: {failed_count} bills") | |
| logger.info(f"Output saved to: {OUTPUT_FILE}") | |
| if fixed_count > 0: | |
| # Decide overwrite behavior | |
| if overwrite is None: | |
| # CLI mode: ask the user (guardrail preserved) | |
| try: | |
| response = input( | |
| f"\nDo you want to overwrite {INPUT_FILE} with the fixed data? (y/n): " | |
| ) | |
| except EOFError: | |
| logger.error( | |
| "No input available (EOF). Leaving original file unchanged." | |
| ) | |
| return | |
| overwrite_flag = response.strip().lower().startswith("y") | |
| else: | |
| # Non-interactive mode (e.g. Streamlit pipeline) | |
| overwrite_flag = overwrite | |
| if overwrite_flag: | |
| shutil.copy2(OUTPUT_FILE, INPUT_FILE) | |
| logger.info(f"Original file {INPUT_FILE} has been updated with fixed data.") | |
| else: | |
| logger.info("Overwrite declined; original file left unchanged.") | |
| if __name__ == "__main__": | |
| # If running under Streamlit / pipeline, we expect FIX_PDF_OVERWRITE in env: | |
| # "yes", "y", "true", "1" -> overwrite=True | |
| # "no", "n", "false", "0" -> overwrite=False | |
| # If it's not set, we fall back to CLI mode and ask via input(). | |
| env_choice = os.getenv("FIX_PDF_OVERWRITE") | |
| if env_choice is None: | |
| # Local CLI run → still interactive | |
| main(overwrite=None) | |
| else: | |
| choice = env_choice.strip().lower() | |
| if choice in ("yes", "y", "true", "1"): | |
| main(overwrite=True) | |
| elif choice in ("no", "n", "false", "0"): | |
| main(overwrite=False) | |
| else: | |
| logger.warning( | |
| f"Invalid FIX_PDF_OVERWRITE='{env_choice}', defaulting to no overwrite." | |
| ) | |
| main(overwrite=False) | |