Spaces:

VAILL
/

legislation-tracker

Running on CPU Upgrade

File size: 12,372 Bytes

98bf60c

import os
import json
import base64
import logging
import sys
import time as _time_mod
from datetime import datetime, timezone
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import PyPDF2
from io import BytesIO
import re
import shutil

# Load environment variables
load_dotenv()

# Pipeline status tracking (no-op when running standalone)
_PIPELINE_SCRIPT = os.environ.get("PIPELINE_CURRENT_SCRIPT")
_pipeline = None
_last_status_write = 0.0
if _PIPELINE_SCRIPT:
    try:
        from pipeline_status import PipelineStatus
        _pipeline = PipelineStatus()
    except Exception:
        pass

def _update_pipeline_progress(current, total, unit="bills", message=""):
    global _last_status_write
    if not _pipeline:
        return
    now = _time_mod.time()
    if now - _last_status_write < 3.0:
        return
    _last_status_write = now
    try:
        _pipeline.update_progress(_PIPELINE_SCRIPT, current, total, unit, message)
    except Exception:
        pass

def _log_pipeline_error(error, bill_id="", bill_key=""):
    if not _pipeline:
        return
    try:
        _pipeline.log_error(_PIPELINE_SCRIPT, error, bill_id, bill_key)
    except Exception:
        pass

# Import shared LegiScan API helper
from legiscan_api import legi_request, RATE_LIMIT, get_api_call_count

# Files
INPUT_FILE = "data/known_bills.json"
OUTPUT_FILE = "data/known_bills_fixed.json"
BACKUP_FILE = "data/known_bills_backup.json"

import time

# Logging configuration
LOG_FILE = "data_updating_scripts/logs/fix_pdf_bills.log"
os.makedirs("data_updating_scripts/logs", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)


def is_pdf_content(text):
    """Check if the text content is an unprocessed PDF."""
    if not text:
        return False
    # Check for PDF header signatures
    pdf_signatures = ["%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF1.3", "%PDF1.4", "%PDF1.5", "%PDF1.6", "%PDF1.7"]
    text_start = text[:20] if len(text) >= 20 else text
    return any(text_start.startswith(sig) for sig in pdf_signatures)


def extract_text_from_pdf_bytes(pdf_bytes):
    """Extract text from PDF bytes using PyPDF2."""
    try:
        pdf_file = BytesIO(pdf_bytes)
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        text_content = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text_content.append(page_text)
        
        full_text = "\n".join(text_content)
        
        # Clean up the extracted text
        # Remove excessive whitespace while preserving paragraph breaks
        full_text = re.sub(r'\n{3,}', '\n\n', full_text)
        full_text = re.sub(r' {2,}', ' ', full_text)
        full_text = full_text.strip()
        
        return full_text
    except Exception as e:
        logger.error(f"Error extracting text from PDF: {e}")
        return None


def try_local_pdf_extraction(bill):
    """Try to extract text from a bill's stored PDF content without any API calls.

    When get_data.py stored raw PDF content as text (from older runs before PDF
    handling was added), the text field contains the PDF bytes decoded as UTF-8.
    We can try to re-encode and extract, but this is lossy so it may fail.
    """
    text = bill.get("text", "")
    if not text:
        return None
    try:
        # The stored text is raw PDF bytes that were decoded with errors="ignore",
        # so re-encoding with latin-1 preserves byte values better than utf-8
        pdf_bytes = text.encode("latin-1", errors="ignore")
        if pdf_bytes[:4] != b'%PDF':
            return None
        extracted = extract_text_from_pdf_bytes(pdf_bytes)
        if extracted and len(extracted.strip()) > 100:
            return extracted
    except Exception:
        pass
    return None


def fix_pdf_bill_via_api(bill):
    """Fix a single bill by re-fetching from the LegiScan API (fallback).

    Only called when local extraction fails.
    """
    bill_id = bill.get("bill_id")
    state = bill.get("state")
    bill_num = bill.get("bill_number")

    logger.info(f"Local extraction failed for {state} {bill_num} (ID: {bill_id}), fetching from API")

    # First, try to get the bill details again
    details_resp = legi_request("getBill", {"id": bill_id})
    if not details_resp:
        logger.warning(f"Could not fetch bill details for {bill_id}")
        return None

    details = details_resp.get("bill", {})
    texts = details.get("texts", [])

    if not texts:
        logger.warning(f"No text documents available for {bill_id}")
        return None

    # Try to get the text document
    doc_id = texts[0].get("doc_id")
    text_resp = legi_request("getBillText", {"id": doc_id})

    if not text_resp or "text" not in text_resp:
        logger.warning(f"Could not fetch text for {bill_id}")
        return None

    raw_b64 = text_resp["text"].get("doc", "")
    if not raw_b64:
        logger.warning(f"No document content for {bill_id}")
        return None

    try:
        # Decode the base64 content
        decoded = base64.b64decode(raw_b64)

        # Check if it's a PDF by looking at the magic bytes
        if decoded[:4] == b'%PDF':
            # It's a PDF, extract text
            extracted_text = extract_text_from_pdf_bytes(decoded)
            if extracted_text and len(extracted_text.strip()) > 100:
                logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF for {bill_id}")
                return extracted_text
            else:
                logger.warning(f"Extracted text too short or empty for {bill_id}")
                return None
        else:
            # Try to decode as HTML
            try:
                html = decoded.decode("utf-8", errors="ignore")
                soup = BeautifulSoup(html, "html.parser")
                plain_text = soup.get_text(separator="\n", strip=True)
                if plain_text and len(plain_text.strip()) > 100:
                    logger.info(f"Successfully extracted HTML text for {bill_id}")
                    return plain_text
            except Exception:
                pass

        logger.warning(f"Could not process document for {bill_id}")
        return None

    except Exception as e:
        logger.error(f"Error processing document for {bill_id}: {e}")
        return None


def main(overwrite: bool | None = None):
    # Load the bills
    logger.info(f"Loading bills from {INPUT_FILE}")
    try:
        with open(INPUT_FILE, 'r') as f:
            bills = json.load(f)
    except Exception as e:
        logger.error(f"Could not load bills file: {e}")
        sys.exit(1)
    
    logger.info(f"Loaded {len(bills)} bills")
    
    # Create a backup
    logger.info(f"Creating backup at {BACKUP_FILE}")
    with open(BACKUP_FILE, 'w') as f:
        json.dump(bills, f, indent=2)
    
    # Find bills with unprocessed PDF content (skip already-fixed bills)
    pdf_bills = []
    already_fixed = 0
    for i, bill in enumerate(bills):
        if bill.get("text_fixed"):
            already_fixed += 1
            continue
        if is_pdf_content(bill.get("text")):
            pdf_bills.append(i)

    logger.info(f"Found {len(pdf_bills)} bills with unprocessed PDF content ({already_fixed} already fixed, skipped)")

    if not pdf_bills:
        logger.info("No PDF bills to fix - saving output as-is")
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(bills, f, indent=2)
        logger.info(f"Output saved to: {OUTPUT_FILE}")
        return

    # Process each PDF bill: try local extraction first, then API fallback
    fixed_local = 0
    fixed_api = 0
    failed_count = 0

    for idx, bill_idx in enumerate(pdf_bills):
        bill = bills[bill_idx]
        bill_label = f"{bill.get('state')} {bill.get('bill_number')}"
        logger.info(f"Processing {idx + 1}/{len(pdf_bills)}: {bill_label}")

        # Step 1: Try local extraction (no API call)
        fixed_text = try_local_pdf_extraction(bill)
        if fixed_text:
            bills[bill_idx]["text"] = fixed_text
            bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
            bills[bill_idx]["text_fixed"] = True
            fixed_local += 1
            logger.info(f"Fixed locally (no API call): {bill_label}")
            continue

        # Step 2: Fall back to API
        fixed_text = fix_pdf_bill_via_api(bill)

        if fixed_text:
            bills[bill_idx]["text"] = fixed_text
            bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
            bills[bill_idx]["text_fixed"] = True
            fixed_api += 1
            logger.info(f"Fixed via API: {bill_label}")
        else:
            bills[bill_idx]["text_extraction_failed"] = True
            bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
            failed_count += 1
            logger.warning(f"Failed to fix: {bill_label}")
            _log_pipeline_error("Failed to fix PDF text",
                bill_id=str(bill.get("bill_id")), bill_key=bill_label)

        _update_pipeline_progress(idx + 1, len(pdf_bills), "PDF bills",
            f"Fixed {fixed_local + fixed_api} bills, {failed_count} failed")

        # Rate limiting (only needed after API calls)
        time.sleep(RATE_LIMIT)

        # Save progress every 50 bills
        if (idx + 1) % 50 == 0:
            logger.info(f"Saving progress... ({idx + 1}/{len(pdf_bills)} processed)")
            with open(OUTPUT_FILE, 'w') as f:
                json.dump(bills, f, indent=2)

    fixed_count = fixed_local + fixed_api

    # Save final results
    logger.info(f"Saving final results to {OUTPUT_FILE}")
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(bills, f, indent=2)

    api_calls = get_api_call_count()
    logger.info(f"Processing complete!")
    logger.info(f"Fixed locally (no API calls): {fixed_local} bills")
    logger.info(f"Fixed via API: {fixed_api} bills")
    logger.info(f"Failed to fix: {failed_count} bills")
    logger.info(f"Already fixed (skipped): {already_fixed} bills")
    logger.info(f"Total LegiScan API calls this run: {api_calls}")
    if fixed_local > 0:
        logger.info(f"Saved ~{fixed_local * 2} API calls by using local PDF extraction")
    logger.info(f"Output saved to: {OUTPUT_FILE}")

    if fixed_count > 0:
        # Decide overwrite behavior
        if overwrite is None:
            # CLI mode: ask the user (guardrail preserved)
            try:
                response = input(
                    f"\nDo you want to overwrite {INPUT_FILE} with the fixed data? (y/n): "
                )
            except EOFError:
                logger.error(
                    "No input available (EOF). Leaving original file unchanged."
                )
                return
            overwrite_flag = response.strip().lower().startswith("y")
        else:
            # Non-interactive mode (e.g. Streamlit pipeline)
            overwrite_flag = overwrite

        if overwrite_flag:
            shutil.copy2(OUTPUT_FILE, INPUT_FILE)
            logger.info(f"Original file {INPUT_FILE} has been updated with fixed data.")
        else:
            logger.info("Overwrite declined; original file left unchanged.")



if __name__ == "__main__":
    # If running under Streamlit / pipeline, we expect FIX_PDF_OVERWRITE in env:
    #   "yes", "y", "true", "1"  -> overwrite=True
    #   "no", "n", "false", "0"  -> overwrite=False
    # If it's not set, we fall back to CLI mode and ask via input().
    env_choice = os.getenv("FIX_PDF_OVERWRITE")

    if env_choice is None:
        # Local CLI run → still interactive
        main(overwrite=None)
    else:
        choice = env_choice.strip().lower()
        if choice in ("yes", "y", "true", "1"):
            main(overwrite=True)
        elif choice in ("no", "n", "false", "0"):
            main(overwrite=False)
        else:
            logger.warning(
                f"Invalid FIX_PDF_OVERWRITE='{env_choice}', defaulting to no overwrite."
            )
            main(overwrite=False)