import os
import json
import base64
import logging
import sys
from datetime import datetime, timezone
import requests
from dotenv import load_dotenv
import PyPDF2
from io import BytesIO
import re
import shutil

# Load environment variables
load_dotenv()
API_KEY = os.getenv("LEGISCAN_API_KEY")

# Files
INPUT_FILE = "data/known_bills.json"
OUTPUT_FILE = "data/known_bills_fixed.json"
BACKUP_FILE = "data/known_bills_backup.json"

# Rate limiting
import time
RATE_LIMIT = 0.2  # seconds between API requests

# Logging configuration
LOG_FILE = "data_updating_scripts/logs/fix_pdf_bills.log"
os.makedirs("data_updating_scripts/logs", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE)
    ]
)
logger = logging.getLogger(__name__)


def is_pdf_content(text):
    """Check if the text content is an unprocessed PDF."""
    if not text:
        return False
    # Check for PDF header signatures
    pdf_signatures = ["%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF1.3", "%PDF1.4", "%PDF1.5", "%PDF1.6", "%PDF1.7"]
    text_start = text[:20] if len(text) >= 20 else text
    return any(text_start.startswith(sig) for sig in pdf_signatures)


def extract_text_from_pdf_bytes(pdf_bytes):
    """Extract text from PDF bytes using PyPDF2."""
    try:
        pdf_file = BytesIO(pdf_bytes)
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        text_content = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                text_content.append(page_text)
        
        full_text = "\n".join(text_content)
        
        # Clean up the extracted text
        # Remove excessive whitespace while preserving paragraph breaks
        full_text = re.sub(r'\n{3,}', '\n\n', full_text)
        full_text = re.sub(r' {2,}', ' ', full_text)
        full_text = full_text.strip()
        
        return full_text
    except Exception as e:
        logger.error(f"Error extracting text from PDF: {e}")
        return None


def legi_request(op, params):
    """Make a request to the LegiScan API."""
    base = "https://api.legiscan.com/"
    params.update({"key": API_KEY, "op": op})
    try:
        resp = requests.get(base, params=params, timeout=10)
        resp.raise_for_status()
        data = resp.json()
        if data.get("status") != "OK":
            logger.error(f"API error {op}: {data.get('message', data)}")
            return None
        return data
    except requests.RequestException as e:
        logger.error(f"Request failed ({op}): {e}")
        return None


def fix_pdf_bill(bill):
    """Fix a single bill with unprocessed PDF content."""
    bill_id = bill.get("bill_id")
    state = bill.get("state")
    bill_num = bill.get("bill_number")
    
    logger.info(f"Fixing PDF content for {state} {bill_num} (ID: {bill_id})")
    
    # First, try to get the bill details again
    details_resp = legi_request("getBill", {"id": bill_id})
    if not details_resp:
        logger.warning(f"Could not fetch bill details for {bill_id}")
        return None
    
    details = details_resp.get("bill", {})
    texts = details.get("texts", [])
    
    if not texts:
        logger.warning(f"No text documents available for {bill_id}")
        return None
    
    # Try to get the text document
    doc_id = texts[0].get("doc_id")
    text_resp = legi_request("getBillText", {"id": doc_id})
    
    if not text_resp or "text" not in text_resp:
        logger.warning(f"Could not fetch text for {bill_id}")
        return None
    
    raw_b64 = text_resp["text"].get("doc", "")
    if not raw_b64:
        logger.warning(f"No document content for {bill_id}")
        return None
    
    try:
        # Decode the base64 content
        decoded = base64.b64decode(raw_b64)
        
        # Check if it's a PDF by looking at the magic bytes
        if decoded[:4] == b'%PDF':
            # It's a PDF, extract text
            extracted_text = extract_text_from_pdf_bytes(decoded)
            if extracted_text and len(extracted_text.strip()) > 100:  # Ensure we got meaningful text
                logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF for {bill_id}")
                return extracted_text
            else:
                logger.warning(f"Extracted text too short or empty for {bill_id}")
                return None
        else:
            # Try to decode as HTML (shouldn't happen for these cases, but just in case)
            try:
                from bs4 import BeautifulSoup
                html = decoded.decode("utf-8", errors="ignore")
                soup = BeautifulSoup(html, "html.parser")
                plain_text = soup.get_text(separator="\n", strip=True)
                if plain_text and len(plain_text.strip()) > 100:
                    logger.info(f"Successfully extracted HTML text for {bill_id}")
                    return plain_text
            except:
                pass
                
        logger.warning(f"Could not process document for {bill_id}")
        return None
        
    except Exception as e:
        logger.error(f"Error processing document for {bill_id}: {e}")
        return None


def main(overwrite: bool | None = None):
    # Load the bills
    logger.info(f"Loading bills from {INPUT_FILE}")
    try:
        with open(INPUT_FILE, 'r') as f:
            bills = json.load(f)
    except Exception as e:
        logger.error(f"Could not load bills file: {e}")
        sys.exit(1)
    
    logger.info(f"Loaded {len(bills)} bills")
    
    # Create a backup
    logger.info(f"Creating backup at {BACKUP_FILE}")
    with open(BACKUP_FILE, 'w') as f:
        json.dump(bills, f, indent=2)
    
    # Find bills with unprocessed PDF content
    pdf_bills = []
    for i, bill in enumerate(bills):
        if is_pdf_content(bill.get("text")):
            pdf_bills.append(i)
    
    logger.info(f"Found {len(pdf_bills)} bills with unprocessed PDF content")
    
    # Process each PDF bill
    fixed_count = 0
    failed_count = 0
    
    for idx, bill_idx in enumerate(pdf_bills):
        bill = bills[bill_idx]
        logger.info(f"Processing {idx + 1}/{len(pdf_bills)}: {bill.get('state')} {bill.get('bill_number')}")
        
        # Try to fix the PDF content
        fixed_text = fix_pdf_bill(bill)
        
        if fixed_text:
            # Update the bill with the fixed text
            bills[bill_idx]["text"] = fixed_text
            bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
            bills[bill_idx]["text_fixed"] = True  # Mark that we fixed this
            fixed_count += 1
            logger.info(f"Successfully fixed bill {bill.get('bill_id')}")
        else:
            # Mark that we tried but failed
            bills[bill_idx]["text_extraction_failed"] = True
            bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
            failed_count += 1
            logger.warning(f"Failed to fix bill {bill.get('bill_id')}")
        
        # Rate limiting
        time.sleep(RATE_LIMIT)
        
        # Save progress every 50 bills
        if (idx + 1) % 50 == 0:
            logger.info(f"Saving progress... ({idx + 1}/{len(pdf_bills)} processed)")
            with open(OUTPUT_FILE, 'w') as f:
                json.dump(bills, f, indent=2)
    
    # Save final results
    logger.info(f"Saving final results to {OUTPUT_FILE}")
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(bills, f, indent=2)
    
    logger.info(f"Processing complete!")
    logger.info(f"Successfully fixed: {fixed_count} bills")
    logger.info(f"Failed to fix: {failed_count} bills")
    logger.info(f"Output saved to: {OUTPUT_FILE}")

    if fixed_count > 0:
        # Decide overwrite behavior
        if overwrite is None:
            # CLI mode: ask the user (guardrail preserved)
            try:
                response = input(
                    f"\nDo you want to overwrite {INPUT_FILE} with the fixed data? (y/n): "
                )
            except EOFError:
                logger.error(
                    "No input available (EOF). Leaving original file unchanged."
                )
                return
            overwrite_flag = response.strip().lower().startswith("y")
        else:
            # Non-interactive mode (e.g. Streamlit pipeline)
            overwrite_flag = overwrite

        if overwrite_flag:
            shutil.copy2(OUTPUT_FILE, INPUT_FILE)
            logger.info(f"Original file {INPUT_FILE} has been updated with fixed data.")
        else:
            logger.info("Overwrite declined; original file left unchanged.")


if __name__ == "__main__":
    # If running under Streamlit / pipeline, we expect FIX_PDF_OVERWRITE in env:
    #   "yes", "y", "true", "1"  -> overwrite=True
    #   "no", "n", "false", "0"  -> overwrite=False
    # If it's not set, we fall back to CLI mode and ask via input().
    env_choice = os.getenv("FIX_PDF_OVERWRITE")

    if env_choice is None:
        # Local CLI run → still interactive
        main(overwrite=None)
    else:
        choice = env_choice.strip().lower()
        if choice in ("yes", "y", "true", "1"):
            main(overwrite=True)
        elif choice in ("no", "n", "false", "0"):
            main(overwrite=False)
        else:
            logger.warning(
                f"Invalid FIX_PDF_OVERWRITE='{env_choice}', defaulting to no overwrite."
            )
            main(overwrite=False)