import gradio as gr
import fitz  # PyMuPDF
import re
import requests
from io import BytesIO
import pandas as pd

def check_latest_section(pdf_url, identifiers_input, split_marker):
    # Step 1: Prepare identifiers (alphanumeric-safe)
    identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
    if not identifiers:
        return "❌ No valid Message Identifiers entered.", None

    # Step 2: Download PDF
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_bytes = BytesIO(response.content)
    except Exception as e:
        return f"❌ Failed to load PDF: {str(e)}", None

    # Step 3: Extract full text from PDF
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        full_text = "\n".join([page.get_text("text") for page in doc])
    except Exception as e:
        return f"❌ Failed to extract text: {str(e)}", None

    # Step 4: Split by user-defined marker (optional)
    if split_marker.strip() and split_marker in full_text:
        parts = full_text.split(split_marker)
        latest_block = parts[1]  # First block *after* the split
        note = f"✅ Found marker '{split_marker}', using the latest block."
    else:
        latest_block = full_text
        note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."

    # Step 5: Track Line# context and find message identifiers below it
    id_pattern = set(identifiers)
    matches_set = set()

    current_line = None
    for line in latest_block.splitlines():
        line = line.strip()

        if not line:
            continue

        line_match = re.match(r"Line#\s+(\d+)", line)
        if line_match:
            current_line = int(line_match.group(1))
            continue

        if current_line is not None:
            for ident in id_pattern:
                if re.search(rf"\b{re.escape(ident)}\b", line):
                    matches_set.add((current_line, ident))

    if not matches_set:
        return note + " No matching Message Identifiers found.", None

    df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
    return note + " Matches found:", df

# Gradio Interface
demo = gr.Interface(
    fn=check_latest_section,
    inputs=[
        gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
        gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
        gr.Textbox(label="Split Marker (optional)", value="Record #"),
    ],
    outputs=[
        gr.Textbox(label="Status"),
        gr.Dataframe(label="Matching Lines", type="pandas"),
    ],
    title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
    description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
)

demo.launch()