joycecast's picture
Update app.py
042f009 verified
raw
history blame
2.83 kB
import gradio as gr
import fitz # PyMuPDF
import re
import requests
from io import BytesIO
import pandas as pd
def check_latest_section(pdf_url, identifiers_input, split_marker):
# Step 1: Prepare identifiers (alphanumeric-safe)
identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
if not identifiers:
return "❌ No valid Message Identifiers entered.", None
# Step 2: Download PDF
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_bytes = BytesIO(response.content)
except Exception as e:
return f"❌ Failed to load PDF: {str(e)}", None
# Step 3: Extract full text from PDF
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = "\n".join([page.get_text("text") for page in doc])
except Exception as e:
return f"❌ Failed to extract text: {str(e)}", None
# Step 4: Split by user-defined marker (optional)
if split_marker.strip() and split_marker in full_text:
parts = full_text.split(split_marker)
latest_block = parts[1] # First block *after* the split
note = f"βœ… Found marker '{split_marker}', using the latest block."
else:
latest_block = full_text
note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
# Step 5: Track Line# context and find message identifiers below it
id_pattern = set(identifiers)
matches_set = set()
current_line = None
for line in latest_block.splitlines():
line = line.strip()
if not line:
continue
line_match = re.match(r"Line#\s+(\d+)", line)
if line_match:
current_line = int(line_match.group(1))
continue
if current_line is not None:
for ident in id_pattern:
if re.search(rf"\b{re.escape(ident)}\b", line):
matches_set.add((current_line, ident))
if not matches_set:
return note + " No matching Message Identifiers found.", None
df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
return note + " Matches found:", df
# Gradio Interface
demo = gr.Interface(
fn=check_latest_section,
inputs=[
gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
gr.Textbox(label="Split Marker (optional)", value="Record #"),
],
outputs=[
gr.Textbox(label="Status"),
gr.Dataframe(label="Matching Lines", type="pandas"),
],
title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
)
demo.launch()