joycecast's picture
Update app.py
d2d693b verified
raw
history blame
2.54 kB
import gradio as gr
import fitz # PyMuPDF
import re
import requests
from io import BytesIO
import pandas as pd
def check_latest_section(pdf_url, identifiers_input, split_marker):
# Step 1: Prepare identifiers
identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
if not identifiers:
return "❌ No valid Message Identifiers entered.", None
# Step 2: Download PDF
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_bytes = BytesIO(response.content)
except Exception as e:
return f"❌ Failed to load PDF: {str(e)}", None
# Step 3: Extract full text from PDF
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = "\n".join([page.get_text() for page in doc])
except Exception as e:
return f"❌ Failed to extract text: {str(e)}", None
# Step 4: Split by user-defined marker (optional)
if split_marker.strip() and split_marker in full_text:
parts = full_text.split(split_marker)
latest_block = parts[0]
note = f"βœ… Found marker '{split_marker}', using the latest block."
else:
latest_block = full_text
note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
# Step 5: Match Line# and Message Identifier
id_pattern = "|".join(re.escape(i) for i in identifiers)
regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
matches = []
for match in regex.finditer(latest_block):
line_num, msg_id = match.groups()
matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
if not matches:
return note + " No matching Message Identifiers found.", None
df = pd.DataFrame(matches).sort_values("Line#").reset_index(drop=True)
return note + " Matches found:", df
# Gradio Interface
demo = gr.Interface(
fn=check_latest_section,
inputs=[
gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
gr.Textbox(label="Message Identifier List", value="523"),
gr.Textbox(label="Split Marker (optional)", value="Record #"),
],
outputs=[
gr.Textbox(label="Status"),
gr.Dataframe(label="Matching Lines", type="pandas"),
],
title="PDF Line# Identifier Checker (Latest Only)",
description="Checks Line# entries with specified Message Identifiers. If a split marker is provided, only the latest section is used; otherwise, the full document is scanned."
)
demo.launch()