Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

App Files Files Community

ErrorMsgIdentifier / app.py

joycecast

Update app.py

042f009 verified 10 months ago

raw

history blame

2.83 kB

	import gradio as gr
	import fitz # PyMuPDF
	import re
	import requests
	from io import BytesIO
	import pandas as pd

	def check_latest_section(pdf_url, identifiers_input, split_marker):
	# Step 1: Prepare identifiers (alphanumeric-safe)
	identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
	if not identifiers:
	return "❌ No valid Message Identifiers entered.", None

	# Step 2: Download PDF
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()
	pdf_bytes = BytesIO(response.content)
	except Exception as e:
	return f"❌ Failed to load PDF: {str(e)}", None

	# Step 3: Extract full text from PDF
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	full_text = "\n".join([page.get_text("text") for page in doc])
	except Exception as e:
	return f"❌ Failed to extract text: {str(e)}", None

	# Step 4: Split by user-defined marker (optional)
	if split_marker.strip() and split_marker in full_text:
	parts = full_text.split(split_marker)
	latest_block = parts[1] # First block after the split
	note = f"✅ Found marker '{split_marker}', using the latest block."
	else:
	latest_block = full_text
	note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."

	# Step 5: Track Line# context and find message identifiers below it
	id_pattern = set(identifiers)
	matches_set = set()

	current_line = None
	for line in latest_block.splitlines():
	line = line.strip()

	if not line:
	continue

	line_match = re.match(r"Line#\s+(\d+)", line)
	if line_match:
	current_line = int(line_match.group(1))
	continue

	if current_line is not None:
	for ident in id_pattern:
	if re.search(rf"\b{re.escape(ident)}\b", line):
	matches_set.add((current_line, ident))

	if not matches_set:
	return note + " No matching Message Identifiers found.", None

	df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
	return note + " Matches found:", df

	# Gradio Interface
	demo = gr.Interface(
	fn=check_latest_section,
	inputs=[
	gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
	gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
	gr.Textbox(label="Split Marker (optional)", value="Record #"),
	],
	outputs=[
	gr.Textbox(label="Status"),
	gr.Dataframe(label="Matching Lines", type="pandas"),
	],
	title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
	description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
	)

	demo.launch()