Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

App Files Files Community

ErrorMsgIdentifier / app.py

joycecast

Update app.py

d2d693b verified 12 months ago

raw

history blame

2.54 kB

	import gradio as gr
	import fitz # PyMuPDF
	import re
	import requests
	from io import BytesIO
	import pandas as pd

	def check_latest_section(pdf_url, identifiers_input, split_marker):
	# Step 1: Prepare identifiers
	identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
	if not identifiers:
	return "❌ No valid Message Identifiers entered.", None

	# Step 2: Download PDF
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()
	pdf_bytes = BytesIO(response.content)
	except Exception as e:
	return f"❌ Failed to load PDF: {str(e)}", None

	# Step 3: Extract full text from PDF
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	full_text = "\n".join([page.get_text() for page in doc])
	except Exception as e:
	return f"❌ Failed to extract text: {str(e)}", None

	# Step 4: Split by user-defined marker (optional)
	if split_marker.strip() and split_marker in full_text:
	parts = full_text.split(split_marker)
	latest_block = parts[0]
	note = f"✅ Found marker '{split_marker}', using the latest block."
	else:
	latest_block = full_text
	note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."

	# Step 5: Match Line# and Message Identifier
	id_pattern = "\|".join(re.escape(i) for i in identifiers)
	regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")

	matches = []
	for match in regex.finditer(latest_block):
	line_num, msg_id = match.groups()
	matches.append({"Line#": int(line_num), "Message Identifier": msg_id})

	if not matches:
	return note + " No matching Message Identifiers found.", None

	df = pd.DataFrame(matches).sort_values("Line#").reset_index(drop=True)
	return note + " Matches found:", df

	# Gradio Interface
	demo = gr.Interface(
	fn=check_latest_section,
	inputs=[
	gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
	gr.Textbox(label="Message Identifier List", value="523"),
	gr.Textbox(label="Split Marker (optional)", value="Record #"),
	],
	outputs=[
	gr.Textbox(label="Status"),
	gr.Dataframe(label="Matching Lines", type="pandas"),
	],
	title="PDF Line# Identifier Checker (Latest Only)",
	description="Checks Line# entries with specified Message Identifiers. If a split marker is provided, only the latest section is used; otherwise, the full document is scanned."
	)

	demo.launch()