Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

App Files Files Community

ErrorMsgIdentifier / app.py

joycecast

Update app.py

c080e37 verified 8 months ago

raw

history blame contribute delete

9.51 kB

	import gradio as gr
	import fitz # PyMuPDF
	import re
	import requests
	from io import BytesIO
	import pandas as pd
	from datetime import datetime

	def extract_first_datetime(pdf_url):
	"""Extract the first datetime value from PDF"""
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()
	pdf_bytes = BytesIO(response.content)
	except Exception as e:
	return f"❌ Failed to load PDF: {str(e)}"

	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	full_text = "\n".join([page.get_text("text") for page in doc])
	except Exception as e:
	return f"❌ Failed to extract text: {str(e)}"

	# Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)"
	datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'

	match = re.search(datetime_pattern, full_text)
	if match:
	datetime_str = match.group(0)
	return f"✅ First datetime found: {datetime_str}"
	else:
	return "❌ No datetime pattern found in the PDF"

	def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index):
	# Step 1: Prepare identifiers (alphanumeric-safe)
	identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
	if not identifiers:
	return "❌ No valid Identifiers entered.", None, ""

	# Step 2: Download PDF
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()
	pdf_bytes = BytesIO(response.content)
	except Exception as e:
	return f"❌ Failed to load PDF: {str(e)}", None, ""

	# Step 3: Extract full text from PDF
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	full_text = "\n".join([page.get_text("text") for page in doc])
	except Exception as e:
	return f"❌ Failed to extract text: {str(e)}", None, ""

	# Step 4: Extract datetime first
	datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
	datetime_match = re.search(datetime_pattern, full_text)
	datetime_result = ""
	if datetime_match:
	datetime_str = datetime_match.group(0)
	datetime_result = f"✅ First datetime found: {datetime_str}"
	else:
	datetime_result = "❌ No datetime pattern found in the PDF"

	# Step 5: Split by user-defined marker (optional)
	if split_marker.strip() and split_marker in full_text:
	parts = full_text.split(split_marker)
	# Use custom parts index, default to 1 if invalid
	try:
	parts_index = int(parts_index)
	if parts_index < 0 or parts_index >= len(parts):
	parts_index = 1 # Default to 1 if out of range
	except (ValueError, TypeError):
	parts_index = 1 # Default to 1 if invalid input

	latest_block = parts[parts_index] # Use custom parts index
	note = f"✅ Found marker '{split_marker}', using block {parts_index} (0-indexed)."
	else:
	latest_block = full_text
	note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."

	# Step 6: Track Line# entries that don't have any of the specified identifiers
	missing_identifiers_data = [] # List to store line data with content
	current_line = None
	lines_content = []

	for line in latest_block.splitlines():
	line = line.strip()

	if not line:
	continue

	line_match = re.match(r"Line#\s+(\d+)", line)
	if line_match:
	# If we had a previous line number, check if it should be included
	if current_line is not None and lines_content:
	content = " ".join(lines_content)
	# Check if ANY of the identifiers are present
	has_any_identifier = any(identifier in content for identifier in identifiers)
	if not has_any_identifier:
	# Extract identifier message and narrative message
	identifier_msg = ""
	narrative_msg = ""

	# Look for identifier patterns in the content
	for identifier in identifiers:
	if identifier in content:
	# Find the context around the identifier
	idx = content.find(identifier)
	start = max(0, idx - 50)
	end = min(len(content), idx + len(identifier) + 50)
	identifier_msg = content[start:end].strip()
	break

	# Look for narrative message patterns (you may need to adjust this based on your PDF structure)
	# Common patterns for narrative messages
	narrative_patterns = [
	r'Message:\s*(.+)',
	r'Narrative:\s*(.+)',
	r'Description:\s*(.+)',
	r'Note:\s*(.+)'
	]

	for pattern in narrative_patterns:
	match = re.search(pattern, content, re.IGNORECASE)
	if match:
	narrative_msg = match.group(1).strip()
	break

	# If no specific narrative pattern found, use the full content
	if not narrative_msg:
	narrative_msg = content[:200] + "..." if len(content) > 200 else content

	missing_identifiers_data.append({
	"Line#": current_line,
	"Identifier Message": identifier_msg,
	"Narrative Message": narrative_msg,
	"Full Content": content
	})

	# Reset for new line number
	current_line = int(line_match.group(1))
	lines_content = []
	elif current_line is not None:
	lines_content.append(line)

	# Check the last line number
	if current_line is not None and lines_content:
	content = " ".join(lines_content)
	has_any_identifier = any(identifier in content for identifier in identifiers)
	if not has_any_identifier:
	# Extract identifier message and narrative message
	identifier_msg = ""
	narrative_msg = ""

	# Look for identifier patterns in the content
	for identifier in identifiers:
	if identifier in content:
	# Find the context around the identifier
	idx = content.find(identifier)
	start = max(0, idx - 50)
	end = min(len(content), idx + len(identifier) + 50)
	identifier_msg = content[start:end].strip()
	break

	# Look for narrative message patterns
	narrative_patterns = [
	r'Message:\s*(.+)',
	r'Narrative:\s*(.+)',
	r'Description:\s*(.+)',
	r'Note:\s*(.+)'
	]

	for pattern in narrative_patterns:
	match = re.search(pattern, content, re.IGNORECASE)
	if match:
	narrative_msg = match.group(1).strip()
	break

	# If no specific narrative pattern found, use the full content
	if not narrative_msg:
	narrative_msg = content[:200] + "..." if len(content) > 200 else content

	missing_identifiers_data.append({
	"Line#": current_line,
	"Identifier Message": identifier_msg,
	"Narrative Message": narrative_msg,
	"Full Content": content
	})

	if not missing_identifiers_data:
	return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result

	# Create DataFrame with all the collected data
	df = pd.DataFrame(missing_identifiers_data)
	return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result

	# Gradio Interface
	with gr.Blocks(title="PDF Analysis Tool") as demo:
	gr.Markdown("# PDF Analysis Tool")

	gr.Markdown("## PDF Analysis and Datetime Extraction")

	with gr.Row():
	pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf")
	identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas")
	split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #")
	parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1)

	with gr.Row():
	check_btn = gr.Button("Analyze PDF")

	with gr.Row():
	result_text = gr.Textbox(label="Status")
	datetime_result = gr.Textbox(label="Datetime Result")

	with gr.Row():
	result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas")

	check_btn.click(
	fn=check_latest_section,
	inputs=[pdf_url, identifiers_input, split_marker, parts_index],
	outputs=[result_text, result_df, datetime_result]
	)

	demo.launch()