joycecast's picture
Update app.py
c080e37 verified
import gradio as gr
import fitz # PyMuPDF
import re
import requests
from io import BytesIO
import pandas as pd
from datetime import datetime
def extract_first_datetime(pdf_url):
"""Extract the first datetime value from PDF"""
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_bytes = BytesIO(response.content)
except Exception as e:
return f"❌ Failed to load PDF: {str(e)}"
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = "\n".join([page.get_text("text") for page in doc])
except Exception as e:
return f"❌ Failed to extract text: {str(e)}"
# Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)"
datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
match = re.search(datetime_pattern, full_text)
if match:
datetime_str = match.group(0)
return f"βœ… First datetime found: {datetime_str}"
else:
return "❌ No datetime pattern found in the PDF"
def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index):
# Step 1: Prepare identifiers (alphanumeric-safe)
identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
if not identifiers:
return "❌ No valid Identifiers entered.", None, ""
# Step 2: Download PDF
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_bytes = BytesIO(response.content)
except Exception as e:
return f"❌ Failed to load PDF: {str(e)}", None, ""
# Step 3: Extract full text from PDF
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = "\n".join([page.get_text("text") for page in doc])
except Exception as e:
return f"❌ Failed to extract text: {str(e)}", None, ""
# Step 4: Extract datetime first
datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
datetime_match = re.search(datetime_pattern, full_text)
datetime_result = ""
if datetime_match:
datetime_str = datetime_match.group(0)
datetime_result = f"βœ… First datetime found: {datetime_str}"
else:
datetime_result = "❌ No datetime pattern found in the PDF"
# Step 5: Split by user-defined marker (optional)
if split_marker.strip() and split_marker in full_text:
parts = full_text.split(split_marker)
# Use custom parts index, default to 1 if invalid
try:
parts_index = int(parts_index)
if parts_index < 0 or parts_index >= len(parts):
parts_index = 1 # Default to 1 if out of range
except (ValueError, TypeError):
parts_index = 1 # Default to 1 if invalid input
latest_block = parts[parts_index] # Use custom parts index
note = f"βœ… Found marker '{split_marker}', using block {parts_index} (0-indexed)."
else:
latest_block = full_text
note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
# Step 6: Track Line# entries that don't have any of the specified identifiers
missing_identifiers_data = [] # List to store line data with content
current_line = None
lines_content = []
for line in latest_block.splitlines():
line = line.strip()
if not line:
continue
line_match = re.match(r"Line#\s+(\d+)", line)
if line_match:
# If we had a previous line number, check if it should be included
if current_line is not None and lines_content:
content = " ".join(lines_content)
# Check if ANY of the identifiers are present
has_any_identifier = any(identifier in content for identifier in identifiers)
if not has_any_identifier:
# Extract identifier message and narrative message
identifier_msg = ""
narrative_msg = ""
# Look for identifier patterns in the content
for identifier in identifiers:
if identifier in content:
# Find the context around the identifier
idx = content.find(identifier)
start = max(0, idx - 50)
end = min(len(content), idx + len(identifier) + 50)
identifier_msg = content[start:end].strip()
break
# Look for narrative message patterns (you may need to adjust this based on your PDF structure)
# Common patterns for narrative messages
narrative_patterns = [
r'Message:\s*(.+)',
r'Narrative:\s*(.+)',
r'Description:\s*(.+)',
r'Note:\s*(.+)'
]
for pattern in narrative_patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
narrative_msg = match.group(1).strip()
break
# If no specific narrative pattern found, use the full content
if not narrative_msg:
narrative_msg = content[:200] + "..." if len(content) > 200 else content
missing_identifiers_data.append({
"Line#": current_line,
"Identifier Message": identifier_msg,
"Narrative Message": narrative_msg,
"Full Content": content
})
# Reset for new line number
current_line = int(line_match.group(1))
lines_content = []
elif current_line is not None:
lines_content.append(line)
# Check the last line number
if current_line is not None and lines_content:
content = " ".join(lines_content)
has_any_identifier = any(identifier in content for identifier in identifiers)
if not has_any_identifier:
# Extract identifier message and narrative message
identifier_msg = ""
narrative_msg = ""
# Look for identifier patterns in the content
for identifier in identifiers:
if identifier in content:
# Find the context around the identifier
idx = content.find(identifier)
start = max(0, idx - 50)
end = min(len(content), idx + len(identifier) + 50)
identifier_msg = content[start:end].strip()
break
# Look for narrative message patterns
narrative_patterns = [
r'Message:\s*(.+)',
r'Narrative:\s*(.+)',
r'Description:\s*(.+)',
r'Note:\s*(.+)'
]
for pattern in narrative_patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
narrative_msg = match.group(1).strip()
break
# If no specific narrative pattern found, use the full content
if not narrative_msg:
narrative_msg = content[:200] + "..." if len(content) > 200 else content
missing_identifiers_data.append({
"Line#": current_line,
"Identifier Message": identifier_msg,
"Narrative Message": narrative_msg,
"Full Content": content
})
if not missing_identifiers_data:
return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result
# Create DataFrame with all the collected data
df = pd.DataFrame(missing_identifiers_data)
return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result
# Gradio Interface
with gr.Blocks(title="PDF Analysis Tool") as demo:
gr.Markdown("# PDF Analysis Tool")
gr.Markdown("## PDF Analysis and Datetime Extraction")
with gr.Row():
pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf")
identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas")
split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #")
parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1)
with gr.Row():
check_btn = gr.Button("Analyze PDF")
with gr.Row():
result_text = gr.Textbox(label="Status")
datetime_result = gr.Textbox(label="Datetime Result")
with gr.Row():
result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas")
check_btn.click(
fn=check_latest_section,
inputs=[pdf_url, identifiers_input, split_marker, parts_index],
outputs=[result_text, result_df, datetime_result]
)
demo.launch()