Spaces:
Sleeping
Sleeping
File size: 9,506 Bytes
fcad26a c080e37 fcad26a c080e37 6d303d7 fcad26a c080e37 fcad26a 56bb301 fcad26a c080e37 fcad26a 56bb301 fcad26a 042f009 fcad26a c080e37 fcad26a c080e37 56bb301 c080e37 56bb301 c080e37 042f009 c080e37 042f009 c080e37 042f009 c080e37 042f009 c080e37 042f009 c080e37 fcad26a c080e37 fcad26a c080e37 fcad26a 56bb301 c080e37 fcad26a 6d303d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
import gradio as gr
import fitz # PyMuPDF
import re
import requests
from io import BytesIO
import pandas as pd
from datetime import datetime
def extract_first_datetime(pdf_url):
"""Extract the first datetime value from PDF"""
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_bytes = BytesIO(response.content)
except Exception as e:
return f"β Failed to load PDF: {str(e)}"
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = "\n".join([page.get_text("text") for page in doc])
except Exception as e:
return f"β Failed to extract text: {str(e)}"
# Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)"
datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
match = re.search(datetime_pattern, full_text)
if match:
datetime_str = match.group(0)
return f"β
First datetime found: {datetime_str}"
else:
return "β No datetime pattern found in the PDF"
def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index):
# Step 1: Prepare identifiers (alphanumeric-safe)
identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
if not identifiers:
return "β No valid Identifiers entered.", None, ""
# Step 2: Download PDF
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_bytes = BytesIO(response.content)
except Exception as e:
return f"β Failed to load PDF: {str(e)}", None, ""
# Step 3: Extract full text from PDF
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = "\n".join([page.get_text("text") for page in doc])
except Exception as e:
return f"β Failed to extract text: {str(e)}", None, ""
# Step 4: Extract datetime first
datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
datetime_match = re.search(datetime_pattern, full_text)
datetime_result = ""
if datetime_match:
datetime_str = datetime_match.group(0)
datetime_result = f"β
First datetime found: {datetime_str}"
else:
datetime_result = "β No datetime pattern found in the PDF"
# Step 5: Split by user-defined marker (optional)
if split_marker.strip() and split_marker in full_text:
parts = full_text.split(split_marker)
# Use custom parts index, default to 1 if invalid
try:
parts_index = int(parts_index)
if parts_index < 0 or parts_index >= len(parts):
parts_index = 1 # Default to 1 if out of range
except (ValueError, TypeError):
parts_index = 1 # Default to 1 if invalid input
latest_block = parts[parts_index] # Use custom parts index
note = f"β
Found marker '{split_marker}', using block {parts_index} (0-indexed)."
else:
latest_block = full_text
note = f"β οΈ Marker '{split_marker}' not found. Using entire PDF content."
# Step 6: Track Line# entries that don't have any of the specified identifiers
missing_identifiers_data = [] # List to store line data with content
current_line = None
lines_content = []
for line in latest_block.splitlines():
line = line.strip()
if not line:
continue
line_match = re.match(r"Line#\s+(\d+)", line)
if line_match:
# If we had a previous line number, check if it should be included
if current_line is not None and lines_content:
content = " ".join(lines_content)
# Check if ANY of the identifiers are present
has_any_identifier = any(identifier in content for identifier in identifiers)
if not has_any_identifier:
# Extract identifier message and narrative message
identifier_msg = ""
narrative_msg = ""
# Look for identifier patterns in the content
for identifier in identifiers:
if identifier in content:
# Find the context around the identifier
idx = content.find(identifier)
start = max(0, idx - 50)
end = min(len(content), idx + len(identifier) + 50)
identifier_msg = content[start:end].strip()
break
# Look for narrative message patterns (you may need to adjust this based on your PDF structure)
# Common patterns for narrative messages
narrative_patterns = [
r'Message:\s*(.+)',
r'Narrative:\s*(.+)',
r'Description:\s*(.+)',
r'Note:\s*(.+)'
]
for pattern in narrative_patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
narrative_msg = match.group(1).strip()
break
# If no specific narrative pattern found, use the full content
if not narrative_msg:
narrative_msg = content[:200] + "..." if len(content) > 200 else content
missing_identifiers_data.append({
"Line#": current_line,
"Identifier Message": identifier_msg,
"Narrative Message": narrative_msg,
"Full Content": content
})
# Reset for new line number
current_line = int(line_match.group(1))
lines_content = []
elif current_line is not None:
lines_content.append(line)
# Check the last line number
if current_line is not None and lines_content:
content = " ".join(lines_content)
has_any_identifier = any(identifier in content for identifier in identifiers)
if not has_any_identifier:
# Extract identifier message and narrative message
identifier_msg = ""
narrative_msg = ""
# Look for identifier patterns in the content
for identifier in identifiers:
if identifier in content:
# Find the context around the identifier
idx = content.find(identifier)
start = max(0, idx - 50)
end = min(len(content), idx + len(identifier) + 50)
identifier_msg = content[start:end].strip()
break
# Look for narrative message patterns
narrative_patterns = [
r'Message:\s*(.+)',
r'Narrative:\s*(.+)',
r'Description:\s*(.+)',
r'Note:\s*(.+)'
]
for pattern in narrative_patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
narrative_msg = match.group(1).strip()
break
# If no specific narrative pattern found, use the full content
if not narrative_msg:
narrative_msg = content[:200] + "..." if len(content) > 200 else content
missing_identifiers_data.append({
"Line#": current_line,
"Identifier Message": identifier_msg,
"Narrative Message": narrative_msg,
"Full Content": content
})
if not missing_identifiers_data:
return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result
# Create DataFrame with all the collected data
df = pd.DataFrame(missing_identifiers_data)
return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result
# Gradio Interface
with gr.Blocks(title="PDF Analysis Tool") as demo:
gr.Markdown("# PDF Analysis Tool")
gr.Markdown("## PDF Analysis and Datetime Extraction")
with gr.Row():
pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf")
identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas")
split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #")
parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1)
with gr.Row():
check_btn = gr.Button("Analyze PDF")
with gr.Row():
result_text = gr.Textbox(label="Status")
datetime_result = gr.Textbox(label="Datetime Result")
with gr.Row():
result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas")
check_btn.click(
fn=check_latest_section,
inputs=[pdf_url, identifiers_input, split_marker, parts_index],
outputs=[result_text, result_df, datetime_result]
)
demo.launch() |