import gradio as gr import fitz # PyMuPDF import re import requests from io import BytesIO import pandas as pd from datetime import datetime def extract_first_datetime(pdf_url): """Extract the first datetime value from PDF""" try: response = requests.get(pdf_url) response.raise_for_status() pdf_bytes = BytesIO(response.content) except Exception as e: return f"❌ Failed to load PDF: {str(e)}" try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") full_text = "\n".join([page.get_text("text") for page in doc]) except Exception as e: return f"❌ Failed to extract text: {str(e)}" # Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)" datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)' match = re.search(datetime_pattern, full_text) if match: datetime_str = match.group(0) return f"✅ First datetime found: {datetime_str}" else: return "❌ No datetime pattern found in the PDF" def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index): # Step 1: Prepare identifiers (alphanumeric-safe) identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()] if not identifiers: return "❌ No valid Identifiers entered.", None, "" # Step 2: Download PDF try: response = requests.get(pdf_url) response.raise_for_status() pdf_bytes = BytesIO(response.content) except Exception as e: return f"❌ Failed to load PDF: {str(e)}", None, "" # Step 3: Extract full text from PDF try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") full_text = "\n".join([page.get_text("text") for page in doc]) except Exception as e: return f"❌ Failed to extract text: {str(e)}", None, "" # Step 4: Extract datetime first datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)' datetime_match = re.search(datetime_pattern, full_text) datetime_result = "" if datetime_match: datetime_str = datetime_match.group(0) datetime_result = f"✅ First datetime found: {datetime_str}" else: datetime_result = "❌ No datetime pattern found in the PDF" # Step 5: Split by user-defined marker (optional) if split_marker.strip() and split_marker in full_text: parts = full_text.split(split_marker) # Use custom parts index, default to 1 if invalid try: parts_index = int(parts_index) if parts_index < 0 or parts_index >= len(parts): parts_index = 1 # Default to 1 if out of range except (ValueError, TypeError): parts_index = 1 # Default to 1 if invalid input latest_block = parts[parts_index] # Use custom parts index note = f"✅ Found marker '{split_marker}', using block {parts_index} (0-indexed)." else: latest_block = full_text note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content." # Step 6: Track Line# entries that don't have any of the specified identifiers missing_identifiers_data = [] # List to store line data with content current_line = None lines_content = [] for line in latest_block.splitlines(): line = line.strip() if not line: continue line_match = re.match(r"Line#\s+(\d+)", line) if line_match: # If we had a previous line number, check if it should be included if current_line is not None and lines_content: content = " ".join(lines_content) # Check if ANY of the identifiers are present has_any_identifier = any(identifier in content for identifier in identifiers) if not has_any_identifier: # Extract identifier message and narrative message identifier_msg = "" narrative_msg = "" # Look for identifier patterns in the content for identifier in identifiers: if identifier in content: # Find the context around the identifier idx = content.find(identifier) start = max(0, idx - 50) end = min(len(content), idx + len(identifier) + 50) identifier_msg = content[start:end].strip() break # Look for narrative message patterns (you may need to adjust this based on your PDF structure) # Common patterns for narrative messages narrative_patterns = [ r'Message:\s*(.+)', r'Narrative:\s*(.+)', r'Description:\s*(.+)', r'Note:\s*(.+)' ] for pattern in narrative_patterns: match = re.search(pattern, content, re.IGNORECASE) if match: narrative_msg = match.group(1).strip() break # If no specific narrative pattern found, use the full content if not narrative_msg: narrative_msg = content[:200] + "..." if len(content) > 200 else content missing_identifiers_data.append({ "Line#": current_line, "Identifier Message": identifier_msg, "Narrative Message": narrative_msg, "Full Content": content }) # Reset for new line number current_line = int(line_match.group(1)) lines_content = [] elif current_line is not None: lines_content.append(line) # Check the last line number if current_line is not None and lines_content: content = " ".join(lines_content) has_any_identifier = any(identifier in content for identifier in identifiers) if not has_any_identifier: # Extract identifier message and narrative message identifier_msg = "" narrative_msg = "" # Look for identifier patterns in the content for identifier in identifiers: if identifier in content: # Find the context around the identifier idx = content.find(identifier) start = max(0, idx - 50) end = min(len(content), idx + len(identifier) + 50) identifier_msg = content[start:end].strip() break # Look for narrative message patterns narrative_patterns = [ r'Message:\s*(.+)', r'Narrative:\s*(.+)', r'Description:\s*(.+)', r'Note:\s*(.+)' ] for pattern in narrative_patterns: match = re.search(pattern, content, re.IGNORECASE) if match: narrative_msg = match.group(1).strip() break # If no specific narrative pattern found, use the full content if not narrative_msg: narrative_msg = content[:200] + "..." if len(content) > 200 else content missing_identifiers_data.append({ "Line#": current_line, "Identifier Message": identifier_msg, "Narrative Message": narrative_msg, "Full Content": content }) if not missing_identifiers_data: return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result # Create DataFrame with all the collected data df = pd.DataFrame(missing_identifiers_data) return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result # Gradio Interface with gr.Blocks(title="PDF Analysis Tool") as demo: gr.Markdown("# PDF Analysis Tool") gr.Markdown("## PDF Analysis and Datetime Extraction") with gr.Row(): pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf") identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas") split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #") parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1) with gr.Row(): check_btn = gr.Button("Analyze PDF") with gr.Row(): result_text = gr.Textbox(label="Status") datetime_result = gr.Textbox(label="Datetime Result") with gr.Row(): result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas") check_btn.click( fn=check_latest_section, inputs=[pdf_url, identifiers_input, split_marker, parts_index], outputs=[result_text, result_df, datetime_result] ) demo.launch()