Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import re | |
| import requests | |
| from io import BytesIO | |
| import pandas as pd | |
| from datetime import datetime | |
| def extract_first_datetime(pdf_url): | |
| """Extract the first datetime value from PDF""" | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| pdf_bytes = BytesIO(response.content) | |
| except Exception as e: | |
| return f"β Failed to load PDF: {str(e)}" | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| full_text = "\n".join([page.get_text("text") for page in doc]) | |
| except Exception as e: | |
| return f"β Failed to extract text: {str(e)}" | |
| # Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)" | |
| datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)' | |
| match = re.search(datetime_pattern, full_text) | |
| if match: | |
| datetime_str = match.group(0) | |
| return f"β First datetime found: {datetime_str}" | |
| else: | |
| return "β No datetime pattern found in the PDF" | |
| def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index): | |
| # Step 1: Prepare identifiers (alphanumeric-safe) | |
| identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()] | |
| if not identifiers: | |
| return "β No valid Identifiers entered.", None, "" | |
| # Step 2: Download PDF | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| pdf_bytes = BytesIO(response.content) | |
| except Exception as e: | |
| return f"β Failed to load PDF: {str(e)}", None, "" | |
| # Step 3: Extract full text from PDF | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| full_text = "\n".join([page.get_text("text") for page in doc]) | |
| except Exception as e: | |
| return f"β Failed to extract text: {str(e)}", None, "" | |
| # Step 4: Extract datetime first | |
| datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)' | |
| datetime_match = re.search(datetime_pattern, full_text) | |
| datetime_result = "" | |
| if datetime_match: | |
| datetime_str = datetime_match.group(0) | |
| datetime_result = f"β First datetime found: {datetime_str}" | |
| else: | |
| datetime_result = "β No datetime pattern found in the PDF" | |
| # Step 5: Split by user-defined marker (optional) | |
| if split_marker.strip() and split_marker in full_text: | |
| parts = full_text.split(split_marker) | |
| # Use custom parts index, default to 1 if invalid | |
| try: | |
| parts_index = int(parts_index) | |
| if parts_index < 0 or parts_index >= len(parts): | |
| parts_index = 1 # Default to 1 if out of range | |
| except (ValueError, TypeError): | |
| parts_index = 1 # Default to 1 if invalid input | |
| latest_block = parts[parts_index] # Use custom parts index | |
| note = f"β Found marker '{split_marker}', using block {parts_index} (0-indexed)." | |
| else: | |
| latest_block = full_text | |
| note = f"β οΈ Marker '{split_marker}' not found. Using entire PDF content." | |
| # Step 6: Track Line# entries that don't have any of the specified identifiers | |
| missing_identifiers_data = [] # List to store line data with content | |
| current_line = None | |
| lines_content = [] | |
| for line in latest_block.splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| line_match = re.match(r"Line#\s+(\d+)", line) | |
| if line_match: | |
| # If we had a previous line number, check if it should be included | |
| if current_line is not None and lines_content: | |
| content = " ".join(lines_content) | |
| # Check if ANY of the identifiers are present | |
| has_any_identifier = any(identifier in content for identifier in identifiers) | |
| if not has_any_identifier: | |
| # Extract identifier message and narrative message | |
| identifier_msg = "" | |
| narrative_msg = "" | |
| # Look for identifier patterns in the content | |
| for identifier in identifiers: | |
| if identifier in content: | |
| # Find the context around the identifier | |
| idx = content.find(identifier) | |
| start = max(0, idx - 50) | |
| end = min(len(content), idx + len(identifier) + 50) | |
| identifier_msg = content[start:end].strip() | |
| break | |
| # Look for narrative message patterns (you may need to adjust this based on your PDF structure) | |
| # Common patterns for narrative messages | |
| narrative_patterns = [ | |
| r'Message:\s*(.+)', | |
| r'Narrative:\s*(.+)', | |
| r'Description:\s*(.+)', | |
| r'Note:\s*(.+)' | |
| ] | |
| for pattern in narrative_patterns: | |
| match = re.search(pattern, content, re.IGNORECASE) | |
| if match: | |
| narrative_msg = match.group(1).strip() | |
| break | |
| # If no specific narrative pattern found, use the full content | |
| if not narrative_msg: | |
| narrative_msg = content[:200] + "..." if len(content) > 200 else content | |
| missing_identifiers_data.append({ | |
| "Line#": current_line, | |
| "Identifier Message": identifier_msg, | |
| "Narrative Message": narrative_msg, | |
| "Full Content": content | |
| }) | |
| # Reset for new line number | |
| current_line = int(line_match.group(1)) | |
| lines_content = [] | |
| elif current_line is not None: | |
| lines_content.append(line) | |
| # Check the last line number | |
| if current_line is not None and lines_content: | |
| content = " ".join(lines_content) | |
| has_any_identifier = any(identifier in content for identifier in identifiers) | |
| if not has_any_identifier: | |
| # Extract identifier message and narrative message | |
| identifier_msg = "" | |
| narrative_msg = "" | |
| # Look for identifier patterns in the content | |
| for identifier in identifiers: | |
| if identifier in content: | |
| # Find the context around the identifier | |
| idx = content.find(identifier) | |
| start = max(0, idx - 50) | |
| end = min(len(content), idx + len(identifier) + 50) | |
| identifier_msg = content[start:end].strip() | |
| break | |
| # Look for narrative message patterns | |
| narrative_patterns = [ | |
| r'Message:\s*(.+)', | |
| r'Narrative:\s*(.+)', | |
| r'Description:\s*(.+)', | |
| r'Note:\s*(.+)' | |
| ] | |
| for pattern in narrative_patterns: | |
| match = re.search(pattern, content, re.IGNORECASE) | |
| if match: | |
| narrative_msg = match.group(1).strip() | |
| break | |
| # If no specific narrative pattern found, use the full content | |
| if not narrative_msg: | |
| narrative_msg = content[:200] + "..." if len(content) > 200 else content | |
| missing_identifiers_data.append({ | |
| "Line#": current_line, | |
| "Identifier Message": identifier_msg, | |
| "Narrative Message": narrative_msg, | |
| "Full Content": content | |
| }) | |
| if not missing_identifiers_data: | |
| return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result | |
| # Create DataFrame with all the collected data | |
| df = pd.DataFrame(missing_identifiers_data) | |
| return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result | |
| # Gradio Interface | |
| with gr.Blocks(title="PDF Analysis Tool") as demo: | |
| gr.Markdown("# PDF Analysis Tool") | |
| gr.Markdown("## PDF Analysis and Datetime Extraction") | |
| with gr.Row(): | |
| pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf") | |
| identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas") | |
| split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #") | |
| parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1) | |
| with gr.Row(): | |
| check_btn = gr.Button("Analyze PDF") | |
| with gr.Row(): | |
| result_text = gr.Textbox(label="Status") | |
| datetime_result = gr.Textbox(label="Datetime Result") | |
| with gr.Row(): | |
| result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas") | |
| check_btn.click( | |
| fn=check_latest_section, | |
| inputs=[pdf_url, identifiers_input, split_marker, parts_index], | |
| outputs=[result_text, result_df, datetime_result] | |
| ) | |
| demo.launch() |