Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

File size: 9,506 Bytes

import gradio as gr
import fitz  # PyMuPDF
import re
import requests
from io import BytesIO
import pandas as pd
from datetime import datetime

def extract_first_datetime(pdf_url):
    """Extract the first datetime value from PDF"""
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_bytes = BytesIO(response.content)
    except Exception as e:
        return f"❌ Failed to load PDF: {str(e)}"

    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        full_text = "\n".join([page.get_text("text") for page in doc])
    except Exception as e:
        return f"❌ Failed to extract text: {str(e)}"

    # Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)"
    datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
    
    match = re.search(datetime_pattern, full_text)
    if match:
        datetime_str = match.group(0)
        return f"✅ First datetime found: {datetime_str}"
    else:
        return "❌ No datetime pattern found in the PDF"

def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index):
    # Step 1: Prepare identifiers (alphanumeric-safe)
    identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
    if not identifiers:
        return "❌ No valid Identifiers entered.", None, ""

    # Step 2: Download PDF
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_bytes = BytesIO(response.content)
    except Exception as e:
        return f"❌ Failed to load PDF: {str(e)}", None, ""

    # Step 3: Extract full text from PDF
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        full_text = "\n".join([page.get_text("text") for page in doc])
    except Exception as e:
        return f"❌ Failed to extract text: {str(e)}", None, ""

    # Step 4: Extract datetime first
    datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
    datetime_match = re.search(datetime_pattern, full_text)
    datetime_result = ""
    if datetime_match:
        datetime_str = datetime_match.group(0)
        datetime_result = f"✅ First datetime found: {datetime_str}"
    else:
        datetime_result = "❌ No datetime pattern found in the PDF"

    # Step 5: Split by user-defined marker (optional)
    if split_marker.strip() and split_marker in full_text:
        parts = full_text.split(split_marker)
        # Use custom parts index, default to 1 if invalid
        try:
            parts_index = int(parts_index)
            if parts_index < 0 or parts_index >= len(parts):
                parts_index = 1  # Default to 1 if out of range
        except (ValueError, TypeError):
            parts_index = 1  # Default to 1 if invalid input
            
        latest_block = parts[parts_index]  # Use custom parts index
        note = f"✅ Found marker '{split_marker}', using block {parts_index} (0-indexed)."
    else:
        latest_block = full_text
        note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."

    # Step 6: Track Line# entries that don't have any of the specified identifiers
    missing_identifiers_data = []  # List to store line data with content
    current_line = None
    lines_content = []

    for line in latest_block.splitlines():
        line = line.strip()
        
        if not line:
            continue

        line_match = re.match(r"Line#\s+(\d+)", line)
        if line_match:
            # If we had a previous line number, check if it should be included
            if current_line is not None and lines_content:
                content = " ".join(lines_content)
                # Check if ANY of the identifiers are present
                has_any_identifier = any(identifier in content for identifier in identifiers)
                if not has_any_identifier:
                    # Extract identifier message and narrative message
                    identifier_msg = ""
                    narrative_msg = ""
                    
                    # Look for identifier patterns in the content
                    for identifier in identifiers:
                        if identifier in content:
                            # Find the context around the identifier
                            idx = content.find(identifier)
                            start = max(0, idx - 50)
                            end = min(len(content), idx + len(identifier) + 50)
                            identifier_msg = content[start:end].strip()
                            break
                    
                    # Look for narrative message patterns (you may need to adjust this based on your PDF structure)
                    # Common patterns for narrative messages
                    narrative_patterns = [
                        r'Message:\s*(.+)',
                        r'Narrative:\s*(.+)',
                        r'Description:\s*(.+)',
                        r'Note:\s*(.+)'
                    ]
                    
                    for pattern in narrative_patterns:
                        match = re.search(pattern, content, re.IGNORECASE)
                        if match:
                            narrative_msg = match.group(1).strip()
                            break
                    
                    # If no specific narrative pattern found, use the full content
                    if not narrative_msg:
                        narrative_msg = content[:200] + "..." if len(content) > 200 else content
                    
                    missing_identifiers_data.append({
                        "Line#": current_line,
                        "Identifier Message": identifier_msg,
                        "Narrative Message": narrative_msg,
                        "Full Content": content
                    })
            
            # Reset for new line number
            current_line = int(line_match.group(1))
            lines_content = []
        elif current_line is not None:
            lines_content.append(line)

    # Check the last line number
    if current_line is not None and lines_content:
        content = " ".join(lines_content)
        has_any_identifier = any(identifier in content for identifier in identifiers)
        if not has_any_identifier:
            # Extract identifier message and narrative message
            identifier_msg = ""
            narrative_msg = ""
            
            # Look for identifier patterns in the content
            for identifier in identifiers:
                if identifier in content:
                    # Find the context around the identifier
                    idx = content.find(identifier)
                    start = max(0, idx - 50)
                    end = min(len(content), idx + len(identifier) + 50)
                    identifier_msg = content[start:end].strip()
                    break
            
            # Look for narrative message patterns
            narrative_patterns = [
                r'Message:\s*(.+)',
                r'Narrative:\s*(.+)',
                r'Description:\s*(.+)',
                r'Note:\s*(.+)'
            ]
            
            for pattern in narrative_patterns:
                match = re.search(pattern, content, re.IGNORECASE)
                if match:
                    narrative_msg = match.group(1).strip()
                    break
            
            # If no specific narrative pattern found, use the full content
            if not narrative_msg:
                narrative_msg = content[:200] + "..." if len(content) > 200 else content
            
            missing_identifiers_data.append({
                "Line#": current_line,
                "Identifier Message": identifier_msg,
                "Narrative Message": narrative_msg,
                "Full Content": content
            })

    if not missing_identifiers_data:
        return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result

    # Create DataFrame with all the collected data
    df = pd.DataFrame(missing_identifiers_data)
    return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result

# Gradio Interface
with gr.Blocks(title="PDF Analysis Tool") as demo:
    gr.Markdown("# PDF Analysis Tool")
    
    gr.Markdown("## PDF Analysis and Datetime Extraction")
    
    with gr.Row():
        pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf")
        identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas")
        split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #")
        parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1)
    
    with gr.Row():
        check_btn = gr.Button("Analyze PDF")
    
    with gr.Row():
        result_text = gr.Textbox(label="Status")
        datetime_result = gr.Textbox(label="Datetime Result")
    
    with gr.Row():
        result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas")
    
    check_btn.click(
        fn=check_latest_section,
        inputs=[pdf_url, identifiers_input, split_marker, parts_index],
        outputs=[result_text, result_df, datetime_result]
    )

demo.launch()