Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

App Files Files Community

joycecast commited on Jun 23, 2025

Commit

c080e37

verified ·

1 Parent(s): 042f009

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -35

app.py CHANGED Viewed

@@ -4,12 +4,38 @@ import re
 import requests
 from io import BytesIO
 import pandas as pd
-def check_latest_section(pdf_url, identifiers_input, split_marker):
     # Step 1: Prepare identifiers (alphanumeric-safe)
     identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
     if not identifiers:
-        return "❌ No valid Message Identifiers entered.", None
     # Step 2: Download PDF
     try:
@@ -17,65 +43,184 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
         response.raise_for_status()
         pdf_bytes = BytesIO(response.content)
     except Exception as e:
-        return f"❌ Failed to load PDF: {str(e)}", None
     # Step 3: Extract full text from PDF
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
         full_text = "\n".join([page.get_text("text") for page in doc])
     except Exception as e:
-        return f"❌ Failed to extract text: {str(e)}", None
-    # Step 4: Split by user-defined marker (optional)
     if split_marker.strip() and split_marker in full_text:
         parts = full_text.split(split_marker)
-        latest_block = parts[1]  # First block *after* the split
-        note = f"✅ Found marker '{split_marker}', using the latest block."
     else:
         latest_block = full_text
         note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
-    # Step 5: Track Line# context and find message identifiers below it
-    id_pattern = set(identifiers)
-    matches_set = set()
     current_line = None
     for line in latest_block.splitlines():
         line = line.strip()
         if not line:
             continue
         line_match = re.match(r"Line#\s+(\d+)", line)
         if line_match:
             current_line = int(line_match.group(1))
-            continue
-        if current_line is not None:
-            for ident in id_pattern:
-                if re.search(rf"\b{re.escape(ident)}\b", line):
-                    matches_set.add((current_line, ident))
-    if not matches_set:
-        return note + " No matching Message Identifiers found.", None
-    df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
-    return note + " Matches found:", df
 # Gradio Interface
-demo = gr.Interface(
-    fn=check_latest_section,
-    inputs=[
-        gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
-        gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
-        gr.Textbox(label="Split Marker (optional)", value="Record #"),
-    ],
-    outputs=[
-        gr.Textbox(label="Status"),
-        gr.Dataframe(label="Matching Lines", type="pandas"),
-    ],
-    title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
-    description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
-)
 demo.launch()

 import requests
 from io import BytesIO
 import pandas as pd
+from datetime import datetime
+def extract_first_datetime(pdf_url):
+    """Extract the first datetime value from PDF"""
+    try:
+        response = requests.get(pdf_url)
+        response.raise_for_status()
+        pdf_bytes = BytesIO(response.content)
+    except Exception as e:
+        return f"❌ Failed to load PDF: {str(e)}"
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        full_text = "\n".join([page.get_text("text") for page in doc])
+    except Exception as e:
+        return f"❌ Failed to extract text: {str(e)}"
+    # Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)"
+    datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
+    match = re.search(datetime_pattern, full_text)
+    if match:
+        datetime_str = match.group(0)
+        return f"✅ First datetime found: {datetime_str}"
+    else:
+        return "❌ No datetime pattern found in the PDF"
+def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index):
     # Step 1: Prepare identifiers (alphanumeric-safe)
     identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
     if not identifiers:
+        return "❌ No valid Identifiers entered.", None, ""
     # Step 2: Download PDF
     try:
         response.raise_for_status()
         pdf_bytes = BytesIO(response.content)
     except Exception as e:
+        return f"❌ Failed to load PDF: {str(e)}", None, ""
     # Step 3: Extract full text from PDF
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
         full_text = "\n".join([page.get_text("text") for page in doc])
     except Exception as e:
+        return f"❌ Failed to extract text: {str(e)}", None, ""
+    # Step 4: Extract datetime first
+    datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
+    datetime_match = re.search(datetime_pattern, full_text)
+    datetime_result = ""
+    if datetime_match:
+        datetime_str = datetime_match.group(0)
+        datetime_result = f"✅ First datetime found: {datetime_str}"
+    else:
+        datetime_result = "❌ No datetime pattern found in the PDF"
+    # Step 5: Split by user-defined marker (optional)
     if split_marker.strip() and split_marker in full_text:
         parts = full_text.split(split_marker)
+        # Use custom parts index, default to 1 if invalid
+        try:
+            parts_index = int(parts_index)
+            if parts_index < 0 or parts_index >= len(parts):
+                parts_index = 1  # Default to 1 if out of range
+        except (ValueError, TypeError):
+            parts_index = 1  # Default to 1 if invalid input
+        latest_block = parts[parts_index]  # Use custom parts index
+        note = f"✅ Found marker '{split_marker}', using block {parts_index} (0-indexed)."
     else:
         latest_block = full_text
         note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
+    # Step 6: Track Line# entries that don't have any of the specified identifiers
+    missing_identifiers_data = []  # List to store line data with content
     current_line = None
+    lines_content = []
     for line in latest_block.splitlines():
         line = line.strip()
         if not line:
             continue
         line_match = re.match(r"Line#\s+(\d+)", line)
         if line_match:
+            # If we had a previous line number, check if it should be included
+            if current_line is not None and lines_content:
+                content = " ".join(lines_content)
+                # Check if ANY of the identifiers are present
+                has_any_identifier = any(identifier in content for identifier in identifiers)
+                if not has_any_identifier:
+                    # Extract identifier message and narrative message
+                    identifier_msg = ""
+                    narrative_msg = ""
+                    # Look for identifier patterns in the content
+                    for identifier in identifiers:
+                        if identifier in content:
+                            # Find the context around the identifier
+                            idx = content.find(identifier)
+                            start = max(0, idx - 50)
+                            end = min(len(content), idx + len(identifier) + 50)
+                            identifier_msg = content[start:end].strip()
+                            break
+                    # Look for narrative message patterns (you may need to adjust this based on your PDF structure)
+                    # Common patterns for narrative messages
+                    narrative_patterns = [
+                        r'Message:\s*(.+)',
+                        r'Narrative:\s*(.+)',
+                        r'Description:\s*(.+)',
+                        r'Note:\s*(.+)'
+                    ]
+                    for pattern in narrative_patterns:
+                        match = re.search(pattern, content, re.IGNORECASE)
+                        if match:
+                            narrative_msg = match.group(1).strip()
+                            break
+                    # If no specific narrative pattern found, use the full content
+                    if not narrative_msg:
+                        narrative_msg = content[:200] + "..." if len(content) > 200 else content
+                    missing_identifiers_data.append({
+                        "Line#": current_line,
+                        "Identifier Message": identifier_msg,
+                        "Narrative Message": narrative_msg,
+                        "Full Content": content
+                    })
+            # Reset for new line number
             current_line = int(line_match.group(1))
+            lines_content = []
+        elif current_line is not None:
+            lines_content.append(line)
+    # Check the last line number
+    if current_line is not None and lines_content:
+        content = " ".join(lines_content)
+        has_any_identifier = any(identifier in content for identifier in identifiers)
+        if not has_any_identifier:
+            # Extract identifier message and narrative message
+            identifier_msg = ""
+            narrative_msg = ""
+            # Look for identifier patterns in the content
+            for identifier in identifiers:
+                if identifier in content:
+                    # Find the context around the identifier
+                    idx = content.find(identifier)
+                    start = max(0, idx - 50)
+                    end = min(len(content), idx + len(identifier) + 50)
+                    identifier_msg = content[start:end].strip()
+                    break
+            # Look for narrative message patterns
+            narrative_patterns = [
+                r'Message:\s*(.+)',
+                r'Narrative:\s*(.+)',
+                r'Description:\s*(.+)',
+                r'Note:\s*(.+)'
+            ]
+            for pattern in narrative_patterns:
+                match = re.search(pattern, content, re.IGNORECASE)
+                if match:
+                    narrative_msg = match.group(1).strip()
+                    break
+            # If no specific narrative pattern found, use the full content
+            if not narrative_msg:
+                narrative_msg = content[:200] + "..." if len(content) > 200 else content
+            missing_identifiers_data.append({
+                "Line#": current_line,
+                "Identifier Message": identifier_msg,
+                "Narrative Message": narrative_msg,
+                "Full Content": content
+            })
+    if not missing_identifiers_data:
+        return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result
+    # Create DataFrame with all the collected data
+    df = pd.DataFrame(missing_identifiers_data)
+    return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result
 # Gradio Interface
+with gr.Blocks(title="PDF Analysis Tool") as demo:
+    gr.Markdown("# PDF Analysis Tool")
+    gr.Markdown("## PDF Analysis and Datetime Extraction")
+    with gr.Row():
+        pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf")
+        identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas")
+        split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #")
+        parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1)
+    with gr.Row():
+        check_btn = gr.Button("Analyze PDF")
+    with gr.Row():
+        result_text = gr.Textbox(label="Status")
+        datetime_result = gr.Textbox(label="Datetime Result")
+    with gr.Row():
+        result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas")
+    check_btn.click(
+        fn=check_latest_section,
+        inputs=[pdf_url, identifiers_input, split_marker, parts_index],
+        outputs=[result_text, result_df, datetime_result]
+    )
 demo.launch()