Spaces:

APEXlogistics
/

ErrorMsgIdentifier

Sleeping

App Files Files Community

joycecast commited on May 1, 2025

Commit

56bb301

verified ·

1 Parent(s): 64fabbb

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -17

app.py CHANGED Viewed

@@ -5,13 +5,13 @@ import requests
 from io import BytesIO
 import pandas as pd
-def check_pdf_messages(pdf_url, identifiers_input):
-    # Parse identifiers
     identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
     if not identifiers:
         return "❌ No valid Message Identifiers entered.", None
-    # Download PDF
     try:
         response = requests.get(pdf_url)
         response.raise_for_status()
@@ -19,43 +19,51 @@ def check_pdf_messages(pdf_url, identifiers_input):
     except Exception as e:
         return f"❌ Failed to load PDF: {str(e)}", None
-    # Extract text using PyMuPDF
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
         full_text = "\n".join([page.get_text() for page in doc])
     except Exception as e:
-        return f"❌ Failed to extract text from PDF: {str(e)}", None
-    # Regex to match Line# + Message Identifier
-    id_pattern = "|".join(re.escape(id_) for id_ in identifiers)
     regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
-    # Parse matches
     matches = []
-    for match in regex.finditer(full_text):
         line_num, msg_id = match.groups()
         matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
     if not matches:
-        return "✅ No matching Message Identifiers found in the PDF.", None
-    # Return results as DataFrame
-    df = pd.DataFrame(matches).sort_values(by="Line#").reset_index(drop=True)
-    return "✅ Matches found:", df
-# Gradio UI
 demo = gr.Interface(
-    fn=check_pdf_messages,
     inputs=[
         gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
         gr.Textbox(label="Message Identifier List", value="523"),
     ],
     outputs=[
         gr.Textbox(label="Status"),
         gr.Dataframe(label="Matching Lines", type="pandas"),
     ],
-    title="PDF Message Identifier Line Checker",
-    description="Paste a PDF URL and a comma-separated list of Message Identifiers. This tool finds which Line# entries match those IDs."
 )
 demo.launch()

 from io import BytesIO
 import pandas as pd
+def check_latest_section(pdf_url, identifiers_input, split_marker):
+    # Step 1: Prepare identifiers
     identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
     if not identifiers:
         return "❌ No valid Message Identifiers entered.", None
+    # Step 2: Download PDF
     try:
         response = requests.get(pdf_url)
         response.raise_for_status()
     except Exception as e:
         return f"❌ Failed to load PDF: {str(e)}", None
+    # Step 3: Extract full text from PDF
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
         full_text = "\n".join([page.get_text() for page in doc])
     except Exception as e:
+        return f"❌ Failed to extract text: {str(e)}", None
+    # Step 4: Split by user-defined marker (optional)
+    if split_marker.strip() and split_marker in full_text:
+        parts = full_text.split(split_marker)
+        latest_block = parts[1]
+        note = f"✅ Found marker '{split_marker}', using the latest block."
+    else:
+        latest_block = full_text
+        note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
+    # Step 5: Match Line# and Message Identifier
+    id_pattern = "|".join(re.escape(i) for i in identifiers)
     regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
     matches = []
+    for match in regex.finditer(latest_block):
         line_num, msg_id = match.groups()
         matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
     if not matches:
+        return note + " No matching Message Identifiers found.", None
+    df = pd.DataFrame(matches).sort_values("Line#").reset_index(drop=True)
+    return note + " Matches found:", df
+# Gradio Interface
 demo = gr.Interface(
+    fn=check_latest_section,
     inputs=[
         gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
         gr.Textbox(label="Message Identifier List", value="523"),
+        gr.Textbox(label="Split Marker (optional)", value="Record #"),
     ],
     outputs=[
         gr.Textbox(label="Status"),
         gr.Dataframe(label="Matching Lines", type="pandas"),
     ],
+    title="PDF Line# Identifier Checker (Latest Only)",
+    description="Checks Line# entries with specified Message Identifiers. If a split marker is provided, only the latest section is used; otherwise, the full document is scanned."
 )
 demo.launch()