import gradio as gr import fitz # PyMuPDF import re import requests from io import BytesIO import pandas as pd def check_latest_section(pdf_url, identifiers_input, split_marker): # Step 1: Prepare identifiers (alphanumeric-safe) identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()] if not identifiers: return "❌ No valid Message Identifiers entered.", None # Step 2: Download PDF try: response = requests.get(pdf_url) response.raise_for_status() pdf_bytes = BytesIO(response.content) except Exception as e: return f"❌ Failed to load PDF: {str(e)}", None # Step 3: Extract full text from PDF try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") full_text = "\n".join([page.get_text("text") for page in doc]) except Exception as e: return f"❌ Failed to extract text: {str(e)}", None # Step 4: Split by user-defined marker (optional) if split_marker.strip() and split_marker in full_text: parts = full_text.split(split_marker) latest_block = parts[1] # First block *after* the split note = f"✅ Found marker '{split_marker}', using the latest block." else: latest_block = full_text note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content." # Step 5: Track Line# context and find message identifiers below it id_pattern = set(identifiers) matches_set = set() current_line = None for line in latest_block.splitlines(): line = line.strip() if not line: continue line_match = re.match(r"Line#\s+(\d+)", line) if line_match: current_line = int(line_match.group(1)) continue if current_line is not None: for ident in id_pattern: if re.search(rf"\b{re.escape(ident)}\b", line): matches_set.add((current_line, ident)) if not matches_set: return note + " No matching Message Identifiers found.", None df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"]) return note + " Matches found:", df # Gradio Interface demo = gr.Interface( fn=check_latest_section, inputs=[ gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"), gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"), gr.Textbox(label="Split Marker (optional)", value="Record #"), ], outputs=[ gr.Textbox(label="Status"), gr.Dataframe(label="Matching Lines", type="pandas"), ], title="PDF Line# Identifier Checker (Reliable Contextual Matching)", description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow." ) demo.launch()