Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import re | |
| import requests | |
| from io import BytesIO | |
| import pandas as pd | |
| def check_latest_section(pdf_url, identifiers_input, split_marker): | |
| # Step 1: Prepare identifiers (alphanumeric-safe) | |
| identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()] | |
| if not identifiers: | |
| return "β No valid Message Identifiers entered.", None | |
| # Step 2: Download PDF | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| pdf_bytes = BytesIO(response.content) | |
| except Exception as e: | |
| return f"β Failed to load PDF: {str(e)}", None | |
| # Step 3: Extract full text from PDF | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| full_text = "\n".join([page.get_text("text") for page in doc]) | |
| except Exception as e: | |
| return f"β Failed to extract text: {str(e)}", None | |
| # Step 4: Split by user-defined marker (optional) | |
| if split_marker.strip() and split_marker in full_text: | |
| parts = full_text.split(split_marker) | |
| latest_block = parts[1] # First block *after* the split | |
| note = f"β Found marker '{split_marker}', using the latest block." | |
| else: | |
| latest_block = full_text | |
| note = f"β οΈ Marker '{split_marker}' not found. Using entire PDF content." | |
| # Step 5: Track Line# context and find message identifiers below it | |
| id_pattern = set(identifiers) | |
| matches_set = set() | |
| current_line = None | |
| for line in latest_block.splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| line_match = re.match(r"Line#\s+(\d+)", line) | |
| if line_match: | |
| current_line = int(line_match.group(1)) | |
| continue | |
| if current_line is not None: | |
| for ident in id_pattern: | |
| if re.search(rf"\b{re.escape(ident)}\b", line): | |
| matches_set.add((current_line, ident)) | |
| if not matches_set: | |
| return note + " No matching Message Identifiers found.", None | |
| df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"]) | |
| return note + " Matches found:", df | |
| # Gradio Interface | |
| demo = gr.Interface( | |
| fn=check_latest_section, | |
| inputs=[ | |
| gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"), | |
| gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"), | |
| gr.Textbox(label="Split Marker (optional)", value="Record #"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Status"), | |
| gr.Dataframe(label="Matching Lines", type="pandas"), | |
| ], | |
| title="PDF Line# Identifier Checker (Reliable Contextual Matching)", | |
| description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow." | |
| ) | |
| demo.launch() |