Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import re | |
| import requests | |
| from io import BytesIO | |
| import pandas as pd | |
| def check_latest_section(pdf_url, identifiers_input, split_marker): | |
| # Step 1: Prepare identifiers | |
| identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()] | |
| if not identifiers: | |
| return "β No valid Message Identifiers entered.", None | |
| # Step 2: Download PDF | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| pdf_bytes = BytesIO(response.content) | |
| except Exception as e: | |
| return f"β Failed to load PDF: {str(e)}", None | |
| # Step 3: Extract full text from PDF | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| full_text = "\n".join([page.get_text() for page in doc]) | |
| except Exception as e: | |
| return f"β Failed to extract text: {str(e)}", None | |
| # Step 4: Split by user-defined marker (optional) | |
| if split_marker.strip() and split_marker in full_text: | |
| parts = full_text.split(split_marker) | |
| latest_block = parts[0] | |
| note = f"β Found marker '{split_marker}', using the latest block." | |
| else: | |
| latest_block = full_text | |
| note = f"β οΈ Marker '{split_marker}' not found. Using entire PDF content." | |
| # Step 5: Match Line# and Message Identifier | |
| id_pattern = "|".join(re.escape(i) for i in identifiers) | |
| regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})") | |
| matches = [] | |
| for match in regex.finditer(latest_block): | |
| line_num, msg_id = match.groups() | |
| matches.append({"Line#": int(line_num), "Message Identifier": msg_id}) | |
| if not matches: | |
| return note + " No matching Message Identifiers found.", None | |
| df = pd.DataFrame(matches).sort_values("Line#").reset_index(drop=True) | |
| return note + " Matches found:", df | |
| # Gradio Interface | |
| demo = gr.Interface( | |
| fn=check_latest_section, | |
| inputs=[ | |
| gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"), | |
| gr.Textbox(label="Message Identifier List", value="523"), | |
| gr.Textbox(label="Split Marker (optional)", value="Record #"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Status"), | |
| gr.Dataframe(label="Matching Lines", type="pandas"), | |
| ], | |
| title="PDF Line# Identifier Checker (Latest Only)", | |
| description="Checks Line# entries with specified Message Identifiers. If a split marker is provided, only the latest section is used; otherwise, the full document is scanned." | |
| ) | |
| demo.launch() | |