joycecast commited on
Commit
fcad26a
·
verified ·
1 Parent(s): 1c86181

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import re
4
+ import requests
5
+ from io import BytesIO
6
+ import pandas as pd
7
+
8
+ def check_pdf_messages(pdf_url, identifiers_input):
9
+ # Parse identifiers
10
+ identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
11
+ if not identifiers:
12
+ return "❌ No valid Message Identifiers entered.", None
13
+
14
+ # Download PDF
15
+ try:
16
+ response = requests.get(pdf_url)
17
+ response.raise_for_status()
18
+ pdf_bytes = BytesIO(response.content)
19
+ except Exception as e:
20
+ return f"❌ Failed to load PDF: {str(e)}", None
21
+
22
+ # Extract text using PyMuPDF
23
+ try:
24
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
25
+ full_text = "\n".join([page.get_text() for page in doc])
26
+ except Exception as e:
27
+ return f"❌ Failed to extract text from PDF: {str(e)}", None
28
+
29
+ # Regex to match Line# + Message Identifier
30
+ id_pattern = "|".join(re.escape(id_) for id_ in identifiers)
31
+ regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
32
+
33
+ # Parse matches
34
+ matches = []
35
+ for match in regex.finditer(full_text):
36
+ line_num, msg_id = match.groups()
37
+ matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
38
+
39
+ if not matches:
40
+ return "✅ No matching Message Identifiers found in the PDF.", None
41
+
42
+ # Return results as DataFrame
43
+ df = pd.DataFrame(matches).sort_values(by="Line#").reset_index(drop=True)
44
+ return "✅ Matches found:", df
45
+
46
+ # Gradio UI
47
+ demo = gr.Interface(
48
+ fn=check_pdf_messages,
49
+ inputs=[
50
+ gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
51
+ gr.Textbox(label="Message Identifier List", placeholder="e.g., 523,600"),
52
+ ],
53
+ outputs=[
54
+ gr.Textbox(label="Status"),
55
+ gr.Dataframe(label="Matching Lines", type="pandas"),
56
+ ],
57
+ title="PDF Message Identifier Line Checker",
58
+ description="Paste a PDF URL and a comma-separated list of Message Identifiers. This tool finds which Line# entries match those IDs."
59
+ )
60
+
61
+ demo.launch()