joycecast commited on
Commit
56bb301
Β·
verified Β·
1 Parent(s): 64fabbb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -17
app.py CHANGED
@@ -5,13 +5,13 @@ import requests
5
  from io import BytesIO
6
  import pandas as pd
7
 
8
- def check_pdf_messages(pdf_url, identifiers_input):
9
- # Parse identifiers
10
  identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
11
  if not identifiers:
12
  return "❌ No valid Message Identifiers entered.", None
13
 
14
- # Download PDF
15
  try:
16
  response = requests.get(pdf_url)
17
  response.raise_for_status()
@@ -19,43 +19,51 @@ def check_pdf_messages(pdf_url, identifiers_input):
19
  except Exception as e:
20
  return f"❌ Failed to load PDF: {str(e)}", None
21
 
22
- # Extract text using PyMuPDF
23
  try:
24
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
25
  full_text = "\n".join([page.get_text() for page in doc])
26
  except Exception as e:
27
- return f"❌ Failed to extract text from PDF: {str(e)}", None
28
 
29
- # Regex to match Line# + Message Identifier
30
- id_pattern = "|".join(re.escape(id_) for id_ in identifiers)
 
 
 
 
 
 
 
 
 
31
  regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
32
 
33
- # Parse matches
34
  matches = []
35
- for match in regex.finditer(full_text):
36
  line_num, msg_id = match.groups()
37
  matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
38
 
39
  if not matches:
40
- return "βœ… No matching Message Identifiers found in the PDF.", None
41
 
42
- # Return results as DataFrame
43
- df = pd.DataFrame(matches).sort_values(by="Line#").reset_index(drop=True)
44
- return "βœ… Matches found:", df
45
 
46
- # Gradio UI
47
  demo = gr.Interface(
48
- fn=check_pdf_messages,
49
  inputs=[
50
  gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
51
  gr.Textbox(label="Message Identifier List", value="523"),
 
52
  ],
53
  outputs=[
54
  gr.Textbox(label="Status"),
55
  gr.Dataframe(label="Matching Lines", type="pandas"),
56
  ],
57
- title="PDF Message Identifier Line Checker",
58
- description="Paste a PDF URL and a comma-separated list of Message Identifiers. This tool finds which Line# entries match those IDs."
59
  )
60
 
61
  demo.launch()
 
5
  from io import BytesIO
6
  import pandas as pd
7
 
8
+ def check_latest_section(pdf_url, identifiers_input, split_marker):
9
+ # Step 1: Prepare identifiers
10
  identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip().isdigit()]
11
  if not identifiers:
12
  return "❌ No valid Message Identifiers entered.", None
13
 
14
+ # Step 2: Download PDF
15
  try:
16
  response = requests.get(pdf_url)
17
  response.raise_for_status()
 
19
  except Exception as e:
20
  return f"❌ Failed to load PDF: {str(e)}", None
21
 
22
+ # Step 3: Extract full text from PDF
23
  try:
24
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
25
  full_text = "\n".join([page.get_text() for page in doc])
26
  except Exception as e:
27
+ return f"❌ Failed to extract text: {str(e)}", None
28
 
29
+ # Step 4: Split by user-defined marker (optional)
30
+ if split_marker.strip() and split_marker in full_text:
31
+ parts = full_text.split(split_marker)
32
+ latest_block = parts[1]
33
+ note = f"βœ… Found marker '{split_marker}', using the latest block."
34
+ else:
35
+ latest_block = full_text
36
+ note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
37
+
38
+ # Step 5: Match Line# and Message Identifier
39
+ id_pattern = "|".join(re.escape(i) for i in identifiers)
40
  regex = re.compile(rf"Line#\s+(\d+)\s+({id_pattern})")
41
 
 
42
  matches = []
43
+ for match in regex.finditer(latest_block):
44
  line_num, msg_id = match.groups()
45
  matches.append({"Line#": int(line_num), "Message Identifier": msg_id})
46
 
47
  if not matches:
48
+ return note + " No matching Message Identifiers found.", None
49
 
50
+ df = pd.DataFrame(matches).sort_values("Line#").reset_index(drop=True)
51
+ return note + " Matches found:", df
 
52
 
53
+ # Gradio Interface
54
  demo = gr.Interface(
55
+ fn=check_latest_section,
56
  inputs=[
57
  gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
58
  gr.Textbox(label="Message Identifier List", value="523"),
59
+ gr.Textbox(label="Split Marker (optional)", value="Record #"),
60
  ],
61
  outputs=[
62
  gr.Textbox(label="Status"),
63
  gr.Dataframe(label="Matching Lines", type="pandas"),
64
  ],
65
+ title="PDF Line# Identifier Checker (Latest Only)",
66
+ description="Checks Line# entries with specified Message Identifiers. If a split marker is provided, only the latest section is used; otherwise, the full document is scanned."
67
  )
68
 
69
  demo.launch()