joycecast commited on
Commit
c080e37
Β·
verified Β·
1 Parent(s): 042f009

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -35
app.py CHANGED
@@ -4,12 +4,38 @@ import re
4
  import requests
5
  from io import BytesIO
6
  import pandas as pd
 
7
 
8
- def check_latest_section(pdf_url, identifiers_input, split_marker):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Step 1: Prepare identifiers (alphanumeric-safe)
10
  identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
11
  if not identifiers:
12
- return "❌ No valid Message Identifiers entered.", None
13
 
14
  # Step 2: Download PDF
15
  try:
@@ -17,65 +43,184 @@ def check_latest_section(pdf_url, identifiers_input, split_marker):
17
  response.raise_for_status()
18
  pdf_bytes = BytesIO(response.content)
19
  except Exception as e:
20
- return f"❌ Failed to load PDF: {str(e)}", None
21
 
22
  # Step 3: Extract full text from PDF
23
  try:
24
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
25
  full_text = "\n".join([page.get_text("text") for page in doc])
26
  except Exception as e:
27
- return f"❌ Failed to extract text: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Step 4: Split by user-defined marker (optional)
30
  if split_marker.strip() and split_marker in full_text:
31
  parts = full_text.split(split_marker)
32
- latest_block = parts[1] # First block *after* the split
33
- note = f"βœ… Found marker '{split_marker}', using the latest block."
 
 
 
 
 
 
 
 
34
  else:
35
  latest_block = full_text
36
  note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
37
 
38
- # Step 5: Track Line# context and find message identifiers below it
39
- id_pattern = set(identifiers)
40
- matches_set = set()
41
-
42
  current_line = None
 
 
43
  for line in latest_block.splitlines():
44
  line = line.strip()
45
-
46
  if not line:
47
  continue
48
 
49
  line_match = re.match(r"Line#\s+(\d+)", line)
50
  if line_match:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  current_line = int(line_match.group(1))
52
- continue
 
 
53
 
54
- if current_line is not None:
55
- for ident in id_pattern:
56
- if re.search(rf"\b{re.escape(ident)}\b", line):
57
- matches_set.add((current_line, ident))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- if not matches_set:
60
- return note + " No matching Message Identifiers found.", None
61
 
62
- df = pd.DataFrame(sorted(matches_set), columns=["Line#", "Message Identifier"])
63
- return note + " Matches found:", df
 
64
 
65
  # Gradio Interface
66
- demo = gr.Interface(
67
- fn=check_latest_section,
68
- inputs=[
69
- gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf"),
70
- gr.Textbox(label="Message Identifier List", value="523, P00, P02, 831"),
71
- gr.Textbox(label="Split Marker (optional)", value="Record #"),
72
- ],
73
- outputs=[
74
- gr.Textbox(label="Status"),
75
- gr.Dataframe(label="Matching Lines", type="pandas"),
76
- ],
77
- title="PDF Line# Identifier Checker (Reliable Contextual Matching)",
78
- description="Scans a PDF from URL, tracks Line# blocks and matches identifiers in the lines that follow."
79
- )
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  demo.launch()
 
4
  import requests
5
  from io import BytesIO
6
  import pandas as pd
7
+ from datetime import datetime
8
 
9
+ def extract_first_datetime(pdf_url):
10
+ """Extract the first datetime value from PDF"""
11
+ try:
12
+ response = requests.get(pdf_url)
13
+ response.raise_for_status()
14
+ pdf_bytes = BytesIO(response.content)
15
+ except Exception as e:
16
+ return f"❌ Failed to load PDF: {str(e)}"
17
+
18
+ try:
19
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
20
+ full_text = "\n".join([page.get_text("text") for page in doc])
21
+ except Exception as e:
22
+ return f"❌ Failed to extract text: {str(e)}"
23
+
24
+ # Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)"
25
+ datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
26
+
27
+ match = re.search(datetime_pattern, full_text)
28
+ if match:
29
+ datetime_str = match.group(0)
30
+ return f"βœ… First datetime found: {datetime_str}"
31
+ else:
32
+ return "❌ No datetime pattern found in the PDF"
33
+
34
+ def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index):
35
  # Step 1: Prepare identifiers (alphanumeric-safe)
36
  identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
37
  if not identifiers:
38
+ return "❌ No valid Identifiers entered.", None, ""
39
 
40
  # Step 2: Download PDF
41
  try:
 
43
  response.raise_for_status()
44
  pdf_bytes = BytesIO(response.content)
45
  except Exception as e:
46
+ return f"❌ Failed to load PDF: {str(e)}", None, ""
47
 
48
  # Step 3: Extract full text from PDF
49
  try:
50
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
51
  full_text = "\n".join([page.get_text("text") for page in doc])
52
  except Exception as e:
53
+ return f"❌ Failed to extract text: {str(e)}", None, ""
54
+
55
+ # Step 4: Extract datetime first
56
+ datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
57
+ datetime_match = re.search(datetime_pattern, full_text)
58
+ datetime_result = ""
59
+ if datetime_match:
60
+ datetime_str = datetime_match.group(0)
61
+ datetime_result = f"βœ… First datetime found: {datetime_str}"
62
+ else:
63
+ datetime_result = "❌ No datetime pattern found in the PDF"
64
 
65
+ # Step 5: Split by user-defined marker (optional)
66
  if split_marker.strip() and split_marker in full_text:
67
  parts = full_text.split(split_marker)
68
+ # Use custom parts index, default to 1 if invalid
69
+ try:
70
+ parts_index = int(parts_index)
71
+ if parts_index < 0 or parts_index >= len(parts):
72
+ parts_index = 1 # Default to 1 if out of range
73
+ except (ValueError, TypeError):
74
+ parts_index = 1 # Default to 1 if invalid input
75
+
76
+ latest_block = parts[parts_index] # Use custom parts index
77
+ note = f"βœ… Found marker '{split_marker}', using block {parts_index} (0-indexed)."
78
  else:
79
  latest_block = full_text
80
  note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."
81
 
82
+ # Step 6: Track Line# entries that don't have any of the specified identifiers
83
+ missing_identifiers_data = [] # List to store line data with content
 
 
84
  current_line = None
85
+ lines_content = []
86
+
87
  for line in latest_block.splitlines():
88
  line = line.strip()
89
+
90
  if not line:
91
  continue
92
 
93
  line_match = re.match(r"Line#\s+(\d+)", line)
94
  if line_match:
95
+ # If we had a previous line number, check if it should be included
96
+ if current_line is not None and lines_content:
97
+ content = " ".join(lines_content)
98
+ # Check if ANY of the identifiers are present
99
+ has_any_identifier = any(identifier in content for identifier in identifiers)
100
+ if not has_any_identifier:
101
+ # Extract identifier message and narrative message
102
+ identifier_msg = ""
103
+ narrative_msg = ""
104
+
105
+ # Look for identifier patterns in the content
106
+ for identifier in identifiers:
107
+ if identifier in content:
108
+ # Find the context around the identifier
109
+ idx = content.find(identifier)
110
+ start = max(0, idx - 50)
111
+ end = min(len(content), idx + len(identifier) + 50)
112
+ identifier_msg = content[start:end].strip()
113
+ break
114
+
115
+ # Look for narrative message patterns (you may need to adjust this based on your PDF structure)
116
+ # Common patterns for narrative messages
117
+ narrative_patterns = [
118
+ r'Message:\s*(.+)',
119
+ r'Narrative:\s*(.+)',
120
+ r'Description:\s*(.+)',
121
+ r'Note:\s*(.+)'
122
+ ]
123
+
124
+ for pattern in narrative_patterns:
125
+ match = re.search(pattern, content, re.IGNORECASE)
126
+ if match:
127
+ narrative_msg = match.group(1).strip()
128
+ break
129
+
130
+ # If no specific narrative pattern found, use the full content
131
+ if not narrative_msg:
132
+ narrative_msg = content[:200] + "..." if len(content) > 200 else content
133
+
134
+ missing_identifiers_data.append({
135
+ "Line#": current_line,
136
+ "Identifier Message": identifier_msg,
137
+ "Narrative Message": narrative_msg,
138
+ "Full Content": content
139
+ })
140
+
141
+ # Reset for new line number
142
  current_line = int(line_match.group(1))
143
+ lines_content = []
144
+ elif current_line is not None:
145
+ lines_content.append(line)
146
 
147
+ # Check the last line number
148
+ if current_line is not None and lines_content:
149
+ content = " ".join(lines_content)
150
+ has_any_identifier = any(identifier in content for identifier in identifiers)
151
+ if not has_any_identifier:
152
+ # Extract identifier message and narrative message
153
+ identifier_msg = ""
154
+ narrative_msg = ""
155
+
156
+ # Look for identifier patterns in the content
157
+ for identifier in identifiers:
158
+ if identifier in content:
159
+ # Find the context around the identifier
160
+ idx = content.find(identifier)
161
+ start = max(0, idx - 50)
162
+ end = min(len(content), idx + len(identifier) + 50)
163
+ identifier_msg = content[start:end].strip()
164
+ break
165
+
166
+ # Look for narrative message patterns
167
+ narrative_patterns = [
168
+ r'Message:\s*(.+)',
169
+ r'Narrative:\s*(.+)',
170
+ r'Description:\s*(.+)',
171
+ r'Note:\s*(.+)'
172
+ ]
173
+
174
+ for pattern in narrative_patterns:
175
+ match = re.search(pattern, content, re.IGNORECASE)
176
+ if match:
177
+ narrative_msg = match.group(1).strip()
178
+ break
179
+
180
+ # If no specific narrative pattern found, use the full content
181
+ if not narrative_msg:
182
+ narrative_msg = content[:200] + "..." if len(content) > 200 else content
183
+
184
+ missing_identifiers_data.append({
185
+ "Line#": current_line,
186
+ "Identifier Message": identifier_msg,
187
+ "Narrative Message": narrative_msg,
188
+ "Full Content": content
189
+ })
190
 
191
+ if not missing_identifiers_data:
192
+ return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result
193
 
194
+ # Create DataFrame with all the collected data
195
+ df = pd.DataFrame(missing_identifiers_data)
196
+ return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result
197
 
198
  # Gradio Interface
199
+ with gr.Blocks(title="PDF Analysis Tool") as demo:
200
+ gr.Markdown("# PDF Analysis Tool")
201
+
202
+ gr.Markdown("## PDF Analysis and Datetime Extraction")
203
+
204
+ with gr.Row():
205
+ pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf")
206
+ identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas")
207
+ split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #")
208
+ parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1)
209
+
210
+ with gr.Row():
211
+ check_btn = gr.Button("Analyze PDF")
212
+
213
+ with gr.Row():
214
+ result_text = gr.Textbox(label="Status")
215
+ datetime_result = gr.Textbox(label="Datetime Result")
216
+
217
+ with gr.Row():
218
+ result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas")
219
+
220
+ check_btn.click(
221
+ fn=check_latest_section,
222
+ inputs=[pdf_url, identifiers_input, split_marker, parts_index],
223
+ outputs=[result_text, result_df, datetime_result]
224
+ )
225
 
226
  demo.launch()