File size: 9,506 Bytes
fcad26a
 
 
 
 
 
c080e37
fcad26a
c080e37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d303d7
 
fcad26a
c080e37
fcad26a
56bb301
fcad26a
 
 
 
 
c080e37
fcad26a
56bb301
fcad26a
 
042f009
fcad26a
c080e37
 
 
 
 
 
 
 
 
 
 
fcad26a
c080e37
56bb301
 
c080e37
 
 
 
 
 
 
 
 
 
56bb301
 
 
 
c080e37
 
042f009
c080e37
 
042f009
 
c080e37
042f009
 
 
 
 
c080e37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
042f009
c080e37
 
 
042f009
c080e37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcad26a
c080e37
 
fcad26a
c080e37
 
 
fcad26a
56bb301
c080e37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcad26a
6d303d7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import gradio as gr
import fitz  # PyMuPDF
import re
import requests
from io import BytesIO
import pandas as pd
from datetime import datetime

def extract_first_datetime(pdf_url):
    """Extract the first datetime value from PDF"""
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_bytes = BytesIO(response.content)
    except Exception as e:
        return f"❌ Failed to load PDF: {str(e)}"

    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        full_text = "\n".join([page.get_text("text") for page in doc])
    except Exception as e:
        return f"❌ Failed to extract text: {str(e)}"

    # Pattern to match datetime format like "Wed May 21 2025 05:40:47 GMT-0700 (Pacific Daylight Time)"
    datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
    
    match = re.search(datetime_pattern, full_text)
    if match:
        datetime_str = match.group(0)
        return f"βœ… First datetime found: {datetime_str}"
    else:
        return "❌ No datetime pattern found in the PDF"

def check_latest_section(pdf_url, identifiers_input, split_marker, parts_index):
    # Step 1: Prepare identifiers (alphanumeric-safe)
    identifiers = [id.strip() for id in identifiers_input.split(',') if id.strip()]
    if not identifiers:
        return "❌ No valid Identifiers entered.", None, ""

    # Step 2: Download PDF
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        pdf_bytes = BytesIO(response.content)
    except Exception as e:
        return f"❌ Failed to load PDF: {str(e)}", None, ""

    # Step 3: Extract full text from PDF
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        full_text = "\n".join([page.get_text("text") for page in doc])
    except Exception as e:
        return f"❌ Failed to extract text: {str(e)}", None, ""

    # Step 4: Extract datetime first
    datetime_pattern = r'\w{3}\s+\w{3}\s+\d{1,2}\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+GMT[+-]\d{4}\s+\([^)]+\)'
    datetime_match = re.search(datetime_pattern, full_text)
    datetime_result = ""
    if datetime_match:
        datetime_str = datetime_match.group(0)
        datetime_result = f"βœ… First datetime found: {datetime_str}"
    else:
        datetime_result = "❌ No datetime pattern found in the PDF"

    # Step 5: Split by user-defined marker (optional)
    if split_marker.strip() and split_marker in full_text:
        parts = full_text.split(split_marker)
        # Use custom parts index, default to 1 if invalid
        try:
            parts_index = int(parts_index)
            if parts_index < 0 or parts_index >= len(parts):
                parts_index = 1  # Default to 1 if out of range
        except (ValueError, TypeError):
            parts_index = 1  # Default to 1 if invalid input
            
        latest_block = parts[parts_index]  # Use custom parts index
        note = f"βœ… Found marker '{split_marker}', using block {parts_index} (0-indexed)."
    else:
        latest_block = full_text
        note = f"⚠️ Marker '{split_marker}' not found. Using entire PDF content."

    # Step 6: Track Line# entries that don't have any of the specified identifiers
    missing_identifiers_data = []  # List to store line data with content
    current_line = None
    lines_content = []

    for line in latest_block.splitlines():
        line = line.strip()
        
        if not line:
            continue

        line_match = re.match(r"Line#\s+(\d+)", line)
        if line_match:
            # If we had a previous line number, check if it should be included
            if current_line is not None and lines_content:
                content = " ".join(lines_content)
                # Check if ANY of the identifiers are present
                has_any_identifier = any(identifier in content for identifier in identifiers)
                if not has_any_identifier:
                    # Extract identifier message and narrative message
                    identifier_msg = ""
                    narrative_msg = ""
                    
                    # Look for identifier patterns in the content
                    for identifier in identifiers:
                        if identifier in content:
                            # Find the context around the identifier
                            idx = content.find(identifier)
                            start = max(0, idx - 50)
                            end = min(len(content), idx + len(identifier) + 50)
                            identifier_msg = content[start:end].strip()
                            break
                    
                    # Look for narrative message patterns (you may need to adjust this based on your PDF structure)
                    # Common patterns for narrative messages
                    narrative_patterns = [
                        r'Message:\s*(.+)',
                        r'Narrative:\s*(.+)',
                        r'Description:\s*(.+)',
                        r'Note:\s*(.+)'
                    ]
                    
                    for pattern in narrative_patterns:
                        match = re.search(pattern, content, re.IGNORECASE)
                        if match:
                            narrative_msg = match.group(1).strip()
                            break
                    
                    # If no specific narrative pattern found, use the full content
                    if not narrative_msg:
                        narrative_msg = content[:200] + "..." if len(content) > 200 else content
                    
                    missing_identifiers_data.append({
                        "Line#": current_line,
                        "Identifier Message": identifier_msg,
                        "Narrative Message": narrative_msg,
                        "Full Content": content
                    })
            
            # Reset for new line number
            current_line = int(line_match.group(1))
            lines_content = []
        elif current_line is not None:
            lines_content.append(line)

    # Check the last line number
    if current_line is not None and lines_content:
        content = " ".join(lines_content)
        has_any_identifier = any(identifier in content for identifier in identifiers)
        if not has_any_identifier:
            # Extract identifier message and narrative message
            identifier_msg = ""
            narrative_msg = ""
            
            # Look for identifier patterns in the content
            for identifier in identifiers:
                if identifier in content:
                    # Find the context around the identifier
                    idx = content.find(identifier)
                    start = max(0, idx - 50)
                    end = min(len(content), idx + len(identifier) + 50)
                    identifier_msg = content[start:end].strip()
                    break
            
            # Look for narrative message patterns
            narrative_patterns = [
                r'Message:\s*(.+)',
                r'Narrative:\s*(.+)',
                r'Description:\s*(.+)',
                r'Note:\s*(.+)'
            ]
            
            for pattern in narrative_patterns:
                match = re.search(pattern, content, re.IGNORECASE)
                if match:
                    narrative_msg = match.group(1).strip()
                    break
            
            # If no specific narrative pattern found, use the full content
            if not narrative_msg:
                narrative_msg = content[:200] + "..." if len(content) > 200 else content
            
            missing_identifiers_data.append({
                "Line#": current_line,
                "Identifier Message": identifier_msg,
                "Narrative Message": narrative_msg,
                "Full Content": content
            })

    if not missing_identifiers_data:
        return note + f" All lines contain at least one of the identifiers: {', '.join(identifiers)}.", None, datetime_result

    # Create DataFrame with all the collected data
    df = pd.DataFrame(missing_identifiers_data)
    return note + f" Found {len(missing_identifiers_data)} lines missing all identifiers ({', '.join(identifiers)}):", df, datetime_result

# Gradio Interface
with gr.Blocks(title="PDF Analysis Tool") as demo:
    gr.Markdown("# PDF Analysis Tool")
    
    gr.Markdown("## PDF Analysis and Datetime Extraction")
    
    with gr.Row():
        pdf_url = gr.Textbox(label="PDF URL", placeholder="https://.../yourfile.pdf")
        identifiers_input = gr.Textbox(label="Identifier List", value="628, 995", placeholder="Enter identifiers separated by commas")
        split_marker = gr.Textbox(label="Split Marker (optional)", value="Record #")
        parts_index = gr.Number(label="Parts Index", value=1, minimum=0, step=1)
    
    with gr.Row():
        check_btn = gr.Button("Analyze PDF")
    
    with gr.Row():
        result_text = gr.Textbox(label="Status")
        datetime_result = gr.Textbox(label="Datetime Result")
    
    with gr.Row():
        result_df = gr.Dataframe(label="Lines Missing All Identifiers", type="pandas")
    
    check_btn.click(
        fn=check_latest_section,
        inputs=[pdf_url, identifiers_input, split_marker, parts_index],
        outputs=[result_text, result_df, datetime_result]
    )

demo.launch()