File size: 11,827 Bytes
a081bdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import gradio as gr
import os
import pandas as pd
import shutil
import sys

# ---------------------------------------------------------
# IMPORT LOGICCODE
# ---------------------------------------------------------
# We expect logiccode.py to be in the same directory
try:
    import logiccode
except ImportError as e:
    print("CRITICAL ERROR: Could not import 'logiccode.py'.")
    print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}")
    sys.exit(1)

# ---------------------------------------------------------
# MOCK ARGUMENTS
# ---------------------------------------------------------
# This class mimics the argparse object that logiccode expects
class MockArgs:
    def __init__(self):
        self.debug = False
        self.pages = 3
        self.file = []
        self.inputkeywords = ""
        self.required = []
        self.fuzzy = True
        self.visualize = False

# Initialize args in logiccode if not already present
if not hasattr(logiccode, 'args'):
    logiccode.args = MockArgs()

# ---------------------------------------------------------
# CORE PROCESSING FUNCTION
# ---------------------------------------------------------
def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled):
    """

    Process uploaded files using the imported logiccode module.

    """
    # 1. Update global args in logiccode based on UI inputs
    logiccode.args.debug = debug_enabled
    logiccode.args.fuzzy = fuzzy_match_enabled
    
    # Initialize output containers
    results = []
    gallery_images = []
    logs = []
    
    # Parse keywords
    user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()]
    
    # Track found documents for "Required" check
    found_documents = set()
    all_matched_keywords_per_file = []
    
    if not files:
        return "<h3>⚠️ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin."

    logs.append(f"Starting processing of {len(files)} files...")
    logs.append(f"Target Keywords: {user_keywords}")
    logs.append(f"Required Documents: {required_docs}")

    # 2. Iterate through uploaded files
    for file_obj in files:
        file_path = file_obj.name
        filename = os.path.basename(file_path)
        
        logs.append(f"\n--- Processing: {filename} ---")
        
        # --- A. Generate Previews for Gallery ---
        if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            gallery_images.append((file_path, filename))
        
        elif file_path.lower().endswith('.pdf'):
            try:
                # Use logiccode's utility to get a preview of the 1st page
                preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1)
                if preview_pages:
                    gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)"))
                    logs.append(f"Generated PDF preview for {filename}")
            except Exception as e:
                logs.append(f"⚠️ PDF Preview failed for {filename}: {e}")

        # --- B. Text Extraction & Analysis ---
        try:
            # Extract text (logiccode handles PDF vs Image internally)
            ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages)
            
            if not ocr_texts:
                logs.append(f"⚠️ Warning: No text extracted from {filename}")
                results.append({
                    "File": filename, "Type": "Unreadable", "Score": 0, 
                    "Status": "FAILED", "Matched Keywords": ""
                })
                continue
                
            # Normalize text
            full_text = " ".join(ocr_texts)
            ocr_tokens = logiccode.normalize_text(full_text)
            
            # Classify Document
            doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled)
            found_documents.add(doc_type)
            logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)")
            
            # Verify Keywords
            # logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}]
            verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled)
            
            matched_kws = [r['keyword'] for r in verification_results if r['matched']]
            all_matched_keywords_per_file.append(set(matched_kws))
            
            # Determine File Status
            # If keywords were provided, we require all of them to match for "VERIFIED"
            if user_keywords:
                file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL"
                if len(matched_kws) == 0: file_status = "FAILED"
            else:
                file_status = "INFO ONLY"

            logs.append(f"Matched: {matched_kws if matched_kws else 'None'}")
            
            results.append({
                "File": filename,
                "Type": doc_type,
                "Score": f"{doc_score:.1f}%",
                "Status": file_status,
                "Matched Keywords": ", ".join(matched_kws)
            })
            
        except Exception as e:
            error_msg = f"Error processing {filename}: {str(e)}"
            logs.append(error_msg)
            if debug_enabled:
                import traceback
                logs.append(traceback.format_exc())
            
            results.append({
                "File": filename, "Type": "Error", "Score": 0,
                "Status": "ERROR", "Matched Keywords": str(e)
            })

    # 3. Calculate Summary Logic
    required_set = set(required_docs)
    missing_docs = required_set - found_documents
    
    all_user_keywords = set(user_keywords)
    keywords_found_across_all_files = set()
    for file_kw_set in all_matched_keywords_per_file:
        keywords_found_across_all_files.update(file_kw_set)
        
    missing_keywords = all_user_keywords - keywords_found_across_all_files
    
    # 4. Build HTML Report
    return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs)

def build_html_summary(required_set, missing_docs, missing_keywords):
    html = """

    <div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'>

        <h3 style='margin-top: 0; color: #333;'>Verification Summary</h3>

    """
    
    # Document Status
    doc_status_bool = True
    if required_set:
        if missing_docs:
            doc_status_bool = False
            html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>"
        else:
            html += f"<div style='margin-bottom: 8px;'>βœ… <b>Documents:</b> All required types found.</div>"
    else:
        html += "<div style='margin-bottom: 8px; color: #666;'>ℹ️ No specific document types required.</div>"

    # Keyword Status
    kw_status_bool = True
    if missing_keywords:
        kw_status_bool = False
        html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>"
    else:
        html += f"<div style='margin-bottom: 8px;'>βœ… <b>Keywords:</b> All keywords found.</div>"
        
    # Final Status
    overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444"
    overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED"
    
    html += f"<hr style='margin: 15px 0; border-color: #eee;'>"
    html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>"
    html += "</div>"
    return html

# ---------------------------------------------------------
# GRADIO UI SETUP
# ---------------------------------------------------------
theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="slate",
).set(
    body_background_fill="#f9fafb",
    block_background_fill="white",
    block_border_width="1px"
)

with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo:
    gr.Markdown(
        """

        # πŸ“„ Intelligent Document Verification

        Upload documents, specify required types, and verify content matches automatically.

        """
    )
    
    with gr.Row():
        # Left Column: Inputs
        with gr.Column(scale=4):
            files_input = gr.File(
                file_count="multiple",
                label="1. Upload Documents",
                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"],
                height=250
            )
            
            keywords_input = gr.Textbox(
                label="2. Keywords to Verify",
                placeholder="Name, ID Number, Date of Birth...",
                info="Enter values that MUST appear in the documents (space separated)",
                lines=2
            )
        
        # Right Column: Configuration
        with gr.Column(scale=3):
            # Fetch doc types dynamically from logiccode
            available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else []
            
            required_docs_input = gr.Dropdown(
                choices=available_types,
                multiselect=True,
                label="3. Required Document Types",
                info="Which documents are mandatory?",
                value=[]
            )
            
            with gr.Group():
                gr.Markdown("### Settings")
                fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)")
                debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs")
            
            verify_btn = gr.Button("πŸ” Verify Documents", variant="primary", size="lg")
    
    gr.Markdown("---")
    
    # Results Area
    with gr.Row():
        # Summary Box
        with gr.Column(scale=1):
            status_output = gr.HTML(label="Overall Status")
        
        # Detailed Tabs
        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("πŸ“Š Results Table"):
                    results_df = gr.Dataframe(
                        headers=["File", "Type", "Score", "Status", "Matched Keywords"],
                        interactive=False
                    )
                
                with gr.TabItem("πŸ–ΌοΈ Document Gallery"):
                    gallery = gr.Gallery(
                        label="Processed Images", 
                        show_label=False, 
                        columns=[3], rows=[2], 
                        object_fit="contain", 
                        height="auto"
                    )
                
                with gr.TabItem("πŸ“ System Logs"):
                    logs_output = gr.Textbox(
                        label="Processing Logs", 
                        lines=15,
                        interactive=False,
                        show_copy_button=True
                    )

    # Event Trigger
    verify_btn.click(
        fn=process_documents,
        inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox],
        outputs=[status_output, gallery, results_df, logs_output]
    )

if __name__ == "__main__":
    # Increase max file size if needed, allow sharing
    demo.launch(share=False, server_name="0.0.0.0")