Spaces:
Sleeping
Sleeping
File size: 11,827 Bytes
a081bdc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 |
import gradio as gr
import os
import pandas as pd
import shutil
import sys
# ---------------------------------------------------------
# IMPORT LOGICCODE
# ---------------------------------------------------------
# We expect logiccode.py to be in the same directory
try:
import logiccode
except ImportError as e:
print("CRITICAL ERROR: Could not import 'logiccode.py'.")
print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}")
sys.exit(1)
# ---------------------------------------------------------
# MOCK ARGUMENTS
# ---------------------------------------------------------
# This class mimics the argparse object that logiccode expects
class MockArgs:
def __init__(self):
self.debug = False
self.pages = 3
self.file = []
self.inputkeywords = ""
self.required = []
self.fuzzy = True
self.visualize = False
# Initialize args in logiccode if not already present
if not hasattr(logiccode, 'args'):
logiccode.args = MockArgs()
# ---------------------------------------------------------
# CORE PROCESSING FUNCTION
# ---------------------------------------------------------
def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled):
"""
Process uploaded files using the imported logiccode module.
"""
# 1. Update global args in logiccode based on UI inputs
logiccode.args.debug = debug_enabled
logiccode.args.fuzzy = fuzzy_match_enabled
# Initialize output containers
results = []
gallery_images = []
logs = []
# Parse keywords
user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()]
# Track found documents for "Required" check
found_documents = set()
all_matched_keywords_per_file = []
if not files:
return "<h3>β οΈ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin."
logs.append(f"Starting processing of {len(files)} files...")
logs.append(f"Target Keywords: {user_keywords}")
logs.append(f"Required Documents: {required_docs}")
# 2. Iterate through uploaded files
for file_obj in files:
file_path = file_obj.name
filename = os.path.basename(file_path)
logs.append(f"\n--- Processing: {filename} ---")
# --- A. Generate Previews for Gallery ---
if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
gallery_images.append((file_path, filename))
elif file_path.lower().endswith('.pdf'):
try:
# Use logiccode's utility to get a preview of the 1st page
preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1)
if preview_pages:
gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)"))
logs.append(f"Generated PDF preview for {filename}")
except Exception as e:
logs.append(f"β οΈ PDF Preview failed for {filename}: {e}")
# --- B. Text Extraction & Analysis ---
try:
# Extract text (logiccode handles PDF vs Image internally)
ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages)
if not ocr_texts:
logs.append(f"β οΈ Warning: No text extracted from {filename}")
results.append({
"File": filename, "Type": "Unreadable", "Score": 0,
"Status": "FAILED", "Matched Keywords": ""
})
continue
# Normalize text
full_text = " ".join(ocr_texts)
ocr_tokens = logiccode.normalize_text(full_text)
# Classify Document
doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled)
found_documents.add(doc_type)
logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)")
# Verify Keywords
# logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}]
verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled)
matched_kws = [r['keyword'] for r in verification_results if r['matched']]
all_matched_keywords_per_file.append(set(matched_kws))
# Determine File Status
# If keywords were provided, we require all of them to match for "VERIFIED"
if user_keywords:
file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL"
if len(matched_kws) == 0: file_status = "FAILED"
else:
file_status = "INFO ONLY"
logs.append(f"Matched: {matched_kws if matched_kws else 'None'}")
results.append({
"File": filename,
"Type": doc_type,
"Score": f"{doc_score:.1f}%",
"Status": file_status,
"Matched Keywords": ", ".join(matched_kws)
})
except Exception as e:
error_msg = f"Error processing {filename}: {str(e)}"
logs.append(error_msg)
if debug_enabled:
import traceback
logs.append(traceback.format_exc())
results.append({
"File": filename, "Type": "Error", "Score": 0,
"Status": "ERROR", "Matched Keywords": str(e)
})
# 3. Calculate Summary Logic
required_set = set(required_docs)
missing_docs = required_set - found_documents
all_user_keywords = set(user_keywords)
keywords_found_across_all_files = set()
for file_kw_set in all_matched_keywords_per_file:
keywords_found_across_all_files.update(file_kw_set)
missing_keywords = all_user_keywords - keywords_found_across_all_files
# 4. Build HTML Report
return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs)
def build_html_summary(required_set, missing_docs, missing_keywords):
html = """
<div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'>
<h3 style='margin-top: 0; color: #333;'>Verification Summary</h3>
"""
# Document Status
doc_status_bool = True
if required_set:
if missing_docs:
doc_status_bool = False
html += f"<div style='margin-bottom: 8px;'>β <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>"
else:
html += f"<div style='margin-bottom: 8px;'>β
<b>Documents:</b> All required types found.</div>"
else:
html += "<div style='margin-bottom: 8px; color: #666;'>βΉοΈ No specific document types required.</div>"
# Keyword Status
kw_status_bool = True
if missing_keywords:
kw_status_bool = False
html += f"<div style='margin-bottom: 8px;'>β <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>"
else:
html += f"<div style='margin-bottom: 8px;'>β
<b>Keywords:</b> All keywords found.</div>"
# Final Status
overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444"
overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED"
html += f"<hr style='margin: 15px 0; border-color: #eee;'>"
html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>"
html += "</div>"
return html
# ---------------------------------------------------------
# GRADIO UI SETUP
# ---------------------------------------------------------
theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
).set(
body_background_fill="#f9fafb",
block_background_fill="white",
block_border_width="1px"
)
with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo:
gr.Markdown(
"""
# π Intelligent Document Verification
Upload documents, specify required types, and verify content matches automatically.
"""
)
with gr.Row():
# Left Column: Inputs
with gr.Column(scale=4):
files_input = gr.File(
file_count="multiple",
label="1. Upload Documents",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"],
height=250
)
keywords_input = gr.Textbox(
label="2. Keywords to Verify",
placeholder="Name, ID Number, Date of Birth...",
info="Enter values that MUST appear in the documents (space separated)",
lines=2
)
# Right Column: Configuration
with gr.Column(scale=3):
# Fetch doc types dynamically from logiccode
available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else []
required_docs_input = gr.Dropdown(
choices=available_types,
multiselect=True,
label="3. Required Document Types",
info="Which documents are mandatory?",
value=[]
)
with gr.Group():
gr.Markdown("### Settings")
fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)")
debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs")
verify_btn = gr.Button("π Verify Documents", variant="primary", size="lg")
gr.Markdown("---")
# Results Area
with gr.Row():
# Summary Box
with gr.Column(scale=1):
status_output = gr.HTML(label="Overall Status")
# Detailed Tabs
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("π Results Table"):
results_df = gr.Dataframe(
headers=["File", "Type", "Score", "Status", "Matched Keywords"],
interactive=False
)
with gr.TabItem("πΌοΈ Document Gallery"):
gallery = gr.Gallery(
label="Processed Images",
show_label=False,
columns=[3], rows=[2],
object_fit="contain",
height="auto"
)
with gr.TabItem("π System Logs"):
logs_output = gr.Textbox(
label="Processing Logs",
lines=15,
interactive=False,
show_copy_button=True
)
# Event Trigger
verify_btn.click(
fn=process_documents,
inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox],
outputs=[status_output, gallery, results_df, logs_output]
)
if __name__ == "__main__":
# Increase max file size if needed, allow sharing
demo.launch(share=False, server_name="0.0.0.0") |