triflix's picture
Upload 2 files
a081bdc verified
import gradio as gr
import os
import pandas as pd
import shutil
import sys
# ---------------------------------------------------------
# IMPORT LOGICCODE
# ---------------------------------------------------------
# We expect logiccode.py to be in the same directory
try:
import logiccode
except ImportError as e:
print("CRITICAL ERROR: Could not import 'logiccode.py'.")
print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}")
sys.exit(1)
# ---------------------------------------------------------
# MOCK ARGUMENTS
# ---------------------------------------------------------
# This class mimics the argparse object that logiccode expects
class MockArgs:
def __init__(self):
self.debug = False
self.pages = 3
self.file = []
self.inputkeywords = ""
self.required = []
self.fuzzy = True
self.visualize = False
# Initialize args in logiccode if not already present
if not hasattr(logiccode, 'args'):
logiccode.args = MockArgs()
# ---------------------------------------------------------
# CORE PROCESSING FUNCTION
# ---------------------------------------------------------
def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled):
"""
Process uploaded files using the imported logiccode module.
"""
# 1. Update global args in logiccode based on UI inputs
logiccode.args.debug = debug_enabled
logiccode.args.fuzzy = fuzzy_match_enabled
# Initialize output containers
results = []
gallery_images = []
logs = []
# Parse keywords
user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()]
# Track found documents for "Required" check
found_documents = set()
all_matched_keywords_per_file = []
if not files:
return "<h3>⚠️ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin."
logs.append(f"Starting processing of {len(files)} files...")
logs.append(f"Target Keywords: {user_keywords}")
logs.append(f"Required Documents: {required_docs}")
# 2. Iterate through uploaded files
for file_obj in files:
file_path = file_obj.name
filename = os.path.basename(file_path)
logs.append(f"\n--- Processing: {filename} ---")
# --- A. Generate Previews for Gallery ---
if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
gallery_images.append((file_path, filename))
elif file_path.lower().endswith('.pdf'):
try:
# Use logiccode's utility to get a preview of the 1st page
preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1)
if preview_pages:
gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)"))
logs.append(f"Generated PDF preview for {filename}")
except Exception as e:
logs.append(f"⚠️ PDF Preview failed for {filename}: {e}")
# --- B. Text Extraction & Analysis ---
try:
# Extract text (logiccode handles PDF vs Image internally)
ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages)
if not ocr_texts:
logs.append(f"⚠️ Warning: No text extracted from {filename}")
results.append({
"File": filename, "Type": "Unreadable", "Score": 0,
"Status": "FAILED", "Matched Keywords": ""
})
continue
# Normalize text
full_text = " ".join(ocr_texts)
ocr_tokens = logiccode.normalize_text(full_text)
# Classify Document
doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled)
found_documents.add(doc_type)
logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)")
# Verify Keywords
# logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}]
verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled)
matched_kws = [r['keyword'] for r in verification_results if r['matched']]
all_matched_keywords_per_file.append(set(matched_kws))
# Determine File Status
# If keywords were provided, we require all of them to match for "VERIFIED"
if user_keywords:
file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL"
if len(matched_kws) == 0: file_status = "FAILED"
else:
file_status = "INFO ONLY"
logs.append(f"Matched: {matched_kws if matched_kws else 'None'}")
results.append({
"File": filename,
"Type": doc_type,
"Score": f"{doc_score:.1f}%",
"Status": file_status,
"Matched Keywords": ", ".join(matched_kws)
})
except Exception as e:
error_msg = f"Error processing {filename}: {str(e)}"
logs.append(error_msg)
if debug_enabled:
import traceback
logs.append(traceback.format_exc())
results.append({
"File": filename, "Type": "Error", "Score": 0,
"Status": "ERROR", "Matched Keywords": str(e)
})
# 3. Calculate Summary Logic
required_set = set(required_docs)
missing_docs = required_set - found_documents
all_user_keywords = set(user_keywords)
keywords_found_across_all_files = set()
for file_kw_set in all_matched_keywords_per_file:
keywords_found_across_all_files.update(file_kw_set)
missing_keywords = all_user_keywords - keywords_found_across_all_files
# 4. Build HTML Report
return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs)
def build_html_summary(required_set, missing_docs, missing_keywords):
html = """
<div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'>
<h3 style='margin-top: 0; color: #333;'>Verification Summary</h3>
"""
# Document Status
doc_status_bool = True
if required_set:
if missing_docs:
doc_status_bool = False
html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>"
else:
html += f"<div style='margin-bottom: 8px;'>βœ… <b>Documents:</b> All required types found.</div>"
else:
html += "<div style='margin-bottom: 8px; color: #666;'>ℹ️ No specific document types required.</div>"
# Keyword Status
kw_status_bool = True
if missing_keywords:
kw_status_bool = False
html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>"
else:
html += f"<div style='margin-bottom: 8px;'>βœ… <b>Keywords:</b> All keywords found.</div>"
# Final Status
overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444"
overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED"
html += f"<hr style='margin: 15px 0; border-color: #eee;'>"
html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>"
html += "</div>"
return html
# ---------------------------------------------------------
# GRADIO UI SETUP
# ---------------------------------------------------------
theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
).set(
body_background_fill="#f9fafb",
block_background_fill="white",
block_border_width="1px"
)
with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo:
gr.Markdown(
"""
# πŸ“„ Intelligent Document Verification
Upload documents, specify required types, and verify content matches automatically.
"""
)
with gr.Row():
# Left Column: Inputs
with gr.Column(scale=4):
files_input = gr.File(
file_count="multiple",
label="1. Upload Documents",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"],
height=250
)
keywords_input = gr.Textbox(
label="2. Keywords to Verify",
placeholder="Name, ID Number, Date of Birth...",
info="Enter values that MUST appear in the documents (space separated)",
lines=2
)
# Right Column: Configuration
with gr.Column(scale=3):
# Fetch doc types dynamically from logiccode
available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else []
required_docs_input = gr.Dropdown(
choices=available_types,
multiselect=True,
label="3. Required Document Types",
info="Which documents are mandatory?",
value=[]
)
with gr.Group():
gr.Markdown("### Settings")
fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)")
debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs")
verify_btn = gr.Button("πŸ” Verify Documents", variant="primary", size="lg")
gr.Markdown("---")
# Results Area
with gr.Row():
# Summary Box
with gr.Column(scale=1):
status_output = gr.HTML(label="Overall Status")
# Detailed Tabs
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("πŸ“Š Results Table"):
results_df = gr.Dataframe(
headers=["File", "Type", "Score", "Status", "Matched Keywords"],
interactive=False
)
with gr.TabItem("πŸ–ΌοΈ Document Gallery"):
gallery = gr.Gallery(
label="Processed Images",
show_label=False,
columns=[3], rows=[2],
object_fit="contain",
height="auto"
)
with gr.TabItem("πŸ“ System Logs"):
logs_output = gr.Textbox(
label="Processing Logs",
lines=15,
interactive=False,
show_copy_button=True
)
# Event Trigger
verify_btn.click(
fn=process_documents,
inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox],
outputs=[status_output, gallery, results_df, logs_output]
)
if __name__ == "__main__":
# Increase max file size if needed, allow sharing
demo.launch(share=False, server_name="0.0.0.0")