Spaces:

triflix
/

DocumentVerification

Sleeping

File size: 11,827 Bytes

a081bdc

import gradio as gr
import os
import pandas as pd
import shutil
import sys

# ---------------------------------------------------------
# IMPORT LOGICCODE
# ---------------------------------------------------------
# We expect logiccode.py to be in the same directory
try:
    import logiccode
except ImportError as e:
    print("CRITICAL ERROR: Could not import 'logiccode.py'.")
    print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}")
    sys.exit(1)

# ---------------------------------------------------------
# MOCK ARGUMENTS
# ---------------------------------------------------------
# This class mimics the argparse object that logiccode expects
class MockArgs:
    def __init__(self):
        self.debug = False
        self.pages = 3
        self.file = []
        self.inputkeywords = ""
        self.required = []
        self.fuzzy = True
        self.visualize = False

# Initialize args in logiccode if not already present
if not hasattr(logiccode, 'args'):
    logiccode.args = MockArgs()

# ---------------------------------------------------------
# CORE PROCESSING FUNCTION
# ---------------------------------------------------------
def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled):
    """

    Process uploaded files using the imported logiccode module.

    """
    # 1. Update global args in logiccode based on UI inputs
    logiccode.args.debug = debug_enabled
    logiccode.args.fuzzy = fuzzy_match_enabled
    
    # Initialize output containers
    results = []
    gallery_images = []
    logs = []
    
    # Parse keywords
    user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()]
    
    # Track found documents for "Required" check
    found_documents = set()
    all_matched_keywords_per_file = []
    
    if not files:
        return "<h3>⚠️ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin."

    logs.append(f"Starting processing of {len(files)} files...")
    logs.append(f"Target Keywords: {user_keywords}")
    logs.append(f"Required Documents: {required_docs}")

    # 2. Iterate through uploaded files
    for file_obj in files:
        file_path = file_obj.name
        filename = os.path.basename(file_path)
        
        logs.append(f"\n--- Processing: {filename} ---")
        
        # --- A. Generate Previews for Gallery ---
        if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            gallery_images.append((file_path, filename))
        
        elif file_path.lower().endswith('.pdf'):
            try:
                # Use logiccode's utility to get a preview of the 1st page
                preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1)
                if preview_pages:
                    gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)"))
                    logs.append(f"Generated PDF preview for {filename}")
            except Exception as e:
                logs.append(f"⚠️ PDF Preview failed for {filename}: {e}")

        # --- B. Text Extraction & Analysis ---
        try:
            # Extract text (logiccode handles PDF vs Image internally)
            ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages)
            
            if not ocr_texts:
                logs.append(f"⚠️ Warning: No text extracted from {filename}")
                results.append({
                    "File": filename, "Type": "Unreadable", "Score": 0, 
                    "Status": "FAILED", "Matched Keywords": ""
                })
                continue
                
            # Normalize text
            full_text = " ".join(ocr_texts)
            ocr_tokens = logiccode.normalize_text(full_text)
            
            # Classify Document
            doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled)
            found_documents.add(doc_type)
            logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)")
            
            # Verify Keywords
            # logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}]
            verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled)
            
            matched_kws = [r['keyword'] for r in verification_results if r['matched']]
            all_matched_keywords_per_file.append(set(matched_kws))
            
            # Determine File Status
            # If keywords were provided, we require all of them to match for "VERIFIED"
            if user_keywords:
                file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL"
                if len(matched_kws) == 0: file_status = "FAILED"
            else:
                file_status = "INFO ONLY"

            logs.append(f"Matched: {matched_kws if matched_kws else 'None'}")
            
            results.append({
                "File": filename,
                "Type": doc_type,
                "Score": f"{doc_score:.1f}%",
                "Status": file_status,
                "Matched Keywords": ", ".join(matched_kws)
            })
            
        except Exception as e:
            error_msg = f"Error processing {filename}: {str(e)}"
            logs.append(error_msg)
            if debug_enabled:
                import traceback
                logs.append(traceback.format_exc())
            
            results.append({
                "File": filename, "Type": "Error", "Score": 0,
                "Status": "ERROR", "Matched Keywords": str(e)
            })

    # 3. Calculate Summary Logic
    required_set = set(required_docs)
    missing_docs = required_set - found_documents
    
    all_user_keywords = set(user_keywords)
    keywords_found_across_all_files = set()
    for file_kw_set in all_matched_keywords_per_file:
        keywords_found_across_all_files.update(file_kw_set)
        
    missing_keywords = all_user_keywords - keywords_found_across_all_files
    
    # 4. Build HTML Report
    return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs)

def build_html_summary(required_set, missing_docs, missing_keywords):
    html = """

    <div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'>

        <h3 style='margin-top: 0; color: #333;'>Verification Summary</h3>

    """
    
    # Document Status
    doc_status_bool = True
    if required_set:
        if missing_docs:
            doc_status_bool = False
            html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>"
        else:
            html += f"<div style='margin-bottom: 8px;'>✅ <b>Documents:</b> All required types found.</div>"
    else:
        html += "<div style='margin-bottom: 8px; color: #666;'>ℹ️ No specific document types required.</div>"

    # Keyword Status
    kw_status_bool = True
    if missing_keywords:
        kw_status_bool = False
        html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>"
    else:
        html += f"<div style='margin-bottom: 8px;'>✅ <b>Keywords:</b> All keywords found.</div>"
        
    # Final Status
    overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444"
    overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED"
    
    html += f"<hr style='margin: 15px 0; border-color: #eee;'>"
    html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>"
    html += "</div>"
    return html

# ---------------------------------------------------------
# GRADIO UI SETUP
# ---------------------------------------------------------
theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="slate",
).set(
    body_background_fill="#f9fafb",
    block_background_fill="white",
    block_border_width="1px"
)

with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo:
    gr.Markdown(
        """

        # 📄 Intelligent Document Verification

        Upload documents, specify required types, and verify content matches automatically.

        """
    )
    
    with gr.Row():
        # Left Column: Inputs
        with gr.Column(scale=4):
            files_input = gr.File(
                file_count="multiple",
                label="1. Upload Documents",
                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"],
                height=250
            )
            
            keywords_input = gr.Textbox(
                label="2. Keywords to Verify",
                placeholder="Name, ID Number, Date of Birth...",
                info="Enter values that MUST appear in the documents (space separated)",
                lines=2
            )
        
        # Right Column: Configuration
        with gr.Column(scale=3):
            # Fetch doc types dynamically from logiccode
            available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else []
            
            required_docs_input = gr.Dropdown(
                choices=available_types,
                multiselect=True,
                label="3. Required Document Types",
                info="Which documents are mandatory?",
                value=[]
            )
            
            with gr.Group():
                gr.Markdown("### Settings")
                fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)")
                debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs")
            
            verify_btn = gr.Button("🔍 Verify Documents", variant="primary", size="lg")
    
    gr.Markdown("---")
    
    # Results Area
    with gr.Row():
        # Summary Box
        with gr.Column(scale=1):
            status_output = gr.HTML(label="Overall Status")
        
        # Detailed Tabs
        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("📊 Results Table"):
                    results_df = gr.Dataframe(
                        headers=["File", "Type", "Score", "Status", "Matched Keywords"],
                        interactive=False
                    )
                
                with gr.TabItem("🖼️ Document Gallery"):
                    gallery = gr.Gallery(
                        label="Processed Images", 
                        show_label=False, 
                        columns=[3], rows=[2], 
                        object_fit="contain", 
                        height="auto"
                    )
                
                with gr.TabItem("📝 System Logs"):
                    logs_output = gr.Textbox(
                        label="Processing Logs", 
                        lines=15,
                        interactive=False,
                        show_copy_button=True
                    )

    # Event Trigger
    verify_btn.click(
        fn=process_documents,
        inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox],
        outputs=[status_output, gallery, results_df, logs_output]
    )

if __name__ == "__main__":
    # Increase max file size if needed, allow sharing
    demo.launch(share=False, server_name="0.0.0.0")