import gradio as gr import os import pandas as pd import shutil import sys # --------------------------------------------------------- # IMPORT LOGICCODE # --------------------------------------------------------- # We expect logiccode.py to be in the same directory try: import logiccode except ImportError as e: print("CRITICAL ERROR: Could not import 'logiccode.py'.") print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}") sys.exit(1) # --------------------------------------------------------- # MOCK ARGUMENTS # --------------------------------------------------------- # This class mimics the argparse object that logiccode expects class MockArgs: def __init__(self): self.debug = False self.pages = 3 self.file = [] self.inputkeywords = "" self.required = [] self.fuzzy = True self.visualize = False # Initialize args in logiccode if not already present if not hasattr(logiccode, 'args'): logiccode.args = MockArgs() # --------------------------------------------------------- # CORE PROCESSING FUNCTION # --------------------------------------------------------- def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled): """ Process uploaded files using the imported logiccode module. """ # 1. Update global args in logiccode based on UI inputs logiccode.args.debug = debug_enabled logiccode.args.fuzzy = fuzzy_match_enabled # Initialize output containers results = [] gallery_images = [] logs = [] # Parse keywords user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()] # Track found documents for "Required" check found_documents = set() all_matched_keywords_per_file = [] if not files: return "

⚠️ No files uploaded

", [], pd.DataFrame(), "Please upload files to begin." logs.append(f"Starting processing of {len(files)} files...") logs.append(f"Target Keywords: {user_keywords}") logs.append(f"Required Documents: {required_docs}") # 2. Iterate through uploaded files for file_obj in files: file_path = file_obj.name filename = os.path.basename(file_path) logs.append(f"\n--- Processing: {filename} ---") # --- A. Generate Previews for Gallery --- if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')): gallery_images.append((file_path, filename)) elif file_path.lower().endswith('.pdf'): try: # Use logiccode's utility to get a preview of the 1st page preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1) if preview_pages: gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)")) logs.append(f"Generated PDF preview for {filename}") except Exception as e: logs.append(f"⚠️ PDF Preview failed for {filename}: {e}") # --- B. Text Extraction & Analysis --- try: # Extract text (logiccode handles PDF vs Image internally) ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages) if not ocr_texts: logs.append(f"⚠️ Warning: No text extracted from {filename}") results.append({ "File": filename, "Type": "Unreadable", "Score": 0, "Status": "FAILED", "Matched Keywords": "" }) continue # Normalize text full_text = " ".join(ocr_texts) ocr_tokens = logiccode.normalize_text(full_text) # Classify Document doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled) found_documents.add(doc_type) logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)") # Verify Keywords # logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}] verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled) matched_kws = [r['keyword'] for r in verification_results if r['matched']] all_matched_keywords_per_file.append(set(matched_kws)) # Determine File Status # If keywords were provided, we require all of them to match for "VERIFIED" if user_keywords: file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL" if len(matched_kws) == 0: file_status = "FAILED" else: file_status = "INFO ONLY" logs.append(f"Matched: {matched_kws if matched_kws else 'None'}") results.append({ "File": filename, "Type": doc_type, "Score": f"{doc_score:.1f}%", "Status": file_status, "Matched Keywords": ", ".join(matched_kws) }) except Exception as e: error_msg = f"Error processing {filename}: {str(e)}" logs.append(error_msg) if debug_enabled: import traceback logs.append(traceback.format_exc()) results.append({ "File": filename, "Type": "Error", "Score": 0, "Status": "ERROR", "Matched Keywords": str(e) }) # 3. Calculate Summary Logic required_set = set(required_docs) missing_docs = required_set - found_documents all_user_keywords = set(user_keywords) keywords_found_across_all_files = set() for file_kw_set in all_matched_keywords_per_file: keywords_found_across_all_files.update(file_kw_set) missing_keywords = all_user_keywords - keywords_found_across_all_files # 4. Build HTML Report return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs) def build_html_summary(required_set, missing_docs, missing_keywords): html = """

Verification Summary

""" # Document Status doc_status_bool = True if required_set: if missing_docs: doc_status_bool = False html += f"
Missing Documents: {', '.join(sorted(missing_docs))}
" else: html += f"
Documents: All required types found.
" else: html += "
ℹ️ No specific document types required.
" # Keyword Status kw_status_bool = True if missing_keywords: kw_status_bool = False html += f"
Missing Keywords: {', '.join(sorted(missing_keywords))}
" else: html += f"
Keywords: All keywords found.
" # Final Status overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444" overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED" html += f"
" html += f"

{overall_text}

" html += "
" return html # --------------------------------------------------------- # GRADIO UI SETUP # --------------------------------------------------------- theme = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", ).set( body_background_fill="#f9fafb", block_background_fill="white", block_border_width="1px" ) with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo: gr.Markdown( """ # 📄 Intelligent Document Verification Upload documents, specify required types, and verify content matches automatically. """ ) with gr.Row(): # Left Column: Inputs with gr.Column(scale=4): files_input = gr.File( file_count="multiple", label="1. Upload Documents", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"], height=250 ) keywords_input = gr.Textbox( label="2. Keywords to Verify", placeholder="Name, ID Number, Date of Birth...", info="Enter values that MUST appear in the documents (space separated)", lines=2 ) # Right Column: Configuration with gr.Column(scale=3): # Fetch doc types dynamically from logiccode available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else [] required_docs_input = gr.Dropdown( choices=available_types, multiselect=True, label="3. Required Document Types", info="Which documents are mandatory?", value=[] ) with gr.Group(): gr.Markdown("### Settings") fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)") debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs") verify_btn = gr.Button("🔍 Verify Documents", variant="primary", size="lg") gr.Markdown("---") # Results Area with gr.Row(): # Summary Box with gr.Column(scale=1): status_output = gr.HTML(label="Overall Status") # Detailed Tabs with gr.Column(scale=2): with gr.Tabs(): with gr.TabItem("📊 Results Table"): results_df = gr.Dataframe( headers=["File", "Type", "Score", "Status", "Matched Keywords"], interactive=False ) with gr.TabItem("🖼️ Document Gallery"): gallery = gr.Gallery( label="Processed Images", show_label=False, columns=[3], rows=[2], object_fit="contain", height="auto" ) with gr.TabItem("📝 System Logs"): logs_output = gr.Textbox( label="Processing Logs", lines=15, interactive=False, show_copy_button=True ) # Event Trigger verify_btn.click( fn=process_documents, inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox], outputs=[status_output, gallery, results_df, logs_output] ) if __name__ == "__main__": # Increase max file size if needed, allow sharing demo.launch(share=False, server_name="0.0.0.0")