Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import pandas as pd | |
| import shutil | |
| import sys | |
| # --------------------------------------------------------- | |
| # IMPORT LOGICCODE | |
| # --------------------------------------------------------- | |
| # We expect logiccode.py to be in the same directory | |
| try: | |
| import logiccode | |
| except ImportError as e: | |
| print("CRITICAL ERROR: Could not import 'logiccode.py'.") | |
| print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}") | |
| sys.exit(1) | |
| # --------------------------------------------------------- | |
| # MOCK ARGUMENTS | |
| # --------------------------------------------------------- | |
| # This class mimics the argparse object that logiccode expects | |
| class MockArgs: | |
| def __init__(self): | |
| self.debug = False | |
| self.pages = 3 | |
| self.file = [] | |
| self.inputkeywords = "" | |
| self.required = [] | |
| self.fuzzy = True | |
| self.visualize = False | |
| # Initialize args in logiccode if not already present | |
| if not hasattr(logiccode, 'args'): | |
| logiccode.args = MockArgs() | |
| # --------------------------------------------------------- | |
| # CORE PROCESSING FUNCTION | |
| # --------------------------------------------------------- | |
| def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled): | |
| """ | |
| Process uploaded files using the imported logiccode module. | |
| """ | |
| # 1. Update global args in logiccode based on UI inputs | |
| logiccode.args.debug = debug_enabled | |
| logiccode.args.fuzzy = fuzzy_match_enabled | |
| # Initialize output containers | |
| results = [] | |
| gallery_images = [] | |
| logs = [] | |
| # Parse keywords | |
| user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()] | |
| # Track found documents for "Required" check | |
| found_documents = set() | |
| all_matched_keywords_per_file = [] | |
| if not files: | |
| return "<h3>β οΈ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin." | |
| logs.append(f"Starting processing of {len(files)} files...") | |
| logs.append(f"Target Keywords: {user_keywords}") | |
| logs.append(f"Required Documents: {required_docs}") | |
| # 2. Iterate through uploaded files | |
| for file_obj in files: | |
| file_path = file_obj.name | |
| filename = os.path.basename(file_path) | |
| logs.append(f"\n--- Processing: {filename} ---") | |
| # --- A. Generate Previews for Gallery --- | |
| if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')): | |
| gallery_images.append((file_path, filename)) | |
| elif file_path.lower().endswith('.pdf'): | |
| try: | |
| # Use logiccode's utility to get a preview of the 1st page | |
| preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1) | |
| if preview_pages: | |
| gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)")) | |
| logs.append(f"Generated PDF preview for {filename}") | |
| except Exception as e: | |
| logs.append(f"β οΈ PDF Preview failed for {filename}: {e}") | |
| # --- B. Text Extraction & Analysis --- | |
| try: | |
| # Extract text (logiccode handles PDF vs Image internally) | |
| ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages) | |
| if not ocr_texts: | |
| logs.append(f"β οΈ Warning: No text extracted from {filename}") | |
| results.append({ | |
| "File": filename, "Type": "Unreadable", "Score": 0, | |
| "Status": "FAILED", "Matched Keywords": "" | |
| }) | |
| continue | |
| # Normalize text | |
| full_text = " ".join(ocr_texts) | |
| ocr_tokens = logiccode.normalize_text(full_text) | |
| # Classify Document | |
| doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled) | |
| found_documents.add(doc_type) | |
| logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)") | |
| # Verify Keywords | |
| # logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}] | |
| verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled) | |
| matched_kws = [r['keyword'] for r in verification_results if r['matched']] | |
| all_matched_keywords_per_file.append(set(matched_kws)) | |
| # Determine File Status | |
| # If keywords were provided, we require all of them to match for "VERIFIED" | |
| if user_keywords: | |
| file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL" | |
| if len(matched_kws) == 0: file_status = "FAILED" | |
| else: | |
| file_status = "INFO ONLY" | |
| logs.append(f"Matched: {matched_kws if matched_kws else 'None'}") | |
| results.append({ | |
| "File": filename, | |
| "Type": doc_type, | |
| "Score": f"{doc_score:.1f}%", | |
| "Status": file_status, | |
| "Matched Keywords": ", ".join(matched_kws) | |
| }) | |
| except Exception as e: | |
| error_msg = f"Error processing {filename}: {str(e)}" | |
| logs.append(error_msg) | |
| if debug_enabled: | |
| import traceback | |
| logs.append(traceback.format_exc()) | |
| results.append({ | |
| "File": filename, "Type": "Error", "Score": 0, | |
| "Status": "ERROR", "Matched Keywords": str(e) | |
| }) | |
| # 3. Calculate Summary Logic | |
| required_set = set(required_docs) | |
| missing_docs = required_set - found_documents | |
| all_user_keywords = set(user_keywords) | |
| keywords_found_across_all_files = set() | |
| for file_kw_set in all_matched_keywords_per_file: | |
| keywords_found_across_all_files.update(file_kw_set) | |
| missing_keywords = all_user_keywords - keywords_found_across_all_files | |
| # 4. Build HTML Report | |
| return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs) | |
| def build_html_summary(required_set, missing_docs, missing_keywords): | |
| html = """ | |
| <div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'> | |
| <h3 style='margin-top: 0; color: #333;'>Verification Summary</h3> | |
| """ | |
| # Document Status | |
| doc_status_bool = True | |
| if required_set: | |
| if missing_docs: | |
| doc_status_bool = False | |
| html += f"<div style='margin-bottom: 8px;'>β <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>" | |
| else: | |
| html += f"<div style='margin-bottom: 8px;'>β <b>Documents:</b> All required types found.</div>" | |
| else: | |
| html += "<div style='margin-bottom: 8px; color: #666;'>βΉοΈ No specific document types required.</div>" | |
| # Keyword Status | |
| kw_status_bool = True | |
| if missing_keywords: | |
| kw_status_bool = False | |
| html += f"<div style='margin-bottom: 8px;'>β <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>" | |
| else: | |
| html += f"<div style='margin-bottom: 8px;'>β <b>Keywords:</b> All keywords found.</div>" | |
| # Final Status | |
| overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444" | |
| overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED" | |
| html += f"<hr style='margin: 15px 0; border-color: #eee;'>" | |
| html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>" | |
| html += "</div>" | |
| return html | |
| # --------------------------------------------------------- | |
| # GRADIO UI SETUP | |
| # --------------------------------------------------------- | |
| theme = gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="slate", | |
| ).set( | |
| body_background_fill="#f9fafb", | |
| block_background_fill="white", | |
| block_border_width="1px" | |
| ) | |
| with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo: | |
| gr.Markdown( | |
| """ | |
| # π Intelligent Document Verification | |
| Upload documents, specify required types, and verify content matches automatically. | |
| """ | |
| ) | |
| with gr.Row(): | |
| # Left Column: Inputs | |
| with gr.Column(scale=4): | |
| files_input = gr.File( | |
| file_count="multiple", | |
| label="1. Upload Documents", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"], | |
| height=250 | |
| ) | |
| keywords_input = gr.Textbox( | |
| label="2. Keywords to Verify", | |
| placeholder="Name, ID Number, Date of Birth...", | |
| info="Enter values that MUST appear in the documents (space separated)", | |
| lines=2 | |
| ) | |
| # Right Column: Configuration | |
| with gr.Column(scale=3): | |
| # Fetch doc types dynamically from logiccode | |
| available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else [] | |
| required_docs_input = gr.Dropdown( | |
| choices=available_types, | |
| multiselect=True, | |
| label="3. Required Document Types", | |
| info="Which documents are mandatory?", | |
| value=[] | |
| ) | |
| with gr.Group(): | |
| gr.Markdown("### Settings") | |
| fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)") | |
| debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs") | |
| verify_btn = gr.Button("π Verify Documents", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| # Results Area | |
| with gr.Row(): | |
| # Summary Box | |
| with gr.Column(scale=1): | |
| status_output = gr.HTML(label="Overall Status") | |
| # Detailed Tabs | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.TabItem("π Results Table"): | |
| results_df = gr.Dataframe( | |
| headers=["File", "Type", "Score", "Status", "Matched Keywords"], | |
| interactive=False | |
| ) | |
| with gr.TabItem("πΌοΈ Document Gallery"): | |
| gallery = gr.Gallery( | |
| label="Processed Images", | |
| show_label=False, | |
| columns=[3], rows=[2], | |
| object_fit="contain", | |
| height="auto" | |
| ) | |
| with gr.TabItem("π System Logs"): | |
| logs_output = gr.Textbox( | |
| label="Processing Logs", | |
| lines=15, | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| # Event Trigger | |
| verify_btn.click( | |
| fn=process_documents, | |
| inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox], | |
| outputs=[status_output, gallery, results_df, logs_output] | |
| ) | |
| if __name__ == "__main__": | |
| # Increase max file size if needed, allow sharing | |
| demo.launch(share=False, server_name="0.0.0.0") |