Spaces:

triflix
/

DocumentVerification

Sleeping

App Files Files Community

DocumentVerification / app.py

triflix

Upload 2 files

a081bdc verified about 1 month ago

raw

history blame contribute delete

11.8 kB

	import gradio as gr
	import os
	import pandas as pd
	import shutil
	import sys

	# ---------------------------------------------------------
	# IMPORT LOGICCODE
	# ---------------------------------------------------------
	# We expect logiccode.py to be in the same directory
	try:
	import logiccode
	except ImportError as e:
	print("CRITICAL ERROR: Could not import 'logiccode.py'.")
	print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}")
	sys.exit(1)

	# ---------------------------------------------------------
	# MOCK ARGUMENTS
	# ---------------------------------------------------------
	# This class mimics the argparse object that logiccode expects
	class MockArgs:
	def __init__(self):
	self.debug = False
	self.pages = 3
	self.file = []
	self.inputkeywords = ""
	self.required = []
	self.fuzzy = True
	self.visualize = False

	# Initialize args in logiccode if not already present
	if not hasattr(logiccode, 'args'):
	logiccode.args = MockArgs()

	# ---------------------------------------------------------
	# CORE PROCESSING FUNCTION
	# ---------------------------------------------------------
	def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled):
	"""
	Process uploaded files using the imported logiccode module.
	"""
	# 1. Update global args in logiccode based on UI inputs
	logiccode.args.debug = debug_enabled
	logiccode.args.fuzzy = fuzzy_match_enabled

	# Initialize output containers
	results = []
	gallery_images = []
	logs = []

	# Parse keywords
	user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()]

	# Track found documents for "Required" check
	found_documents = set()
	all_matched_keywords_per_file = []

	if not files:
	return "<h3>⚠️ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin."

	logs.append(f"Starting processing of {len(files)} files...")
	logs.append(f"Target Keywords: {user_keywords}")
	logs.append(f"Required Documents: {required_docs}")

	# 2. Iterate through uploaded files
	for file_obj in files:
	file_path = file_obj.name
	filename = os.path.basename(file_path)

	logs.append(f"\n--- Processing: {filename} ---")

	# --- A. Generate Previews for Gallery ---
	if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
	gallery_images.append((file_path, filename))

	elif file_path.lower().endswith('.pdf'):
	try:
	# Use logiccode's utility to get a preview of the 1st page
	preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1)
	if preview_pages:
	gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)"))
	logs.append(f"Generated PDF preview for {filename}")
	except Exception as e:
	logs.append(f"⚠️ PDF Preview failed for {filename}: {e}")

	# --- B. Text Extraction & Analysis ---
	try:
	# Extract text (logiccode handles PDF vs Image internally)
	ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages)

	if not ocr_texts:
	logs.append(f"⚠️ Warning: No text extracted from {filename}")
	results.append({
	"File": filename, "Type": "Unreadable", "Score": 0,
	"Status": "FAILED", "Matched Keywords": ""
	})
	continue

	# Normalize text
	full_text = " ".join(ocr_texts)
	ocr_tokens = logiccode.normalize_text(full_text)

	# Classify Document
	doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled)
	found_documents.add(doc_type)
	logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)")

	# Verify Keywords
	# logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}]
	verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled)

	matched_kws = [r['keyword'] for r in verification_results if r['matched']]
	all_matched_keywords_per_file.append(set(matched_kws))

	# Determine File Status
	# If keywords were provided, we require all of them to match for "VERIFIED"
	if user_keywords:
	file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL"
	if len(matched_kws) == 0: file_status = "FAILED"
	else:
	file_status = "INFO ONLY"

	logs.append(f"Matched: {matched_kws if matched_kws else 'None'}")

	results.append({
	"File": filename,
	"Type": doc_type,
	"Score": f"{doc_score:.1f}%",
	"Status": file_status,
	"Matched Keywords": ", ".join(matched_kws)
	})

	except Exception as e:
	error_msg = f"Error processing {filename}: {str(e)}"
	logs.append(error_msg)
	if debug_enabled:
	import traceback
	logs.append(traceback.format_exc())

	results.append({
	"File": filename, "Type": "Error", "Score": 0,
	"Status": "ERROR", "Matched Keywords": str(e)
	})

	# 3. Calculate Summary Logic
	required_set = set(required_docs)
	missing_docs = required_set - found_documents

	all_user_keywords = set(user_keywords)
	keywords_found_across_all_files = set()
	for file_kw_set in all_matched_keywords_per_file:
	keywords_found_across_all_files.update(file_kw_set)

	missing_keywords = all_user_keywords - keywords_found_across_all_files

	# 4. Build HTML Report
	return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs)

	def build_html_summary(required_set, missing_docs, missing_keywords):
	html = """
	<div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'>
	<h3 style='margin-top: 0; color: #333;'>Verification Summary</h3>
	"""

	# Document Status
	doc_status_bool = True
	if required_set:
	if missing_docs:
	doc_status_bool = False
	html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>"
	else:
	html += f"<div style='margin-bottom: 8px;'>✅ <b>Documents:</b> All required types found.</div>"
	else:
	html += "<div style='margin-bottom: 8px; color: #666;'>ℹ️ No specific document types required.</div>"

	# Keyword Status
	kw_status_bool = True
	if missing_keywords:
	kw_status_bool = False
	html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>"
	else:
	html += f"<div style='margin-bottom: 8px;'>✅ <b>Keywords:</b> All keywords found.</div>"

	# Final Status
	overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444"
	overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED"

	html += f"<hr style='margin: 15px 0; border-color: #eee;'>"
	html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>"
	html += "</div>"
	return html

	# ---------------------------------------------------------
	# GRADIO UI SETUP
	# ---------------------------------------------------------
	theme = gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="slate",
	).set(
	body_background_fill="#f9fafb",
	block_background_fill="white",
	block_border_width="1px"
	)

	with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo:
	gr.Markdown(
	"""
	# 📄 Intelligent Document Verification
	Upload documents, specify required types, and verify content matches automatically.
	"""
	)

	with gr.Row():
	# Left Column: Inputs
	with gr.Column(scale=4):
	files_input = gr.File(
	file_count="multiple",
	label="1. Upload Documents",
	file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"],
	height=250
	)

	keywords_input = gr.Textbox(
	label="2. Keywords to Verify",
	placeholder="Name, ID Number, Date of Birth...",
	info="Enter values that MUST appear in the documents (space separated)",
	lines=2
	)

	# Right Column: Configuration
	with gr.Column(scale=3):
	# Fetch doc types dynamically from logiccode
	available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else []

	required_docs_input = gr.Dropdown(
	choices=available_types,
	multiselect=True,
	label="3. Required Document Types",
	info="Which documents are mandatory?",
	value=[]
	)

	with gr.Group():
	gr.Markdown("### Settings")
	fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)")
	debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs")

	verify_btn = gr.Button("🔍 Verify Documents", variant="primary", size="lg")

	gr.Markdown("---")

	# Results Area
	with gr.Row():
	# Summary Box
	with gr.Column(scale=1):
	status_output = gr.HTML(label="Overall Status")

	# Detailed Tabs
	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("📊 Results Table"):
	results_df = gr.Dataframe(
	headers=["File", "Type", "Score", "Status", "Matched Keywords"],
	interactive=False
	)

	with gr.TabItem("🖼️ Document Gallery"):
	gallery = gr.Gallery(
	label="Processed Images",
	show_label=False,
	columns=[3], rows=[2],
	object_fit="contain",
	height="auto"
	)

	with gr.TabItem("📝 System Logs"):
	logs_output = gr.Textbox(
	label="Processing Logs",
	lines=15,
	interactive=False,
	show_copy_button=True
	)

	# Event Trigger
	verify_btn.click(
	fn=process_documents,
	inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox],
	outputs=[status_output, gallery, results_df, logs_output]
	)

	if __name__ == "__main__":
	# Increase max file size if needed, allow sharing
	demo.launch(share=False, server_name="0.0.0.0")