Spaces:

Hamza4100
/

t1-tax-pdf-processor

Build error

App Files Files Community

t1-tax-pdf-processor / app_gradio.py

Hamza4100

Upload 23 files

aa8e38b verified about 1 month ago

raw

history blame contribute delete

6.72 kB

	"""
	Gradio UI for T1 Tax PDF Processor - Hugging Face Spaces Version
	"""
	import gradio as gr
	import tempfile
	import json
	from pathlib import Path

	from app.extractor import PDFExtractor
	from app.ocr import OCRProcessor
	from app.mapper import FieldMapper
	from app.filler import PDFFiller
	from app.utils.logging import get_logger

	logger = get_logger(__name__)

	# Initialize components
	extractor = PDFExtractor()
	mapper = FieldMapper()
	filler = PDFFiller()

	# Try to initialize OCR
	try:
	ocr = OCRProcessor()
	ocr_available = ocr.is_available()
	except:
	ocr = None
	ocr_available = False


	def process_pdfs(source_pdf, template_pdf, use_ocr=False, flatten_output=False):
	"""
	Process PDFs: extract from source and fill template.

	Args:
	source_pdf: Path to source T1 PDF
	template_pdf: Path to template PDF form
	use_ocr: Force OCR processing
	flatten_output: Flatten output PDF

	Returns:
	tuple: (output_pdf_path, extraction_table_data, status_message)
	"""
	try:
	if not source_pdf or not template_pdf:
	return None, [], "❌ Please upload both PDFs"

	source_path = Path(source_pdf.name)
	template_path = Path(template_pdf.name)

	# Step 1: Extract data
	has_text = extractor.has_text_content(source_path)

	if has_text and not use_ocr:
	result = extractor.extract_all_data(source_path)
	method = "text"
	elif ocr_available:
	text = ocr.process_pdf(source_path)
	line_values = extractor.extract_line_values(text)
	result = {
	"text": text,
	"line_values": line_values,
	"has_text": False,
	"extraction_method": "ocr"
	}
	method = "ocr"
	else:
	result = extractor.extract_all_data(source_path)
	method = "text_fallback"

	line_values = result["line_values"]

	if not line_values:
	return None, [], "⚠️ No T1 line values found in source PDF"

	# Step 2: Map values
	mapped = mapper.map_values(line_values)

	# Step 3: Fill template
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as output_file:
	output_path = Path(output_file.name)

	filler.fill_form(
	template_path=template_path,
	output_path=output_path,
	field_values=mapped,
	flatten=flatten_output
	)

	# Create table data
	table_data = []
	for line_num, value in line_values.items():
	if value:
	field_name = mapper.get_field_name(line_num) or "unmapped"
	table_data.append([line_num, field_name, value])

	status = f"✅ Success! Extracted {len(line_values)} lines using {method} method"

	return str(output_path), table_data, status

	except Exception as e:
	logger.error(f"Processing error: {e}")
	return None, [], f"❌ Error: {str(e)}"


	def get_current_mappings():
	"""Get current field mappings as formatted text."""
	mappings = mapper.get_all_mappings()
	output = "## Current Field Mappings\n\n"
	output += "\| Line # \| Field Name \| Description \|\n"
	output += "\|--------\|------------\|-------------\|\n"

	for line_num, info in sorted(mappings.items())[:20]: # Show first 20
	field = info.get("field", "")
	desc = info.get("description", "")
	output += f"\| {line_num} \| {field} \| {desc} \|\n"

	if len(mappings) > 20:
	output += f"\n... and {len(mappings) - 20} more mappings"

	return output


	# Create Gradio Interface
	with gr.Blocks(theme=gr.themes.Soft(), title="T1 Tax PDF Processor") as demo:

	gr.Markdown("""
	# 📄 T1 Tax PDF Processor

	Extract data from Canadian T1 tax return PDFs and automatically fill PDF forms.

	How it works:
	1. Upload your T1 tax return (source) and target form template (template)
	2. Click "Process PDFs" to extract and fill
	3. Download the filled PDF

	⚠️ Demo Only - Do not upload real sensitive tax documents to public demos!
	""")

	with gr.Row():
	with gr.Column():
	source_pdf = gr.File(
	label="📄 Source T1 PDF (with data)",
	file_types=[".pdf"],
	type="filepath"
	)

	template_pdf = gr.File(
	label="📝 Target Template PDF (to fill)",
	file_types=[".pdf"],
	type="filepath"
	)

	with gr.Row():
	use_ocr = gr.Checkbox(
	label="Force OCR (for scanned PDFs)",
	value=False,
	info="Enable if source PDF is scanned/image-based"
	)

	flatten_output = gr.Checkbox(
	label="Flatten output PDF",
	value=False,
	info="Make filled fields non-editable"
	)

	process_btn = gr.Button("🚀 Process PDFs", variant="primary", size="lg")

	status_text = gr.Markdown("Ready to process...")

	with gr.Row():
	output_pdf = gr.File(
	label="📥 Download Filled PDF",
	type="filepath"
	)

	with gr.Row():
	extraction_table = gr.Dataframe(
	headers=["Line Number", "Field Name", "Value"],
	label="📊 Extracted T1 Data",
	wrap=True
	)

	# Process button action
	process_btn.click(
	fn=process_pdfs,
	inputs=[source_pdf, template_pdf, use_ocr, flatten_output],
	outputs=[output_pdf, extraction_table, status_text]
	)

	# Mappings section
	with gr.Accordion("🗺️ Field Mappings Configuration", open=False):
	gr.Markdown("""
	View current T1 line number to PDF field mappings.
	To modify mappings, update `config/t1_mapping.json` in the repository.
	""")

	mappings_display = gr.Markdown(get_current_mappings())

	gr.Markdown("""
	---

	### 📚 Common T1 Line Numbers

	\| Line \| Description \|
	\|------\|-------------\|
	\| 10100 \| Employment income \|
	\| 15000 \| Total income \|
	\| 23600 \| Net income \|
	\| 26000 \| Taxable income \|
	\| 42000 \| Net federal tax \|
	\| 48400 \| Refund \|
	\| 48500 \| Balance owing \|

	### 🔧 Tech Stack

	Built with: FastAPI, pdfplumber, PyMuPDF, Tesseract OCR, Gradio

	### ⚠️ Privacy Notice

	This is a demonstration application. Files are processed temporarily and not stored permanently.
	Do not upload real tax documents containing sensitive personal information.
	""")

	# Launch
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)