t1-tax-pdf-processor / app_gradio.py
Hamza4100's picture
Upload 23 files
aa8e38b verified
"""
Gradio UI for T1 Tax PDF Processor - Hugging Face Spaces Version
"""
import gradio as gr
import tempfile
import json
from pathlib import Path
from app.extractor import PDFExtractor
from app.ocr import OCRProcessor
from app.mapper import FieldMapper
from app.filler import PDFFiller
from app.utils.logging import get_logger
logger = get_logger(__name__)
# Initialize components
extractor = PDFExtractor()
mapper = FieldMapper()
filler = PDFFiller()
# Try to initialize OCR
try:
ocr = OCRProcessor()
ocr_available = ocr.is_available()
except:
ocr = None
ocr_available = False
def process_pdfs(source_pdf, template_pdf, use_ocr=False, flatten_output=False):
"""
Process PDFs: extract from source and fill template.
Args:
source_pdf: Path to source T1 PDF
template_pdf: Path to template PDF form
use_ocr: Force OCR processing
flatten_output: Flatten output PDF
Returns:
tuple: (output_pdf_path, extraction_table_data, status_message)
"""
try:
if not source_pdf or not template_pdf:
return None, [], "❌ Please upload both PDFs"
source_path = Path(source_pdf.name)
template_path = Path(template_pdf.name)
# Step 1: Extract data
has_text = extractor.has_text_content(source_path)
if has_text and not use_ocr:
result = extractor.extract_all_data(source_path)
method = "text"
elif ocr_available:
text = ocr.process_pdf(source_path)
line_values = extractor.extract_line_values(text)
result = {
"text": text,
"line_values": line_values,
"has_text": False,
"extraction_method": "ocr"
}
method = "ocr"
else:
result = extractor.extract_all_data(source_path)
method = "text_fallback"
line_values = result["line_values"]
if not line_values:
return None, [], "⚠️ No T1 line values found in source PDF"
# Step 2: Map values
mapped = mapper.map_values(line_values)
# Step 3: Fill template
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as output_file:
output_path = Path(output_file.name)
filler.fill_form(
template_path=template_path,
output_path=output_path,
field_values=mapped,
flatten=flatten_output
)
# Create table data
table_data = []
for line_num, value in line_values.items():
if value:
field_name = mapper.get_field_name(line_num) or "unmapped"
table_data.append([line_num, field_name, value])
status = f"βœ… Success! Extracted {len(line_values)} lines using {method} method"
return str(output_path), table_data, status
except Exception as e:
logger.error(f"Processing error: {e}")
return None, [], f"❌ Error: {str(e)}"
def get_current_mappings():
"""Get current field mappings as formatted text."""
mappings = mapper.get_all_mappings()
output = "## Current Field Mappings\n\n"
output += "| Line # | Field Name | Description |\n"
output += "|--------|------------|-------------|\n"
for line_num, info in sorted(mappings.items())[:20]: # Show first 20
field = info.get("field", "")
desc = info.get("description", "")
output += f"| {line_num} | {field} | {desc} |\n"
if len(mappings) > 20:
output += f"\n... and {len(mappings) - 20} more mappings"
return output
# Create Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="T1 Tax PDF Processor") as demo:
gr.Markdown("""
# πŸ“„ T1 Tax PDF Processor
Extract data from Canadian T1 tax return PDFs and automatically fill PDF forms.
**How it works:**
1. Upload your T1 tax return (source) and target form template (template)
2. Click "Process PDFs" to extract and fill
3. Download the filled PDF
⚠️ **Demo Only** - Do not upload real sensitive tax documents to public demos!
""")
with gr.Row():
with gr.Column():
source_pdf = gr.File(
label="πŸ“„ Source T1 PDF (with data)",
file_types=[".pdf"],
type="filepath"
)
template_pdf = gr.File(
label="πŸ“ Target Template PDF (to fill)",
file_types=[".pdf"],
type="filepath"
)
with gr.Row():
use_ocr = gr.Checkbox(
label="Force OCR (for scanned PDFs)",
value=False,
info="Enable if source PDF is scanned/image-based"
)
flatten_output = gr.Checkbox(
label="Flatten output PDF",
value=False,
info="Make filled fields non-editable"
)
process_btn = gr.Button("πŸš€ Process PDFs", variant="primary", size="lg")
status_text = gr.Markdown("Ready to process...")
with gr.Row():
output_pdf = gr.File(
label="πŸ“₯ Download Filled PDF",
type="filepath"
)
with gr.Row():
extraction_table = gr.Dataframe(
headers=["Line Number", "Field Name", "Value"],
label="πŸ“Š Extracted T1 Data",
wrap=True
)
# Process button action
process_btn.click(
fn=process_pdfs,
inputs=[source_pdf, template_pdf, use_ocr, flatten_output],
outputs=[output_pdf, extraction_table, status_text]
)
# Mappings section
with gr.Accordion("πŸ—ΊοΈ Field Mappings Configuration", open=False):
gr.Markdown("""
View current T1 line number to PDF field mappings.
To modify mappings, update `config/t1_mapping.json` in the repository.
""")
mappings_display = gr.Markdown(get_current_mappings())
gr.Markdown("""
---
### πŸ“š Common T1 Line Numbers
| Line | Description |
|------|-------------|
| 10100 | Employment income |
| 15000 | Total income |
| 23600 | Net income |
| 26000 | Taxable income |
| 42000 | Net federal tax |
| 48400 | Refund |
| 48500 | Balance owing |
### πŸ”§ Tech Stack
Built with: FastAPI, pdfplumber, PyMuPDF, Tesseract OCR, Gradio
### ⚠️ Privacy Notice
This is a demonstration application. Files are processed temporarily and not stored permanently.
**Do not upload real tax documents containing sensitive personal information.**
""")
# Launch
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)