Spaces:
Build error
Build error
| """ | |
| Gradio UI for T1 Tax PDF Processor - Hugging Face Spaces Version | |
| """ | |
| import gradio as gr | |
| import tempfile | |
| import json | |
| from pathlib import Path | |
| from app.extractor import PDFExtractor | |
| from app.ocr import OCRProcessor | |
| from app.mapper import FieldMapper | |
| from app.filler import PDFFiller | |
| from app.utils.logging import get_logger | |
| logger = get_logger(__name__) | |
| # Initialize components | |
| extractor = PDFExtractor() | |
| mapper = FieldMapper() | |
| filler = PDFFiller() | |
| # Try to initialize OCR | |
| try: | |
| ocr = OCRProcessor() | |
| ocr_available = ocr.is_available() | |
| except: | |
| ocr = None | |
| ocr_available = False | |
| def process_pdfs(source_pdf, template_pdf, use_ocr=False, flatten_output=False): | |
| """ | |
| Process PDFs: extract from source and fill template. | |
| Args: | |
| source_pdf: Path to source T1 PDF | |
| template_pdf: Path to template PDF form | |
| use_ocr: Force OCR processing | |
| flatten_output: Flatten output PDF | |
| Returns: | |
| tuple: (output_pdf_path, extraction_table_data, status_message) | |
| """ | |
| try: | |
| if not source_pdf or not template_pdf: | |
| return None, [], "β Please upload both PDFs" | |
| source_path = Path(source_pdf.name) | |
| template_path = Path(template_pdf.name) | |
| # Step 1: Extract data | |
| has_text = extractor.has_text_content(source_path) | |
| if has_text and not use_ocr: | |
| result = extractor.extract_all_data(source_path) | |
| method = "text" | |
| elif ocr_available: | |
| text = ocr.process_pdf(source_path) | |
| line_values = extractor.extract_line_values(text) | |
| result = { | |
| "text": text, | |
| "line_values": line_values, | |
| "has_text": False, | |
| "extraction_method": "ocr" | |
| } | |
| method = "ocr" | |
| else: | |
| result = extractor.extract_all_data(source_path) | |
| method = "text_fallback" | |
| line_values = result["line_values"] | |
| if not line_values: | |
| return None, [], "β οΈ No T1 line values found in source PDF" | |
| # Step 2: Map values | |
| mapped = mapper.map_values(line_values) | |
| # Step 3: Fill template | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as output_file: | |
| output_path = Path(output_file.name) | |
| filler.fill_form( | |
| template_path=template_path, | |
| output_path=output_path, | |
| field_values=mapped, | |
| flatten=flatten_output | |
| ) | |
| # Create table data | |
| table_data = [] | |
| for line_num, value in line_values.items(): | |
| if value: | |
| field_name = mapper.get_field_name(line_num) or "unmapped" | |
| table_data.append([line_num, field_name, value]) | |
| status = f"β Success! Extracted {len(line_values)} lines using {method} method" | |
| return str(output_path), table_data, status | |
| except Exception as e: | |
| logger.error(f"Processing error: {e}") | |
| return None, [], f"β Error: {str(e)}" | |
| def get_current_mappings(): | |
| """Get current field mappings as formatted text.""" | |
| mappings = mapper.get_all_mappings() | |
| output = "## Current Field Mappings\n\n" | |
| output += "| Line # | Field Name | Description |\n" | |
| output += "|--------|------------|-------------|\n" | |
| for line_num, info in sorted(mappings.items())[:20]: # Show first 20 | |
| field = info.get("field", "") | |
| desc = info.get("description", "") | |
| output += f"| {line_num} | {field} | {desc} |\n" | |
| if len(mappings) > 20: | |
| output += f"\n... and {len(mappings) - 20} more mappings" | |
| return output | |
| # Create Gradio Interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="T1 Tax PDF Processor") as demo: | |
| gr.Markdown(""" | |
| # π T1 Tax PDF Processor | |
| Extract data from Canadian T1 tax return PDFs and automatically fill PDF forms. | |
| **How it works:** | |
| 1. Upload your T1 tax return (source) and target form template (template) | |
| 2. Click "Process PDFs" to extract and fill | |
| 3. Download the filled PDF | |
| β οΈ **Demo Only** - Do not upload real sensitive tax documents to public demos! | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| source_pdf = gr.File( | |
| label="π Source T1 PDF (with data)", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| template_pdf = gr.File( | |
| label="π Target Template PDF (to fill)", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| with gr.Row(): | |
| use_ocr = gr.Checkbox( | |
| label="Force OCR (for scanned PDFs)", | |
| value=False, | |
| info="Enable if source PDF is scanned/image-based" | |
| ) | |
| flatten_output = gr.Checkbox( | |
| label="Flatten output PDF", | |
| value=False, | |
| info="Make filled fields non-editable" | |
| ) | |
| process_btn = gr.Button("π Process PDFs", variant="primary", size="lg") | |
| status_text = gr.Markdown("Ready to process...") | |
| with gr.Row(): | |
| output_pdf = gr.File( | |
| label="π₯ Download Filled PDF", | |
| type="filepath" | |
| ) | |
| with gr.Row(): | |
| extraction_table = gr.Dataframe( | |
| headers=["Line Number", "Field Name", "Value"], | |
| label="π Extracted T1 Data", | |
| wrap=True | |
| ) | |
| # Process button action | |
| process_btn.click( | |
| fn=process_pdfs, | |
| inputs=[source_pdf, template_pdf, use_ocr, flatten_output], | |
| outputs=[output_pdf, extraction_table, status_text] | |
| ) | |
| # Mappings section | |
| with gr.Accordion("πΊοΈ Field Mappings Configuration", open=False): | |
| gr.Markdown(""" | |
| View current T1 line number to PDF field mappings. | |
| To modify mappings, update `config/t1_mapping.json` in the repository. | |
| """) | |
| mappings_display = gr.Markdown(get_current_mappings()) | |
| gr.Markdown(""" | |
| --- | |
| ### π Common T1 Line Numbers | |
| | Line | Description | | |
| |------|-------------| | |
| | 10100 | Employment income | | |
| | 15000 | Total income | | |
| | 23600 | Net income | | |
| | 26000 | Taxable income | | |
| | 42000 | Net federal tax | | |
| | 48400 | Refund | | |
| | 48500 | Balance owing | | |
| ### π§ Tech Stack | |
| Built with: FastAPI, pdfplumber, PyMuPDF, Tesseract OCR, Gradio | |
| ### β οΈ Privacy Notice | |
| This is a demonstration application. Files are processed temporarily and not stored permanently. | |
| **Do not upload real tax documents containing sensitive personal information.** | |
| """) | |
| # Launch | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |