Spaces:
Build error
Build error
| import gradio as gr | |
| import pandas as pd | |
| import os | |
| import traceback | |
| from pathlib import Path | |
| from extractor import FinancialStatementExtractor | |
| def process_documents(files): | |
| """Process multiple uploaded documents and extract financial statements.""" | |
| print(f"DEBUG: Received {len(files) if files else 0} files") | |
| if not files or len(files) == 0: | |
| return None, "Please upload at least one file" | |
| try: | |
| print("DEBUG: Initializing extractor...") | |
| extractor = FinancialStatementExtractor() | |
| print("DEBUG: Extractor initialized successfully") | |
| output_files = [] | |
| summary_parts = [] | |
| for idx, file in enumerate(files): | |
| try: | |
| file_name = Path(file.name).name | |
| print(f"DEBUG: Processing file {idx + 1}: {file_name}") | |
| summary_parts.append(f"\n### File {idx + 1}: {file_name}") | |
| print(f"DEBUG: Extracting from {file.name}") | |
| result = extractor.extract_from_file(file.name) | |
| print(f"DEBUG: Extraction result status: {result['status']}") | |
| if result['status'] == 'error': | |
| error_msg = f"❌ Error: {result['message']}" | |
| print(f"DEBUG: {error_msg}") | |
| summary_parts.append(error_msg) | |
| continue | |
| output_path = f"financial_statements_{idx + 1}_{Path(file_name).stem}.xlsx" | |
| df = result['dataframe'] | |
| print(f"DEBUG: Creating Excel file: {output_path}") | |
| with pd.ExcelWriter(output_path, engine='openpyxl') as writer: | |
| df.to_excel(writer, sheet_name='Financial Statements', index=False) | |
| workbook = writer.book | |
| worksheet = writer.sheets['Financial Statements'] | |
| from openpyxl.styles import Font, PatternFill, Alignment | |
| header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid') | |
| header_font = Font(bold=True, color='FFFFFF') | |
| for cell in worksheet[1]: | |
| cell.fill = header_fill | |
| cell.font = header_font | |
| cell.alignment = Alignment(horizontal='center', vertical='center') | |
| for column in worksheet.columns: | |
| max_length = 0 | |
| column_letter = column[0].column_letter | |
| for cell in column: | |
| try: | |
| if len(str(cell.value)) > max_length: | |
| max_length = len(str(cell.value)) | |
| except: | |
| pass | |
| adjusted_width = min(max_length + 2, 50) | |
| worksheet.column_dimensions[column_letter].width = adjusted_width | |
| for row in worksheet.iter_rows(min_row=2): | |
| for idx_col, cell in enumerate(row): | |
| if idx_col > 0: # Skip first column (Particulars) | |
| try: | |
| if cell.value is not None and isinstance(cell.value, (int, float)): | |
| cell.number_format = '#,##0.00' | |
| except: | |
| pass | |
| output_files.append(output_path) | |
| print(f"DEBUG: Successfully created {output_path}") | |
| summary_parts.append(f""" | |
| **Extraction successful!** | |
| - Line items extracted: {len(df)} | |
| - Fiscal years found: {', '.join([col for col in df.columns if col != 'Particulars'])} | |
| - Extraction method: {result.get('method', 'Pattern matching')} | |
| """) | |
| except Exception as file_error: | |
| error_msg = f"❌ Failed to process {file_name}: {str(file_error)}" | |
| print(f"DEBUG ERROR: {error_msg}") | |
| print(f"DEBUG TRACEBACK: {traceback.format_exc()}") | |
| summary_parts.append(error_msg) | |
| continue | |
| if len(output_files) == 0: | |
| error_summary = "❌ Failed to process any files. Please check file formats and content.\n\n" + "\n".join(summary_parts) | |
| print(f"DEBUG: No files processed successfully") | |
| return None, error_summary | |
| final_summary = f""" | |
| # Batch Extraction Complete | |
| **Total files processed:** {len(files)} | |
| **Successfully extracted:** {len(output_files)} | |
| **Failed:** {len(files) - len(output_files)} | |
| --- | |
| """ + "\n".join(summary_parts) | |
| print(f"DEBUG: Returning {len(output_files)} output files") | |
| # Return all files and summary | |
| return output_files, final_summary | |
| except Exception as e: | |
| error_msg = f"Error processing files: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
| print(f"DEBUG CRITICAL ERROR: {error_msg}") | |
| return None, error_msg | |
| with gr.Blocks(title="Financial Statement Extraction Tool") as demo: | |
| gr.Markdown(""" | |
| # Financial Statement Extraction Tool (Multi-Document) | |
| Upload **multiple** financial documents to automatically extract income statement line items into Excel files. | |
| **Supported formats:** PDF, DOCX, DOC, PNG, JPG, JPEG, TXT | |
| **Features:** | |
| - AI-powered semantic matching for line item normalization | |
| - Batch processing - upload multiple documents at once | |
| - Individual Excel output for each document | |
| - Deterministic fallback for reliability | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload Financial Documents (Multiple files supported)", | |
| file_types=[".pdf", ".docx", ".doc", ".png", ".jpg", ".jpeg", ".txt"], | |
| file_count="multiple" | |
| ) | |
| submit_btn = gr.Button("Extract Financial Data from All Files", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| ### 💡 Tips: | |
| - Upload multiple files to process them in one batch | |
| - Each file will generate a separate Excel output | |
| - Supported: Annual reports, quarterly statements, scanned documents | |
| """) | |
| with gr.Column(): | |
| output_files = gr.File(label="Download Excel Outputs", file_count="multiple") | |
| output_text = gr.Markdown(label="Extraction Summary") | |
| submit_btn.click( | |
| fn=process_documents, | |
| inputs=file_input, | |
| outputs=[output_files, output_text] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### How it works: | |
| 1. **Document Processing:** Extracts text from PDFs, Word docs, images (OCR), and text files | |
| 2. **AI Pattern Recognition:** Uses semantic similarity AI to identify and normalize financial line items | |
| 3. **Intelligent Matching:** Handles variations like "Revenue from Ops" vs "Operating Revenue" | |
| 4. **Data Extraction:** Reliably extracts numeric values using deterministic parsing | |
| 5. **Excel Generation:** Creates professionally formatted spreadsheets for each document | |
| ### Sample line items recognized: | |
| - Revenue from operations, Other income, Total revenue | |
| - Cost of materials consumed, Employee expenses, Depreciation | |
| - EBITDA, EBIT, Profit before tax, Profit after tax | |
| - And many more standard financial statement items... | |
| ### AI Technology: | |
| - **Model:** Sentence-BERT (all-MiniLM-L6-v2) for semantic similarity | |
| - **Method:** Cosine similarity matching between extracted items and standard terms | |
| - **Fallback:** Rule-based normalization if AI confidence < 50% | |
| - **Reliability:** Deterministic numeric extraction prevents hallucination | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |