import gradio as gr import pandas as pd import os import traceback from pathlib import Path from extractor import FinancialStatementExtractor def process_documents(files): """Process multiple uploaded documents and extract financial statements.""" print(f"DEBUG: Received {len(files) if files else 0} files") if not files or len(files) == 0: return None, "Please upload at least one file" try: print("DEBUG: Initializing extractor...") extractor = FinancialStatementExtractor() print("DEBUG: Extractor initialized successfully") output_files = [] summary_parts = [] for idx, file in enumerate(files): try: file_name = Path(file.name).name print(f"DEBUG: Processing file {idx + 1}: {file_name}") summary_parts.append(f"\n### File {idx + 1}: {file_name}") print(f"DEBUG: Extracting from {file.name}") result = extractor.extract_from_file(file.name) print(f"DEBUG: Extraction result status: {result['status']}") if result['status'] == 'error': error_msg = f"❌ Error: {result['message']}" print(f"DEBUG: {error_msg}") summary_parts.append(error_msg) continue output_path = f"financial_statements_{idx + 1}_{Path(file_name).stem}.xlsx" df = result['dataframe'] print(f"DEBUG: Creating Excel file: {output_path}") with pd.ExcelWriter(output_path, engine='openpyxl') as writer: df.to_excel(writer, sheet_name='Financial Statements', index=False) workbook = writer.book worksheet = writer.sheets['Financial Statements'] from openpyxl.styles import Font, PatternFill, Alignment header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid') header_font = Font(bold=True, color='FFFFFF') for cell in worksheet[1]: cell.fill = header_fill cell.font = header_font cell.alignment = Alignment(horizontal='center', vertical='center') for column in worksheet.columns: max_length = 0 column_letter = column[0].column_letter for cell in column: try: if len(str(cell.value)) > max_length: max_length = len(str(cell.value)) except: pass adjusted_width = min(max_length + 2, 50) worksheet.column_dimensions[column_letter].width = adjusted_width for row in worksheet.iter_rows(min_row=2): for idx_col, cell in enumerate(row): if idx_col > 0: # Skip first column (Particulars) try: if cell.value is not None and isinstance(cell.value, (int, float)): cell.number_format = '#,##0.00' except: pass output_files.append(output_path) print(f"DEBUG: Successfully created {output_path}") summary_parts.append(f""" **Extraction successful!** - Line items extracted: {len(df)} - Fiscal years found: {', '.join([col for col in df.columns if col != 'Particulars'])} - Extraction method: {result.get('method', 'Pattern matching')} """) except Exception as file_error: error_msg = f"❌ Failed to process {file_name}: {str(file_error)}" print(f"DEBUG ERROR: {error_msg}") print(f"DEBUG TRACEBACK: {traceback.format_exc()}") summary_parts.append(error_msg) continue if len(output_files) == 0: error_summary = "❌ Failed to process any files. Please check file formats and content.\n\n" + "\n".join(summary_parts) print(f"DEBUG: No files processed successfully") return None, error_summary final_summary = f""" # Batch Extraction Complete **Total files processed:** {len(files)} **Successfully extracted:** {len(output_files)} **Failed:** {len(files) - len(output_files)} --- """ + "\n".join(summary_parts) print(f"DEBUG: Returning {len(output_files)} output files") # Return all files and summary return output_files, final_summary except Exception as e: error_msg = f"Error processing files: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" print(f"DEBUG CRITICAL ERROR: {error_msg}") return None, error_msg with gr.Blocks(title="Financial Statement Extraction Tool") as demo: gr.Markdown(""" # Financial Statement Extraction Tool (Multi-Document) Upload **multiple** financial documents to automatically extract income statement line items into Excel files. **Supported formats:** PDF, DOCX, DOC, PNG, JPG, JPEG, TXT **Features:** - AI-powered semantic matching for line item normalization - Batch processing - upload multiple documents at once - Individual Excel output for each document - Deterministic fallback for reliability """) with gr.Row(): with gr.Column(): file_input = gr.File( label="Upload Financial Documents (Multiple files supported)", file_types=[".pdf", ".docx", ".doc", ".png", ".jpg", ".jpeg", ".txt"], file_count="multiple" ) submit_btn = gr.Button("Extract Financial Data from All Files", variant="primary", size="lg") gr.Markdown(""" ### 💡 Tips: - Upload multiple files to process them in one batch - Each file will generate a separate Excel output - Supported: Annual reports, quarterly statements, scanned documents """) with gr.Column(): output_files = gr.File(label="Download Excel Outputs", file_count="multiple") output_text = gr.Markdown(label="Extraction Summary") submit_btn.click( fn=process_documents, inputs=file_input, outputs=[output_files, output_text] ) gr.Markdown(""" --- ### How it works: 1. **Document Processing:** Extracts text from PDFs, Word docs, images (OCR), and text files 2. **AI Pattern Recognition:** Uses semantic similarity AI to identify and normalize financial line items 3. **Intelligent Matching:** Handles variations like "Revenue from Ops" vs "Operating Revenue" 4. **Data Extraction:** Reliably extracts numeric values using deterministic parsing 5. **Excel Generation:** Creates professionally formatted spreadsheets for each document ### Sample line items recognized: - Revenue from operations, Other income, Total revenue - Cost of materials consumed, Employee expenses, Depreciation - EBITDA, EBIT, Profit before tax, Profit after tax - And many more standard financial statement items... ### AI Technology: - **Model:** Sentence-BERT (all-MiniLM-L6-v2) for semantic similarity - **Method:** Cosine similarity matching between extracted items and standard terms - **Fallback:** Rule-based normalization if AI confidence < 50% - **Reliability:** Deterministic numeric extraction prevents hallucination """) if __name__ == "__main__": demo.launch()