Amerue's picture
Updated app.py gradio markdown
9bd4f90 verified
import gradio as gr
import pandas as pd
import os
import traceback
from pathlib import Path
from extractor import FinancialStatementExtractor
def process_documents(files):
"""Process multiple uploaded documents and extract financial statements."""
print(f"DEBUG: Received {len(files) if files else 0} files")
if not files or len(files) == 0:
return None, "Please upload at least one file"
try:
print("DEBUG: Initializing extractor...")
extractor = FinancialStatementExtractor()
print("DEBUG: Extractor initialized successfully")
output_files = []
summary_parts = []
for idx, file in enumerate(files):
try:
file_name = Path(file.name).name
print(f"DEBUG: Processing file {idx + 1}: {file_name}")
summary_parts.append(f"\n### File {idx + 1}: {file_name}")
print(f"DEBUG: Extracting from {file.name}")
result = extractor.extract_from_file(file.name)
print(f"DEBUG: Extraction result status: {result['status']}")
if result['status'] == 'error':
error_msg = f"❌ Error: {result['message']}"
print(f"DEBUG: {error_msg}")
summary_parts.append(error_msg)
continue
output_path = f"financial_statements_{idx + 1}_{Path(file_name).stem}.xlsx"
df = result['dataframe']
print(f"DEBUG: Creating Excel file: {output_path}")
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Financial Statements', index=False)
workbook = writer.book
worksheet = writer.sheets['Financial Statements']
from openpyxl.styles import Font, PatternFill, Alignment
header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
header_font = Font(bold=True, color='FFFFFF')
for cell in worksheet[1]:
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal='center', vertical='center')
for column in worksheet.columns:
max_length = 0
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = min(max_length + 2, 50)
worksheet.column_dimensions[column_letter].width = adjusted_width
for row in worksheet.iter_rows(min_row=2):
for idx_col, cell in enumerate(row):
if idx_col > 0: # Skip first column (Particulars)
try:
if cell.value is not None and isinstance(cell.value, (int, float)):
cell.number_format = '#,##0.00'
except:
pass
output_files.append(output_path)
print(f"DEBUG: Successfully created {output_path}")
summary_parts.append(f"""
**Extraction successful!**
- Line items extracted: {len(df)}
- Fiscal years found: {', '.join([col for col in df.columns if col != 'Particulars'])}
- Extraction method: {result.get('method', 'Pattern matching')}
""")
except Exception as file_error:
error_msg = f"❌ Failed to process {file_name}: {str(file_error)}"
print(f"DEBUG ERROR: {error_msg}")
print(f"DEBUG TRACEBACK: {traceback.format_exc()}")
summary_parts.append(error_msg)
continue
if len(output_files) == 0:
error_summary = "❌ Failed to process any files. Please check file formats and content.\n\n" + "\n".join(summary_parts)
print(f"DEBUG: No files processed successfully")
return None, error_summary
final_summary = f"""
# Batch Extraction Complete
**Total files processed:** {len(files)}
**Successfully extracted:** {len(output_files)}
**Failed:** {len(files) - len(output_files)}
---
""" + "\n".join(summary_parts)
print(f"DEBUG: Returning {len(output_files)} output files")
# Return all files and summary
return output_files, final_summary
except Exception as e:
error_msg = f"Error processing files: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
print(f"DEBUG CRITICAL ERROR: {error_msg}")
return None, error_msg
with gr.Blocks(title="Financial Statement Extraction Tool") as demo:
gr.Markdown("""
# Financial Statement Extraction Tool (Multi-Document)
Upload **multiple** financial documents to automatically extract income statement line items into Excel files.
**Supported formats:** PDF, DOCX, DOC, PNG, JPG, JPEG, TXT
**Features:**
- AI-powered semantic matching for line item normalization
- Batch processing - upload multiple documents at once
- Individual Excel output for each document
- Deterministic fallback for reliability
""")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload Financial Documents (Multiple files supported)",
file_types=[".pdf", ".docx", ".doc", ".png", ".jpg", ".jpeg", ".txt"],
file_count="multiple"
)
submit_btn = gr.Button("Extract Financial Data from All Files", variant="primary", size="lg")
gr.Markdown("""
### 💡 Tips:
- Upload multiple files to process them in one batch
- Each file will generate a separate Excel output
- Supported: Annual reports, quarterly statements, scanned documents
""")
with gr.Column():
output_files = gr.File(label="Download Excel Outputs", file_count="multiple")
output_text = gr.Markdown(label="Extraction Summary")
submit_btn.click(
fn=process_documents,
inputs=file_input,
outputs=[output_files, output_text]
)
gr.Markdown("""
---
### How it works:
1. **Document Processing:** Extracts text from PDFs, Word docs, images (OCR), and text files
2. **AI Pattern Recognition:** Uses semantic similarity AI to identify and normalize financial line items
3. **Intelligent Matching:** Handles variations like "Revenue from Ops" vs "Operating Revenue"
4. **Data Extraction:** Reliably extracts numeric values using deterministic parsing
5. **Excel Generation:** Creates professionally formatted spreadsheets for each document
### Sample line items recognized:
- Revenue from operations, Other income, Total revenue
- Cost of materials consumed, Employee expenses, Depreciation
- EBITDA, EBIT, Profit before tax, Profit after tax
- And many more standard financial statement items...
### AI Technology:
- **Model:** Sentence-BERT (all-MiniLM-L6-v2) for semantic similarity
- **Method:** Cosine similarity matching between extracted items and standard terms
- **Fallback:** Rule-based normalization if AI confidence < 50%
- **Reliability:** Deterministic numeric extraction prevents hallucination
""")
if __name__ == "__main__":
demo.launch()