Spaces:

Amerue
/

financial-statement-extractor

Build error

App Files Files Community

financial-statement-extractor / app.py

Amerue

Updated app.py gradio markdown

9bd4f90 verified 4 months ago

raw

history blame contribute delete

8.08 kB

	import gradio as gr
	import pandas as pd
	import os
	import traceback
	from pathlib import Path
	from extractor import FinancialStatementExtractor

	def process_documents(files):
	"""Process multiple uploaded documents and extract financial statements."""
	print(f"DEBUG: Received {len(files) if files else 0} files")

	if not files or len(files) == 0:
	return None, "Please upload at least one file"

	try:
	print("DEBUG: Initializing extractor...")
	extractor = FinancialStatementExtractor()
	print("DEBUG: Extractor initialized successfully")

	output_files = []
	summary_parts = []

	for idx, file in enumerate(files):
	try:
	file_name = Path(file.name).name
	print(f"DEBUG: Processing file {idx + 1}: {file_name}")
	summary_parts.append(f"\n### File {idx + 1}: {file_name}")

	print(f"DEBUG: Extracting from {file.name}")
	result = extractor.extract_from_file(file.name)
	print(f"DEBUG: Extraction result status: {result['status']}")

	if result['status'] == 'error':
	error_msg = f"❌ Error: {result['message']}"
	print(f"DEBUG: {error_msg}")
	summary_parts.append(error_msg)
	continue

	output_path = f"financial_statements_{idx + 1}_{Path(file_name).stem}.xlsx"
	df = result['dataframe']
	print(f"DEBUG: Creating Excel file: {output_path}")

	with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
	df.to_excel(writer, sheet_name='Financial Statements', index=False)

	workbook = writer.book
	worksheet = writer.sheets['Financial Statements']

	from openpyxl.styles import Font, PatternFill, Alignment

	header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
	header_font = Font(bold=True, color='FFFFFF')

	for cell in worksheet[1]:
	cell.fill = header_fill
	cell.font = header_font
	cell.alignment = Alignment(horizontal='center', vertical='center')

	for column in worksheet.columns:
	max_length = 0
	column_letter = column[0].column_letter
	for cell in column:
	try:
	if len(str(cell.value)) > max_length:
	max_length = len(str(cell.value))
	except:
	pass
	adjusted_width = min(max_length + 2, 50)
	worksheet.column_dimensions[column_letter].width = adjusted_width

	for row in worksheet.iter_rows(min_row=2):
	for idx_col, cell in enumerate(row):
	if idx_col > 0: # Skip first column (Particulars)
	try:
	if cell.value is not None and isinstance(cell.value, (int, float)):
	cell.number_format = '#,##0.00'
	except:
	pass

	output_files.append(output_path)
	print(f"DEBUG: Successfully created {output_path}")

	summary_parts.append(f"""
	Extraction successful!
	- Line items extracted: {len(df)}
	- Fiscal years found: {', '.join([col for col in df.columns if col != 'Particulars'])}
	- Extraction method: {result.get('method', 'Pattern matching')}
	""")

	except Exception as file_error:
	error_msg = f"❌ Failed to process {file_name}: {str(file_error)}"
	print(f"DEBUG ERROR: {error_msg}")
	print(f"DEBUG TRACEBACK: {traceback.format_exc()}")
	summary_parts.append(error_msg)
	continue

	if len(output_files) == 0:
	error_summary = "❌ Failed to process any files. Please check file formats and content.\n\n" + "\n".join(summary_parts)
	print(f"DEBUG: No files processed successfully")
	return None, error_summary

	final_summary = f"""
	# Batch Extraction Complete

	Total files processed: {len(files)}
	Successfully extracted: {len(output_files)}
	Failed: {len(files) - len(output_files)}

	---
	""" + "\n".join(summary_parts)

	print(f"DEBUG: Returning {len(output_files)} output files")
	# Return all files and summary
	return output_files, final_summary

	except Exception as e:
	error_msg = f"Error processing files: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
	print(f"DEBUG CRITICAL ERROR: {error_msg}")
	return None, error_msg

	with gr.Blocks(title="Financial Statement Extraction Tool") as demo:
	gr.Markdown("""
	# Financial Statement Extraction Tool (Multi-Document)

	Upload multiple financial documents to automatically extract income statement line items into Excel files.

	Supported formats: PDF, DOCX, DOC, PNG, JPG, JPEG, TXT

	Features:
	- AI-powered semantic matching for line item normalization
	- Batch processing - upload multiple documents at once
	- Individual Excel output for each document
	- Deterministic fallback for reliability
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload Financial Documents (Multiple files supported)",
	file_types=[".pdf", ".docx", ".doc", ".png", ".jpg", ".jpeg", ".txt"],
	file_count="multiple"
	)
	submit_btn = gr.Button("Extract Financial Data from All Files", variant="primary", size="lg")

	gr.Markdown("""
	### 💡 Tips:
	- Upload multiple files to process them in one batch
	- Each file will generate a separate Excel output
	- Supported: Annual reports, quarterly statements, scanned documents
	""")

	with gr.Column():
	output_files = gr.File(label="Download Excel Outputs", file_count="multiple")
	output_text = gr.Markdown(label="Extraction Summary")

	submit_btn.click(
	fn=process_documents,
	inputs=file_input,
	outputs=[output_files, output_text]
	)

	gr.Markdown("""
	---
	### How it works:

	1. Document Processing: Extracts text from PDFs, Word docs, images (OCR), and text files
	2. AI Pattern Recognition: Uses semantic similarity AI to identify and normalize financial line items
	3. Intelligent Matching: Handles variations like "Revenue from Ops" vs "Operating Revenue"
	4. Data Extraction: Reliably extracts numeric values using deterministic parsing
	5. Excel Generation: Creates professionally formatted spreadsheets for each document

	### Sample line items recognized:
	- Revenue from operations, Other income, Total revenue
	- Cost of materials consumed, Employee expenses, Depreciation
	- EBITDA, EBIT, Profit before tax, Profit after tax
	- And many more standard financial statement items...

	### AI Technology:
	- Model: Sentence-BERT (all-MiniLM-L6-v2) for semantic similarity
	- Method: Cosine similarity matching between extracted items and standard terms
	- Fallback: Rule-based normalization if AI confidence < 50%
	- Reliability: Deterministic numeric extraction prevents hallucination
	""")

	if __name__ == "__main__":
	demo.launch()