Spaces:

Amerue
/

financial-statement-extractor

Build error

App Files Files Community

financial-statement-extractor / test_extractor.py

Amerue

Uploading main application file and supporter files as well

ee27e09 verified 4 months ago

raw

history blame contribute delete

7.35 kB

	"""
	Test script for Financial Statement Extractor
	Tests all functionality and validates output
	"""

	import sys
	import os
	from pathlib import Path
	import pandas as pd
	from extractor import FinancialStatementExtractor

	def test_pdf_extraction(test_file=None):
	"""Test PDF extraction functionality."""
	print("\n" + "="*60)
	print("TEST 1: PDF Extraction")
	print("="*60)

	if test_file and os.path.exists(test_file):
	extractor = FinancialStatementExtractor()
	result = extractor.extract_from_file(test_file)

	if result['status'] == 'success':
	print("✅ PDF extraction successful!")
	print(f" Line items found: {len(result['dataframe'])}")
	print(f" Columns: {list(result['dataframe'].columns)}")
	print("\nFirst 5 rows:")
	print(result['dataframe'].head())
	return True
	else:
	print(f"❌ PDF extraction failed: {result['message']}")
	return False
	else:
	print("⚠️ No test PDF file provided or file not found")
	return None

	def test_normalization():
	"""Test line item normalization."""
	print("\n" + "="*60)
	print("TEST 2: Line Item Normalization")
	print("="*60)

	extractor = FinancialStatementExtractor()

	test_cases = [
	("Revenue from ops", "Revenue From Operations"),
	("Cost of Material Consumed", "Cost Of Materials Consumed"),
	("Employee benefits expense", "Employee Benefit Expenses"),
	("Profit before tax", "Profit Before Tax"),
	("EBITDA", "Ebitda"),
	]

	passed = 0
	for original, expected_similar in test_cases:
	normalized = extractor._normalize_item_name(original)
	print(f" {original:30} → {normalized}")
	if original.lower() in normalized.lower() or normalized.lower() in original.lower():
	passed += 1

	print(f"\n✅ Normalization working: {passed}/{len(test_cases)} cases handled")
	return passed == len(test_cases)

	def test_number_extraction():
	"""Test numeric value extraction."""
	print("\n" + "="*60)
	print("TEST 3: Numeric Value Extraction")
	print("="*60)

	extractor = FinancialStatementExtractor()

	test_cases = [
	("123,456.78", [123456.78]),
	("1,234 5,678 9,012", [1234.0, 5678.0, 9012.0]),
	("(1,234.56)", [1234.56]), # Should handle parentheses
	("-500.25", [-500.25]),
	]

	passed = 0
	for text, expected in test_cases:
	result = extractor._extract_numbers(text)
	if len(result) == len(expected):
	print(f" ✅ '{text}' → {result}")
	passed += 1
	else:
	print(f" ❌ '{text}' → {result} (expected {expected})")

	print(f"\n✅ Number extraction: {passed}/{len(test_cases)} cases passed")
	return passed == len(test_cases)

	def test_year_extraction():
	"""Test fiscal year detection."""
	print("\n" + "="*60)
	print("TEST 4: Fiscal Year Detection")
	print("="*60)

	extractor = FinancialStatementExtractor()

	test_text = """
	Financial Results for FY 25, FY 24, and FY 23
	Year ended March 31, 2025
	Comparative data for 2024 and 2023
	"""

	years = extractor._extract_years(test_text)
	print(f" Detected years: {years}")

	if len(years) > 0:
	print(f"✅ Year extraction working: {len(years)} years found")
	return True
	else:
	print("❌ Year extraction failed")
	return False

	def test_excel_generation():
	"""Test Excel file generation."""
	print("\n" + "="*60)
	print("TEST 5: Excel File Generation")
	print("="*60)

	# Create sample data
	sample_data = {
	'Particulars': ['Revenue', 'Expenses', 'Profit'],
	'FY 25': [100000, 60000, 40000],
	'FY 24': [90000, 55000, 35000],
	}

	df = pd.DataFrame(sample_data)

	try:
	output_path = "test_output.xlsx"
	with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
	df.to_excel(writer, sheet_name='Test', index=False)

	# Verify file exists and has data
	if os.path.exists(output_path):
	test_df = pd.read_excel(output_path)
	if len(test_df) == 3 and len(test_df.columns) == 3:
	print("✅ Excel generation successful!")
	print(f" File created: {output_path}")
	print(f" Rows: {len(test_df)}, Columns: {len(test_df.columns)}")

	# Cleanup
	os.remove(output_path)
	return True

	print("❌ Excel generation failed")
	return False

	except Exception as e:
	print(f"❌ Excel generation error: {e}")
	return False

	def test_llm_availability():
	"""Test if LLM is available and working."""
	print("\n" + "="*60)
	print("TEST 6: LLM Availability")
	print("="*60)

	extractor = FinancialStatementExtractor()

	if extractor.llm_available:
	print("✅ LLM loaded successfully (google/flan-t5-small)")
	print(" Model will be used for normalization")

	# Test LLM normalization
	try:
	test_result = extractor._llm_normalize("Revenue from operations")
	if test_result:
	print(f" Test normalization: 'Revenue from operations' → '{test_result}'")
	return True
	except Exception as e:
	print(f" ⚠️ LLM loaded but normalization failed: {e}")
	return False
	else:
	print("⚠️ LLM not available - using rule-based fallback only")
	print(" This is OK - system will still work with deterministic methods")
	return None

	def run_all_tests(pdf_file=None):
	"""Run all tests and provide summary."""
	print("\n" + "#"*60)
	print("# FINANCIAL STATEMENT EXTRACTOR - TEST SUITE")
	print("#"*60)

	results = {
	'PDF Extraction': test_pdf_extraction(pdf_file),
	'Normalization': test_normalization(),
	'Number Extraction': test_number_extraction(),
	'Year Detection': test_year_extraction(),
	'Excel Generation': test_excel_generation(),
	'LLM Availability': test_llm_availability(),
	}

	print("\n" + "="*60)
	print("TEST SUMMARY")
	print("="*60)

	passed = sum(1 for v in results.values() if v is True)
	failed = sum(1 for v in results.values() if v is False)
	skipped = sum(1 for v in results.values() if v is None)

	for test, result in results.items():
	status = "✅ PASS" if result is True else ("❌ FAIL" if result is False else "⚠️ SKIP")
	print(f"{status:12} {test}")

	print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped")

	if failed == 0:
	print("\n🎉 All critical tests passed! System is ready for deployment.")
	else:
	print("\n⚠️ Some tests failed. Please review errors above.")

	print("="*60)

	return failed == 0

	if __name__ == "__main__":
	# Check if test PDF file provided
	test_pdf = None
	if len(sys.argv) > 1:
	test_pdf = sys.argv[1]
	print(f"Using test PDF: {test_pdf}")

	success = run_all_tests(test_pdf)
	sys.exit(0 if success else 1)