Spaces:
Build error
Build error
File size: 7,345 Bytes
ee27e09 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | """
Test script for Financial Statement Extractor
Tests all functionality and validates output
"""
import sys
import os
from pathlib import Path
import pandas as pd
from extractor import FinancialStatementExtractor
def test_pdf_extraction(test_file=None):
"""Test PDF extraction functionality."""
print("\n" + "="*60)
print("TEST 1: PDF Extraction")
print("="*60)
if test_file and os.path.exists(test_file):
extractor = FinancialStatementExtractor()
result = extractor.extract_from_file(test_file)
if result['status'] == 'success':
print("β
PDF extraction successful!")
print(f" Line items found: {len(result['dataframe'])}")
print(f" Columns: {list(result['dataframe'].columns)}")
print("\nFirst 5 rows:")
print(result['dataframe'].head())
return True
else:
print(f"β PDF extraction failed: {result['message']}")
return False
else:
print("β οΈ No test PDF file provided or file not found")
return None
def test_normalization():
"""Test line item normalization."""
print("\n" + "="*60)
print("TEST 2: Line Item Normalization")
print("="*60)
extractor = FinancialStatementExtractor()
test_cases = [
("Revenue from ops", "Revenue From Operations"),
("Cost of Material Consumed", "Cost Of Materials Consumed"),
("Employee benefits expense", "Employee Benefit Expenses"),
("Profit before tax", "Profit Before Tax"),
("EBITDA", "Ebitda"),
]
passed = 0
for original, expected_similar in test_cases:
normalized = extractor._normalize_item_name(original)
print(f" {original:30} β {normalized}")
if original.lower() in normalized.lower() or normalized.lower() in original.lower():
passed += 1
print(f"\nβ
Normalization working: {passed}/{len(test_cases)} cases handled")
return passed == len(test_cases)
def test_number_extraction():
"""Test numeric value extraction."""
print("\n" + "="*60)
print("TEST 3: Numeric Value Extraction")
print("="*60)
extractor = FinancialStatementExtractor()
test_cases = [
("123,456.78", [123456.78]),
("1,234 5,678 9,012", [1234.0, 5678.0, 9012.0]),
("(1,234.56)", [1234.56]), # Should handle parentheses
("-500.25", [-500.25]),
]
passed = 0
for text, expected in test_cases:
result = extractor._extract_numbers(text)
if len(result) == len(expected):
print(f" β
'{text}' β {result}")
passed += 1
else:
print(f" β '{text}' β {result} (expected {expected})")
print(f"\nβ
Number extraction: {passed}/{len(test_cases)} cases passed")
return passed == len(test_cases)
def test_year_extraction():
"""Test fiscal year detection."""
print("\n" + "="*60)
print("TEST 4: Fiscal Year Detection")
print("="*60)
extractor = FinancialStatementExtractor()
test_text = """
Financial Results for FY 25, FY 24, and FY 23
Year ended March 31, 2025
Comparative data for 2024 and 2023
"""
years = extractor._extract_years(test_text)
print(f" Detected years: {years}")
if len(years) > 0:
print(f"β
Year extraction working: {len(years)} years found")
return True
else:
print("β Year extraction failed")
return False
def test_excel_generation():
"""Test Excel file generation."""
print("\n" + "="*60)
print("TEST 5: Excel File Generation")
print("="*60)
# Create sample data
sample_data = {
'Particulars': ['Revenue', 'Expenses', 'Profit'],
'FY 25': [100000, 60000, 40000],
'FY 24': [90000, 55000, 35000],
}
df = pd.DataFrame(sample_data)
try:
output_path = "test_output.xlsx"
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Test', index=False)
# Verify file exists and has data
if os.path.exists(output_path):
test_df = pd.read_excel(output_path)
if len(test_df) == 3 and len(test_df.columns) == 3:
print("β
Excel generation successful!")
print(f" File created: {output_path}")
print(f" Rows: {len(test_df)}, Columns: {len(test_df.columns)}")
# Cleanup
os.remove(output_path)
return True
print("β Excel generation failed")
return False
except Exception as e:
print(f"β Excel generation error: {e}")
return False
def test_llm_availability():
"""Test if LLM is available and working."""
print("\n" + "="*60)
print("TEST 6: LLM Availability")
print("="*60)
extractor = FinancialStatementExtractor()
if extractor.llm_available:
print("β
LLM loaded successfully (google/flan-t5-small)")
print(" Model will be used for normalization")
# Test LLM normalization
try:
test_result = extractor._llm_normalize("Revenue from operations")
if test_result:
print(f" Test normalization: 'Revenue from operations' β '{test_result}'")
return True
except Exception as e:
print(f" β οΈ LLM loaded but normalization failed: {e}")
return False
else:
print("β οΈ LLM not available - using rule-based fallback only")
print(" This is OK - system will still work with deterministic methods")
return None
def run_all_tests(pdf_file=None):
"""Run all tests and provide summary."""
print("\n" + "#"*60)
print("# FINANCIAL STATEMENT EXTRACTOR - TEST SUITE")
print("#"*60)
results = {
'PDF Extraction': test_pdf_extraction(pdf_file),
'Normalization': test_normalization(),
'Number Extraction': test_number_extraction(),
'Year Detection': test_year_extraction(),
'Excel Generation': test_excel_generation(),
'LLM Availability': test_llm_availability(),
}
print("\n" + "="*60)
print("TEST SUMMARY")
print("="*60)
passed = sum(1 for v in results.values() if v is True)
failed = sum(1 for v in results.values() if v is False)
skipped = sum(1 for v in results.values() if v is None)
for test, result in results.items():
status = "β
PASS" if result is True else ("β FAIL" if result is False else "β οΈ SKIP")
print(f"{status:12} {test}")
print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped")
if failed == 0:
print("\nπ All critical tests passed! System is ready for deployment.")
else:
print("\nβ οΈ Some tests failed. Please review errors above.")
print("="*60)
return failed == 0
if __name__ == "__main__":
# Check if test PDF file provided
test_pdf = None
if len(sys.argv) > 1:
test_pdf = sys.argv[1]
print(f"Using test PDF: {test_pdf}")
success = run_all_tests(test_pdf)
sys.exit(0 if success else 1)
|