Spaces:
Build error
Build error
| """ | |
| Test script for Financial Statement Extractor | |
| Tests all functionality and validates output | |
| """ | |
| import sys | |
| import os | |
| from pathlib import Path | |
| import pandas as pd | |
| from extractor import FinancialStatementExtractor | |
| def test_pdf_extraction(test_file=None): | |
| """Test PDF extraction functionality.""" | |
| print("\n" + "="*60) | |
| print("TEST 1: PDF Extraction") | |
| print("="*60) | |
| if test_file and os.path.exists(test_file): | |
| extractor = FinancialStatementExtractor() | |
| result = extractor.extract_from_file(test_file) | |
| if result['status'] == 'success': | |
| print("β PDF extraction successful!") | |
| print(f" Line items found: {len(result['dataframe'])}") | |
| print(f" Columns: {list(result['dataframe'].columns)}") | |
| print("\nFirst 5 rows:") | |
| print(result['dataframe'].head()) | |
| return True | |
| else: | |
| print(f"β PDF extraction failed: {result['message']}") | |
| return False | |
| else: | |
| print("β οΈ No test PDF file provided or file not found") | |
| return None | |
| def test_normalization(): | |
| """Test line item normalization.""" | |
| print("\n" + "="*60) | |
| print("TEST 2: Line Item Normalization") | |
| print("="*60) | |
| extractor = FinancialStatementExtractor() | |
| test_cases = [ | |
| ("Revenue from ops", "Revenue From Operations"), | |
| ("Cost of Material Consumed", "Cost Of Materials Consumed"), | |
| ("Employee benefits expense", "Employee Benefit Expenses"), | |
| ("Profit before tax", "Profit Before Tax"), | |
| ("EBITDA", "Ebitda"), | |
| ] | |
| passed = 0 | |
| for original, expected_similar in test_cases: | |
| normalized = extractor._normalize_item_name(original) | |
| print(f" {original:30} β {normalized}") | |
| if original.lower() in normalized.lower() or normalized.lower() in original.lower(): | |
| passed += 1 | |
| print(f"\nβ Normalization working: {passed}/{len(test_cases)} cases handled") | |
| return passed == len(test_cases) | |
| def test_number_extraction(): | |
| """Test numeric value extraction.""" | |
| print("\n" + "="*60) | |
| print("TEST 3: Numeric Value Extraction") | |
| print("="*60) | |
| extractor = FinancialStatementExtractor() | |
| test_cases = [ | |
| ("123,456.78", [123456.78]), | |
| ("1,234 5,678 9,012", [1234.0, 5678.0, 9012.0]), | |
| ("(1,234.56)", [1234.56]), # Should handle parentheses | |
| ("-500.25", [-500.25]), | |
| ] | |
| passed = 0 | |
| for text, expected in test_cases: | |
| result = extractor._extract_numbers(text) | |
| if len(result) == len(expected): | |
| print(f" β '{text}' β {result}") | |
| passed += 1 | |
| else: | |
| print(f" β '{text}' β {result} (expected {expected})") | |
| print(f"\nβ Number extraction: {passed}/{len(test_cases)} cases passed") | |
| return passed == len(test_cases) | |
| def test_year_extraction(): | |
| """Test fiscal year detection.""" | |
| print("\n" + "="*60) | |
| print("TEST 4: Fiscal Year Detection") | |
| print("="*60) | |
| extractor = FinancialStatementExtractor() | |
| test_text = """ | |
| Financial Results for FY 25, FY 24, and FY 23 | |
| Year ended March 31, 2025 | |
| Comparative data for 2024 and 2023 | |
| """ | |
| years = extractor._extract_years(test_text) | |
| print(f" Detected years: {years}") | |
| if len(years) > 0: | |
| print(f"β Year extraction working: {len(years)} years found") | |
| return True | |
| else: | |
| print("β Year extraction failed") | |
| return False | |
| def test_excel_generation(): | |
| """Test Excel file generation.""" | |
| print("\n" + "="*60) | |
| print("TEST 5: Excel File Generation") | |
| print("="*60) | |
| # Create sample data | |
| sample_data = { | |
| 'Particulars': ['Revenue', 'Expenses', 'Profit'], | |
| 'FY 25': [100000, 60000, 40000], | |
| 'FY 24': [90000, 55000, 35000], | |
| } | |
| df = pd.DataFrame(sample_data) | |
| try: | |
| output_path = "test_output.xlsx" | |
| with pd.ExcelWriter(output_path, engine='openpyxl') as writer: | |
| df.to_excel(writer, sheet_name='Test', index=False) | |
| # Verify file exists and has data | |
| if os.path.exists(output_path): | |
| test_df = pd.read_excel(output_path) | |
| if len(test_df) == 3 and len(test_df.columns) == 3: | |
| print("β Excel generation successful!") | |
| print(f" File created: {output_path}") | |
| print(f" Rows: {len(test_df)}, Columns: {len(test_df.columns)}") | |
| # Cleanup | |
| os.remove(output_path) | |
| return True | |
| print("β Excel generation failed") | |
| return False | |
| except Exception as e: | |
| print(f"β Excel generation error: {e}") | |
| return False | |
| def test_llm_availability(): | |
| """Test if LLM is available and working.""" | |
| print("\n" + "="*60) | |
| print("TEST 6: LLM Availability") | |
| print("="*60) | |
| extractor = FinancialStatementExtractor() | |
| if extractor.llm_available: | |
| print("β LLM loaded successfully (google/flan-t5-small)") | |
| print(" Model will be used for normalization") | |
| # Test LLM normalization | |
| try: | |
| test_result = extractor._llm_normalize("Revenue from operations") | |
| if test_result: | |
| print(f" Test normalization: 'Revenue from operations' β '{test_result}'") | |
| return True | |
| except Exception as e: | |
| print(f" β οΈ LLM loaded but normalization failed: {e}") | |
| return False | |
| else: | |
| print("β οΈ LLM not available - using rule-based fallback only") | |
| print(" This is OK - system will still work with deterministic methods") | |
| return None | |
| def run_all_tests(pdf_file=None): | |
| """Run all tests and provide summary.""" | |
| print("\n" + "#"*60) | |
| print("# FINANCIAL STATEMENT EXTRACTOR - TEST SUITE") | |
| print("#"*60) | |
| results = { | |
| 'PDF Extraction': test_pdf_extraction(pdf_file), | |
| 'Normalization': test_normalization(), | |
| 'Number Extraction': test_number_extraction(), | |
| 'Year Detection': test_year_extraction(), | |
| 'Excel Generation': test_excel_generation(), | |
| 'LLM Availability': test_llm_availability(), | |
| } | |
| print("\n" + "="*60) | |
| print("TEST SUMMARY") | |
| print("="*60) | |
| passed = sum(1 for v in results.values() if v is True) | |
| failed = sum(1 for v in results.values() if v is False) | |
| skipped = sum(1 for v in results.values() if v is None) | |
| for test, result in results.items(): | |
| status = "β PASS" if result is True else ("β FAIL" if result is False else "β οΈ SKIP") | |
| print(f"{status:12} {test}") | |
| print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped") | |
| if failed == 0: | |
| print("\nπ All critical tests passed! System is ready for deployment.") | |
| else: | |
| print("\nβ οΈ Some tests failed. Please review errors above.") | |
| print("="*60) | |
| return failed == 0 | |
| if __name__ == "__main__": | |
| # Check if test PDF file provided | |
| test_pdf = None | |
| if len(sys.argv) > 1: | |
| test_pdf = sys.argv[1] | |
| print(f"Using test PDF: {test_pdf}") | |
| success = run_all_tests(test_pdf) | |
| sys.exit(0 if success else 1) | |