File size: 7,345 Bytes
ee27e09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""
Test script for Financial Statement Extractor
Tests all functionality and validates output
"""

import sys
import os
from pathlib import Path
import pandas as pd
from extractor import FinancialStatementExtractor

def test_pdf_extraction(test_file=None):
    """Test PDF extraction functionality."""
    print("\n" + "="*60)
    print("TEST 1: PDF Extraction")
    print("="*60)
    
    if test_file and os.path.exists(test_file):
        extractor = FinancialStatementExtractor()
        result = extractor.extract_from_file(test_file)
        
        if result['status'] == 'success':
            print("βœ… PDF extraction successful!")
            print(f"   Line items found: {len(result['dataframe'])}")
            print(f"   Columns: {list(result['dataframe'].columns)}")
            print("\nFirst 5 rows:")
            print(result['dataframe'].head())
            return True
        else:
            print(f"❌ PDF extraction failed: {result['message']}")
            return False
    else:
        print("⚠️  No test PDF file provided or file not found")
        return None

def test_normalization():
    """Test line item normalization."""
    print("\n" + "="*60)
    print("TEST 2: Line Item Normalization")
    print("="*60)
    
    extractor = FinancialStatementExtractor()
    
    test_cases = [
        ("Revenue from ops", "Revenue From Operations"),
        ("Cost of Material Consumed", "Cost Of Materials Consumed"),
        ("Employee benefits expense", "Employee Benefit Expenses"),
        ("Profit before tax", "Profit Before Tax"),
        ("EBITDA", "Ebitda"),
    ]
    
    passed = 0
    for original, expected_similar in test_cases:
        normalized = extractor._normalize_item_name(original)
        print(f"   {original:30} β†’ {normalized}")
        if original.lower() in normalized.lower() or normalized.lower() in original.lower():
            passed += 1
    
    print(f"\nβœ… Normalization working: {passed}/{len(test_cases)} cases handled")
    return passed == len(test_cases)

def test_number_extraction():
    """Test numeric value extraction."""
    print("\n" + "="*60)
    print("TEST 3: Numeric Value Extraction")
    print("="*60)
    
    extractor = FinancialStatementExtractor()
    
    test_cases = [
        ("123,456.78", [123456.78]),
        ("1,234 5,678 9,012", [1234.0, 5678.0, 9012.0]),
        ("(1,234.56)", [1234.56]),  # Should handle parentheses
        ("-500.25", [-500.25]),
    ]
    
    passed = 0
    for text, expected in test_cases:
        result = extractor._extract_numbers(text)
        if len(result) == len(expected):
            print(f"   βœ… '{text}' β†’ {result}")
            passed += 1
        else:
            print(f"   ❌ '{text}' β†’ {result} (expected {expected})")
    
    print(f"\nβœ… Number extraction: {passed}/{len(test_cases)} cases passed")
    return passed == len(test_cases)

def test_year_extraction():
    """Test fiscal year detection."""
    print("\n" + "="*60)
    print("TEST 4: Fiscal Year Detection")
    print("="*60)
    
    extractor = FinancialStatementExtractor()
    
    test_text = """
    Financial Results for FY 25, FY 24, and FY 23
    Year ended March 31, 2025
    Comparative data for 2024 and 2023
    """
    
    years = extractor._extract_years(test_text)
    print(f"   Detected years: {years}")
    
    if len(years) > 0:
        print(f"βœ… Year extraction working: {len(years)} years found")
        return True
    else:
        print("❌ Year extraction failed")
        return False

def test_excel_generation():
    """Test Excel file generation."""
    print("\n" + "="*60)
    print("TEST 5: Excel File Generation")
    print("="*60)
    
    # Create sample data
    sample_data = {
        'Particulars': ['Revenue', 'Expenses', 'Profit'],
        'FY 25': [100000, 60000, 40000],
        'FY 24': [90000, 55000, 35000],
    }
    
    df = pd.DataFrame(sample_data)
    
    try:
        output_path = "test_output.xlsx"
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name='Test', index=False)
        
        # Verify file exists and has data
        if os.path.exists(output_path):
            test_df = pd.read_excel(output_path)
            if len(test_df) == 3 and len(test_df.columns) == 3:
                print("βœ… Excel generation successful!")
                print(f"   File created: {output_path}")
                print(f"   Rows: {len(test_df)}, Columns: {len(test_df.columns)}")
                
                # Cleanup
                os.remove(output_path)
                return True
        
        print("❌ Excel generation failed")
        return False
        
    except Exception as e:
        print(f"❌ Excel generation error: {e}")
        return False

def test_llm_availability():
    """Test if LLM is available and working."""
    print("\n" + "="*60)
    print("TEST 6: LLM Availability")
    print("="*60)
    
    extractor = FinancialStatementExtractor()
    
    if extractor.llm_available:
        print("βœ… LLM loaded successfully (google/flan-t5-small)")
        print("   Model will be used for normalization")
        
        # Test LLM normalization
        try:
            test_result = extractor._llm_normalize("Revenue from operations")
            if test_result:
                print(f"   Test normalization: 'Revenue from operations' β†’ '{test_result}'")
            return True
        except Exception as e:
            print(f"   ⚠️  LLM loaded but normalization failed: {e}")
            return False
    else:
        print("⚠️  LLM not available - using rule-based fallback only")
        print("   This is OK - system will still work with deterministic methods")
        return None

def run_all_tests(pdf_file=None):
    """Run all tests and provide summary."""
    print("\n" + "#"*60)
    print("# FINANCIAL STATEMENT EXTRACTOR - TEST SUITE")
    print("#"*60)
    
    results = {
        'PDF Extraction': test_pdf_extraction(pdf_file),
        'Normalization': test_normalization(),
        'Number Extraction': test_number_extraction(),
        'Year Detection': test_year_extraction(),
        'Excel Generation': test_excel_generation(),
        'LLM Availability': test_llm_availability(),
    }
    
    print("\n" + "="*60)
    print("TEST SUMMARY")
    print("="*60)
    
    passed = sum(1 for v in results.values() if v is True)
    failed = sum(1 for v in results.values() if v is False)
    skipped = sum(1 for v in results.values() if v is None)
    
    for test, result in results.items():
        status = "βœ… PASS" if result is True else ("❌ FAIL" if result is False else "⚠️  SKIP")
        print(f"{status:12} {test}")
    
    print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped")
    
    if failed == 0:
        print("\nπŸŽ‰ All critical tests passed! System is ready for deployment.")
    else:
        print("\n⚠️  Some tests failed. Please review errors above.")
    
    print("="*60)
    
    return failed == 0

if __name__ == "__main__":
    # Check if test PDF file provided
    test_pdf = None
    if len(sys.argv) > 1:
        test_pdf = sys.argv[1]
        print(f"Using test PDF: {test_pdf}")
    
    success = run_all_tests(test_pdf)
    sys.exit(0 if success else 1)