File size: 38,783 Bytes
4e0f742
 
18398c9
34d4317
b6a0723
82fa835
437698e
34d4317
79acbc2
793d450
4700923
327cf9f
 
 
4700923
 
79acbc2
7aaecfe
437698e
 
 
 
 
7447ee1
 
6fa1f33
d8ec638
6fa1f33
03aab8f
34d4317
 
 
437698e
0f013a7
437698e
34d4317
437698e
be22fa9
4700923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f170645
 
34d4317
437698e
34d4317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db9da2f
437698e
 
 
 
 
 
 
 
f7b11e7
437698e
 
 
 
 
 
 
 
 
 
 
82fa835
db9da2f
437698e
 
34d4317
437698e
 
 
 
 
34d4317
 
 
437698e
 
34d4317
437698e
34d4317
437698e
 
 
 
 
 
 
 
34d4317
437698e
 
 
 
 
 
 
 
34d4317
437698e
 
34d4317
437698e
327cf9f
437698e
 
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437698e
327cf9f
 
 
 
 
 
437698e
327cf9f
 
 
 
 
 
437698e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7aaecfe
 
 
 
 
ba3a429
7aaecfe
 
 
 
 
 
 
 
 
 
ba3a429
dae25de
34d4317
7aaecfe
 
 
82fa835
eeb3dd2
7aaecfe
d8ec638
ba3a429
4700923
82fa835
 
 
f170645
4700923
ba3a429
4700923
 
 
 
 
f170645
82fa835
 
4700923
82fa835
 
ba3a429
4700923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba3a429
82fa835
94f9b7b
4700923
 
 
 
327cf9f
4700923
327cf9f
 
 
 
 
ba3a429
327cf9f
 
 
4700923
327cf9f
 
 
4700923
327cf9f
 
 
 
 
 
4700923
 
327cf9f
4700923
327cf9f
 
 
 
4700923
 
 
 
 
 
 
 
 
 
 
7aaecfe
fe614c0
aff8f4e
fe614c0
437698e
d8ec638
 
 
 
 
 
7aaecfe
5790e20
fe614c0
ba3a429
d8ec638
7aaecfe
 
 
 
fe614c0
7aaecfe
 
 
0315c1f
fe614c0
7aaecfe
d8ec638
7aaecfe
 
437698e
fe614c0
82fa835
fe614c0
0315c1f
7aaecfe
4700923
b6c48af
82fa835
 
 
 
 
327cf9f
82fa835
eeb3dd2
6d3e84c
dae25de
 
 
 
 
6d3e84c
 
 
 
 
 
 
 
 
4700923
6d3e84c
 
 
 
 
 
dae25de
eeb3dd2
7aaecfe
 
82fa835
 
 
327cf9f
4700923
7aaecfe
 
327cf9f
a33d8b8
7aaecfe
d8ec638
7530d87
 
 
 
 
 
4700923
7530d87
 
 
dae25de
7aaecfe
437698e
fe614c0
a33d8b8
dae25de
7aaecfe
82fa835
7aaecfe
437698e
 
5790e20
437698e
 
 
34d4317
7aaecfe
437698e
 
82fa835
437698e
db9da2f
437698e
82fa835
437698e
82fa835
327cf9f
82fa835
75ebbc3
327cf9f
 
 
 
 
 
82fa835
 
75ebbc3
 
82fa835
7aaecfe
82fa835
 
327cf9f
75ebbc3
327cf9f
82fa835
 
 
75ebbc3
327cf9f
82fa835
 
75ebbc3
327cf9f
82fa835
 
 
 
 
 
4700923
 
327cf9f
 
82fa835
 
75ebbc3
327cf9f
82fa835
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82fa835
f7b11e7
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4700923
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82fa835
 
 
 
 
 
 
 
 
 
327cf9f
82fa835
 
 
f7b11e7
82fa835
 
 
 
 
7aaecfe
75ebbc3
 
327cf9f
 
82fa835
 
 
 
 
 
 
327cf9f
 
 
 
 
82fa835
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae25de
327cf9f
 
 
 
 
82fa835
75ebbc3
e27e418
bd852bc
e27e418
 
e1ae56e
ba3a429
 
 
e27e418
7447ee1
 
488956c
eeb3dd2
 
bd852bc
eeb3dd2
 
e1ae56e
4700923
e1ae56e
437698e
7447ee1
437698e
e27e418
7447ee1
e1ae56e
 
 
437698e
e1ae56e
793d450
eeb3dd2
 
 
793d450
 
 
 
e27e418
aff8f4e
e27e418
aff8f4e
dae25de
e27e418
 
 
d280c62
5bf91d3
3a97ce5
76f6a35
 
 
 
 
 
 
 
 
 
3a97ce5
 
76f6a35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7b11e7
76f6a35
 
 
 
 
 
 
 
 
 
 
 
3a97ce5
76f6a35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a97ce5
ba3a429
e1ae56e
 
ba3a429
 
437698e
ba3a429
793d450
e1ae56e
 
ba3a429
4700923
 
 
 
 
e1ae56e
 
0908876
e1ae56e
 
eeb3dd2
dae25de
d8ec638
a33d8b8
 
d8ec638
7447ee1
eeb3dd2
e27e418
 
 
 
 
 
e1ae56e
4e0f742
 
e1ae56e
437698e
e1ae56e
82fa835
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
import gradio as gr
import pandas as pd
import os
import logging
import shutil
from datetime import datetime
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import tempfile
import io
import pdfplumber  # Added for PDF fallback processing
from fuzzywuzzy import process  # Added for fuzzy matching
import numpy as np
from scipy import stats  # For statistical analysis
import plotly.express as px  # For Plotly charts
import openpyxl  # Ensure XLSX dependency

# Import all backends
from nebius_backend import (
    NebiusFinanceProcessor,
    process_transactions_nebius,
    generate_financial_report_nebius,
    batch_process_transactions_nebius
)

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Check if running on Hugging Face Spaces
IS_HF_SPACE = os.getenv("SPACE_ID") is not None
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
NEBIUS_API_KEY = os.getenv("NEBIUS_API_KEY")

# Initialize models for local processing
finbert_model = None
financial_classifier = None

# Fallback PDFTransactionProcessor if custom module is unavailable
try:
    from pdf_processor import PDFTransactionProcessor
except ImportError:
    class PDFTransactionProcessor:
        """Fallback PDF processor using pdfplumber to extract tables."""
        def process_pdf(self, pdf_path):
            try:
                logger.debug(f"Processing PDF with pdfplumber: {pdf_path}")
                data = []
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        tables = page.extract_tables()
                        for table in tables:
                            if table and len(table) > 1:  # Skip empty or header-only tables
                                data.extend(table[1:])  # Skip header row
                if not data:
                    raise ValueError("No tables found in PDF")
                # Assume first row has headers; adjust if needed
                df = pd.DataFrame(data[1:], columns=data[0])
                # Clean column names
                df.columns = [str(col).lower().strip() if col else f"col_{i}" for i, col in enumerate(df.columns)]
                return df
            except Exception as e:
                logger.error(f"PDF processing failed: {str(e)}")
                raise ValueError(f"Failed to process PDF: {str(e)}")

pdf_processor = PDFTransactionProcessor()

def initialize_finbert():
    """Initialize FinBERT model for financial sentiment analysis."""
    global finbert_model
    if finbert_model is None:
        try:
            logger.info("Initializing FinBERT model...")
            finbert_model = pipeline(
                'sentiment-analysis',
                model='ProsusAI/finbert',
                use_auth_token=HF_TOKEN,
                device=0 if torch.cuda.is_available() else -1
            )
            logger.info("FinBERT model initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize FinBERT: {str(e)}")
            try:
                finbert_model = pipeline(
                    'sentiment-analysis',
                    model='cardiffnlp/twitter-roberta-base-sentiment-latest',
                    device=0 if torch.cuda.is_available() else -1
                )
                logger.info("Fallback sentiment model initialized")
            except Exception as fallback_error:
                logger.error(f"Failed to initialize fallback model: {str(fallback_error)}")
                finbert_model = None

def initialize_financial_classifier():
    """Initialize a financial classification model."""
    global financial_classifier
    if financial_classifier is None:
        try:
            logger.info("Initializing financial classifier...")
            financial_classifier = pipeline(
                'text-classification',
                model='nlptown/bert-base-multilingual-uncased-sentiment',
                use_auth_token=HF_TOKEN,
                device=0 if torch.cuda.is_available() else -1
            )
            logger.info("Financial classifier initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize financial classifier: {str(e)}")
            financial_classifier = None

# Check API availability
nebius_available = bool(NEBIUS_API_KEY)

if nebius_available:
    try:
        test_processor_nebius = NebiusFinanceProcessor(NEBIUS_API_KEY)
        logger.info("Nebius AI Studio API available and initialized")
    except Exception as e:
        logger.warning(f"Nebius initialization failed: {str(e)}")
        nebius_available = False

def enhanced_categorize_transaction_local(description, amount):
    """Enhanced categorization using multiple Hugging Face models."""
    try:
        if finbert_model is None:
            initialize_finbert()
        if financial_classifier is None:
            initialize_financial_classifier()
        
        category = rule_based_categorization(description, amount)
        
        if finbert_model is not None:
            try:
                sentiment = finbert_model(description)[0]
                confidence = sentiment['score']
                if confidence > 0.8:
                    category = refine_category_with_sentiment(category, description, amount, sentiment)
            except Exception as e:
                logger.debug(f"Sentiment analysis failed: {str(e)}")
        
        if financial_classifier is not None:
            try:
                classification = financial_classifier(description)[0]
                category = refine_category_with_classification(category, classification)
            except Exception as e:
                logger.debug(f"Financial classification failed: {str(e)}")
        
        return category
    except Exception as e:
        logger.error(f"Error in enhanced categorization: {str(e)}")
        return rule_based_categorization(description, amount)

def rule_based_categorization(description, amount):
    """Enhanced rule-based categorization with fuzzy matching."""
    description = description.lower()
    
    # Fuzzy matching for better category detection
    categories = {
        'Salary Income': ['salary', 'wage', 'payroll', 'pay', 'income', 'bonus'],
        'Refunds & Returns': ['refund', 'return', 'cashback', 'rebate'],
        'Investment Income': ['interest', 'dividend', 'investment', 'stock', 'bond'],
        'Freelance Income': ['freelance', 'consulting', 'contract', 'side hustle'],
        'Other Income': ['miscellaneous', 'other'],
        'Groceries & Food': ['grocery', 'supermarket', 'food store', 'market', 'walmart', 'target', 'costco'],
        'Transportation - Fuel': ['gas', 'fuel', 'petrol', 'station', 'shell', 'bp', 'exxon'],
        'Transportation - Public/Ride': ['uber', 'lyft', 'taxi', 'bus', 'train', 'metro'],
        'Transportation - Vehicle': ['car payment', 'auto loan', 'vehicle', 'maintenance', 'repair'],
        'Utilities': ['electric', 'water', 'gas bill', 'internet', 'phone', 'cable'],
        'Housing': ['rent', 'mortgage', 'housing', 'apartment', 'condo'],
        'Dining Out': ['restaurant', 'dining', 'cafe', 'fast food', 'coffee', 'bar'],
        'Healthcare': ['medical', 'doctor', 'hospital', 'pharmacy', 'dental', 'health'],
        'Insurance': ['insurance', 'premium', 'coverage'],
        'Shopping - Clothing': ['clothing', 'apparel', 'shoes', 'retail', 'store'],
        'Shopping - Electronics': ['electronics', 'computer', 'phone', 'tech', 'gadget'],
        'Entertainment': ['entertainment', 'movie', 'netflix', 'spotify', 'hulu', 'game'],
        'Travel': ['travel', 'hotel', 'airbnb', 'flight', 'vacation'],
        'Education': ['education', 'school', 'tuition', 'course', 'book'],
        'Fitness': ['gym', 'fitness', 'sports', 'yoga', 'membership'],
        'Charity': ['charity', 'donation', 'nonprofit', 'giving'],
        'Taxes': ['tax', 'irs', 'federal', 'state', 'property tax'],
        'Miscellaneous Expenses': ['miscellaneous', 'other']
    }
    
    if amount > 0:
        for category, keywords in categories.items():
            if category in ['Salary Income', 'Refunds & Returns', 'Investment Income', 'Freelance Income', 'Other Income']:
                for keyword in keywords:
                    if process.extractOne(keyword, [description])[1] > 85:  # Fuzzy match threshold
                        return category
        return 'Other Income'
    else:
        for category, keywords in categories.items():
            if category not in ['Salary Income', 'Refunds & Returns', 'Investment Income', 'Freelance Income', 'Other Income']:
                for keyword in keywords:
                    if process.extractOne(keyword, [description])[1] > 85:  # Fuzzy match threshold
                        return category
        return 'Miscellaneous Expenses'

def refine_category_with_sentiment(current_category, description, amount, sentiment):
    """Refine category based on sentiment analysis results."""
    sentiment_label = sentiment['label'].lower()
    confidence = sentiment['score']
    
    if confidence > 0.8:
        if sentiment_label in ['negative', 'bearish']:
            if current_category in ['Dining Out', 'Entertainment', 'Shopping']:
                return f"Discretionary - {current_category}"
            elif current_category == 'Miscellaneous Expenses':
                return "Questionable Expense"
        elif sentiment_label in ['positive', 'bullish']:
            if current_category == 'Miscellaneous Expenses' and amount > 0:
                return "Unexpected Income"
            elif current_category in ['Groceries & Food', 'Healthcare']:
                return f"Essential - {current_category}"
    
    return current_category

def refine_category_with_classification(current_category, classification):
    """Refine category based on financial classification results."""
    label = classification['label'].lower()
    confidence = classification['score']
    
    if confidence > 0.8:
        if 'investment' in label:
            return 'Investment Related'
        elif 'loan' in label or 'credit' in label:
            return 'Debt Payment'
        elif 'subscription' in label:
            return 'Subscriptions'
        elif 'business' in label:
            return 'Business Expense' if current_category not in ['Salary Income', 'Freelance Income'] else current_category
    
    return current_category

def process_transactions(transactions, backend="local"):
    """Process transactions locally by categorizing them."""
    processed = []
    for tx in transactions:
        try:
            category = enhanced_categorize_transaction_local(tx['description'], tx['amount'])
            tx_copy = tx.copy()
            tx_copy['category'] = category
            processed.append(tx_copy)
        except Exception as e:
            logger.warning(f"Failed to categorize transaction {tx}: {str(e)}")
            tx_copy = tx.copy()
            tx_copy['category'] = 'Uncategorized'
            processed.append(tx_copy)
    return processed

def process_file(file_obj, backend="auto", currency="USD"):
    """Process an uploaded file and return categorized transactions, financial report, and expense chart."""
    try:
        if file_obj is None:
            raise ValueError("No file provided for processing")

        filename = file_obj.name
        logger.debug(f"Processing file: {filename} with backend: {backend}, currency: {currency}")

        # Process based on file type
        if filename.endswith('.pdf'):
            # Use a temporary file with proper cleanup
            with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmpfile:
                shutil.copyfileobj(file_obj, tmpfile)
                tmp_path = tmpfile.name
            try:
                logger.debug(f"Reading PDF: {tmp_path}")
                df = pdf_processor.process_pdf(tmp_path)
                if df.empty:
                    raise ValueError("No data extracted from PDF")
                logger.debug(f"PDF DataFrame columns: {list(df.columns)}")
            except Exception as e:
                raise ValueError(f"Failed to process PDF file: {str(e)}")
            finally:
                try:
                    os.unlink(tmp_path)
                    logger.debug(f"Deleted temp PDF file: {tmp_path}")
                except Exception as e:
                    logger.warning(f"Failed to delete temp PDF file: {str(e)}")
        elif filename.endswith('.csv'):
            try:
                logger.debug(f"Reading CSV: {filename}")
                df = pd.read_csv(file_obj)
                if df.empty:
                    raise ValueError("CSV file is empty")
                logger.debug(f"CSV DataFrame columns: {list(df.columns)}")
            except Exception as e:
                raise ValueError(f"Failed to process CSV file: {str(e)}")
        elif filename.endswith(('.xlsx', '.xls')):
            try:
                logger.debug(f"Reading Excel: {filename}")
                # Use openpyxl explicitly and read into memory to handle file-like objects
                file_content = file_obj.read()
                df = pd.read_excel(io.BytesIO(file_content), engine='openpyxl')
                if df.empty:
                    raise ValueError("Excel file is empty")
                logger.debug(f"Excel DataFrame columns: {list(df.columns)}")
            except Exception as e:
                raise ValueError(f"Failed to process Excel file: {str(e)}")
        else:
            raise ValueError("Unsupported file format. Please upload PDF, CSV, or Excel.")

        # Validate DataFrame
        if df is None or len(df) == 0:
            raise ValueError("No data extracted from file")

        # Standardize and validate columns with fuzzy matching
        df.columns = [str(col).lower().strip() for col in df.columns]
        column_mappings = {
            'date': ['date', 'transaction_date', 'posted_date', 'dt', 'transaction_dt'],
            'description': ['description', 'desc', 'memo', 'notes', 'transaction_description'],
            'amount': ['amount', 'amt', 'value', 'transaction_amount', 'total']
        }
        required_columns = ['date', 'description', 'amount']
        mapped_columns = {}
        for req_col in required_columns:
            for col in df.columns:
                if any(process.extractOne(col, column_mappings[req_col])[1] > 90 for col in [col.lower()]):
                    mapped_columns[req_col] = col
                    break
            if req_col not in mapped_columns:
                raise ValueError(f"Missing required column similar to '{req_col}'. Found columns: {list(df.columns)}")

        # Rename columns to standard names
        df = df.rename(columns={v: k for k, v in mapped_columns.items()})

        # Handle date parsing with multiple formats
        df['date'] = pd.to_datetime(df['date'], errors='coerce', infer_datetime_format=True)
        if df['date'].isna().all():
            raise ValueError("No valid dates found in 'date' column")
        if df['date'].isna().any():
            logger.warning(f"Dropping {df['date'].isna().sum()} rows with invalid dates")
            df = df.dropna(subset=['date'])

        # Ensure amount is numeric
        df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
        if df['amount'].isna().all():
            raise ValueError("No valid amounts found in 'amount' column")
        if df['amount'].isna().any():
            logger.warning(f"Dropping {df['amount'].isna().sum()} rows with invalid amounts")
            df = df.dropna(subset=['amount'])

        # Ensure description is string
        df['description'] = df['description'].astype(str)

        if df.empty:
            raise ValueError("No valid transactions remain after data cleaning")

        transactions = df.to_dict('records')
        logger.debug(f"Extracted {len(transactions)} transactions: {transactions[:2]}")

        if backend == "auto":
            if not nebius_available:
                logger.warning("Nebius API key not set; falling back to local processing")
                backend = "local"
            else:
                backend = "nebius"
        logger.debug(f"Selected backend: {backend}")

        if backend == "nebius" and nebius_available:
            with tempfile.NamedTemporaryFile(suffix='.csv', delete=False) as tmpfile:
                df.to_csv(tmpfile.name, index=False)
                logger.debug(f"Created temporary CSV file: {tmpfile.name}")
                if not os.path.exists(tmpfile.name):
                    raise ValueError(f"Temporary CSV file {tmpfile.name} was not created")
                if os.path.getsize(tmpfile.name) == 0:
                    raise ValueError(f"Temporary CSV file {tmpfile.name} is empty")
                processed_df = process_transactions_nebius(tmpfile.name)
                if processed_df.empty or 'error' in processed_df.columns:
                    raise ValueError(f"Nebius processing failed: {processed_df.get('error', ['Unknown error'])[0]}")
                report = generate_financial_report_nebius(processed_df.to_dict('records'))
                logger.debug(f"Nebius report: {report}")
                try:
                    os.unlink(tmpfile.name)
                    logger.debug(f"Deleted temporary CSV file: {tmpfile.name}")
                except Exception as e:
                    logger.warning(f"Failed to delete temp CSV file: {str(e)}")
        else:
            processed = process_transactions(transactions, "local")
            report = generate_report(processed, "local")
            processed_df = pd.DataFrame(processed)
            logger.debug(f"Local report: {report}")

        # Currency conversion rates (approximate as of June 09, 2025)
        currency_rates = {
            "USD": 1.0,
            "GBP": 0.79,
            "JPY": 157.5,
            "KES": 129.0
        }
        rate = currency_rates.get(currency, 1.0)
        processed_df['Amount'] = processed_df['amount'] * rate

        # Generate expense chart data using Plotly
        expense_chart = None
        if not processed_df.empty:
            expense_df = processed_df[processed_df['amount'] < 0].groupby('category')['Amount'].sum() * -1
            if not expense_df.empty:
                top_expenses = expense_df.sort_values(ascending=False).head(5)
                chart_df = pd.DataFrame({
                    'Category': top_expenses.index,
                    'Amount': top_expenses.values
                })
                expense_chart = px.pie(
                    chart_df,
                    values='Amount',
                    names='Category',
                    title=f'Top 5 Expense Categories ({currency})',
                    color_discrete_sequence=['#FF6384', '#36A2EB', '#FFCE56', '#4BC0C0', '#9966FF']
                )
                expense_chart.update_layout(
                    margin=dict(t=50, b=10, l=10, r=10),
                    legend=dict(orientation="h", yanchor="top", y=1.1, xanchor="center", x=0.5),
                    font=dict(color="#333333")
                )

        # Format the summary with currency symbol
        if isinstance(report, dict) and 'error' not in report:
            summary = report.get('summary', report)
            summary['total_income'] = round(summary.get('total_income', 0) * rate, 2)
            summary['total_expenses'] = round(summary.get('total_expenses', 0) * rate, 2)
            summary['net_balance'] = round(summary.get('net_balance', 0) * rate, 2)
            formatted_summary = format_summary(summary, currency, processed_df)
            error_message = ""
        else:
            summary = {"error": str(report.get('error', 'Unknown error in report generation'))}
            formatted_summary = format_summary(summary, currency, pd.DataFrame())
            error_message = summary.get('error', "Unknown error")

        logger.debug("File processing completed successfully")
        # Prepare temporary files for CSV and XLSX downloads
        with tempfile.NamedTemporaryFile(suffix='.csv', delete=False) as tmp_csv:
            tmp_csv.write(processed_df.to_csv(index=False).encode())
            csv_path = tmp_csv.name
        with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp_xlsx:
            output = io.BytesIO()
            processed_df.to_excel(output, index=False, engine='openpyxl')
            tmp_xlsx.write(output.getvalue())
            xlsx_path = tmp_xlsx.name

        return processed_df, formatted_summary, error_message, csv_path, xlsx_path, expense_chart

    except Exception as e:
        logger.error(f"Error processing file: {str(e)}")
        error_message = str(e)
        return pd.DataFrame({"error": [str(e)]}), format_summary({"error": str(e)}, currency, pd.DataFrame()), error_message, None, None, None

def generate_report(transactions, backend="auto"):
    """Generate a financial report from processed transactions."""
    try:
        if backend == "auto":
            if nebius_available:
                backend = "nebius"
            else:
                backend = "local"
        
        if backend == "nebius" and nebius_available:
            return generate_financial_report_nebius(transactions)
        else:
            return generate_local_report(transactions)
            
    except Exception as e:
        logger.error(f"Error in generate_report: {str(e)}")
        return generate_local_report(transactions)

def generate_local_report(transactions):
    """Generate a detailed financial report with enhanced AI insights."""
    if not transactions:
        return {
            'summary': {
                'total_income': 0,
                'total_expenses': 0,
                'net_balance': 0,
                'num_transactions': 0
            },
            'trends': [],
            'insights': ["No transactions provided for analysis."]
        }
    
    df = pd.DataFrame(transactions)
    
    # Convert date strings to datetime for analysis
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    
    # Calculate basic metrics
    income = df[df['amount'] > 0]['amount'].sum()
    expenses = df[df['amount'] < 0]['amount'].sum() * -1
    net = income - expenses
    
    # Group by categories
    expense_categories = df[df['amount'] < 0].groupby('category')['amount'].sum().sort_values()
    income_categories = df[df['amount'] > 0].groupby('category')['amount'].sum().sort_values(ascending=False)
    
    # Time-based analysis
    df['month'] = df['date'].dt.to_period('M')
    df['year'] = df['date'].dt.to_period('Y')
    monthly_expenses = df[df['amount'] < 0].groupby('month')['amount'].sum() * -1
    monthly_income = df[df['amount'] > 0].groupby('month')['amount'].sum()
    ytd_expenses = df[df['amount'] < 0].groupby('year')['amount'].sum() * -1
    ytd_income = df[df['amount'] > 0].groupby('year')['amount'].sum()
    current_month = pd.to_datetime('2025-06-09').to_period('M')
    current_year = pd.to_datetime('2025-06-09').to_period('Y')
    mtd_expenses = df[(df['amount'] < 0) & (df['month'] == current_month)]['amount'].sum() * -1
    mtd_income = df[(df['amount'] > 0) & (df['month'] == current_month)]['amount'].sum()
    ytd_expenses_total = ytd_expenses.get(current_year, 0) * -1
    ytd_income_total = ytd_income.get(current_year, 0)
    
    # Advanced AI Insights
    insights = []
    
    # 1. Spending Behavior Analysis
    frequent_categories = df[df['amount'] < 0].groupby('category').size().sort_values(ascending=False).head(5)
    if not frequent_categories.empty:
        top_categories = [f"{cat} ({count} transactions, {df[(df['category'] == cat) & (df['amount'] < 0)]['amount'].sum() * -1:.2f})"
                         for cat, count in frequent_categories.items()]
        insights.append(f"**Top Spending Categories**: {', '.join(top_categories)} account for the majority of your transactions.")
    
    # 2. Anomaly Detection (using IQR method)
    amounts = df['amount'].abs()
    q1, q3 = amounts.quantile([0.25, 0.75])
    iqr = q3 - q1
    anomaly_threshold = q3 + 1.5 * iqr
    anomalies = df[amounts > anomaly_threshold]
    if not anomalies.empty:
        for _, anomaly in anomalies.iterrows():
            insights.append(f"**Anomaly Detected**: Unusual {('income' if anomaly['amount'] > 0 else 'expense')} of {anomaly['amount']:.2f} {currency_symbols.get(currency, '$')} on {anomaly['date'].date()} for '{anomaly['description']}' (category: {anomaly['category']}).")
    
    # 3. Income Stability and Volatility
    if len(monthly_income) > 1:
        income_mean = monthly_income.mean()
        income_std = monthly_income.std()
        income_cv = income_std / income_mean if income_mean > 0 else 0
        if income_cv > 0.3:
            insights.append(f"**Income Volatility**: Your income varies significantly (coefficient of variation: {income_cv:.2%}), indicating irregular earnings. Consider stabilizing income sources.")
        else:
            insights.append(f"Income Stability: Your monthly income appears consistent with low variability.")
    
    # 4. Spending Trends (Seasonality Analysis)
    if len(monthly_expenses) > 3:
        expense_trend = stats.linregress(range(len(monthly_expenses)), monthly_expenses.values)
        if expense_trend.slope > 0:
            insights.append(f"**Spending Trend**: Your monthly expenses are increasing by approximately {expense_trend.slope:.2f} {currency_symbols.get(currency, '$')} per month. Review discretionary spending.")
        elif expense_trend.slope < 0:
            insights.append(f"**Spending Trend**: Your monthly expenses are decreasing by approximately {-expense_trend.slope:.2f} {currency_symbols.get(currency, '$')} per month. Good job controlling costs!")
    
    # 5. Financial Health
    if income > 0:
        expense_to_income_ratio = expenses / income
        financial_health_score = max(0, min(10, 10 - (expense_to_income_ratio * 10)))
        health_assessment = "Healthy" if financial_health_score >= 7 else "Needs Improvement" if financial_health_score >= 4 else "At Risk"
        insights.append(f"**Financial Health Score**: {financial_health_score:.1f}/10 ({health_assessment})")
        if expense_to_income_ratio > 0.7:
            insights.append(f"**Alert**: Spending is {expense_to_income_ratio:.1%} of income. Aim to reduce discretionary expenses.")
    
    # 6. MTD and YTD Insights
    if mtd_expenses > 0:
        avg_daily_mtd = mtd_expenses / df[df['month'] == current_month]['date'].nunique()
        days_in_month = 30
        projected_mtd = avg_daily_mtd * days_in_month
        insights.append(f"**MTD Spending (as of June 09, 2025)**: {mtd_expenses:.2f} {currency_symbols.get(currency, '$')}. Projected monthly spend: {projected_mtd:.2f} {currency_symbols.get(currency, '$')}.")
    if ytd_income_total > 0 and ytd_expenses_total > 0:
        ytd_savings_rate = 1 - (ytd_expenses_total / ytd_income_total)
        insights.append(f"**YTD Savings Rate**: {ytd_savings_rate:.1%} of income saved in 2025.")
    
    # 7. Predictive Insights
    if len(monthly_expenses) > 1:
        avg_monthly_expense = monthly_expenses.mean()
        insights.append(f"**Forecast**: Based on historical data, expect next month's expenses to be around {avg_monthly_expense:.2f} {currency_symbols.get(currency, '$')} ± {monthly_expenses.std():.2f}.")
    
    # 8. Personalized Recommendations
    if expenses > income * 0.7:
        savings_target = income * 0.2
        insights.append(f"**Recommendation**: Reduce expenses by {savings_target:.2f} {currency_symbols.get(currency, '$')} (20% of income) to improve savings. Focus on discretionary categories like {frequent_categories.index[0] if not frequent_categories.empty else 'miscellaneous'}.")
    elif net > 0 and net < income * 0.1:
        insights.append("**Recommendation**: Consider investing excess funds in low-risk options to grow your wealth.")
    if 'Discretionary -' in df['category'].values:
        discretionary_expenses = df[df['category'].str.contains('Discretionary')]['amount'].sum() * -1
        insights.append(f"**Recommendation**: Discretionary spending totals {discretionary_expenses:.2f} {currency_symbols.get(currency, '$')}. Consider cutting back on non-essential purchases.")
    
    # Fallback if no insights
    if not insights:
        insights.append("No significant patterns detected. Review your spending for potential optimizations.")

    report = {
        'summary': {
            'total_income': round(income, 2),
            'total_expenses': round(expenses, 2),
            'net_balance': round(net, 2),
            'num_transactions': len(df)
        },
        'trends': {
            'top_expense_categories': [
                {'category': cat, 'amount': round(abs(amt), 2), 'count': len(df[df['category'] == cat])}
                for cat, amt in expense_categories.head(5).items()
            ],
            'top_income_categories': [
                {'category': cat, 'amount': round(amt, 2), 'count': len(df[df['category'] == cat])}
                for cat, amt in income_categories.head(3).items()
            ]
        },
        'insights': insights
    }
    
    return report

def format_summary(summary, currency, df):
    """Format financial summary as readable text with enhanced structure."""
    currency_symbols = {"USD": "$", "GBP": "£", "JPY": "¥", "KES": "KSh"}
    symbol = currency_symbols.get(currency, "KSh")

    if 'error' in summary:
        return f"**Error**: {summary['error']}"
    
    lines = ["## Financial Summary"]
    lines.append(f"**Overview** (in {currency})")
    lines.append(f"- **Total Income**: {symbol}{summary.get('total_income', 0):,.2f}")
    lines.append(f"- **Total Expenses**: {symbol}{summary.get('total_expenses', 0):,.2f}")
    lines.append(f"- **Net Balance**: {symbol}{summary.get('net_balance', 0):,.2f}")
    lines.append(f"- **Number of Transactions**: {summary.get('num_transactions', 0)}")
    
    if 'trends' in summary:
        trends = summary['trends']
        if trends.get('top_expense_categories'):
            lines.append("\n**Top Expense Categories**")
            for item in trends['top_expense_categories']:
                lines.append(f"  - {item['category']}: {symbol}{item['amount']:.2f} ({item['count']} transactions)")
        if trends.get('top_income_categories'):
            lines.append("\n**Top Income Categories**")
            for item in trends['top_income_categories']:
                lines.append(f"  - {item['category']}: {symbol}{item['amount']:.2f} ({item['count']} transactions)")
    
    if 'insights' in summary:
        lines.append("\n## AI-Powered Insights")
        for insight in summary['insights']:
            lines.append(f"- {insight}")
    
    # Add simple ASCII chart for expense categories
    if not df.empty:
        expense_df = df[df['amount'] < 0].groupby('category')['amount'].sum().sort_values() * -1
        if not expense_df.empty:
            lines.append("\n**Expense Distribution (ASCII)**")
            max_amount = expense_df.max()
            for cat, amt in expense_df.head(5).items():
                bar_length = int((amt / max_amount) * 20) if max_amount > 0 else 0
                lines.append(f"{cat}: {symbol}{amt:.2f} {'█' * bar_length}")

    return "\n".join(lines)

# Global variable to store the last selected currency
currency = "KSh"
currency_symbols = {"USD": "$", "GBP": "£", "JPY": "¥", "KES": "KSh"}

def create_ui():
    with gr.Blocks(title="Financial Transaction Processor", theme="soft") as demo:
        gr.Markdown("# Financial Transaction Processor")
        gr.Markdown("Upload your bank transactions (PDF, CSV, or Excel) to get categorized spending and financial insights.")

        with gr.Row():
            with gr.Column():
                file_input = gr.File(label="Upload Transactions (CSV, PDF, or Excel)")
                currency_selector = gr.Dropdown(
                    choices=["USD", "GBP", "JPY", "KES"],
                    value="KES",
                    label="Select Currency"
                )
                backend_radio = gr.Radio(
                    choices=["auto", "nebius", "local"],
                    value="auto",
                    label="Processing Backend"
                )
                process_btn = gr.Button("Process Transactions", variant="primary")

            with gr.Column():
                processed_table = gr.Dataframe(
                    label="Processed Transactions",
                    headers=["Date", "Description", "Amount", "Category"],
                    datatype=["str", "str", "number", "str"]
                )
                download_csv_btn = gr.DownloadButton(
                    value=None,
                    label="Download Table as CSV"
                )
                download_xlsx_btn = gr.DownloadButton(
                    value=None,
                    label="Download Table as XLSX"
                )

        with gr.Row():
            with gr.Column(elem_classes=["summary-column"], scale=1):
                report_summary = gr.Markdown(label="Financial Summary")
                expense_chart = gr.Plot(label="Expense Distribution Chart")

        with gr.Row():
            error_display = gr.Textbox(label="Error Details", visible=False)

        # CSS definition
        demo.css = """
            /* Container Styles */
            .financial-container {
                font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                max-width: 1200px;
                margin: 0 auto;
                padding: 20px;
                color: #333;
            }
        
            /* Summary Column Styles */
            .summary-column {
                border: 2px solid #4a90e2;
                padding: 15px;
                margin: 10px;
                border-radius: 8px;
                background-color: #f8fafc;
                box-shadow: 0 2px 5px rgba(0,0,0,0.1);
                transition: all 0.3s ease;
            }
        
            .summary-column:hover {
                box-shadow: 0 5px 15px rgba(0,0,0,0.1);
                transform: translateY(-2px);
            }
        
            /* Header Styles */
            .summary-header {
                color: #2c5282;
                font-size: 1.2rem;
                font-weight: 600;
                margin-bottom: 15px;
                padding-bottom: 8px;
                border-bottom: 1px solid #cbd5e0;
            }
        
            /* Data Display Styles */
            .summary-data {
                display: flex;
                justify-content: space-between;
                margin: 8px 0;
            }
        
            .data-label {
                font-weight: 500;
                color: #4a5568;
            }
        
            .data-value {
                font-weight: 600;
            }
        
            .positive-value {
                color: #38a169;
            }
        
            .negative-value {
                color: #e53e3e;
            }
        
            /* Chart Container */
            .chart-container {
                height: 250px;
                margin: 20px 0;
                padding: 15px;
                background: white;
                border-radius: 8px;
                box-shadow: 0 2px 10px rgba(0,0,0,0.05);
            }
        
            /* Responsive Grid */
            .summary-grid {
                display: grid;
                grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
                gap: 15px;
            }
        
            /* Button Styles */
            .action-button {
                background-color: #4299e1;
                color: white;
                border: none;
                padding: 8px 16px;
                border-radius: 4px;
                cursor: pointer;
                font-size: 0.9rem;
                margin-top: 10px;
                transition: background-color 0.2s;
            }
        
            .action-button:hover {
                background-color: #3182ce;
            }
        
            /* Table Styles */
            .transaction-table {
                width: 100;
                border-collapse: collapse;
                margin-top: 20px;
            }
        
            .transaction-table th {
                background-color: #4a90e2;
                color: white;
                padding: 12px;
                text-align: left;
            }
        
            .transaction-table td {
                padding: 10px;
                border-bottom: 1px solid #e2e8f0;
            }
        
            .transaction-table tr:hover {
                background-color: #ebf8ff;
            }
        
            /* Responsive Media Queries */
            @media (max-width: 768px) {
                .summary-grid {
                    grid-template-columns: 1fr;
                }
                
                .financial-container {
                    padding: 10px;
                }
            }
        """
        with gr.Accordion("Example Transaction File Format", open=False):
            gr.Markdown("""
            Your file should include at least these columns (case-insensitive):
            - **Date**: Transaction date (YYYY-MM-DD format preferred)
            - **Description**: Transaction description/text
            - **Amount**: Positive for income, negative for expenses (in USD by default)
            
            Example CSV content:
            ```
            Date,Description,Amount
            2025-06-01,Salary Deposit,3000.00
            2025-06-02,Grocery Store,-125.50
            2025-06-03,Restaurant,-50.00
            2025-06-04,Netflix,-15.00
            2025-06-05,Fuel,-40.00
            ```
            """)

        process_btn.click(
            fn=process_file,
            inputs=[file_input, backend_radio, currency_selector],
            outputs=[processed_table, report_summary, error_display, download_csv_btn, download_xlsx_btn, expense_chart]
        ).then(
            fn=lambda error_msg: gr.update(visible=bool(error_msg)),
            inputs=[error_display],
            outputs=[error_display]
        )

        currency_selector.change(
            fn=lambda x: globals().update(currency=x),
            inputs=[currency_selector],
            outputs=[]
        )

    return demo

if __name__ == "__main__":
    initialize_finbert()
    initialize_financial_classifier()
    demo = create_ui()
    demo.launch(server_name="0.0.0.0", server_port=7860)