Spaces:

smainye
/

Finapp

Sleeping

File size: 38,783 Bytes

4e0f742
 
18398c9
34d4317
b6a0723
82fa835
437698e
34d4317
79acbc2
793d450
4700923
327cf9f
 
 
4700923
 
79acbc2
7aaecfe
437698e
 
 
 
 
7447ee1
 
6fa1f33
d8ec638
6fa1f33
03aab8f
34d4317
 
 
437698e
0f013a7
437698e
34d4317
437698e
be22fa9
4700923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f170645
 
34d4317
437698e
34d4317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db9da2f
437698e
 
 
 
 
 
 
 
f7b11e7
437698e
 
 
 
 
 
 
 
 
 
 
82fa835
db9da2f
437698e
 
34d4317
437698e
 
 
 
 
34d4317
 
 
437698e
 
34d4317
437698e
34d4317
437698e
 
 
 
 
 
 
 
34d4317
437698e
 
 
 
 
 
 
 
34d4317
437698e
 
34d4317
437698e
327cf9f
437698e
 
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437698e
327cf9f
 
 
 
 
 
437698e
327cf9f
 
 
 
 
 
437698e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7aaecfe
 
 
 
 
ba3a429
7aaecfe
 
 
 
 
 
 
 
 
 
ba3a429
dae25de
34d4317
7aaecfe
 
 
82fa835
eeb3dd2
7aaecfe
d8ec638
ba3a429
4700923
82fa835
 
 
f170645
4700923
ba3a429
4700923
 
 
 
 
f170645
82fa835
 
4700923
82fa835
 
ba3a429
4700923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba3a429
82fa835
94f9b7b
4700923
 
 
 
327cf9f
4700923
327cf9f
 
 
 
 
ba3a429
327cf9f
 
 
4700923
327cf9f
 
 
4700923
327cf9f
 
 
 
 
 
4700923
 
327cf9f
4700923
327cf9f
 
 
 
4700923
 
 
 
 
 
 
 
 
 
 
7aaecfe
fe614c0
aff8f4e
fe614c0
437698e
d8ec638
 
 
 
 
 
7aaecfe
5790e20
fe614c0
ba3a429
d8ec638
7aaecfe
 
 
 
fe614c0
7aaecfe
 
 
0315c1f
fe614c0
7aaecfe
d8ec638
7aaecfe
 
437698e
fe614c0
82fa835
fe614c0
0315c1f
7aaecfe
4700923
b6c48af
82fa835
 
 
 
 
327cf9f
82fa835
eeb3dd2
6d3e84c
dae25de
 
 
 
 
6d3e84c
 
 
 
 
 
 
 
 
4700923
6d3e84c
 
 
 
 
 
dae25de
eeb3dd2
7aaecfe
 
82fa835
 
 
327cf9f
4700923
7aaecfe
 
327cf9f
a33d8b8
7aaecfe
d8ec638
7530d87
 
 
 
 
 
4700923
7530d87
 
 
dae25de
7aaecfe
437698e
fe614c0
a33d8b8
dae25de
7aaecfe
82fa835
7aaecfe
437698e
 
5790e20
437698e
 
 
34d4317
7aaecfe
437698e
 
82fa835
437698e
db9da2f
437698e
82fa835
437698e
82fa835
327cf9f
82fa835
75ebbc3
327cf9f
 
 
 
 
 
82fa835
 
75ebbc3
 
82fa835
7aaecfe
82fa835
 
327cf9f
75ebbc3
327cf9f
82fa835
 
 
75ebbc3
327cf9f
82fa835
 
75ebbc3
327cf9f
82fa835
 
 
 
 
 
4700923
 
327cf9f
 
82fa835
 
75ebbc3
327cf9f
82fa835
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82fa835
f7b11e7
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4700923
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82fa835
 
 
 
 
 
 
 
 
 
327cf9f
82fa835
 
 
f7b11e7
82fa835
 
 
 
 
7aaecfe
75ebbc3
 
327cf9f
 
82fa835
 
 
 
 
 
 
327cf9f
 
 
 
 
82fa835
327cf9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dae25de
327cf9f
 
 
 
 
82fa835
75ebbc3
e27e418
bd852bc
e27e418
 
e1ae56e
ba3a429
 
 
e27e418
7447ee1
 
488956c
eeb3dd2
 
bd852bc
eeb3dd2
 
e1ae56e
4700923
e1ae56e
437698e
7447ee1
437698e
e27e418
7447ee1
e1ae56e
 
 
437698e
e1ae56e
793d450
eeb3dd2
 
 
793d450
 
 
 
e27e418
aff8f4e
e27e418
aff8f4e
dae25de
e27e418
 
 
d280c62
5bf91d3
3a97ce5
76f6a35
 
 
 
 
 
 
 
 
 
3a97ce5
 
76f6a35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7b11e7
76f6a35
 
 
 
 
 
 
 
 
 
 
 
3a97ce5
76f6a35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a97ce5
ba3a429
e1ae56e
 
ba3a429
 
437698e
ba3a429
793d450
e1ae56e
 
ba3a429
4700923
 
 
 
 
e1ae56e
 
0908876
e1ae56e
 
eeb3dd2
dae25de
d8ec638
a33d8b8
 
d8ec638
7447ee1
eeb3dd2
e27e418
 
 
 
 
 
e1ae56e
4e0f742
 
e1ae56e
437698e
e1ae56e
82fa835

import gradio as gr
import pandas as pd
import os
import logging
import shutil
from datetime import datetime
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import tempfile
import io
import pdfplumber  # Added for PDF fallback processing
from fuzzywuzzy import process  # Added for fuzzy matching
import numpy as np
from scipy import stats  # For statistical analysis
import plotly.express as px  # For Plotly charts
import openpyxl  # Ensure XLSX dependency

# Import all backends
from nebius_backend import (
    NebiusFinanceProcessor,
    process_transactions_nebius,
    generate_financial_report_nebius,
    batch_process_transactions_nebius
)

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Check if running on Hugging Face Spaces
IS_HF_SPACE = os.getenv("SPACE_ID") is not None
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
NEBIUS_API_KEY = os.getenv("NEBIUS_API_KEY")

# Initialize models for local processing
finbert_model = None
financial_classifier = None

# Fallback PDFTransactionProcessor if custom module is unavailable
try:
    from pdf_processor import PDFTransactionProcessor
except ImportError:
    class PDFTransactionProcessor:
        """Fallback PDF processor using pdfplumber to extract tables."""
        def process_pdf(self, pdf_path):
            try:
                logger.debug(f"Processing PDF with pdfplumber: {pdf_path}")
                data = []
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        tables = page.extract_tables()
                        for table in tables:
                            if table and len(table) > 1:  # Skip empty or header-only tables
                                data.extend(table[1:])  # Skip header row
                if not data:
                    raise ValueError("No tables found in PDF")
                # Assume first row has headers; adjust if needed
                df = pd.DataFrame(data[1:], columns=data[0])
                # Clean column names
                df.columns = [str(col).lower().strip() if col else f"col_{i}" for i, col in enumerate(df.columns)]
                return df
            except Exception as e:
                logger.error(f"PDF processing failed: {str(e)}")
                raise ValueError(f"Failed to process PDF: {str(e)}")

pdf_processor = PDFTransactionProcessor()

def initialize_finbert():
    """Initialize FinBERT model for financial sentiment analysis."""
    global finbert_model
    if finbert_model is None:
        try:
            logger.info("Initializing FinBERT model...")
            finbert_model = pipeline(
                'sentiment-analysis',
                model='ProsusAI/finbert',
                use_auth_token=HF_TOKEN,
                device=0 if torch.cuda.is_available() else -1
            )
            logger.info("FinBERT model initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize FinBERT: {str(e)}")
            try:
                finbert_model = pipeline(
                    'sentiment-analysis',
                    model='cardiffnlp/twitter-roberta-base-sentiment-latest',
                    device=0 if torch.cuda.is_available() else -1
                )
                logger.info("Fallback sentiment model initialized")
            except Exception as fallback_error:
                logger.error(f"Failed to initialize fallback model: {str(fallback_error)}")
                finbert_model = None

def initialize_financial_classifier():
    """Initialize a financial classification model."""
    global financial_classifier
    if financial_classifier is None:
        try:
            logger.info("Initializing financial classifier...")
            financial_classifier = pipeline(
                'text-classification',
                model='nlptown/bert-base-multilingual-uncased-sentiment',
                use_auth_token=HF_TOKEN,
                device=0 if torch.cuda.is_available() else -1
            )
            logger.info("Financial classifier initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize financial classifier: {str(e)}")
            financial_classifier = None

# Check API availability
nebius_available = bool(NEBIUS_API_KEY)

if nebius_available:
    try:
        test_processor_nebius = NebiusFinanceProcessor(NEBIUS_API_KEY)
        logger.info("Nebius AI Studio API available and initialized")
    except Exception as e:
        logger.warning(f"Nebius initialization failed: {str(e)}")
        nebius_available = False

def enhanced_categorize_transaction_local(description, amount):
    """Enhanced categorization using multiple Hugging Face models."""
    try:
        if finbert_model is None:
            initialize_finbert()
        if financial_classifier is None:
            initialize_financial_classifier()
        
        category = rule_based_categorization(description, amount)
        
        if finbert_model is not None:
            try:
                sentiment = finbert_model(description)[0]
                confidence = sentiment['score']
                if confidence > 0.8:
                    category = refine_category_with_sentiment(category, description, amount, sentiment)
            except Exception as e:
                logger.debug(f"Sentiment analysis failed: {str(e)}")
        
        if financial_classifier is not None:
            try:
                classification = financial_classifier(description)[0]
                category = refine_category_with_classification(category, classification)
            except Exception as e:
                logger.debug(f"Financial classification failed: {str(e)}")
        
        return category
    except Exception as e:
        logger.error(f"Error in enhanced categorization: {str(e)}")
        return rule_based_categorization(description, amount)

def rule_based_categorization(description, amount):
    """Enhanced rule-based categorization with fuzzy matching."""
    description = description.lower()
    
    # Fuzzy matching for better category detection
    categories = {
        'Salary Income': ['salary', 'wage', 'payroll', 'pay', 'income', 'bonus'],
        'Refunds & Returns': ['refund', 'return', 'cashback', 'rebate'],
        'Investment Income': ['interest', 'dividend', 'investment', 'stock', 'bond'],
        'Freelance Income': ['freelance', 'consulting', 'contract', 'side hustle'],
        'Other Income': ['miscellaneous', 'other'],
        'Groceries & Food': ['grocery', 'supermarket', 'food store', 'market', 'walmart', 'target', 'costco'],
        'Transportation - Fuel': ['gas', 'fuel', 'petrol', 'station', 'shell', 'bp', 'exxon'],
        'Transportation - Public/Ride': ['uber', 'lyft', 'taxi', 'bus', 'train', 'metro'],
        'Transportation - Vehicle': ['car payment', 'auto loan', 'vehicle', 'maintenance', 'repair'],
        'Utilities': ['electric', 'water', 'gas bill', 'internet', 'phone', 'cable'],
        'Housing': ['rent', 'mortgage', 'housing', 'apartment', 'condo'],
        'Dining Out': ['restaurant', 'dining', 'cafe', 'fast food', 'coffee', 'bar'],
        'Healthcare': ['medical', 'doctor', 'hospital', 'pharmacy', 'dental', 'health'],
        'Insurance': ['insurance', 'premium', 'coverage'],
        'Shopping - Clothing': ['clothing', 'apparel', 'shoes', 'retail', 'store'],
        'Shopping - Electronics': ['electronics', 'computer', 'phone', 'tech', 'gadget'],
        'Entertainment': ['entertainment', 'movie', 'netflix', 'spotify', 'hulu', 'game'],
        'Travel': ['travel', 'hotel', 'airbnb', 'flight', 'vacation'],
        'Education': ['education', 'school', 'tuition', 'course', 'book'],
        'Fitness': ['gym', 'fitness', 'sports', 'yoga', 'membership'],
        'Charity': ['charity', 'donation', 'nonprofit', 'giving'],
        'Taxes': ['tax', 'irs', 'federal', 'state', 'property tax'],
        'Miscellaneous Expenses': ['miscellaneous', 'other']
    }
    
    if amount > 0:
        for category, keywords in categories.items():
            if category in ['Salary Income', 'Refunds & Returns', 'Investment Income', 'Freelance Income', 'Other Income']:
                for keyword in keywords:
                    if process.extractOne(keyword, [description])[1] > 85:  # Fuzzy match threshold
                        return category
        return 'Other Income'
    else:
        for category, keywords in categories.items():
            if category not in ['Salary Income', 'Refunds & Returns', 'Investment Income', 'Freelance Income', 'Other Income']:
                for keyword in keywords:
                    if process.extractOne(keyword, [description])[1] > 85:  # Fuzzy match threshold
                        return category
        return 'Miscellaneous Expenses'

def refine_category_with_sentiment(current_category, description, amount, sentiment):
    """Refine category based on sentiment analysis results."""
    sentiment_label = sentiment['label'].lower()
    confidence = sentiment['score']
    
    if confidence > 0.8:
        if sentiment_label in ['negative', 'bearish']:
            if current_category in ['Dining Out', 'Entertainment', 'Shopping']:
                return f"Discretionary - {current_category}"
            elif current_category == 'Miscellaneous Expenses':
                return "Questionable Expense"
        elif sentiment_label in ['positive', 'bullish']:
            if current_category == 'Miscellaneous Expenses' and amount > 0:
                return "Unexpected Income"
            elif current_category in ['Groceries & Food', 'Healthcare']:
                return f"Essential - {current_category}"
    
    return current_category

def refine_category_with_classification(current_category, classification):
    """Refine category based on financial classification results."""
    label = classification['label'].lower()
    confidence = classification['score']
    
    if confidence > 0.8:
        if 'investment' in label:
            return 'Investment Related'
        elif 'loan' in label or 'credit' in label:
            return 'Debt Payment'
        elif 'subscription' in label:
            return 'Subscriptions'
        elif 'business' in label:
            return 'Business Expense' if current_category not in ['Salary Income', 'Freelance Income'] else current_category
    
    return current_category

def process_transactions(transactions, backend="local"):
    """Process transactions locally by categorizing them."""
    processed = []
    for tx in transactions:
        try:
            category = enhanced_categorize_transaction_local(tx['description'], tx['amount'])
            tx_copy = tx.copy()
            tx_copy['category'] = category
            processed.append(tx_copy)
        except Exception as e:
            logger.warning(f"Failed to categorize transaction {tx}: {str(e)}")
            tx_copy = tx.copy()
            tx_copy['category'] = 'Uncategorized'
            processed.append(tx_copy)
    return processed

def process_file(file_obj, backend="auto", currency="USD"):
    """Process an uploaded file and return categorized transactions, financial report, and expense chart."""
    try:
        if file_obj is None:
            raise ValueError("No file provided for processing")

        filename = file_obj.name
        logger.debug(f"Processing file: {filename} with backend: {backend}, currency: {currency}")

        # Process based on file type
        if filename.endswith('.pdf'):
            # Use a temporary file with proper cleanup
            with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmpfile:
                shutil.copyfileobj(file_obj, tmpfile)
                tmp_path = tmpfile.name
            try:
                logger.debug(f"Reading PDF: {tmp_path}")
                df = pdf_processor.process_pdf(tmp_path)
                if df.empty:
                    raise ValueError("No data extracted from PDF")
                logger.debug(f"PDF DataFrame columns: {list(df.columns)}")
            except Exception as e:
                raise ValueError(f"Failed to process PDF file: {str(e)}")
            finally:
                try:
                    os.unlink(tmp_path)
                    logger.debug(f"Deleted temp PDF file: {tmp_path}")
                except Exception as e:
                    logger.warning(f"Failed to delete temp PDF file: {str(e)}")
        elif filename.endswith('.csv'):
            try:
                logger.debug(f"Reading CSV: {filename}")
                df = pd.read_csv(file_obj)
                if df.empty:
                    raise ValueError("CSV file is empty")
                logger.debug(f"CSV DataFrame columns: {list(df.columns)}")
            except Exception as e:
                raise ValueError(f"Failed to process CSV file: {str(e)}")
        elif filename.endswith(('.xlsx', '.xls')):
            try:
                logger.debug(f"Reading Excel: {filename}")
                # Use openpyxl explicitly and read into memory to handle file-like objects
                file_content = file_obj.read()
                df = pd.read_excel(io.BytesIO(file_content), engine='openpyxl')
                if df.empty:
                    raise ValueError("Excel file is empty")
                logger.debug(f"Excel DataFrame columns: {list(df.columns)}")
            except Exception as e:
                raise ValueError(f"Failed to process Excel file: {str(e)}")
        else:
            raise ValueError("Unsupported file format. Please upload PDF, CSV, or Excel.")

        # Validate DataFrame
        if df is None or len(df) == 0:
            raise ValueError("No data extracted from file")

        # Standardize and validate columns with fuzzy matching
        df.columns = [str(col).lower().strip() for col in df.columns]
        column_mappings = {
            'date': ['date', 'transaction_date', 'posted_date', 'dt', 'transaction_dt'],
            'description': ['description', 'desc', 'memo', 'notes', 'transaction_description'],
            'amount': ['amount', 'amt', 'value', 'transaction_amount', 'total']
        }
        required_columns = ['date', 'description', 'amount']
        mapped_columns = {}
        for req_col in required_columns:
            for col in df.columns:
                if any(process.extractOne(col, column_mappings[req_col])[1] > 90 for col in [col.lower()]):
                    mapped_columns[req_col] = col
                    break
            if req_col not in mapped_columns:
                raise ValueError(f"Missing required column similar to '{req_col}'. Found columns: {list(df.columns)}")

        # Rename columns to standard names
        df = df.rename(columns={v: k for k, v in mapped_columns.items()})

        # Handle date parsing with multiple formats
        df['date'] = pd.to_datetime(df['date'], errors='coerce', infer_datetime_format=True)
        if df['date'].isna().all():
            raise ValueError("No valid dates found in 'date' column")
        if df['date'].isna().any():
            logger.warning(f"Dropping {df['date'].isna().sum()} rows with invalid dates")
            df = df.dropna(subset=['date'])

        # Ensure amount is numeric
        df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
        if df['amount'].isna().all():
            raise ValueError("No valid amounts found in 'amount' column")
        if df['amount'].isna().any():
            logger.warning(f"Dropping {df['amount'].isna().sum()} rows with invalid amounts")
            df = df.dropna(subset=['amount'])

        # Ensure description is string
        df['description'] = df['description'].astype(str)

        if df.empty:
            raise ValueError("No valid transactions remain after data cleaning")

        transactions = df.to_dict('records')
        logger.debug(f"Extracted {len(transactions)} transactions: {transactions[:2]}")

        if backend == "auto":
            if not nebius_available:
                logger.warning("Nebius API key not set; falling back to local processing")
                backend = "local"
            else:
                backend = "nebius"
        logger.debug(f"Selected backend: {backend}")

        if backend == "nebius" and nebius_available:
            with tempfile.NamedTemporaryFile(suffix='.csv', delete=False) as tmpfile:
                df.to_csv(tmpfile.name, index=False)
                logger.debug(f"Created temporary CSV file: {tmpfile.name}")
                if not os.path.exists(tmpfile.name):
                    raise ValueError(f"Temporary CSV file {tmpfile.name} was not created")
                if os.path.getsize(tmpfile.name) == 0:
                    raise ValueError(f"Temporary CSV file {tmpfile.name} is empty")
                processed_df = process_transactions_nebius(tmpfile.name)
                if processed_df.empty or 'error' in processed_df.columns:
                    raise ValueError(f"Nebius processing failed: {processed_df.get('error', ['Unknown error'])[0]}")
                report = generate_financial_report_nebius(processed_df.to_dict('records'))
                logger.debug(f"Nebius report: {report}")
                try:
                    os.unlink(tmpfile.name)
                    logger.debug(f"Deleted temporary CSV file: {tmpfile.name}")
                except Exception as e:
                    logger.warning(f"Failed to delete temp CSV file: {str(e)}")
        else:
            processed = process_transactions(transactions, "local")
            report = generate_report(processed, "local")
            processed_df = pd.DataFrame(processed)
            logger.debug(f"Local report: {report}")

        # Currency conversion rates (approximate as of June 09, 2025)
        currency_rates = {
            "USD": 1.0,
            "GBP": 0.79,
            "JPY": 157.5,
            "KES": 129.0
        }
        rate = currency_rates.get(currency, 1.0)
        processed_df['Amount'] = processed_df['amount'] * rate

        # Generate expense chart data using Plotly
        expense_chart = None
        if not processed_df.empty:
            expense_df = processed_df[processed_df['amount'] < 0].groupby('category')['Amount'].sum() * -1
            if not expense_df.empty:
                top_expenses = expense_df.sort_values(ascending=False).head(5)
                chart_df = pd.DataFrame({
                    'Category': top_expenses.index,
                    'Amount': top_expenses.values
                })
                expense_chart = px.pie(
                    chart_df,
                    values='Amount',
                    names='Category',
                    title=f'Top 5 Expense Categories ({currency})',
                    color_discrete_sequence=['#FF6384', '#36A2EB', '#FFCE56', '#4BC0C0', '#9966FF']
                )
                expense_chart.update_layout(
                    margin=dict(t=50, b=10, l=10, r=10),
                    legend=dict(orientation="h", yanchor="top", y=1.1, xanchor="center", x=0.5),
                    font=dict(color="#333333")
                )

        # Format the summary with currency symbol
        if isinstance(report, dict) and 'error' not in report:
            summary = report.get('summary', report)
            summary['total_income'] = round(summary.get('total_income', 0) * rate, 2)
            summary['total_expenses'] = round(summary.get('total_expenses', 0) * rate, 2)
            summary['net_balance'] = round(summary.get('net_balance', 0) * rate, 2)
            formatted_summary = format_summary(summary, currency, processed_df)
            error_message = ""
        else:
            summary = {"error": str(report.get('error', 'Unknown error in report generation'))}
            formatted_summary = format_summary(summary, currency, pd.DataFrame())
            error_message = summary.get('error', "Unknown error")

        logger.debug("File processing completed successfully")
        # Prepare temporary files for CSV and XLSX downloads
        with tempfile.NamedTemporaryFile(suffix='.csv', delete=False) as tmp_csv:
            tmp_csv.write(processed_df.to_csv(index=False).encode())
            csv_path = tmp_csv.name
        with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp_xlsx:
            output = io.BytesIO()
            processed_df.to_excel(output, index=False, engine='openpyxl')
            tmp_xlsx.write(output.getvalue())
            xlsx_path = tmp_xlsx.name

        return processed_df, formatted_summary, error_message, csv_path, xlsx_path, expense_chart

    except Exception as e:
        logger.error(f"Error processing file: {str(e)}")
        error_message = str(e)
        return pd.DataFrame({"error": [str(e)]}), format_summary({"error": str(e)}, currency, pd.DataFrame()), error_message, None, None, None

def generate_report(transactions, backend="auto"):
    """Generate a financial report from processed transactions."""
    try:
        if backend == "auto":
            if nebius_available:
                backend = "nebius"
            else:
                backend = "local"
        
        if backend == "nebius" and nebius_available:
            return generate_financial_report_nebius(transactions)
        else:
            return generate_local_report(transactions)
            
    except Exception as e:
        logger.error(f"Error in generate_report: {str(e)}")
        return generate_local_report(transactions)

def generate_local_report(transactions):
    """Generate a detailed financial report with enhanced AI insights."""
    if not transactions:
        return {
            'summary': {
                'total_income': 0,
                'total_expenses': 0,
                'net_balance': 0,
                'num_transactions': 0
            },
            'trends': [],
            'insights': ["No transactions provided for analysis."]
        }
    
    df = pd.DataFrame(transactions)
    
    # Convert date strings to datetime for analysis
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    
    # Calculate basic metrics
    income = df[df['amount'] > 0]['amount'].sum()
    expenses = df[df['amount'] < 0]['amount'].sum() * -1
    net = income - expenses
    
    # Group by categories
    expense_categories = df[df['amount'] < 0].groupby('category')['amount'].sum().sort_values()
    income_categories = df[df['amount'] > 0].groupby('category')['amount'].sum().sort_values(ascending=False)
    
    # Time-based analysis
    df['month'] = df['date'].dt.to_period('M')
    df['year'] = df['date'].dt.to_period('Y')
    monthly_expenses = df[df['amount'] < 0].groupby('month')['amount'].sum() * -1
    monthly_income = df[df['amount'] > 0].groupby('month')['amount'].sum()
    ytd_expenses = df[df['amount'] < 0].groupby('year')['amount'].sum() * -1
    ytd_income = df[df['amount'] > 0].groupby('year')['amount'].sum()
    current_month = pd.to_datetime('2025-06-09').to_period('M')
    current_year = pd.to_datetime('2025-06-09').to_period('Y')
    mtd_expenses = df[(df['amount'] < 0) & (df['month'] == current_month)]['amount'].sum() * -1
    mtd_income = df[(df['amount'] > 0) & (df['month'] == current_month)]['amount'].sum()
    ytd_expenses_total = ytd_expenses.get(current_year, 0) * -1
    ytd_income_total = ytd_income.get(current_year, 0)
    
    # Advanced AI Insights
    insights = []
    
    # 1. Spending Behavior Analysis
    frequent_categories = df[df['amount'] < 0].groupby('category').size().sort_values(ascending=False).head(5)
    if not frequent_categories.empty:
        top_categories = [f"{cat} ({count} transactions, {df[(df['category'] == cat) & (df['amount'] < 0)]['amount'].sum() * -1:.2f})"
                         for cat, count in frequent_categories.items()]
        insights.append(f"**Top Spending Categories**: {', '.join(top_categories)} account for the majority of your transactions.")
    
    # 2. Anomaly Detection (using IQR method)
    amounts = df['amount'].abs()
    q1, q3 = amounts.quantile([0.25, 0.75])
    iqr = q3 - q1
    anomaly_threshold = q3 + 1.5 * iqr
    anomalies = df[amounts > anomaly_threshold]
    if not anomalies.empty:
        for _, anomaly in anomalies.iterrows():
            insights.append(f"**Anomaly Detected**: Unusual {('income' if anomaly['amount'] > 0 else 'expense')} of {anomaly['amount']:.2f} {currency_symbols.get(currency, '$')} on {anomaly['date'].date()} for '{anomaly['description']}' (category: {anomaly['category']}).")
    
    # 3. Income Stability and Volatility
    if len(monthly_income) > 1:
        income_mean = monthly_income.mean()
        income_std = monthly_income.std()
        income_cv = income_std / income_mean if income_mean > 0 else 0
        if income_cv > 0.3:
            insights.append(f"**Income Volatility**: Your income varies significantly (coefficient of variation: {income_cv:.2%}), indicating irregular earnings. Consider stabilizing income sources.")
        else:
            insights.append(f"Income Stability: Your monthly income appears consistent with low variability.")
    
    # 4. Spending Trends (Seasonality Analysis)
    if len(monthly_expenses) > 3:
        expense_trend = stats.linregress(range(len(monthly_expenses)), monthly_expenses.values)
        if expense_trend.slope > 0:
            insights.append(f"**Spending Trend**: Your monthly expenses are increasing by approximately {expense_trend.slope:.2f} {currency_symbols.get(currency, '$')} per month. Review discretionary spending.")
        elif expense_trend.slope < 0:
            insights.append(f"**Spending Trend**: Your monthly expenses are decreasing by approximately {-expense_trend.slope:.2f} {currency_symbols.get(currency, '$')} per month. Good job controlling costs!")
    
    # 5. Financial Health
    if income > 0:
        expense_to_income_ratio = expenses / income
        financial_health_score = max(0, min(10, 10 - (expense_to_income_ratio * 10)))
        health_assessment = "Healthy" if financial_health_score >= 7 else "Needs Improvement" if financial_health_score >= 4 else "At Risk"
        insights.append(f"**Financial Health Score**: {financial_health_score:.1f}/10 ({health_assessment})")
        if expense_to_income_ratio > 0.7:
            insights.append(f"**Alert**: Spending is {expense_to_income_ratio:.1%} of income. Aim to reduce discretionary expenses.")
    
    # 6. MTD and YTD Insights
    if mtd_expenses > 0:
        avg_daily_mtd = mtd_expenses / df[df['month'] == current_month]['date'].nunique()
        days_in_month = 30
        projected_mtd = avg_daily_mtd * days_in_month
        insights.append(f"**MTD Spending (as of June 09, 2025)**: {mtd_expenses:.2f} {currency_symbols.get(currency, '$')}. Projected monthly spend: {projected_mtd:.2f} {currency_symbols.get(currency, '$')}.")
    if ytd_income_total > 0 and ytd_expenses_total > 0:
        ytd_savings_rate = 1 - (ytd_expenses_total / ytd_income_total)
        insights.append(f"**YTD Savings Rate**: {ytd_savings_rate:.1%} of income saved in 2025.")
    
    # 7. Predictive Insights
    if len(monthly_expenses) > 1:
        avg_monthly_expense = monthly_expenses.mean()
        insights.append(f"**Forecast**: Based on historical data, expect next month's expenses to be around {avg_monthly_expense:.2f} {currency_symbols.get(currency, '$')} ± {monthly_expenses.std():.2f}.")
    
    # 8. Personalized Recommendations
    if expenses > income * 0.7:
        savings_target = income * 0.2
        insights.append(f"**Recommendation**: Reduce expenses by {savings_target:.2f} {currency_symbols.get(currency, '$')} (20% of income) to improve savings. Focus on discretionary categories like {frequent_categories.index[0] if not frequent_categories.empty else 'miscellaneous'}.")
    elif net > 0 and net < income * 0.1:
        insights.append("**Recommendation**: Consider investing excess funds in low-risk options to grow your wealth.")
    if 'Discretionary -' in df['category'].values:
        discretionary_expenses = df[df['category'].str.contains('Discretionary')]['amount'].sum() * -1
        insights.append(f"**Recommendation**: Discretionary spending totals {discretionary_expenses:.2f} {currency_symbols.get(currency, '$')}. Consider cutting back on non-essential purchases.")
    
    # Fallback if no insights
    if not insights:
        insights.append("No significant patterns detected. Review your spending for potential optimizations.")

    report = {
        'summary': {
            'total_income': round(income, 2),
            'total_expenses': round(expenses, 2),
            'net_balance': round(net, 2),
            'num_transactions': len(df)
        },
        'trends': {
            'top_expense_categories': [
                {'category': cat, 'amount': round(abs(amt), 2), 'count': len(df[df['category'] == cat])}
                for cat, amt in expense_categories.head(5).items()
            ],
            'top_income_categories': [
                {'category': cat, 'amount': round(amt, 2), 'count': len(df[df['category'] == cat])}
                for cat, amt in income_categories.head(3).items()
            ]
        },
        'insights': insights
    }
    
    return report

def format_summary(summary, currency, df):
    """Format financial summary as readable text with enhanced structure."""
    currency_symbols = {"USD": "$", "GBP": "£", "JPY": "¥", "KES": "KSh"}
    symbol = currency_symbols.get(currency, "KSh")

    if 'error' in summary:
        return f"**Error**: {summary['error']}"
    
    lines = ["## Financial Summary"]
    lines.append(f"**Overview** (in {currency})")
    lines.append(f"- **Total Income**: {symbol}{summary.get('total_income', 0):,.2f}")
    lines.append(f"- **Total Expenses**: {symbol}{summary.get('total_expenses', 0):,.2f}")
    lines.append(f"- **Net Balance**: {symbol}{summary.get('net_balance', 0):,.2f}")
    lines.append(f"- **Number of Transactions**: {summary.get('num_transactions', 0)}")
    
    if 'trends' in summary:
        trends = summary['trends']
        if trends.get('top_expense_categories'):
            lines.append("\n**Top Expense Categories**")
            for item in trends['top_expense_categories']:
                lines.append(f"  - {item['category']}: {symbol}{item['amount']:.2f} ({item['count']} transactions)")
        if trends.get('top_income_categories'):
            lines.append("\n**Top Income Categories**")
            for item in trends['top_income_categories']:
                lines.append(f"  - {item['category']}: {symbol}{item['amount']:.2f} ({item['count']} transactions)")
    
    if 'insights' in summary:
        lines.append("\n## AI-Powered Insights")
        for insight in summary['insights']:
            lines.append(f"- {insight}")
    
    # Add simple ASCII chart for expense categories
    if not df.empty:
        expense_df = df[df['amount'] < 0].groupby('category')['amount'].sum().sort_values() * -1
        if not expense_df.empty:
            lines.append("\n**Expense Distribution (ASCII)**")
            max_amount = expense_df.max()
            for cat, amt in expense_df.head(5).items():
                bar_length = int((amt / max_amount) * 20) if max_amount > 0 else 0
                lines.append(f"{cat}: {symbol}{amt:.2f} {'█' * bar_length}")

    return "\n".join(lines)

# Global variable to store the last selected currency
currency = "KSh"
currency_symbols = {"USD": "$", "GBP": "£", "JPY": "¥", "KES": "KSh"}

def create_ui():
    with gr.Blocks(title="Financial Transaction Processor", theme="soft") as demo:
        gr.Markdown("# Financial Transaction Processor")
        gr.Markdown("Upload your bank transactions (PDF, CSV, or Excel) to get categorized spending and financial insights.")

        with gr.Row():
            with gr.Column():
                file_input = gr.File(label="Upload Transactions (CSV, PDF, or Excel)")
                currency_selector = gr.Dropdown(
                    choices=["USD", "GBP", "JPY", "KES"],
                    value="KES",
                    label="Select Currency"
                )
                backend_radio = gr.Radio(
                    choices=["auto", "nebius", "local"],
                    value="auto",
                    label="Processing Backend"
                )
                process_btn = gr.Button("Process Transactions", variant="primary")

            with gr.Column():
                processed_table = gr.Dataframe(
                    label="Processed Transactions",
                    headers=["Date", "Description", "Amount", "Category"],
                    datatype=["str", "str", "number", "str"]
                )
                download_csv_btn = gr.DownloadButton(
                    value=None,
                    label="Download Table as CSV"
                )
                download_xlsx_btn = gr.DownloadButton(
                    value=None,
                    label="Download Table as XLSX"
                )

        with gr.Row():
            with gr.Column(elem_classes=["summary-column"], scale=1):
                report_summary = gr.Markdown(label="Financial Summary")
                expense_chart = gr.Plot(label="Expense Distribution Chart")

        with gr.Row():
            error_display = gr.Textbox(label="Error Details", visible=False)

        # CSS definition
        demo.css = """
            /* Container Styles */
            .financial-container {
                font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                max-width: 1200px;
                margin: 0 auto;
                padding: 20px;
                color: #333;
            }
        
            /* Summary Column Styles */
            .summary-column {
                border: 2px solid #4a90e2;
                padding: 15px;
                margin: 10px;
                border-radius: 8px;
                background-color: #f8fafc;
                box-shadow: 0 2px 5px rgba(0,0,0,0.1);
                transition: all 0.3s ease;
            }
        
            .summary-column:hover {
                box-shadow: 0 5px 15px rgba(0,0,0,0.1);
                transform: translateY(-2px);
            }
        
            /* Header Styles */
            .summary-header {
                color: #2c5282;
                font-size: 1.2rem;
                font-weight: 600;
                margin-bottom: 15px;
                padding-bottom: 8px;
                border-bottom: 1px solid #cbd5e0;
            }
        
            /* Data Display Styles */
            .summary-data {
                display: flex;
                justify-content: space-between;
                margin: 8px 0;
            }
        
            .data-label {
                font-weight: 500;
                color: #4a5568;
            }
        
            .data-value {
                font-weight: 600;
            }
        
            .positive-value {
                color: #38a169;
            }
        
            .negative-value {
                color: #e53e3e;
            }
        
            /* Chart Container */
            .chart-container {
                height: 250px;
                margin: 20px 0;
                padding: 15px;
                background: white;
                border-radius: 8px;
                box-shadow: 0 2px 10px rgba(0,0,0,0.05);
            }
        
            /* Responsive Grid */
            .summary-grid {
                display: grid;
                grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
                gap: 15px;
            }
        
            /* Button Styles */
            .action-button {
                background-color: #4299e1;
                color: white;
                border: none;
                padding: 8px 16px;
                border-radius: 4px;
                cursor: pointer;
                font-size: 0.9rem;
                margin-top: 10px;
                transition: background-color 0.2s;
            }
        
            .action-button:hover {
                background-color: #3182ce;
            }
        
            /* Table Styles */
            .transaction-table {
                width: 100;
                border-collapse: collapse;
                margin-top: 20px;
            }
        
            .transaction-table th {
                background-color: #4a90e2;
                color: white;
                padding: 12px;
                text-align: left;
            }
        
            .transaction-table td {
                padding: 10px;
                border-bottom: 1px solid #e2e8f0;
            }
        
            .transaction-table tr:hover {
                background-color: #ebf8ff;
            }
        
            /* Responsive Media Queries */
            @media (max-width: 768px) {
                .summary-grid {
                    grid-template-columns: 1fr;
                }
                
                .financial-container {
                    padding: 10px;
                }
            }
        """
        with gr.Accordion("Example Transaction File Format", open=False):
            gr.Markdown("""
            Your file should include at least these columns (case-insensitive):
            - **Date**: Transaction date (YYYY-MM-DD format preferred)
            - **Description**: Transaction description/text
            - **Amount**: Positive for income, negative for expenses (in USD by default)
            
            Example CSV content:
            ```
            Date,Description,Amount
            2025-06-01,Salary Deposit,3000.00
            2025-06-02,Grocery Store,-125.50
            2025-06-03,Restaurant,-50.00
            2025-06-04,Netflix,-15.00
            2025-06-05,Fuel,-40.00
            ```
            """)

        process_btn.click(
            fn=process_file,
            inputs=[file_input, backend_radio, currency_selector],
            outputs=[processed_table, report_summary, error_display, download_csv_btn, download_xlsx_btn, expense_chart]
        ).then(
            fn=lambda error_msg: gr.update(visible=bool(error_msg)),
            inputs=[error_display],
            outputs=[error_display]
        )

        currency_selector.change(
            fn=lambda x: globals().update(currency=x),
            inputs=[currency_selector],
            outputs=[]
        )

    return demo

if __name__ == "__main__":
    initialize_finbert()
    initialize_financial_classifier()
    demo = create_ui()
    demo.launch(server_name="0.0.0.0", server_port=7860)