import streamlit as st import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import confusion_matrix, classification_report import matplotlib.pyplot as plt import seaborn as sns import re st.title("Expense Category Prediction") # Load data from CSV df = pd.read_csv("financial_data.csv", sep='\s\s+', engine='python') # Data Preprocessing def preprocess_data(df): # Clean the date column df['Date'] = df['Date'].str.extract(r'(\d{4}-\d{2}-\d{2})') # Forward fill missing dates df['Date'] = df['Date'].ffill() # Remove rows with missing dates df.dropna(subset=['Date'], inplace=True) # Convert 'Date' to datetime objects df['Date'] = pd.to_datetime(df['Date']) # Fill missing values in 'Expense_Category' and 'Description' with 'Unknown' df['Expense_Category'] = df['Expense_Category'].fillna('Unknown') df['Description'] = df['Description'].fillna('Unknown') # Convert 'Amount' to numeric, fill missing with 0 df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0) # Date Feature Engineering df['Month'] = df['Date'].dt.month df['DayOfWeek'] = df['Date'].dt.dayofweek # Description Text Processing def clean_text(text): text = text.lower() text = re.sub(r'[^a-zA-Z0-9\s]', '', text) return text df['Description_Cleaned'] = df['Description'].apply(clean_text) # TF-IDF Vectorization tfidf_vectorizer = TfidfVectorizer(max_features=100) # Limiting features for simplicity tfidf_features = tfidf_vectorizer.fit_transform(df['Description_Cleaned']).toarray() tfidf_df = pd.DataFrame(tfidf_features, index=df.index) # Combine Features features_df = pd.concat([df[['Amount', 'Month', 'DayOfWeek']], tfidf_df], axis=1) # Encode the target variable label_encoder = LabelEncoder() df['Expense_Category_Encoded'] = label_encoder.fit_transform(df['Expense_Category']) # Select features and target X = features_df y = df['Expense_Category_Encoded'] # Scale the features scaler = StandardScaler() X = scaler.fit_transform(X) return X, y, label_encoder, df # Return the original dataframe X, y, label_encoder, df = preprocess_data(df.copy()) # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) # --- Models --- models = { "Random Forest": RandomForestClassifier(random_state=42), "Gradient Boosting": GradientBoostingClassifier(random_state=42) } # --- Streamlit Tabs --- tabs = st.tabs(list(models.keys())) for tab, model_name in zip(tabs, models.keys()): with tab: st.header(model_name) model = models[model_name] model.fit(X_train, y_train) y_pred = model.predict(X_test) # --- Confusion Matrix --- st.subheader("Confusion Matrix") cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues") plt.xlabel("Predicted") plt.ylabel("Actual") st.pyplot(plt.gcf()) # --- Classification Report --- st.subheader("Classification Report") cr = classification_report(y_test, y_pred, target_names=label_encoder.inverse_transform( df['Expense_Category_Encoded'].unique()), zero_division=0) # Get original category names st.text(cr) # --- Remarks --- st.subheader("Remarks") st.write("Model Performance Analysis:") st.write( f"The {model_name} model's performance in predicting Expense Categories is shown above.") st.write("Key Metrics:") st.write( "- The model uses a combination of expense amount, time-based features, and text descriptions to predict the expense category." ) st.write( "- The classification report provides insights into the model's precision, recall, and F1-score for each expense category." )