Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics import confusion_matrix, classification_report | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import re | |
| st.title("Expense Category Prediction") | |
| # Load data from CSV | |
| df = pd.read_csv("financial_data.csv", sep='\s\s+', engine='python') | |
| # Data Preprocessing | |
| def preprocess_data(df): | |
| # Clean the date column | |
| df['Date'] = df['Date'].str.extract(r'(\d{4}-\d{2}-\d{2})') | |
| # Forward fill missing dates | |
| df['Date'] = df['Date'].ffill() | |
| # Remove rows with missing dates | |
| df.dropna(subset=['Date'], inplace=True) | |
| # Convert 'Date' to datetime objects | |
| df['Date'] = pd.to_datetime(df['Date']) | |
| # Fill missing values in 'Expense_Category' and 'Description' with 'Unknown' | |
| df['Expense_Category'] = df['Expense_Category'].fillna('Unknown') | |
| df['Description'] = df['Description'].fillna('Unknown') | |
| # Convert 'Amount' to numeric, fill missing with 0 | |
| df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0) | |
| # Date Feature Engineering | |
| df['Month'] = df['Date'].dt.month | |
| df['DayOfWeek'] = df['Date'].dt.dayofweek | |
| # Description Text Processing | |
| def clean_text(text): | |
| text = text.lower() | |
| text = re.sub(r'[^a-zA-Z0-9\s]', '', text) | |
| return text | |
| df['Description_Cleaned'] = df['Description'].apply(clean_text) | |
| # TF-IDF Vectorization | |
| tfidf_vectorizer = TfidfVectorizer(max_features=100) # Limiting features for simplicity | |
| tfidf_features = tfidf_vectorizer.fit_transform(df['Description_Cleaned']).toarray() | |
| tfidf_df = pd.DataFrame(tfidf_features, index=df.index) | |
| # Combine Features | |
| features_df = pd.concat([df[['Amount', 'Month', 'DayOfWeek']], tfidf_df], axis=1) | |
| # Encode the target variable | |
| label_encoder = LabelEncoder() | |
| df['Expense_Category_Encoded'] = label_encoder.fit_transform(df['Expense_Category']) | |
| # Select features and target | |
| X = features_df | |
| y = df['Expense_Category_Encoded'] | |
| # Scale the features | |
| scaler = StandardScaler() | |
| X = scaler.fit_transform(X) | |
| return X, y, label_encoder, df # Return the original dataframe | |
| X, y, label_encoder, df = preprocess_data(df.copy()) | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42) | |
| # --- Models --- | |
| models = { | |
| "Random Forest": RandomForestClassifier(random_state=42), | |
| "Gradient Boosting": GradientBoostingClassifier(random_state=42) | |
| } | |
| # --- Streamlit Tabs --- | |
| tabs = st.tabs(list(models.keys())) | |
| for tab, model_name in zip(tabs, models.keys()): | |
| with tab: | |
| st.header(model_name) | |
| model = models[model_name] | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| # --- Confusion Matrix --- | |
| st.subheader("Confusion Matrix") | |
| cm = confusion_matrix(y_test, y_pred) | |
| plt.figure(figsize=(8, 6)) | |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues") | |
| plt.xlabel("Predicted") | |
| plt.ylabel("Actual") | |
| st.pyplot(plt.gcf()) | |
| # --- Classification Report --- | |
| st.subheader("Classification Report") | |
| cr = classification_report(y_test, y_pred, | |
| target_names=label_encoder.inverse_transform( | |
| df['Expense_Category_Encoded'].unique()), | |
| zero_division=0) # Get original category names | |
| st.text(cr) | |
| # --- Remarks --- | |
| st.subheader("Remarks") | |
| st.write("Model Performance Analysis:") | |
| st.write( | |
| f"The {model_name} model's performance in predicting Expense Categories is shown above.") | |
| st.write("Key Metrics:") | |
| st.write( | |
| "- The model uses a combination of expense amount, time-based features, and text descriptions to predict the expense category." | |
| ) | |
| st.write( | |
| "- The classification report provides insights into the model's precision, recall, and F1-score for each expense category." | |
| ) |