import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
import pickle
import re
import string
from collections import Counter
# Set page config
st.set_page_config(page_title="Text Classification App", page_icon="📊", layout="wide")
# Custom CSS for better styling
st.markdown("""
""", unsafe_allow_html=True)
# Utility functions
def clean_text(text):
"""Clean text data"""
if pd.isna(text):
return ""
text = str(text).lower()
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def save_artifacts(obj, folder_name, file_name):
"""Save artifacts like encoders and vectorizers"""
try:
os.makedirs(folder_name, exist_ok=True)
with open(os.path.join(folder_name, file_name), 'wb') as f:
pickle.dump(obj, f)
return True
except Exception as e:
st.error(f"Error saving {file_name}: {str(e)}")
return False
def load_artifacts(folder_name, file_name):
"""Load saved artifacts"""
try:
with open(os.path.join(folder_name, file_name), 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
st.error(f"File {file_name} not found in {folder_name} folder")
return None
except Exception as e:
st.error(f"Error loading {file_name}: {str(e)}")
return None
def analyze_data(df, text_col, target_col):
"""Perform data analysis"""
analysis = {}
# Basic info
analysis['shape'] = df.shape
analysis['columns'] = df.columns.tolist()
analysis['missing_values'] = df.isnull().sum().to_dict()
# Text analysis
df['text_length'] = df[text_col].astype(str).apply(len)
analysis['avg_text_length'] = df['text_length'].mean()
analysis['text_length_stats'] = df['text_length'].describe().to_dict()
# Target analysis
analysis['class_distribution'] = df[target_col].value_counts().to_dict()
analysis['num_classes'] = df[target_col].nunique()
return analysis
def create_visualizations(df, text_col, target_col):
"""Create visualizations"""
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Class distribution
class_counts = df[target_col].value_counts()
axes[0, 0].bar(class_counts.index, class_counts.values)
axes[0, 0].set_title('Class Distribution')
axes[0, 0].set_xlabel('Classes')
axes[0, 0].set_ylabel('Count')
plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
# Text length distribution
axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7)
axes[0, 1].set_title('Text Length Distribution')
axes[0, 1].set_xlabel('Text Length')
axes[0, 1].set_ylabel('Frequency')
# Box plot of text length by class
df.boxplot(column='text_length', by=target_col, ax=axes[1, 0])
axes[1, 0].set_title('Text Length by Class')
axes[1, 0].set_xlabel('Class')
axes[1, 0].set_ylabel('Text Length')
# Correlation plot (if applicable)
if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10:
correlation = df[['text_length', target_col]].corr()
sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm')
axes[1, 1].set_title('Correlation Matrix')
else:
axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target',
ha='center', va='center', transform=axes[1, 1].transAxes)
axes[1, 1].set_title('Correlation Analysis')
plt.tight_layout()
return fig
def train_model(model_name, X_train, X_test, y_train, y_test):
"""Train selected model"""
models_dict = {
"Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
"Linear SVC": LinearSVC(random_state=42, max_iter=1000),
"SVC": SVC(random_state=42, probability=True),
"Multinomial Naive Bayes": MultinomialNB(),
"Gaussian Naive Bayes": GaussianNB()
}
if model_name not in models_dict:
return None, None, None
model = models_dict[model_name]
# Special handling for Gaussian NB (needs dense array)
if model_name == "Gaussian Naive Bayes":
X_train_model = X_train.toarray()
X_test_model = X_test.toarray()
else:
X_train_model = X_train
X_test_model = X_test
# Train model
model.fit(X_train_model, y_train)
# Make predictions
y_pred = model.predict(X_test_model)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
# Save model
os.makedirs("models", exist_ok=True)
model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
save_artifacts(model, "models", model_filename)
return model, accuracy, report
def predict_text(model_name, text, vectorizer_type="tfidf"):
"""Make prediction on new text"""
try:
# Load model
model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
model = load_artifacts("models", model_filename)
if model is None:
return None, None
# Load vectorizer
vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl"
vectorizer = load_artifacts("artifacts", vectorizer_filename)
if vectorizer is None:
return None, None
# Load label encoder
encoder = load_artifacts("artifacts", "label_encoder.pkl")
if encoder is None:
return None, None
# Clean and vectorize text
clean_text_input = clean_text(text)
text_vector = vectorizer.transform([clean_text_input])
# Special handling for Gaussian NB
if "gaussian" in model_name.lower():
text_vector = text_vector.toarray()
# Make prediction
prediction = model.predict(text_vector)
prediction_proba = None
# Get prediction probabilities if available
if hasattr(model, 'predict_proba'):
try:
if "gaussian" in model_name.lower():
prediction_proba = model.predict_proba(text_vector)[0]
else:
prediction_proba = model.predict_proba(text_vector)[0]
except Exception as e:
st.warning(f"Could not get prediction probabilities: {str(e)}")
# Decode prediction
predicted_label = encoder.inverse_transform(prediction)[0]
return predicted_label, prediction_proba
except Exception as e:
st.error(f"Error during prediction: {str(e)}")
return None, None
# Main App
st.markdown('
📊 No Code Text Classification App
', unsafe_allow_html=True)
st.markdown("### Analyze your text data and train machine learning models without coding!")
# Initialize session state
if 'vectorizer_type' not in st.session_state:
st.session_state.vectorizer_type = "tfidf"
if 'trained_models' not in st.session_state:
st.session_state.trained_models = []
# Sidebar
st.sidebar.markdown("## 📁 Upload Your Dataset")
# File upload with better error handling
try:
uploaded_file = st.sidebar.file_uploader(
"Choose a CSV file",
type="csv",
help="Upload your training dataset (CSV format)"
)
# Encoding selection
encoding = st.sidebar.selectbox(
"Select file encoding",
["utf-8", "latin1", "iso-8859-1", "cp1252"],
help="Try different encodings if you get reading errors"
)
except Exception as e:
st.sidebar.error(f"File upload error: {str(e)}")
uploaded_file = None
# Navigation
section = st.sidebar.radio(
"Choose Section",
["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"],
help="Navigate through different sections of the app"
)
# Main content based on section
if uploaded_file is not None:
try:
# Load data with selected encoding
df = pd.read_csv(uploaded_file, encoding=encoding)
st.sidebar.success(f"✅ Data loaded successfully! Shape: {df.shape}")
# Column selection
columns = df.columns.tolist()
text_column = st.sidebar.selectbox("📝 Select text column:", columns)
target_column = st.sidebar.selectbox("🎯 Select target column:", columns)
# Data preprocessing
df['clean_text'] = df[text_column].apply(clean_text)
df['text_length'] = df[text_column].astype(str).apply(len)
# Process target column
label_encoder = LabelEncoder()
df['encoded_target'] = label_encoder.fit_transform(df[target_column])
save_artifacts(label_encoder, "artifacts", "label_encoder.pkl")
except Exception as e:
st.error(f"❌ Error loading data: {str(e)}")
st.info("💡 Try selecting a different encoding from the sidebar.")
df = None
# Section: Data Analysis
if section == "📊 Data Analysis":
if uploaded_file is not None and df is not None:
st.markdown('', unsafe_allow_html=True)
# Data overview
col1, col2, col3 = st.columns(3)
with col1:
st.metric("📋 Total Records", df.shape[0])
with col2:
st.metric("📊 Features", df.shape[1])
with col3:
st.metric("🏷️ Classes", df[target_column].nunique())
# Data preview
st.subheader("📖 Data Preview")
st.dataframe(df[[text_column, target_column, 'text_length']].head(10))
# Analysis results
analysis = analyze_data(df, text_column, target_column)
col1, col2 = st.columns(2)
with col1:
st.subheader("📈 Text Statistics")
st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}")
st.write("**Text length distribution:**")
st.write(pd.DataFrame([analysis['text_length_stats']]).T)
with col2:
st.subheader("🏷️ Class Distribution")
class_dist = pd.DataFrame(list(analysis['class_distribution'].items()),
columns=['Class', 'Count'])
st.dataframe(class_dist)
# Visualizations
st.subheader("📊 Visualizations")
try:
fig = create_visualizations(df, text_column, target_column)
st.pyplot(fig)
except Exception as e:
st.error(f"Error creating visualizations: {str(e)}")
else:
st.warning("📁 Please upload a dataset to analyze.")
# Section: Train Model
elif section == "🤖 Train Model":
if uploaded_file is not None and df is not None:
st.markdown('', unsafe_allow_html=True)
col1, col2 = st.columns(2)
with col1:
st.subheader("🤖 Select Model")
model_name = st.selectbox(
"Choose algorithm:",
["Logistic Regression", "Decision Tree", "Random Forest",
"Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
)
with col2:
st.subheader("🔤 Select Vectorizer")
vectorizer_choice = st.selectbox(
"Choose text vectorizer:",
["TF-IDF Vectorizer", "Count Vectorizer"]
)
# Vectorizer parameters
max_features = st.slider("Max features", 1000, 50000, 10000)
test_size = st.slider("Test size", 0.1, 0.5, 0.2)
if st.button("🚀 Start Training", type="primary"):
with st.spinner("🔄 Training model..."):
try:
# Initialize vectorizer
if vectorizer_choice == "TF-IDF Vectorizer":
vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
st.session_state.vectorizer_type = "tfidf"
else:
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
st.session_state.vectorizer_type = "count"
# Vectorize text
X = vectorizer.fit_transform(df['clean_text'])
y = df['encoded_target']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42, stratify=y
)
# Save vectorizer
vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
save_artifacts(vectorizer, "artifacts", vectorizer_filename)
# Train model
model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test)
if model is not None:
st.success(f"✅ Model trained successfully!")
st.session_state.trained_models.append(model_name)
# Display results
col1, col2 = st.columns(2)
with col1:
st.metric("🎯 Accuracy", f"{accuracy:.4f}")
with col2:
st.metric("🏷️ Classes", len(report) - 3) # Exclude avg metrics
# Detailed metrics
st.subheader("📊 Detailed Metrics")
metrics_df = pd.DataFrame(report).transpose()
st.dataframe(metrics_df.round(4))
except Exception as e:
st.error(f"❌ Training failed: {str(e)}")
else:
st.warning("📁 Please upload a dataset to train a model.")
# Section: Predictions
elif section == "🔮 Predictions":
st.markdown('', unsafe_allow_html=True)
# Check for trained models
if os.path.exists("models") and os.listdir("models"):
available_models = [f.replace('_model.pkl', '').replace('_', ' ').title()
for f in os.listdir("models") if f.endswith('.pkl')]
if available_models:
# Single prediction
st.subheader("🔮 Single Text Prediction")
col1, col2 = st.columns([3, 1])
with col1:
text_input = st.text_area(
"Enter text to classify:",
height=100,
placeholder="Type or paste your text here..."
)
with col2:
selected_model = st.selectbox("Select model:", available_models)
if st.button("🔍 Predict", type="primary"):
if text_input.strip():
with st.spinner("🔄 Making prediction..."):
predicted_label, prediction_proba = predict_text(
selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf')
)
if predicted_label is not None:
st.success("✅ Prediction completed!")
# Results
st.markdown("### 📋 Results")
st.info(f"**Predicted Class:** {predicted_label}")
# Probabilities
if prediction_proba is not None:
encoder = load_artifacts("artifacts", "label_encoder.pkl")
if encoder is not None:
classes = encoder.classes_
prob_df = pd.DataFrame({
'Class': classes,
'Probability': prediction_proba
}).sort_values('Probability', ascending=False)
st.markdown("### 📊 Class Probabilities")
st.bar_chart(prob_df.set_index('Class'))
else:
st.warning("⚠️ Please enter some text to classify.")
# Batch predictions
st.markdown("---")
st.subheader("📦 Batch Predictions")
batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv'])
if batch_file is not None:
try:
batch_df = pd.read_csv(batch_file, encoding=encoding)
st.write("📖 Preview:")
st.dataframe(batch_df.head())
batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist())
batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model")
if st.button("🚀 Run Batch Predictions"):
with st.spinner("🔄 Processing batch predictions..."):
predictions = []
progress_bar = st.progress(0)
for i, text in enumerate(batch_df[batch_text_col]):
pred, _ = predict_text(
batch_model, str(text),
st.session_state.get('vectorizer_type', 'tfidf')
)
predictions.append(pred if pred is not None else "Error")
progress_bar.progress((i + 1) / len(batch_df))
batch_df['Predicted_Class'] = predictions
st.success("✅ Batch predictions completed!")
st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']])
# Download option
csv = batch_df.to_csv(index=False)
st.download_button(
"📥 Download Results",
csv,
"batch_predictions.csv",
"text/csv"
)
except Exception as e:
st.error(f"❌ Batch prediction error: {str(e)}")
else:
st.warning("⚠️ No trained models found.")
else:
st.warning("⚠️ No models available. Please train a model first.")
# Footer
st.markdown("---")
st.markdown("*Built with Streamlit • Text Classification Made Easy*")