Alamgirapi's picture
Update app.py
0a50c6f verified
raw
history blame
20.6 kB
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
import pickle
import re
import string
from collections import Counter
# Set page config
st.set_page_config(page_title="Text Classification App", page_icon="๐Ÿ“Š", layout="wide")
# Custom CSS for better styling
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
color: #1f77b4;
text-align: center;
margin-bottom: 2rem;
}
.section-header {
font-size: 1.8rem;
color: #ff7f0e;
border-bottom: 2px solid #ff7f0e;
padding-bottom: 0.5rem;
}
</style>
""", unsafe_allow_html=True)
# Utility functions
def clean_text(text):
"""Clean text data"""
if pd.isna(text):
return ""
text = str(text).lower()
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def save_artifacts(obj, folder_name, file_name):
"""Save artifacts like encoders and vectorizers"""
try:
os.makedirs(folder_name, exist_ok=True)
with open(os.path.join(folder_name, file_name), 'wb') as f:
pickle.dump(obj, f)
return True
except Exception as e:
st.error(f"Error saving {file_name}: {str(e)}")
return False
def load_artifacts(folder_name, file_name):
"""Load saved artifacts"""
try:
with open(os.path.join(folder_name, file_name), 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
st.error(f"File {file_name} not found in {folder_name} folder")
return None
except Exception as e:
st.error(f"Error loading {file_name}: {str(e)}")
return None
def analyze_data(df, text_col, target_col):
"""Perform data analysis"""
analysis = {}
# Basic info
analysis['shape'] = df.shape
analysis['columns'] = df.columns.tolist()
analysis['missing_values'] = df.isnull().sum().to_dict()
# Text analysis
df['text_length'] = df[text_col].astype(str).apply(len)
analysis['avg_text_length'] = df['text_length'].mean()
analysis['text_length_stats'] = df['text_length'].describe().to_dict()
# Target analysis
analysis['class_distribution'] = df[target_col].value_counts().to_dict()
analysis['num_classes'] = df[target_col].nunique()
return analysis
def create_visualizations(df, text_col, target_col):
"""Create visualizations"""
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Class distribution
class_counts = df[target_col].value_counts()
axes[0, 0].bar(class_counts.index, class_counts.values)
axes[0, 0].set_title('Class Distribution')
axes[0, 0].set_xlabel('Classes')
axes[0, 0].set_ylabel('Count')
plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
# Text length distribution
axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7)
axes[0, 1].set_title('Text Length Distribution')
axes[0, 1].set_xlabel('Text Length')
axes[0, 1].set_ylabel('Frequency')
# Box plot of text length by class
df.boxplot(column='text_length', by=target_col, ax=axes[1, 0])
axes[1, 0].set_title('Text Length by Class')
axes[1, 0].set_xlabel('Class')
axes[1, 0].set_ylabel('Text Length')
# Correlation plot (if applicable)
if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10:
correlation = df[['text_length', target_col]].corr()
sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm')
axes[1, 1].set_title('Correlation Matrix')
else:
axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target',
ha='center', va='center', transform=axes[1, 1].transAxes)
axes[1, 1].set_title('Correlation Analysis')
plt.tight_layout()
return fig
def train_model(model_name, X_train, X_test, y_train, y_test):
"""Train selected model"""
models_dict = {
"Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
"Linear SVC": LinearSVC(random_state=42, max_iter=1000),
"SVC": SVC(random_state=42, probability=True),
"Multinomial Naive Bayes": MultinomialNB(),
"Gaussian Naive Bayes": GaussianNB()
}
if model_name not in models_dict:
return None, None, None
model = models_dict[model_name]
# Special handling for Gaussian NB (needs dense array)
if model_name == "Gaussian Naive Bayes":
X_train_model = X_train.toarray()
X_test_model = X_test.toarray()
else:
X_train_model = X_train
X_test_model = X_test
# Train model
model.fit(X_train_model, y_train)
# Make predictions
y_pred = model.predict(X_test_model)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
# Save model
os.makedirs("models", exist_ok=True)
model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
save_artifacts(model, "models", model_filename)
return model, accuracy, report
def predict_text(model_name, text, vectorizer_type="tfidf"):
"""Make prediction on new text"""
try:
# Load model
model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
model = load_artifacts("models", model_filename)
if model is None:
return None, None
# Load vectorizer
vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl"
vectorizer = load_artifacts("artifacts", vectorizer_filename)
if vectorizer is None:
return None, None
# Load label encoder
encoder = load_artifacts("artifacts", "label_encoder.pkl")
if encoder is None:
return None, None
# Clean and vectorize text
clean_text_input = clean_text(text)
text_vector = vectorizer.transform([clean_text_input])
# Special handling for Gaussian NB
if "gaussian" in model_name.lower():
text_vector = text_vector.toarray()
# Make prediction
prediction = model.predict(text_vector)
prediction_proba = None
# Get prediction probabilities if available
if hasattr(model, 'predict_proba'):
try:
if "gaussian" in model_name.lower():
prediction_proba = model.predict_proba(text_vector)[0]
else:
prediction_proba = model.predict_proba(text_vector)[0]
except Exception as e:
st.warning(f"Could not get prediction probabilities: {str(e)}")
# Decode prediction
predicted_label = encoder.inverse_transform(prediction)[0]
return predicted_label, prediction_proba
except Exception as e:
st.error(f"Error during prediction: {str(e)}")
return None, None
# Main App
st.markdown('<h1 class="main-header">๐Ÿ“Š No Code Text Classification App</h1>', unsafe_allow_html=True)
st.markdown("### Analyze your text data and train machine learning models without coding!")
# Initialize session state
if 'vectorizer_type' not in st.session_state:
st.session_state.vectorizer_type = "tfidf"
if 'trained_models' not in st.session_state:
st.session_state.trained_models = []
# Sidebar
st.sidebar.markdown("## ๐Ÿ“ Upload Your Dataset")
# File upload with better error handling
try:
uploaded_file = st.sidebar.file_uploader(
"Choose a CSV file",
type="csv",
help="Upload your training dataset (CSV format)"
)
# Encoding selection
encoding = st.sidebar.selectbox(
"Select file encoding",
["utf-8", "latin1", "iso-8859-1", "cp1252"],
help="Try different encodings if you get reading errors"
)
except Exception as e:
st.sidebar.error(f"File upload error: {str(e)}")
uploaded_file = None
# Navigation
section = st.sidebar.radio(
"Choose Section",
["๐Ÿ“Š Data Analysis", "๐Ÿค– Train Model", "๐Ÿ”ฎ Predictions"],
help="Navigate through different sections of the app"
)
# Main content based on section
if uploaded_file is not None:
try:
# Load data with selected encoding
df = pd.read_csv(uploaded_file, encoding=encoding)
st.sidebar.success(f"โœ… Data loaded successfully! Shape: {df.shape}")
# Column selection
columns = df.columns.tolist()
text_column = st.sidebar.selectbox("๐Ÿ“ Select text column:", columns)
target_column = st.sidebar.selectbox("๐ŸŽฏ Select target column:", columns)
# Data preprocessing
df['clean_text'] = df[text_column].apply(clean_text)
df['text_length'] = df[text_column].astype(str).apply(len)
# Process target column
label_encoder = LabelEncoder()
df['encoded_target'] = label_encoder.fit_transform(df[target_column])
save_artifacts(label_encoder, "artifacts", "label_encoder.pkl")
except Exception as e:
st.error(f"โŒ Error loading data: {str(e)}")
st.info("๐Ÿ’ก Try selecting a different encoding from the sidebar.")
df = None
# Section: Data Analysis
if section == "๐Ÿ“Š Data Analysis":
if uploaded_file is not None and df is not None:
st.markdown('<h2 class="section-header">Data Analysis</h2>', unsafe_allow_html=True)
# Data overview
col1, col2, col3 = st.columns(3)
with col1:
st.metric("๐Ÿ“‹ Total Records", df.shape[0])
with col2:
st.metric("๐Ÿ“Š Features", df.shape[1])
with col3:
st.metric("๐Ÿท๏ธ Classes", df[target_column].nunique())
# Data preview
st.subheader("๐Ÿ“– Data Preview")
st.dataframe(df[[text_column, target_column, 'text_length']].head(10))
# Analysis results
analysis = analyze_data(df, text_column, target_column)
col1, col2 = st.columns(2)
with col1:
st.subheader("๐Ÿ“ˆ Text Statistics")
st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}")
st.write("**Text length distribution:**")
st.write(pd.DataFrame([analysis['text_length_stats']]).T)
with col2:
st.subheader("๐Ÿท๏ธ Class Distribution")
class_dist = pd.DataFrame(list(analysis['class_distribution'].items()),
columns=['Class', 'Count'])
st.dataframe(class_dist)
# Visualizations
st.subheader("๐Ÿ“Š Visualizations")
try:
fig = create_visualizations(df, text_column, target_column)
st.pyplot(fig)
except Exception as e:
st.error(f"Error creating visualizations: {str(e)}")
else:
st.warning("๐Ÿ“ Please upload a dataset to analyze.")
# Section: Train Model
elif section == "๐Ÿค– Train Model":
if uploaded_file is not None and df is not None:
st.markdown('<h2 class="section-header">Model Training</h2>', unsafe_allow_html=True)
col1, col2 = st.columns(2)
with col1:
st.subheader("๐Ÿค– Select Model")
model_name = st.selectbox(
"Choose algorithm:",
["Logistic Regression", "Decision Tree", "Random Forest",
"Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
)
with col2:
st.subheader("๐Ÿ”ค Select Vectorizer")
vectorizer_choice = st.selectbox(
"Choose text vectorizer:",
["TF-IDF Vectorizer", "Count Vectorizer"]
)
# Vectorizer parameters
max_features = st.slider("Max features", 1000, 50000, 10000)
test_size = st.slider("Test size", 0.1, 0.5, 0.2)
if st.button("๐Ÿš€ Start Training", type="primary"):
with st.spinner("๐Ÿ”„ Training model..."):
try:
# Initialize vectorizer
if vectorizer_choice == "TF-IDF Vectorizer":
vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
st.session_state.vectorizer_type = "tfidf"
else:
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
st.session_state.vectorizer_type = "count"
# Vectorize text
X = vectorizer.fit_transform(df['clean_text'])
y = df['encoded_target']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42, stratify=y
)
# Save vectorizer
vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
save_artifacts(vectorizer, "artifacts", vectorizer_filename)
# Train model
model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test)
if model is not None:
st.success(f"โœ… Model trained successfully!")
st.session_state.trained_models.append(model_name)
# Display results
col1, col2 = st.columns(2)
with col1:
st.metric("๐ŸŽฏ Accuracy", f"{accuracy:.4f}")
with col2:
st.metric("๐Ÿท๏ธ Classes", len(report) - 3) # Exclude avg metrics
# Detailed metrics
st.subheader("๐Ÿ“Š Detailed Metrics")
metrics_df = pd.DataFrame(report).transpose()
st.dataframe(metrics_df.round(4))
except Exception as e:
st.error(f"โŒ Training failed: {str(e)}")
else:
st.warning("๐Ÿ“ Please upload a dataset to train a model.")
# Section: Predictions
elif section == "๐Ÿ”ฎ Predictions":
st.markdown('<h2 class="section-header">Make Predictions</h2>', unsafe_allow_html=True)
# Check for trained models
if os.path.exists("models") and os.listdir("models"):
available_models = [f.replace('_model.pkl', '').replace('_', ' ').title()
for f in os.listdir("models") if f.endswith('.pkl')]
if available_models:
# Single prediction
st.subheader("๐Ÿ”ฎ Single Text Prediction")
col1, col2 = st.columns([3, 1])
with col1:
text_input = st.text_area(
"Enter text to classify:",
height=100,
placeholder="Type or paste your text here..."
)
with col2:
selected_model = st.selectbox("Select model:", available_models)
if st.button("๐Ÿ” Predict", type="primary"):
if text_input.strip():
with st.spinner("๐Ÿ”„ Making prediction..."):
predicted_label, prediction_proba = predict_text(
selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf')
)
if predicted_label is not None:
st.success("โœ… Prediction completed!")
# Results
st.markdown("### ๐Ÿ“‹ Results")
st.info(f"**Predicted Class:** {predicted_label}")
# Probabilities
if prediction_proba is not None:
encoder = load_artifacts("artifacts", "label_encoder.pkl")
if encoder is not None:
classes = encoder.classes_
prob_df = pd.DataFrame({
'Class': classes,
'Probability': prediction_proba
}).sort_values('Probability', ascending=False)
st.markdown("### ๐Ÿ“Š Class Probabilities")
st.bar_chart(prob_df.set_index('Class'))
else:
st.warning("โš ๏ธ Please enter some text to classify.")
# Batch predictions
st.markdown("---")
st.subheader("๐Ÿ“ฆ Batch Predictions")
batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv'])
if batch_file is not None:
try:
batch_df = pd.read_csv(batch_file, encoding=encoding)
st.write("๐Ÿ“– Preview:")
st.dataframe(batch_df.head())
batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist())
batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model")
if st.button("๐Ÿš€ Run Batch Predictions"):
with st.spinner("๐Ÿ”„ Processing batch predictions..."):
predictions = []
progress_bar = st.progress(0)
for i, text in enumerate(batch_df[batch_text_col]):
pred, _ = predict_text(
batch_model, str(text),
st.session_state.get('vectorizer_type', 'tfidf')
)
predictions.append(pred if pred is not None else "Error")
progress_bar.progress((i + 1) / len(batch_df))
batch_df['Predicted_Class'] = predictions
st.success("โœ… Batch predictions completed!")
st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']])
# Download option
csv = batch_df.to_csv(index=False)
st.download_button(
"๐Ÿ“ฅ Download Results",
csv,
"batch_predictions.csv",
"text/csv"
)
except Exception as e:
st.error(f"โŒ Batch prediction error: {str(e)}")
else:
st.warning("โš ๏ธ No trained models found.")
else:
st.warning("โš ๏ธ No models available. Please train a model first.")
# Footer
st.markdown("---")
st.markdown("*Built with Streamlit โ€ข Text Classification Made Easy*")