Alamgirapi's picture
Update app.py
5ba4816 verified
raw
history blame
21.8 kB
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
import pickle
import tempfile
import re
import string
from collections import Counter
# Text Cleaning Class (replacing the custom module)
class TextCleaner:
def clean_text(self, text):
"""Clean and preprocess text"""
if pd.isna(text):
return ""
# Convert to lowercase
text = str(text).lower()
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
# Information Analysis Class (replacing the custom module)
class TextInformations:
def __init__(self, df, text_col, target_col):
self.df = df
self.text_col = text_col
self.target_col = target_col
def shape(self):
return self.df.shape
def missing_values(self):
return self.df.isnull().sum().to_dict()
def class_imbalanced(self):
return self.df[self.target_col].value_counts().to_dict()
def clean_text(self):
cleaner = TextCleaner()
return self.df[self.text_col].apply(cleaner.clean_text)
def text_length(self):
return self.df[self.text_col].str.len()
# Utility functions
def save_to_session(obj, key):
"""Save objects to session state instead of files"""
st.session_state[key] = obj
def load_from_session(key):
"""Load objects from session state"""
return st.session_state.get(key, None)
def train_model(model_name, X_train, X_test, y_train, y_test):
"""Train the selected model"""
if model_name == "Logistic Regression":
model = LogisticRegression(random_state=42, max_iter=1000)
elif model_name == "Decision Tree":
model = DecisionTreeClassifier(random_state=42)
elif model_name == "Random Forest":
model = RandomForestClassifier(random_state=42, n_estimators=100)
elif model_name == "Linear SVC":
model = LinearSVC(random_state=42, max_iter=1000)
elif model_name == "SVC":
model = SVC(random_state=42, probability=True)
elif model_name == "Multinomial Naive Bayes":
model = MultinomialNB()
elif model_name == "Gaussian Naive Bayes":
model = GaussianNB()
# Train model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
return model, accuracy
def predict_text(text, model, vectorizer, encoder):
"""Make prediction on new text"""
try:
# Clean text
text_cleaner = TextCleaner()
clean_text = text_cleaner.clean_text(text)
# Transform text using the vectorizer
text_vector = vectorizer.transform([clean_text])
# Make prediction
prediction = model.predict(text_vector)
prediction_proba = None
# Get prediction probabilities if available
if hasattr(model, 'predict_proba'):
try:
prediction_proba = model.predict_proba(text_vector)[0]
except:
pass
# Decode prediction
predicted_label = encoder.inverse_transform(prediction)[0]
return predicted_label, prediction_proba
except Exception as e:
st.error(f"Error during prediction: {str(e)}")
return None, None
# Streamlit App Configuration
st.set_page_config(
page_title="Text Classification App",
page_icon="๐Ÿ“",
layout="wide"
)
st.title('๐Ÿ“ No Code Text Classification App')
st.markdown('Analyze your text data and train machine learning models for text classification')
# Initialize session state
if 'model_trained' not in st.session_state:
st.session_state.model_trained = False
if 'training_data_processed' not in st.session_state:
st.session_state.training_data_processed = False
# Sidebar
st.sidebar.title("Navigation")
section = st.sidebar.radio(
"Choose Section",
["๐Ÿ“Š Data Analysis", "๐Ÿค– Train Model", "๐Ÿ”ฎ Predictions"],
index=0
)
# Upload Data Section
st.sidebar.markdown("---")
st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
# File uploader with better error handling
try:
train_data = st.sidebar.file_uploader(
"Upload training data (CSV)",
type=["csv"],
help="Upload a CSV file with text and labels for training"
)
test_data = st.sidebar.file_uploader(
"Upload test data (CSV, optional)",
type=["csv"],
help="Optional: Upload a separate test dataset"
)
except Exception as e:
st.sidebar.error(f"File upload error: {str(e)}")
st.sidebar.info("Try refreshing the page or using a different browser")
# Process uploaded data
if train_data is not None:
try:
# Add encoding options to handle different CSV formats
encoding_option = st.sidebar.selectbox(
"CSV Encoding",
["utf-8", "latin-1", "cp1252", "iso-8859-1"],
help="Try different encodings if you get errors"
)
train_df = pd.read_csv(train_data, encoding=encoding_option)
if test_data is not None:
test_df = pd.read_csv(test_data, encoding=encoding_option)
else:
test_df = None
st.sidebar.success(f"โœ… Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
# Column selection
columns = train_df.columns.tolist()
text_data = st.sidebar.selectbox("๐Ÿ“ Choose the text column:", columns)
target = st.sidebar.selectbox("๐ŸŽฏ Choose the target column:", columns)
# Store processed data in session state
st.session_state.train_df = train_df
st.session_state.test_df = test_df
st.session_state.text_col = text_data
st.session_state.target_col = target
st.session_state.training_data_processed = True
except Exception as e:
st.sidebar.error(f"โŒ Error loading data: {str(e)}")
st.sidebar.info("Please check your CSV file format and encoding")
# Data Analysis Section
if section == "๐Ÿ“Š Data Analysis":
st.header("๐Ÿ“Š Data Analysis")
if st.session_state.get('training_data_processed', False):
try:
train_df = st.session_state.train_df
text_col = st.session_state.text_col
target_col = st.session_state.target_col
# Create info object
info = TextInformations(train_df, text_col, target_col)
# Data preprocessing
train_df['clean_text'] = info.clean_text()
train_df['text_length'] = info.text_length()
# Display basic information
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Dataset Shape", f"{info.shape()[0]} ร— {info.shape()[1]}")
with col2:
missing_vals = sum(info.missing_values().values())
st.metric("Missing Values", missing_vals)
with col3:
unique_classes = len(info.class_imbalanced())
st.metric("Unique Classes", unique_classes)
# Data preview
st.subheader("๐Ÿ“‹ Data Preview")
st.dataframe(train_df[[text_col, target_col, 'clean_text', 'text_length']].head(10))
# Class distribution
st.subheader("๐Ÿ“Š Class Distribution")
class_counts = info.class_imbalanced()
col1, col2 = st.columns(2)
with col1:
fig, ax = plt.subplots(figsize=(8, 6))
classes = list(class_counts.keys())
counts = list(class_counts.values())
ax.bar(classes, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8'])
ax.set_title('Class Distribution')
ax.set_xlabel('Classes')
ax.set_ylabel('Count')
plt.xticks(rotation=45)
st.pyplot(fig)
with col2:
st.write("**Class Distribution:**")
for class_name, count in class_counts.items():
percentage = (count / len(train_df)) * 100
st.write(f"- {class_name}: {count} ({percentage:.1f}%)")
# Text length analysis
st.subheader("๐Ÿ“ Text Length Analysis")
col1, col2 = st.columns(2)
with col1:
fig, ax = plt.subplots(figsize=(8, 6))
ax.hist(train_df['text_length'], bins=50, alpha=0.7, color='#4ECDC4')
ax.set_title('Text Length Distribution')
ax.set_xlabel('Text Length (characters)')
ax.set_ylabel('Frequency')
st.pyplot(fig)
with col2:
st.write("**Text Length Statistics:**")
length_stats = train_df['text_length'].describe()
for stat, value in length_stats.items():
st.write(f"- {stat.title()}: {value:.1f}")
# Update session state
st.session_state.processed_train_df = train_df
except Exception as e:
st.error(f"โŒ Error in data analysis: {str(e)}")
else:
st.info("๐Ÿ”„ Please upload training data to perform analysis")
# Train Model Section
elif section == "๐Ÿค– Train Model":
st.header("๐Ÿค– Train Model")
if st.session_state.get('training_data_processed', False):
try:
if 'processed_train_df' in st.session_state:
train_df = st.session_state.processed_train_df
else:
# Process data if not already processed
train_df = st.session_state.train_df
text_col = st.session_state.text_col
target_col = st.session_state.target_col
info = TextInformations(train_df, text_col, target_col)
train_df['clean_text'] = info.clean_text()
train_df['text_length'] = info.text_length()
# Model and vectorizer selection
col1, col2 = st.columns(2)
with col1:
st.subheader("๐ŸŽฏ Model Selection")
model_name = st.selectbox("Choose the Model", [
"Logistic Regression", "Decision Tree",
"Random Forest", "Linear SVC", "SVC",
"Multinomial Naive Bayes", "Gaussian Naive Bayes"
])
with col2:
st.subheader("๐Ÿ“Š Vectorizer Selection")
vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count"])
# Training parameters
st.subheader("โš™๏ธ Training Parameters")
col1, col2 = st.columns(2)
with col1:
max_features = st.slider("Max Features", 1000, 20000, 10000, 1000)
test_size = st.slider("Test Size", 0.1, 0.5, 0.2, 0.05)
with col2:
random_state = st.number_input("Random State", 0, 100, 42)
# Training button
if st.button("๐Ÿš€ Start Training", type="primary"):
with st.spinner("Training model... Please wait"):
try:
# Prepare data
X_text = train_df['clean_text'].fillna('')
y = train_df[st.session_state.target_col]
# Label encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Vectorization
if vectorizer_choice == "TF-IDF":
vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
else:
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
X_vectorized = vectorizer.fit_transform(X_text)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X_vectorized, y_encoded,
test_size=test_size,
random_state=random_state,
stratify=y_encoded
)
# Train model
model, accuracy = train_model(model_name, X_train, X_test, y_train, y_test)
# Save to session state
save_to_session(model, 'trained_model')
save_to_session(vectorizer, 'vectorizer')
save_to_session(label_encoder, 'label_encoder')
save_to_session(model_name, 'model_name')
save_to_session(vectorizer_choice, 'vectorizer_type')
st.session_state.model_trained = True
# Display results
st.success(f"โœ… Model training completed!")
col1, col2 = st.columns(2)
with col1:
st.metric("Model Accuracy", f"{accuracy:.4f}")
with col2:
st.metric("Training Samples", len(X_train))
st.info("๐ŸŽ‰ You can now use the 'Predictions' section to classify new text!")
except Exception as e:
st.error(f"โŒ Error during training: {str(e)}")
except Exception as e:
st.error(f"โŒ Error in model training setup: {str(e)}")
else:
st.info("๐Ÿ”„ Please upload and analyze training data first")
# Predictions Section
elif section == "๐Ÿ”ฎ Predictions":
st.header("๐Ÿ”ฎ Make Predictions")
if st.session_state.get('model_trained', False):
# Single text prediction
st.subheader("๐Ÿ“ Single Text Prediction")
text_input = st.text_area(
"Enter text to classify:",
height=120,
placeholder="Type or paste your text here..."
)
col1, col2 = st.columns([1, 3])
with col1:
if st.button("๐Ÿ”ฎ Predict", type="primary"):
if text_input.strip():
try:
model = load_from_session('trained_model')
vectorizer = load_from_session('vectorizer')
encoder = load_from_session('label_encoder')
predicted_label, prediction_proba = predict_text(
text_input, model, vectorizer, encoder
)
if predicted_label is not None:
st.success("โœ… Prediction completed!")
# Display results
st.markdown("### ๐Ÿ“Š Results")
st.markdown(f"**Predicted Class:** `{predicted_label}`")
# Display probabilities if available
if prediction_proba is not None:
st.markdown("**Class Probabilities:**")
classes = encoder.classes_
prob_data = pd.DataFrame({
'Class': classes,
'Probability': prediction_proba
}).sort_values('Probability', ascending=False)
# Show as bar chart
st.bar_chart(prob_data.set_index('Class'))
# Show as table
st.dataframe(prob_data, use_container_width=True)
except Exception as e:
st.error(f"โŒ Prediction error: {str(e)}")
else:
st.warning("โš ๏ธ Please enter some text to classify")
# Batch predictions
st.markdown("---")
st.subheader("๐Ÿ“ Batch Predictions")
uploaded_batch = st.file_uploader(
"Upload CSV file for batch predictions",
type=['csv'],
help="Upload a CSV file with text data to classify multiple texts at once"
)
if uploaded_batch is not None:
try:
# Load batch data
encoding_option = st.selectbox(
"Batch CSV Encoding",
["utf-8", "latin-1", "cp1252", "iso-8859-1"],
key="batch_encoding"
)
batch_df = pd.read_csv(uploaded_batch, encoding=encoding_option)
st.write("๐Ÿ“‹ **Batch Data Preview:**")
st.dataframe(batch_df.head())
# Select text column
text_column = st.selectbox(
"Select the text column:",
batch_df.columns.tolist()
)
if st.button("๐Ÿš€ Run Batch Predictions", type="primary"):
with st.spinner("Processing batch predictions..."):
try:
model = load_from_session('trained_model')
vectorizer = load_from_session('vectorizer')
encoder = load_from_session('label_encoder')
predictions = []
confidences = []
progress_bar = st.progress(0)
total_rows = len(batch_df)
for idx, text in enumerate(batch_df[text_column]):
pred, pred_proba = predict_text(
str(text), model, vectorizer, encoder
)
predictions.append(pred if pred is not None else "Error")
# Get confidence (max probability)
if pred_proba is not None:
confidences.append(max(pred_proba))
else:
confidences.append(0.0)
progress_bar.progress((idx + 1) / total_rows)
batch_df['Predicted_Class'] = predictions
batch_df['Confidence'] = confidences
st.success("โœ… Batch predictions completed!")
# Show results
st.write("๐Ÿ“Š **Prediction Results:**")
st.dataframe(batch_df[[text_column, 'Predicted_Class', 'Confidence']])
# Download results
csv = batch_df.to_csv(index=False)
st.download_button(
label="๐Ÿ“ฅ Download Results as CSV",
data=csv,
file_name="batch_predictions.csv",
mime="text/csv"
)
except Exception as e:
st.error(f"โŒ Batch prediction error: {str(e)}")
except Exception as e:
st.error(f"โŒ Error loading batch file: {str(e)}")
else:
st.info("๐Ÿ”„ Please train a model first before making predictions")
# Show model info if available
if st.session_state.get('training_data_processed', False):
st.write("๐Ÿ’ก **Tip:** Go to the 'Train Model' section to train a model first!")
# Footer
st.markdown("---")
st.markdown(
"""
<div style='text-align: center; color: #666; padding: 20px;'>
<p>๐Ÿ“ No Code Text Classification App</p>
<p>Built with Streamlit โ€ข Upload CSV โ†’ Analyze โ†’ Train โ†’ Predict</p>
</div>
""",
unsafe_allow_html=True
)