import streamlit as st import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC, SVC from sklearn.naive_bayes import MultinomialNB, GaussianNB from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import os import pickle import re import string from collections import Counter # Set page config st.set_page_config(page_title="Text Classification App", page_icon="📊", layout="wide") # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Utility functions def clean_text(text): """Clean text data""" if pd.isna(text): return "" text = str(text).lower() text = re.sub(r'[^a-zA-Z\s]', '', text) text = re.sub(r'\s+', ' ', text) text = text.strip() return text def save_artifacts(obj, folder_name, file_name): """Save artifacts like encoders and vectorizers""" try: os.makedirs(folder_name, exist_ok=True) with open(os.path.join(folder_name, file_name), 'wb') as f: pickle.dump(obj, f) return True except Exception as e: st.error(f"Error saving {file_name}: {str(e)}") return False def load_artifacts(folder_name, file_name): """Load saved artifacts""" try: with open(os.path.join(folder_name, file_name), 'rb') as f: return pickle.load(f) except FileNotFoundError: st.error(f"File {file_name} not found in {folder_name} folder") return None except Exception as e: st.error(f"Error loading {file_name}: {str(e)}") return None def analyze_data(df, text_col, target_col): """Perform data analysis""" analysis = {} # Basic info analysis['shape'] = df.shape analysis['columns'] = df.columns.tolist() analysis['missing_values'] = df.isnull().sum().to_dict() # Text analysis df['text_length'] = df[text_col].astype(str).apply(len) analysis['avg_text_length'] = df['text_length'].mean() analysis['text_length_stats'] = df['text_length'].describe().to_dict() # Target analysis analysis['class_distribution'] = df[target_col].value_counts().to_dict() analysis['num_classes'] = df[target_col].nunique() return analysis def create_visualizations(df, text_col, target_col): """Create visualizations""" fig, axes = plt.subplots(2, 2, figsize=(15, 10)) # Class distribution class_counts = df[target_col].value_counts() axes[0, 0].bar(class_counts.index, class_counts.values) axes[0, 0].set_title('Class Distribution') axes[0, 0].set_xlabel('Classes') axes[0, 0].set_ylabel('Count') plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right') # Text length distribution axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7) axes[0, 1].set_title('Text Length Distribution') axes[0, 1].set_xlabel('Text Length') axes[0, 1].set_ylabel('Frequency') # Box plot of text length by class df.boxplot(column='text_length', by=target_col, ax=axes[1, 0]) axes[1, 0].set_title('Text Length by Class') axes[1, 0].set_xlabel('Class') axes[1, 0].set_ylabel('Text Length') # Correlation plot (if applicable) if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10: correlation = df[['text_length', target_col]].corr() sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm') axes[1, 1].set_title('Correlation Matrix') else: axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target', ha='center', va='center', transform=axes[1, 1].transAxes) axes[1, 1].set_title('Correlation Analysis') plt.tight_layout() return fig def train_model(model_name, X_train, X_test, y_train, y_test): """Train selected model""" models_dict = { "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000), "Decision Tree": DecisionTreeClassifier(random_state=42), "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100), "Linear SVC": LinearSVC(random_state=42, max_iter=1000), "SVC": SVC(random_state=42, probability=True), "Multinomial Naive Bayes": MultinomialNB(), "Gaussian Naive Bayes": GaussianNB() } if model_name not in models_dict: return None, None, None model = models_dict[model_name] # Special handling for Gaussian NB (needs dense array) if model_name == "Gaussian Naive Bayes": X_train_model = X_train.toarray() X_test_model = X_test.toarray() else: X_train_model = X_train X_test_model = X_test # Train model model.fit(X_train_model, y_train) # Make predictions y_pred = model.predict(X_test_model) # Calculate metrics accuracy = accuracy_score(y_test, y_pred) report = classification_report(y_test, y_pred, output_dict=True) # Save model os.makedirs("models", exist_ok=True) model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl" save_artifacts(model, "models", model_filename) return model, accuracy, report def predict_text(model_name, text, vectorizer_type="tfidf"): """Make prediction on new text""" try: # Load model model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl" model = load_artifacts("models", model_filename) if model is None: return None, None # Load vectorizer vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl" vectorizer = load_artifacts("artifacts", vectorizer_filename) if vectorizer is None: return None, None # Load label encoder encoder = load_artifacts("artifacts", "label_encoder.pkl") if encoder is None: return None, None # Clean and vectorize text clean_text_input = clean_text(text) text_vector = vectorizer.transform([clean_text_input]) # Special handling for Gaussian NB if "gaussian" in model_name.lower(): text_vector = text_vector.toarray() # Make prediction prediction = model.predict(text_vector) prediction_proba = None # Get prediction probabilities if available if hasattr(model, 'predict_proba'): try: if "gaussian" in model_name.lower(): prediction_proba = model.predict_proba(text_vector)[0] else: prediction_proba = model.predict_proba(text_vector)[0] except Exception as e: st.warning(f"Could not get prediction probabilities: {str(e)}") # Decode prediction predicted_label = encoder.inverse_transform(prediction)[0] return predicted_label, prediction_proba except Exception as e: st.error(f"Error during prediction: {str(e)}") return None, None # Main App st.markdown('

📊 No Code Text Classification App

', unsafe_allow_html=True) st.markdown("### Analyze your text data and train machine learning models without coding!") # Initialize session state if 'vectorizer_type' not in st.session_state: st.session_state.vectorizer_type = "tfidf" if 'trained_models' not in st.session_state: st.session_state.trained_models = [] # Sidebar st.sidebar.markdown("## 📁 Upload Your Dataset") # File upload with better error handling try: uploaded_file = st.sidebar.file_uploader( "Choose a CSV file", type="csv", help="Upload your training dataset (CSV format)" ) # Encoding selection encoding = st.sidebar.selectbox( "Select file encoding", ["utf-8", "latin1", "iso-8859-1", "cp1252"], help="Try different encodings if you get reading errors" ) except Exception as e: st.sidebar.error(f"File upload error: {str(e)}") uploaded_file = None # Navigation section = st.sidebar.radio( "Choose Section", ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"], help="Navigate through different sections of the app" ) # Main content based on section if uploaded_file is not None: try: # Load data with selected encoding df = pd.read_csv(uploaded_file, encoding=encoding) st.sidebar.success(f"✅ Data loaded successfully! Shape: {df.shape}") # Column selection columns = df.columns.tolist() text_column = st.sidebar.selectbox("📝 Select text column:", columns) target_column = st.sidebar.selectbox("🎯 Select target column:", columns) # Data preprocessing df['clean_text'] = df[text_column].apply(clean_text) df['text_length'] = df[text_column].astype(str).apply(len) # Process target column label_encoder = LabelEncoder() df['encoded_target'] = label_encoder.fit_transform(df[target_column]) save_artifacts(label_encoder, "artifacts", "label_encoder.pkl") except Exception as e: st.error(f"❌ Error loading data: {str(e)}") st.info("💡 Try selecting a different encoding from the sidebar.") df = None # Section: Data Analysis if section == "📊 Data Analysis": if uploaded_file is not None and df is not None: st.markdown('

Data Analysis

', unsafe_allow_html=True) # Data overview col1, col2, col3 = st.columns(3) with col1: st.metric("📋 Total Records", df.shape[0]) with col2: st.metric("📊 Features", df.shape[1]) with col3: st.metric("🏷️ Classes", df[target_column].nunique()) # Data preview st.subheader("📖 Data Preview") st.dataframe(df[[text_column, target_column, 'text_length']].head(10)) # Analysis results analysis = analyze_data(df, text_column, target_column) col1, col2 = st.columns(2) with col1: st.subheader("📈 Text Statistics") st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}") st.write("**Text length distribution:**") st.write(pd.DataFrame([analysis['text_length_stats']]).T) with col2: st.subheader("🏷️ Class Distribution") class_dist = pd.DataFrame(list(analysis['class_distribution'].items()), columns=['Class', 'Count']) st.dataframe(class_dist) # Visualizations st.subheader("📊 Visualizations") try: fig = create_visualizations(df, text_column, target_column) st.pyplot(fig) except Exception as e: st.error(f"Error creating visualizations: {str(e)}") else: st.warning("📁 Please upload a dataset to analyze.") # Section: Train Model elif section == "🤖 Train Model": if uploaded_file is not None and df is not None: st.markdown('

Model Training

', unsafe_allow_html=True) col1, col2 = st.columns(2) with col1: st.subheader("🤖 Select Model") model_name = st.selectbox( "Choose algorithm:", ["Logistic Regression", "Decision Tree", "Random Forest", "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"] ) with col2: st.subheader("🔤 Select Vectorizer") vectorizer_choice = st.selectbox( "Choose text vectorizer:", ["TF-IDF Vectorizer", "Count Vectorizer"] ) # Vectorizer parameters max_features = st.slider("Max features", 1000, 50000, 10000) test_size = st.slider("Test size", 0.1, 0.5, 0.2) if st.button("🚀 Start Training", type="primary"): with st.spinner("🔄 Training model..."): try: # Initialize vectorizer if vectorizer_choice == "TF-IDF Vectorizer": vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english') st.session_state.vectorizer_type = "tfidf" else: vectorizer = CountVectorizer(max_features=max_features, stop_words='english') st.session_state.vectorizer_type = "count" # Vectorize text X = vectorizer.fit_transform(df['clean_text']) y = df['encoded_target'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42, stratify=y ) # Save vectorizer vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl" save_artifacts(vectorizer, "artifacts", vectorizer_filename) # Train model model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test) if model is not None: st.success(f"✅ Model trained successfully!") st.session_state.trained_models.append(model_name) # Display results col1, col2 = st.columns(2) with col1: st.metric("🎯 Accuracy", f"{accuracy:.4f}") with col2: st.metric("🏷️ Classes", len(report) - 3) # Exclude avg metrics # Detailed metrics st.subheader("📊 Detailed Metrics") metrics_df = pd.DataFrame(report).transpose() st.dataframe(metrics_df.round(4)) except Exception as e: st.error(f"❌ Training failed: {str(e)}") else: st.warning("📁 Please upload a dataset to train a model.") # Section: Predictions elif section == "🔮 Predictions": st.markdown('

Make Predictions

', unsafe_allow_html=True) # Check for trained models if os.path.exists("models") and os.listdir("models"): available_models = [f.replace('_model.pkl', '').replace('_', ' ').title() for f in os.listdir("models") if f.endswith('.pkl')] if available_models: # Single prediction st.subheader("🔮 Single Text Prediction") col1, col2 = st.columns([3, 1]) with col1: text_input = st.text_area( "Enter text to classify:", height=100, placeholder="Type or paste your text here..." ) with col2: selected_model = st.selectbox("Select model:", available_models) if st.button("🔍 Predict", type="primary"): if text_input.strip(): with st.spinner("🔄 Making prediction..."): predicted_label, prediction_proba = predict_text( selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf') ) if predicted_label is not None: st.success("✅ Prediction completed!") # Results st.markdown("### 📋 Results") st.info(f"**Predicted Class:** {predicted_label}") # Probabilities if prediction_proba is not None: encoder = load_artifacts("artifacts", "label_encoder.pkl") if encoder is not None: classes = encoder.classes_ prob_df = pd.DataFrame({ 'Class': classes, 'Probability': prediction_proba }).sort_values('Probability', ascending=False) st.markdown("### 📊 Class Probabilities") st.bar_chart(prob_df.set_index('Class')) else: st.warning("⚠️ Please enter some text to classify.") # Batch predictions st.markdown("---") st.subheader("📦 Batch Predictions") batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv']) if batch_file is not None: try: batch_df = pd.read_csv(batch_file, encoding=encoding) st.write("📖 Preview:") st.dataframe(batch_df.head()) batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist()) batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model") if st.button("🚀 Run Batch Predictions"): with st.spinner("🔄 Processing batch predictions..."): predictions = [] progress_bar = st.progress(0) for i, text in enumerate(batch_df[batch_text_col]): pred, _ = predict_text( batch_model, str(text), st.session_state.get('vectorizer_type', 'tfidf') ) predictions.append(pred if pred is not None else "Error") progress_bar.progress((i + 1) / len(batch_df)) batch_df['Predicted_Class'] = predictions st.success("✅ Batch predictions completed!") st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']]) # Download option csv = batch_df.to_csv(index=False) st.download_button( "📥 Download Results", csv, "batch_predictions.csv", "text/csv" ) except Exception as e: st.error(f"❌ Batch prediction error: {str(e)}") else: st.warning("⚠️ No trained models found.") else: st.warning("⚠️ No models available. Please train a model first.") # Footer st.markdown("---") st.markdown("*Built with Streamlit • Text Classification Made Easy*")