import streamlit as st import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC, SVC from sklearn.naive_bayes import MultinomialNB, GaussianNB from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import os import pickle import re import string from collections import Counter import plotly.express as px import plotly.graph_objects as go # Configure Streamlit page st.set_page_config( page_title="Text Classification App", page_icon="📝", layout="wide" ) # Text preprocessing class class TextCleaner: def __init__(self): self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']) def clean_text(self, text): """Clean and preprocess text""" if pd.isna(text): return "" text = str(text).lower() text = re.sub(r'http\S+', '', text) # Remove URLs text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove non-alphabetic characters text = re.sub(r'\s+', ' ', text) # Remove extra whitespace text = text.strip() # Remove stop words (optional) words = text.split() words = [word for word in words if word not in self.stop_words] return ' '.join(words) # Data analysis functions def get_data_insights(df, text_col, target_col): """Get basic insights from the dataset""" insights = { 'shape': df.shape, 'missing_values': df.isnull().sum().to_dict(), 'class_distribution': df[target_col].value_counts().to_dict(), 'text_length_stats': { 'mean': df[text_col].str.len().mean(), 'median': df[text_col].str.len().median(), 'min': df[text_col].str.len().min(), 'max': df[text_col].str.len().max() } } return insights # Model training functions def train_model(model_name, X_train, X_test, y_train, y_test): """Train and evaluate a model""" models = { 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000), 'Decision Tree': DecisionTreeClassifier(random_state=42), 'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100), 'Linear SVC': LinearSVC(random_state=42, max_iter=1000), 'SVC': SVC(random_state=42, probability=True), 'Multinomial Naive Bayes': MultinomialNB(), 'Gaussian Naive Bayes': GaussianNB() } model = models[model_name] # For Gaussian NB, convert sparse matrix to dense if model_name == 'Gaussian Naive Bayes': X_train = X_train.toarray() X_test = X_test.toarray() # Train model model.fit(X_train, y_train) # Make predictions y_pred = model.predict(X_test) # Calculate metrics accuracy = accuracy_score(y_test, y_pred) # Save model os.makedirs("models", exist_ok=True) model_filename = f"{model_name.replace(' ', '_').lower()}.pkl" with open(os.path.join("models", model_filename), 'wb') as f: pickle.dump(model, f) return model, accuracy, y_pred, model_filename # Utility functions def save_artifacts(obj, folder_name, file_name): """Save artifacts like encoders and vectorizers""" os.makedirs(folder_name, exist_ok=True) with open(os.path.join(folder_name, file_name), 'wb') as f: pickle.dump(obj, f) def load_artifacts(folder_name, file_name): """Load saved artifacts""" try: with open(os.path.join(folder_name, file_name), 'rb') as f: return pickle.load(f) except FileNotFoundError: st.error(f"File {file_name} not found in {folder_name} folder") return None def predict_text(model_filename, text, vectorizer_type="tfidf"): """Make prediction on new text""" try: # Load model with open(os.path.join('models', model_filename), 'rb') as f: model = pickle.load(f) # Load vectorizer vectorizer_file = f"{vectorizer_type}_vectorizer.pkl" vectorizer = load_artifacts("artifacts", vectorizer_file) if vectorizer is None: return None, None # Load label encoder encoder = load_artifacts("artifacts", "encoder.pkl") if encoder is None: return None, None # Clean and vectorize text text_cleaner = TextCleaner() clean_text = text_cleaner.clean_text(text) # Transform text text_vector = vectorizer.transform([clean_text]) # For Gaussian NB, convert to dense if 'gaussian' in model_filename: text_vector = text_vector.toarray() # Make prediction prediction = model.predict(text_vector) prediction_proba = None # Get prediction probabilities if available if hasattr(model, 'predict_proba'): try: prediction_proba = model.predict_proba(text_vector)[0] except: pass # Decode prediction predicted_label = encoder.inverse_transform(prediction)[0] return predicted_label, prediction_proba except Exception as e: st.error(f"Error during prediction: {str(e)}") return None, None # Streamlit App st.title('📝 No Code Text Classification App') st.markdown('---') st.write('Analyze your text data and train machine learning models without coding!') # Sidebar st.sidebar.title("Navigation") section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"]) # Upload Data st.sidebar.markdown("---") st.sidebar.subheader("📁 Upload Your Dataset") train_data = st.sidebar.file_uploader("Upload training data", type=["csv"]) test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"]) # Global variables to store data and settings if 'vectorizer_type' not in st.session_state: st.session_state.vectorizer_type = "tfidf" if train_data is not None: try: # Try different encodings encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1'] train_df = None for encoding in encodings: try: train_df = pd.read_csv(train_data, encoding=encoding) break except UnicodeDecodeError: continue if train_df is None: st.error("Unable to read the CSV file. Please check the file encoding.") else: if test_data is not None: for encoding in encodings: try: test_df = pd.read_csv(test_data, encoding=encoding) break except UnicodeDecodeError: continue else: test_df = None # Show data preview with st.sidebar.expander("📋 Data Preview", expanded=True): st.write("Shape:", train_df.shape) st.write(train_df.head(2)) columns = train_df.columns.tolist() text_data = st.sidebar.selectbox("📝 Choose the text column:", columns) target = st.sidebar.selectbox("🎯 Choose the target column:", columns) # Process data if text_data and target: # Clean text text_cleaner = TextCleaner() train_df['clean_text'] = train_df[text_data].apply(text_cleaner.clean_text) train_df['text_length'] = train_df[text_data].str.len() # Handle label encoding label_encoder = LabelEncoder() train_df['target_encoded'] = label_encoder.fit_transform(train_df[target]) # Save label encoder save_artifacts(label_encoder, "artifacts", "encoder.pkl") except Exception as e: st.error(f"Error loading data: {str(e)}") train_df = None # Data Analysis Section if section == "📊 Data Analysis": if train_data is not None and 'train_df' in locals() and train_df is not None: st.header("📊 Data Analysis") # Get insights insights = get_data_insights(train_df, text_data, target) # Display insights in columns col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Total Samples", insights['shape'][0]) with col2: st.metric("Features", insights['shape'][1]) with col3: st.metric("Classes", len(insights['class_distribution'])) with col4: st.metric("Avg Text Length", f"{insights['text_length_stats']['mean']:.1f}") st.markdown("---") # Data quality section col1, col2 = st.columns(2) with col1: st.subheader("📋 Dataset Overview") st.write("**Shape:**", insights['shape']) st.write("**Missing Values:**") missing_df = pd.DataFrame.from_dict(insights['missing_values'], orient='index', columns=['Count']) st.dataframe(missing_df[missing_df['Count'] > 0]) st.write("**Sample Data:**") st.dataframe(train_df[[text_data, target, 'text_length']].head()) with col2: st.subheader("📊 Class Distribution") class_dist = pd.DataFrame.from_dict(insights['class_distribution'], orient='index', columns=['Count']) st.dataframe(class_dist) # Plot class distribution fig = px.bar( x=class_dist.index, y=class_dist['Count'], title="Class Distribution", labels={'x': 'Class', 'y': 'Count'} ) st.plotly_chart(fig, use_container_width=True) st.markdown("---") # Text analysis section st.subheader("📝 Text Analysis") col1, col2 = st.columns(2) with col1: # Text length distribution fig = px.histogram( train_df, x='text_length', title="Text Length Distribution", nbins=30 ) st.plotly_chart(fig, use_container_width=True) with col2: # Text length by class fig = px.box( train_df, x=target, y='text_length', title="Text Length by Class" ) st.plotly_chart(fig, use_container_width=True) # Word frequency analysis st.subheader("🔤 Most Common Words") all_text = ' '.join(train_df['clean_text'].astype(str)) word_freq = Counter(all_text.split()) top_words = word_freq.most_common(20) if top_words: words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency']) fig = px.bar( words_df, x='Frequency', y='Word', orientation='h', title="Top 20 Most Common Words" ) fig.update_layout(yaxis={'categoryorder': 'total ascending'}) st.plotly_chart(fig, use_container_width=True) else: st.warning("📁 Please upload training data to perform analysis") # Train Model Section elif section == "🤖 Train Model": if train_data is not None and 'train_df' in locals() and train_df is not None: st.header("🤖 Train Machine Learning Model") col1, col2 = st.columns(2) with col1: st.subheader("⚙️ Model Configuration") model_name = st.selectbox("Choose Model", [ "Logistic Regression", "Decision Tree", "Random Forest", "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes" ]) with col2: st.subheader("📊 Vectorization Method") vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count Vectorizer"]) # Model parameters st.subheader("🔧 Parameters") col1, col2 = st.columns(2) with col1: max_features = st.slider("Max Features", 1000, 20000, 10000, step=1000) test_size = st.slider("Test Size", 0.1, 0.4, 0.2, step=0.05) with col2: random_state = st.number_input("Random State", 0, 1000, 42) min_df = st.slider("Min Document Frequency", 1, 10, 1) # Initialize vectorizer if vectorizer_choice == "TF-IDF": vectorizer = TfidfVectorizer( max_features=max_features, min_df=min_df, stop_words='english' ) st.session_state.vectorizer_type = "tfidf" else: vectorizer = CountVectorizer( max_features=max_features, min_df=min_df, stop_words='english' ) st.session_state.vectorizer_type = "count" # Show data info st.subheader("📋 Training Data Info") col1, col2, col3 = st.columns(3) with col1: st.metric("Total Samples", len(train_df)) with col2: st.metric("Unique Classes", train_df[target].nunique()) with col3: st.metric("Avg Text Length", f"{train_df['text_length'].mean():.1f}") if st.button("🚀 Start Training", type="primary"): with st.spinner("Training model... This may take a few minutes."): try: # Vectorize text data X = vectorizer.fit_transform(train_df['clean_text']) y = train_df['target_encoded'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state, stratify=y ) st.success(f"✅ Data split - Train: {X_train.shape}, Test: {X_test.shape}") # Save vectorizer vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl" save_artifacts(vectorizer, "artifacts", vectorizer_filename) # Train model model, accuracy, y_pred, model_filename = train_model( model_name, X_train, X_test, y_train, y_test ) st.success("🎉 Model training completed!") # Display results col1, col2 = st.columns(2) with col1: st.metric("🎯 Test Accuracy", f"{accuracy:.4f}") # Classification report st.subheader("📊 Classification Report") report = classification_report( y_test, y_pred, target_names=label_encoder.classes_, output_dict=True ) report_df = pd.DataFrame(report).transpose() st.dataframe(report_df.round(4)) with col2: # Confusion matrix st.subheader("🔄 Confusion Matrix") cm = confusion_matrix(y_test, y_pred) fig = px.imshow( cm, text_auto=True, aspect="auto", title="Confusion Matrix", labels=dict(x="Predicted", y="Actual"), x=label_encoder.classes_, y=label_encoder.classes_ ) st.plotly_chart(fig, use_container_width=True) st.info(f"✅ Model saved as: {model_filename}") st.info("🔮 You can now use the 'Predictions' section to classify new text!") except Exception as e: st.error(f"❌ Error during training: {str(e)}") else: st.warning("📁 Please upload training data to train a model") # Predictions Section elif section == "🔮 Predictions": st.header("🔮 Text Classification Predictions") # Check if models exist if os.path.exists("models") and os.listdir("models"): available_models = [f for f in os.listdir("models") if f.endswith('.pkl')] if available_models: # Single prediction st.subheader("📝 Single Text Classification") col1, col2 = st.columns([2, 1]) with col1: text_input = st.text_area("Enter text to classify:", height=150) with col2: selected_model = st.selectbox("Choose model:", available_models) predict_button = st.button("🔮 Predict", type="primary") if predict_button and text_input.strip(): with st.spinner("Making prediction..."): predicted_label, prediction_proba = predict_text( selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf') ) if predicted_label is not None: st.success("✅ Prediction completed!") # Display results col1, col2 = st.columns(2) with col1: st.markdown("### 🎯 Results") st.markdown(f"**Input Text:** {text_input[:200]}{'...' if len(text_input) > 200 else ''}") st.markdown(f"**Predicted Class:** `{predicted_label}`") with col2: # Display probabilities if available if prediction_proba is not None: st.markdown("### 📊 Class Probabilities") encoder = load_artifacts("artifacts", "encoder.pkl") if encoder is not None: prob_df = pd.DataFrame({ 'Class': encoder.classes_, 'Probability': prediction_proba }).sort_values('Probability', ascending=False) fig = px.bar( prob_df, x='Probability', y='Class', orientation='h', title="Prediction Confidence" ) fig.update_layout(yaxis={'categoryorder': 'total ascending'}) st.plotly_chart(fig, use_container_width=True) elif predict_button: st.warning("⚠️ Please enter some text to classify") # Batch predictions st.markdown("---") st.subheader("📊 Batch Predictions") uploaded_file = st.file_uploader("Upload CSV file with texts to classify", type=['csv']) if uploaded_file is not None: try: # Try different encodings for batch file encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1'] batch_df = None for encoding in encodings: try: batch_df = pd.read_csv(uploaded_file, encoding=encoding) break except UnicodeDecodeError: continue if batch_df is not None: st.write("📋 Uploaded data preview:") st.dataframe(batch_df.head()) col1, col2 = st.columns(2) with col1: text_column = st.selectbox("Select text column:", batch_df.columns.tolist()) with col2: batch_model = st.selectbox("Choose model:", available_models, key="batch_model") if st.button("🚀 Run Batch Predictions", type="primary"): with st.spinner("Processing batch predictions..."): predictions = [] confidences = [] progress_bar = st.progress(0) total_texts = len(batch_df) for i, text in enumerate(batch_df[text_column]): pred, proba = predict_text( batch_model, str(text), st.session_state.get('vectorizer_type', 'tfidf') ) predictions.append(pred if pred is not None else "Error") # Get confidence (max probability) if proba is not None: confidences.append(max(proba)) else: confidences.append(0.0) progress_bar.progress((i + 1) / total_texts) batch_df['Predicted_Class'] = predictions batch_df['Confidence'] = confidences st.success("✅ Batch predictions completed!") # Show results st.subheader("📊 Results") result_df = batch_df[[text_column, 'Predicted_Class', 'Confidence']] st.dataframe(result_df) # Summary statistics st.subheader("📈 Summary") col1, col2, col3 = st.columns(3) with col1: st.metric("Total Predictions", len(predictions)) with col2: successful_preds = sum(1 for p in predictions if p != "Error") st.metric("Successful", successful_preds) with col3: avg_confidence = sum(confidences) / len(confidences) if confidences else 0 st.metric("Avg Confidence", f"{avg_confidence:.3f}") # Class distribution of predictions pred_counts = pd.Series(predictions).value_counts() if len(pred_counts) > 0: fig = px.pie( values=pred_counts.values, names=pred_counts.index, title="Distribution of Predictions" ) st.plotly_chart(fig, use_container_width=True) # Download results csv = batch_df.to_csv(index=False) st.download_button( label="📥 Download Results as CSV", data=csv, file_name="batch_predictions.csv", mime="text/csv" ) else: st.error("❌ Unable to read the CSV file. Please check the file encoding.") except Exception as e: st.error(f"❌ Error in batch prediction: {str(e)}") else: st.warning("⚠️ No trained models found. Please train a model first.") else: st.warning("⚠️ No models directory found. Please go to 'Train Model' section to train a model first.") # Footer st.markdown("---") st.markdown("🚀 Built with Streamlit | 📊 No-Code Text Classification")