import streamlit as st import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC, SVC from sklearn.naive_bayes import MultinomialNB, GaussianNB from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import re import string import nltk import os import pickle import io import base64 # Download required NLTK data try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords', quiet=True) try: nltk.data.find('corpora/wordnet') except LookupError: nltk.download('wordnet', quiet=True) from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer # Set page config st.set_page_config( page_title="No Code Text Classification", page_icon="๐Ÿ“", layout="wide" ) # Initialize session state if 'trained_model' not in st.session_state: st.session_state.trained_model = None if 'vectorizer' not in st.session_state: st.session_state.vectorizer = None if 'label_encoder' not in st.session_state: st.session_state.label_encoder = None if 'vectorizer_type' not in st.session_state: st.session_state.vectorizer_type = 'tfidf' if 'train_df' not in st.session_state: st.session_state.train_df = None # Text cleaning class class TextCleaner: def __init__(self): self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() def clean_text(self, text): if pd.isna(text): return "" # Convert to lowercase text = str(text).lower() # Remove URLs text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove user mentions and hashtags text = re.sub(r'@\w+|#\w+', '', text) # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() # Remove stopwords and lemmatize words = text.split() words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words] return ' '.join(words) # Utility functions def create_download_link(val, filename): """Generate a download link for a file""" b64 = base64.b64encode(val) return f'Download {filename}' def safe_file_read(uploaded_file): """Safely read uploaded file with multiple encoding attempts""" try: # Try UTF-8 first return pd.read_csv(uploaded_file, encoding='utf-8') except UnicodeDecodeError: try: # Try latin1 uploaded_file.seek(0) # Reset file pointer return pd.read_csv(uploaded_file, encoding='latin1') except: try: # Try cp1252 uploaded_file.seek(0) return pd.read_csv(uploaded_file, encoding='cp1252') except Exception as e: st.error(f"Error reading file: {str(e)}") return None # Data Analysis Functions def get_data_insights(df, text_col, target_col): """Get basic insights from the data""" insights = {} # Basic info insights['shape'] = df.shape insights['missing_values'] = df.isnull().sum().to_dict() # Class distribution insights['class_distribution'] = df[target_col].value_counts().to_dict() # Text length analysis df['text_length'] = df[text_col].astype(str).str.len() insights['avg_text_length'] = df['text_length'].mean() insights['min_text_length'] = df['text_length'].min() insights['max_text_length'] = df['text_length'].max() return insights def create_visualizations(df, text_col, target_col): """Create visualizations for the data""" # Class distribution fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) # Class distribution bar plot class_counts = df[target_col].value_counts() ax1.bar(class_counts.index, class_counts.values) ax1.set_title('Class Distribution') ax1.set_xlabel('Classes') ax1.set_ylabel('Count') ax1.tick_params(axis='x', rotation=45) # Text length distribution df['text_length'] = df[text_col].astype(str).str.len() ax2.hist(df['text_length'], bins=30, alpha=0.7) ax2.set_title('Text Length Distribution') ax2.set_xlabel('Text Length') ax2.set_ylabel('Frequency') plt.tight_layout() st.pyplot(fig) # Model Training Functions def train_model(X_train, X_test, y_train, y_test, model_name): """Train the selected model""" models = { 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000), 'Decision Tree': DecisionTreeClassifier(random_state=42), 'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100), 'Linear SVC': LinearSVC(random_state=42, max_iter=1000), 'SVC': SVC(random_state=42, probability=True), 'Multinomial Naive Bayes': MultinomialNB(), 'Gaussian Naive Bayes': GaussianNB() } model = models[model_name] # Handle sparse matrices for Gaussian NB if model_name == 'Gaussian Naive Bayes': if hasattr(X_train, 'toarray'): X_train = X_train.toarray() X_test = X_test.toarray() # Train model model.fit(X_train, y_train) # Make predictions y_pred = model.predict(X_test) # Calculate metrics accuracy = accuracy_score(y_test, y_pred) return model, accuracy, y_pred # Main App st.title('๐Ÿ”ค No Code Text Classification App') st.markdown('Upload your data, analyze it, train models, and make predictions without writing any code!') # Sidebar st.sidebar.header("๐Ÿ“ Data Upload") # File upload with better error handling train_data = st.sidebar.file_uploader( "Upload training data (CSV)", type=["csv"], help="Upload a CSV file with text and labels" ) # Process uploaded data if train_data is not None: try: with st.spinner("Loading data..."): train_df = safe_file_read(train_data) if train_df is not None: st.session_state.train_df = train_df st.sidebar.success(f"โœ… Data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns") # Column selection columns = train_df.columns.tolist() text_col = st.sidebar.selectbox("๐Ÿ“ Select text column:", columns, key="text_col") target_col = st.sidebar.selectbox("๐ŸŽฏ Select target column:", columns, key="target_col") if text_col and target_col and text_col != target_col: # Clean and prepare data with st.spinner("Preprocessing data..."): text_cleaner = TextCleaner() train_df['clean_text'] = train_df[text_col].apply(text_cleaner.clean_text) # Encode labels label_encoder = LabelEncoder() train_df['encoded_target'] = label_encoder.fit_transform(train_df[target_col]) st.session_state.label_encoder = label_encoder # Main sections tab1, tab2, tab3 = st.tabs(["๐Ÿ“Š Data Analysis", "๐Ÿค– Train Model", "๐Ÿ” Predictions"]) # Data Analysis Tab with tab1: st.header("๐Ÿ“Š Data Analysis") col1, col2 = st.columns(2) with col1: st.subheader("๐Ÿ“ˆ Dataset Overview") insights = get_data_insights(train_df, text_col, target_col) st.metric("Total Samples", insights['shape'][0]) st.metric("Number of Features", insights['shape'][1]) st.metric("Average Text Length", f"{insights['avg_text_length']:.1f}") st.subheader("๐ŸŽฏ Class Distribution") class_dist_df = pd.DataFrame(list(insights['class_distribution'].items()), columns=['Class', 'Count']) st.dataframe(class_dist_df, use_container_width=True) with col2: st.subheader("๐Ÿ“‹ Data Preview") preview_df = train_df[[text_col, target_col]].head() st.dataframe(preview_df, use_container_width=True) st.subheader("๐Ÿงน Cleaned Text Preview") cleaned_preview = train_df[['clean_text', target_col]].head() st.dataframe(cleaned_preview, use_container_width=True) st.subheader("๐Ÿ“Š Visualizations") create_visualizations(train_df, text_col, target_col) # Train Model Tab with tab2: st.header("๐Ÿค– Train Model") col1, col2 = st.columns(2) with col1: st.subheader("๐Ÿ”ง Model Selection") model_name = st.selectbox( "Choose a model:", ["Logistic Regression", "Decision Tree", "Random Forest", "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"] ) with col2: st.subheader("๐Ÿ“Š Vectorizer Selection") vectorizer_type = st.selectbox( "Choose vectorizer:", ["TF-IDF Vectorizer", "Count Vectorizer"] ) # Training parameters st.subheader("โš™๏ธ Training Parameters") col3, col4 = st.columns(2) with col3: test_size = st.slider("Test size", 0.1, 0.5, 0.2, 0.05) max_features = st.number_input("Max features", 1000, 20000, 10000, 1000) if st.button("๐Ÿš€ Train Model", type="primary"): try: with st.spinner("Training model... This may take a few minutes."): # Initialize vectorizer if vectorizer_type == "TF-IDF Vectorizer": vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english') st.session_state.vectorizer_type = 'tfidf' else: vectorizer = CountVectorizer(max_features=max_features, stop_words='english') st.session_state.vectorizer_type = 'count' # Vectorize text X = vectorizer.fit_transform(train_df['clean_text']) y = train_df['encoded_target'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42, stratify=y ) # Train model model, accuracy, y_pred = train_model(X_train, X_test, y_train, y_test, model_name) # Store in session state st.session_state.trained_model = model st.session_state.vectorizer = vectorizer # Display results st.success("๐ŸŽ‰ Model training completed!") col5, col6 = st.columns(2) with col5: st.metric("๐ŸŽฏ Accuracy", f"{accuracy:.4f}") st.metric("๐Ÿ‹๏ธ Training Samples", len(X_train)) st.metric("๐Ÿงช Test Samples", len(X_test)) with col6: st.subheader("๐Ÿ“Š Classification Report") report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True) report_df = pd.DataFrame(report).transpose() st.dataframe(report_df.round(3), use_container_width=True) except Exception as e: st.error(f"โŒ Error during training: {str(e)}") # Predictions Tab with tab3: st.header("๐Ÿ” Make Predictions") if st.session_state.trained_model is not None: # Single prediction st.subheader("๐Ÿ“ Single Text Prediction") user_input = st.text_area("Enter text to classify:", height=100) if st.button("๐Ÿ”ฎ Predict", type="primary"): if user_input.strip(): try: with st.spinner("Making prediction..."): # Clean and vectorize input text_cleaner = TextCleaner() clean_input = text_cleaner.clean_text(user_input) input_vector = st.session_state.vectorizer.transform([clean_input]) # Handle sparse matrix for Gaussian NB if isinstance(st.session_state.trained_model, GaussianNB): input_vector = input_vector.toarray() # Make prediction prediction = st.session_state.trained_model.predict(input_vector)[0] predicted_label = st.session_state.label_encoder.inverse_transform([prediction])[0] # Get probabilities if available if hasattr(st.session_state.trained_model, 'predict_proba'): try: proba = st.session_state.trained_model.predict_proba(input_vector)[0] st.success("๐ŸŽ‰ Prediction completed!") st.write(f"**Input:** {user_input}") st.write(f"**Predicted Class:** {predicted_label}") # Show probabilities st.subheader("๐Ÿ“Š Class Probabilities") prob_df = pd.DataFrame({ 'Class': st.session_state.label_encoder.classes_, 'Probability': proba }).sort_values('Probability', ascending=False) st.bar_chart(prob_df.set_index('Class')) st.dataframe(prob_df.round(4), use_container_width=True) except: st.success("๐ŸŽ‰ Prediction completed!") st.write(f"**Predicted Class:** {predicted_label}") else: st.success("๐ŸŽ‰ Prediction completed!") st.write(f"**Predicted Class:** {predicted_label}") except Exception as e: st.error(f"โŒ Error during prediction: {str(e)}") else: st.warning("โš ๏ธ Please enter some text to classify") # Batch predictions st.subheader("๐Ÿ“Š Batch Predictions") batch_file = st.file_uploader("Upload CSV for batch predictions", type=["csv"]) if batch_file is not None: try: batch_df = safe_file_read(batch_file) if batch_df is not None: st.write("**Preview:**") st.dataframe(batch_df.head(), use_container_width=True) batch_text_col = st.selectbox("Select text column for prediction:", batch_df.columns.tolist()) if st.button("๐Ÿš€ Run Batch Predictions"): with st.spinner("Processing batch predictions..."): text_cleaner = TextCleaner() predictions = [] for text in batch_df[batch_text_col]: try: clean_text = text_cleaner.clean_text(str(text)) text_vector = st.session_state.vectorizer.transform([clean_text]) if isinstance(st.session_state.trained_model, GaussianNB): text_vector = text_vector.toarray() pred = st.session_state.trained_model.predict(text_vector)[0] pred_label = st.session_state.label_encoder.inverse_transform([pred])[0] predictions.append(pred_label) except: predictions.append("Error") batch_df['Predicted_Class'] = predictions st.success("๐ŸŽ‰ Batch predictions completed!") st.dataframe(batch_df, use_container_width=True) # Download results csv_data = batch_df.to_csv(index=False) st.download_button( label="๐Ÿ“ฅ Download Results", data=csv_data, file_name="batch_predictions.csv", mime="text/csv" ) except Exception as e: st.error(f"โŒ Error processing batch file: {str(e)}") else: st.warning("โš ๏ธ No trained model found. Please train a model first in the 'Train Model' tab.") else: st.warning("โš ๏ธ Please select different columns for text and target.") except Exception as e: st.error(f"โŒ Error loading file: {str(e)}") st.info("๐Ÿ’ก Try these solutions:") st.write("- Check if the file is a valid CSV") st.write("- Ensure the file is not corrupted") st.write("- Try saving the file with UTF-8 encoding") else: st.info("๐Ÿ‘† Please upload a CSV file to get started") # Show example data format st.subheader("๐Ÿ“‹ Expected Data Format") example_df = pd.DataFrame({ 'text': [ "This product is amazing! I love it.", "Terrible quality, waste of money.", "Good value for the price.", "Not what I expected, disappointed." ], 'sentiment': ['positive', 'negative', 'positive', 'negative'] }) st.dataframe(example_df, use_container_width=True) # Footer st.markdown("---") st.markdown("Built with โค๏ธ using Streamlit | No Code Text Classification App")