Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.svm import LinearSVC, SVC | |
| from sklearn.naive_bayes import MultinomialNB, GaussianNB | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| import os | |
| import pickle | |
| import tempfile | |
| import re | |
| import string | |
| from collections import Counter | |
| # Text Cleaning Class (replacing the custom module) | |
| class TextCleaner: | |
| def clean_text(self, text): | |
| """Clean and preprocess text""" | |
| if pd.isna(text): | |
| return "" | |
| # Convert to lowercase | |
| text = str(text).lower() | |
| # Remove special characters and digits | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Remove extra whitespace | |
| text = ' '.join(text.split()) | |
| return text | |
| # Information Analysis Class (replacing the custom module) | |
| class TextInformations: | |
| def __init__(self, df, text_col, target_col): | |
| self.df = df | |
| self.text_col = text_col | |
| self.target_col = target_col | |
| def shape(self): | |
| return self.df.shape | |
| def missing_values(self): | |
| return self.df.isnull().sum().to_dict() | |
| def class_imbalanced(self): | |
| return self.df[self.target_col].value_counts().to_dict() | |
| def clean_text(self): | |
| cleaner = TextCleaner() | |
| return self.df[self.text_col].apply(cleaner.clean_text) | |
| def text_length(self): | |
| return self.df[self.text_col].str.len() | |
| # Utility functions | |
| def save_to_session(obj, key): | |
| """Save objects to session state instead of files""" | |
| st.session_state[key] = obj | |
| def load_from_session(key): | |
| """Load objects from session state""" | |
| return st.session_state.get(key, None) | |
| def train_model(model_name, X_train, X_test, y_train, y_test): | |
| """Train the selected model""" | |
| if model_name == "Logistic Regression": | |
| model = LogisticRegression(random_state=42, max_iter=1000) | |
| elif model_name == "Decision Tree": | |
| model = DecisionTreeClassifier(random_state=42) | |
| elif model_name == "Random Forest": | |
| model = RandomForestClassifier(random_state=42, n_estimators=100) | |
| elif model_name == "Linear SVC": | |
| model = LinearSVC(random_state=42, max_iter=1000) | |
| elif model_name == "SVC": | |
| model = SVC(random_state=42, probability=True) | |
| elif model_name == "Multinomial Naive Bayes": | |
| model = MultinomialNB() | |
| elif model_name == "Gaussian Naive Bayes": | |
| model = GaussianNB() | |
| # Train model | |
| model.fit(X_train, y_train) | |
| # Make predictions | |
| y_pred = model.predict(X_test) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| return model, accuracy | |
| def predict_text(text, model, vectorizer, encoder): | |
| """Make prediction on new text""" | |
| try: | |
| # Clean text | |
| text_cleaner = TextCleaner() | |
| clean_text = text_cleaner.clean_text(text) | |
| # Transform text using the vectorizer | |
| text_vector = vectorizer.transform([clean_text]) | |
| # Make prediction | |
| prediction = model.predict(text_vector) | |
| prediction_proba = None | |
| # Get prediction probabilities if available | |
| if hasattr(model, 'predict_proba'): | |
| try: | |
| prediction_proba = model.predict_proba(text_vector)[0] | |
| except: | |
| pass | |
| # Decode prediction | |
| predicted_label = encoder.inverse_transform(prediction)[0] | |
| return predicted_label, prediction_proba | |
| except Exception as e: | |
| st.error(f"Error during prediction: {str(e)}") | |
| return None, None | |
| # Streamlit App Configuration | |
| st.set_page_config( | |
| page_title="Text Classification App", | |
| page_icon="๐", | |
| layout="wide" | |
| ) | |
| st.title('๐ No Code Text Classification App') | |
| st.markdown('Analyze your text data and train machine learning models for text classification') | |
| # Initialize session state | |
| if 'model_trained' not in st.session_state: | |
| st.session_state.model_trained = False | |
| if 'training_data_processed' not in st.session_state: | |
| st.session_state.training_data_processed = False | |
| # Sidebar | |
| st.sidebar.title("Navigation") | |
| section = st.sidebar.radio( | |
| "Choose Section", | |
| ["๐ Data Analysis", "๐ค Train Model", "๐ฎ Predictions"], | |
| index=0 | |
| ) | |
| # Upload Data Section | |
| st.sidebar.markdown("---") | |
| st.sidebar.subheader("๐ Upload Your Dataset") | |
| # File uploader with better error handling | |
| try: | |
| train_data = st.sidebar.file_uploader( | |
| "Upload training data (CSV)", | |
| type=["csv"], | |
| help="Upload a CSV file with text and labels for training" | |
| ) | |
| test_data = st.sidebar.file_uploader( | |
| "Upload test data (CSV, optional)", | |
| type=["csv"], | |
| help="Optional: Upload a separate test dataset" | |
| ) | |
| except Exception as e: | |
| st.sidebar.error(f"File upload error: {str(e)}") | |
| st.sidebar.info("Try refreshing the page or using a different browser") | |
| # Process uploaded data | |
| if train_data is not None: | |
| try: | |
| # Add encoding options to handle different CSV formats | |
| encoding_option = st.sidebar.selectbox( | |
| "CSV Encoding", | |
| ["utf-8", "latin-1", "cp1252", "iso-8859-1"], | |
| help="Try different encodings if you get errors" | |
| ) | |
| train_df = pd.read_csv(train_data, encoding=encoding_option) | |
| if test_data is not None: | |
| test_df = pd.read_csv(test_data, encoding=encoding_option) | |
| else: | |
| test_df = None | |
| st.sidebar.success(f"โ Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns") | |
| # Column selection | |
| columns = train_df.columns.tolist() | |
| text_data = st.sidebar.selectbox("๐ Choose the text column:", columns) | |
| target = st.sidebar.selectbox("๐ฏ Choose the target column:", columns) | |
| # Store processed data in session state | |
| st.session_state.train_df = train_df | |
| st.session_state.test_df = test_df | |
| st.session_state.text_col = text_data | |
| st.session_state.target_col = target | |
| st.session_state.training_data_processed = True | |
| except Exception as e: | |
| st.sidebar.error(f"โ Error loading data: {str(e)}") | |
| st.sidebar.info("Please check your CSV file format and encoding") | |
| # Data Analysis Section | |
| if section == "๐ Data Analysis": | |
| st.header("๐ Data Analysis") | |
| if st.session_state.get('training_data_processed', False): | |
| try: | |
| train_df = st.session_state.train_df | |
| text_col = st.session_state.text_col | |
| target_col = st.session_state.target_col | |
| # Create info object | |
| info = TextInformations(train_df, text_col, target_col) | |
| # Data preprocessing | |
| train_df['clean_text'] = info.clean_text() | |
| train_df['text_length'] = info.text_length() | |
| # Display basic information | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Dataset Shape", f"{info.shape()[0]} ร {info.shape()[1]}") | |
| with col2: | |
| missing_vals = sum(info.missing_values().values()) | |
| st.metric("Missing Values", missing_vals) | |
| with col3: | |
| unique_classes = len(info.class_imbalanced()) | |
| st.metric("Unique Classes", unique_classes) | |
| # Data preview | |
| st.subheader("๐ Data Preview") | |
| st.dataframe(train_df[[text_col, target_col, 'clean_text', 'text_length']].head(10)) | |
| # Class distribution | |
| st.subheader("๐ Class Distribution") | |
| class_counts = info.class_imbalanced() | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| classes = list(class_counts.keys()) | |
| counts = list(class_counts.values()) | |
| ax.bar(classes, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']) | |
| ax.set_title('Class Distribution') | |
| ax.set_xlabel('Classes') | |
| ax.set_ylabel('Count') | |
| plt.xticks(rotation=45) | |
| st.pyplot(fig) | |
| with col2: | |
| st.write("**Class Distribution:**") | |
| for class_name, count in class_counts.items(): | |
| percentage = (count / len(train_df)) * 100 | |
| st.write(f"- {class_name}: {count} ({percentage:.1f}%)") | |
| # Text length analysis | |
| st.subheader("๐ Text Length Analysis") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| ax.hist(train_df['text_length'], bins=50, alpha=0.7, color='#4ECDC4') | |
| ax.set_title('Text Length Distribution') | |
| ax.set_xlabel('Text Length (characters)') | |
| ax.set_ylabel('Frequency') | |
| st.pyplot(fig) | |
| with col2: | |
| st.write("**Text Length Statistics:**") | |
| length_stats = train_df['text_length'].describe() | |
| for stat, value in length_stats.items(): | |
| st.write(f"- {stat.title()}: {value:.1f}") | |
| # Update session state | |
| st.session_state.processed_train_df = train_df | |
| except Exception as e: | |
| st.error(f"โ Error in data analysis: {str(e)}") | |
| else: | |
| st.info("๐ Please upload training data to perform analysis") | |
| # Train Model Section | |
| elif section == "๐ค Train Model": | |
| st.header("๐ค Train Model") | |
| if st.session_state.get('training_data_processed', False): | |
| try: | |
| if 'processed_train_df' in st.session_state: | |
| train_df = st.session_state.processed_train_df | |
| else: | |
| # Process data if not already processed | |
| train_df = st.session_state.train_df | |
| text_col = st.session_state.text_col | |
| target_col = st.session_state.target_col | |
| info = TextInformations(train_df, text_col, target_col) | |
| train_df['clean_text'] = info.clean_text() | |
| train_df['text_length'] = info.text_length() | |
| # Model and vectorizer selection | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("๐ฏ Model Selection") | |
| model_name = st.selectbox("Choose the Model", [ | |
| "Logistic Regression", "Decision Tree", | |
| "Random Forest", "Linear SVC", "SVC", | |
| "Multinomial Naive Bayes", "Gaussian Naive Bayes" | |
| ]) | |
| with col2: | |
| st.subheader("๐ Vectorizer Selection") | |
| vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count"]) | |
| # Training parameters | |
| st.subheader("โ๏ธ Training Parameters") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| max_features = st.slider("Max Features", 1000, 20000, 10000, 1000) | |
| test_size = st.slider("Test Size", 0.1, 0.5, 0.2, 0.05) | |
| with col2: | |
| random_state = st.number_input("Random State", 0, 100, 42) | |
| # Training button | |
| if st.button("๐ Start Training", type="primary"): | |
| with st.spinner("Training model... Please wait"): | |
| try: | |
| # Prepare data | |
| X_text = train_df['clean_text'].fillna('') | |
| y = train_df[st.session_state.target_col] | |
| # Label encoding | |
| label_encoder = LabelEncoder() | |
| y_encoded = label_encoder.fit_transform(y) | |
| # Vectorization | |
| if vectorizer_choice == "TF-IDF": | |
| vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english') | |
| else: | |
| vectorizer = CountVectorizer(max_features=max_features, stop_words='english') | |
| X_vectorized = vectorizer.fit_transform(X_text) | |
| # Train-test split | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_vectorized, y_encoded, | |
| test_size=test_size, | |
| random_state=random_state, | |
| stratify=y_encoded | |
| ) | |
| # Train model | |
| model, accuracy = train_model(model_name, X_train, X_test, y_train, y_test) | |
| # Save to session state | |
| save_to_session(model, 'trained_model') | |
| save_to_session(vectorizer, 'vectorizer') | |
| save_to_session(label_encoder, 'label_encoder') | |
| save_to_session(model_name, 'model_name') | |
| save_to_session(vectorizer_choice, 'vectorizer_type') | |
| st.session_state.model_trained = True | |
| # Display results | |
| st.success(f"โ Model training completed!") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Model Accuracy", f"{accuracy:.4f}") | |
| with col2: | |
| st.metric("Training Samples", len(X_train)) | |
| st.info("๐ You can now use the 'Predictions' section to classify new text!") | |
| except Exception as e: | |
| st.error(f"โ Error during training: {str(e)}") | |
| except Exception as e: | |
| st.error(f"โ Error in model training setup: {str(e)}") | |
| else: | |
| st.info("๐ Please upload and analyze training data first") | |
| # Predictions Section | |
| elif section == "๐ฎ Predictions": | |
| st.header("๐ฎ Make Predictions") | |
| if st.session_state.get('model_trained', False): | |
| # Single text prediction | |
| st.subheader("๐ Single Text Prediction") | |
| text_input = st.text_area( | |
| "Enter text to classify:", | |
| height=120, | |
| placeholder="Type or paste your text here..." | |
| ) | |
| col1, col2 = st.columns([1, 3]) | |
| with col1: | |
| if st.button("๐ฎ Predict", type="primary"): | |
| if text_input.strip(): | |
| try: | |
| model = load_from_session('trained_model') | |
| vectorizer = load_from_session('vectorizer') | |
| encoder = load_from_session('label_encoder') | |
| predicted_label, prediction_proba = predict_text( | |
| text_input, model, vectorizer, encoder | |
| ) | |
| if predicted_label is not None: | |
| st.success("โ Prediction completed!") | |
| # Display results | |
| st.markdown("### ๐ Results") | |
| st.markdown(f"**Predicted Class:** `{predicted_label}`") | |
| # Display probabilities if available | |
| if prediction_proba is not None: | |
| st.markdown("**Class Probabilities:**") | |
| classes = encoder.classes_ | |
| prob_data = pd.DataFrame({ | |
| 'Class': classes, | |
| 'Probability': prediction_proba | |
| }).sort_values('Probability', ascending=False) | |
| # Show as bar chart | |
| st.bar_chart(prob_data.set_index('Class')) | |
| # Show as table | |
| st.dataframe(prob_data, use_container_width=True) | |
| except Exception as e: | |
| st.error(f"โ Prediction error: {str(e)}") | |
| else: | |
| st.warning("โ ๏ธ Please enter some text to classify") | |
| # Batch predictions | |
| st.markdown("---") | |
| st.subheader("๐ Batch Predictions") | |
| uploaded_batch = st.file_uploader( | |
| "Upload CSV file for batch predictions", | |
| type=['csv'], | |
| help="Upload a CSV file with text data to classify multiple texts at once" | |
| ) | |
| if uploaded_batch is not None: | |
| try: | |
| # Load batch data | |
| encoding_option = st.selectbox( | |
| "Batch CSV Encoding", | |
| ["utf-8", "latin-1", "cp1252", "iso-8859-1"], | |
| key="batch_encoding" | |
| ) | |
| batch_df = pd.read_csv(uploaded_batch, encoding=encoding_option) | |
| st.write("๐ **Batch Data Preview:**") | |
| st.dataframe(batch_df.head()) | |
| # Select text column | |
| text_column = st.selectbox( | |
| "Select the text column:", | |
| batch_df.columns.tolist() | |
| ) | |
| if st.button("๐ Run Batch Predictions", type="primary"): | |
| with st.spinner("Processing batch predictions..."): | |
| try: | |
| model = load_from_session('trained_model') | |
| vectorizer = load_from_session('vectorizer') | |
| encoder = load_from_session('label_encoder') | |
| predictions = [] | |
| confidences = [] | |
| progress_bar = st.progress(0) | |
| total_rows = len(batch_df) | |
| for idx, text in enumerate(batch_df[text_column]): | |
| pred, pred_proba = predict_text( | |
| str(text), model, vectorizer, encoder | |
| ) | |
| predictions.append(pred if pred is not None else "Error") | |
| # Get confidence (max probability) | |
| if pred_proba is not None: | |
| confidences.append(max(pred_proba)) | |
| else: | |
| confidences.append(0.0) | |
| progress_bar.progress((idx + 1) / total_rows) | |
| batch_df['Predicted_Class'] = predictions | |
| batch_df['Confidence'] = confidences | |
| st.success("โ Batch predictions completed!") | |
| # Show results | |
| st.write("๐ **Prediction Results:**") | |
| st.dataframe(batch_df[[text_column, 'Predicted_Class', 'Confidence']]) | |
| # Download results | |
| csv = batch_df.to_csv(index=False) | |
| st.download_button( | |
| label="๐ฅ Download Results as CSV", | |
| data=csv, | |
| file_name="batch_predictions.csv", | |
| mime="text/csv" | |
| ) | |
| except Exception as e: | |
| st.error(f"โ Batch prediction error: {str(e)}") | |
| except Exception as e: | |
| st.error(f"โ Error loading batch file: {str(e)}") | |
| else: | |
| st.info("๐ Please train a model first before making predictions") | |
| # Show model info if available | |
| if st.session_state.get('training_data_processed', False): | |
| st.write("๐ก **Tip:** Go to the 'Train Model' section to train a model first!") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown( | |
| """ | |
| <div style='text-align: center; color: #666; padding: 20px;'> | |
| <p>๐ No Code Text Classification App</p> | |
| <p>Built with Streamlit โข Upload CSV โ Analyze โ Train โ Predict</p> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) |