Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import seaborn as sns | |
| from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.svm import LinearSVC, SVC | |
| from sklearn.naive_bayes import MultinomialNB, GaussianNB | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| import os | |
| import pickle | |
| import re | |
| import string | |
| from collections import Counter | |
| # Set page config | |
| st.set_page_config(page_title="Text Classification App", page_icon="๐", layout="wide") | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| color: #1f77b4; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .section-header { | |
| font-size: 1.8rem; | |
| color: #ff7f0e; | |
| border-bottom: 2px solid #ff7f0e; | |
| padding-bottom: 0.5rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Utility functions | |
| def clean_text(text): | |
| """Clean text data""" | |
| if pd.isna(text): | |
| return "" | |
| text = str(text).lower() | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| text = text.strip() | |
| return text | |
| def save_artifacts(obj, folder_name, file_name): | |
| """Save artifacts like encoders and vectorizers""" | |
| try: | |
| os.makedirs(folder_name, exist_ok=True) | |
| with open(os.path.join(folder_name, file_name), 'wb') as f: | |
| pickle.dump(obj, f) | |
| return True | |
| except Exception as e: | |
| st.error(f"Error saving {file_name}: {str(e)}") | |
| return False | |
| def load_artifacts(folder_name, file_name): | |
| """Load saved artifacts""" | |
| try: | |
| with open(os.path.join(folder_name, file_name), 'rb') as f: | |
| return pickle.load(f) | |
| except FileNotFoundError: | |
| st.error(f"File {file_name} not found in {folder_name} folder") | |
| return None | |
| except Exception as e: | |
| st.error(f"Error loading {file_name}: {str(e)}") | |
| return None | |
| def analyze_data(df, text_col, target_col): | |
| """Perform data analysis""" | |
| analysis = {} | |
| # Basic info | |
| analysis['shape'] = df.shape | |
| analysis['columns'] = df.columns.tolist() | |
| analysis['missing_values'] = df.isnull().sum().to_dict() | |
| # Text analysis | |
| df['text_length'] = df[text_col].astype(str).apply(len) | |
| analysis['avg_text_length'] = df['text_length'].mean() | |
| analysis['text_length_stats'] = df['text_length'].describe().to_dict() | |
| # Target analysis | |
| analysis['class_distribution'] = df[target_col].value_counts().to_dict() | |
| analysis['num_classes'] = df[target_col].nunique() | |
| return analysis | |
| def create_visualizations(df, text_col, target_col): | |
| """Create visualizations""" | |
| fig, axes = plt.subplots(2, 2, figsize=(15, 10)) | |
| # Class distribution | |
| class_counts = df[target_col].value_counts() | |
| axes[0, 0].bar(class_counts.index, class_counts.values) | |
| axes[0, 0].set_title('Class Distribution') | |
| axes[0, 0].set_xlabel('Classes') | |
| axes[0, 0].set_ylabel('Count') | |
| plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right') | |
| # Text length distribution | |
| axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7) | |
| axes[0, 1].set_title('Text Length Distribution') | |
| axes[0, 1].set_xlabel('Text Length') | |
| axes[0, 1].set_ylabel('Frequency') | |
| # Box plot of text length by class | |
| df.boxplot(column='text_length', by=target_col, ax=axes[1, 0]) | |
| axes[1, 0].set_title('Text Length by Class') | |
| axes[1, 0].set_xlabel('Class') | |
| axes[1, 0].set_ylabel('Text Length') | |
| # Correlation plot (if applicable) | |
| if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10: | |
| correlation = df[['text_length', target_col]].corr() | |
| sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm') | |
| axes[1, 1].set_title('Correlation Matrix') | |
| else: | |
| axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target', | |
| ha='center', va='center', transform=axes[1, 1].transAxes) | |
| axes[1, 1].set_title('Correlation Analysis') | |
| plt.tight_layout() | |
| return fig | |
| def train_model(model_name, X_train, X_test, y_train, y_test): | |
| """Train selected model""" | |
| models_dict = { | |
| "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000), | |
| "Decision Tree": DecisionTreeClassifier(random_state=42), | |
| "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100), | |
| "Linear SVC": LinearSVC(random_state=42, max_iter=1000), | |
| "SVC": SVC(random_state=42, probability=True), | |
| "Multinomial Naive Bayes": MultinomialNB(), | |
| "Gaussian Naive Bayes": GaussianNB() | |
| } | |
| if model_name not in models_dict: | |
| return None, None, None | |
| model = models_dict[model_name] | |
| # Special handling for Gaussian NB (needs dense array) | |
| if model_name == "Gaussian Naive Bayes": | |
| X_train_model = X_train.toarray() | |
| X_test_model = X_test.toarray() | |
| else: | |
| X_train_model = X_train | |
| X_test_model = X_test | |
| # Train model | |
| model.fit(X_train_model, y_train) | |
| # Make predictions | |
| y_pred = model.predict(X_test_model) | |
| # Calculate metrics | |
| accuracy = accuracy_score(y_test, y_pred) | |
| report = classification_report(y_test, y_pred, output_dict=True) | |
| # Save model | |
| os.makedirs("models", exist_ok=True) | |
| model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl" | |
| save_artifacts(model, "models", model_filename) | |
| return model, accuracy, report | |
| def predict_text(model_name, text, vectorizer_type="tfidf"): | |
| """Make prediction on new text""" | |
| try: | |
| # Load model | |
| model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl" | |
| model = load_artifacts("models", model_filename) | |
| if model is None: | |
| return None, None | |
| # Load vectorizer | |
| vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl" | |
| vectorizer = load_artifacts("artifacts", vectorizer_filename) | |
| if vectorizer is None: | |
| return None, None | |
| # Load label encoder | |
| encoder = load_artifacts("artifacts", "label_encoder.pkl") | |
| if encoder is None: | |
| return None, None | |
| # Clean and vectorize text | |
| clean_text_input = clean_text(text) | |
| text_vector = vectorizer.transform([clean_text_input]) | |
| # Special handling for Gaussian NB | |
| if "gaussian" in model_name.lower(): | |
| text_vector = text_vector.toarray() | |
| # Make prediction | |
| prediction = model.predict(text_vector) | |
| prediction_proba = None | |
| # Get prediction probabilities if available | |
| if hasattr(model, 'predict_proba'): | |
| try: | |
| if "gaussian" in model_name.lower(): | |
| prediction_proba = model.predict_proba(text_vector)[0] | |
| else: | |
| prediction_proba = model.predict_proba(text_vector)[0] | |
| except Exception as e: | |
| st.warning(f"Could not get prediction probabilities: {str(e)}") | |
| # Decode prediction | |
| predicted_label = encoder.inverse_transform(prediction)[0] | |
| return predicted_label, prediction_proba | |
| except Exception as e: | |
| st.error(f"Error during prediction: {str(e)}") | |
| return None, None | |
| # Main App | |
| st.markdown('<h1 class="main-header">๐ No Code Text Classification App</h1>', unsafe_allow_html=True) | |
| st.markdown("### Analyze your text data and train machine learning models without coding!") | |
| # Initialize session state | |
| if 'vectorizer_type' not in st.session_state: | |
| st.session_state.vectorizer_type = "tfidf" | |
| if 'trained_models' not in st.session_state: | |
| st.session_state.trained_models = [] | |
| # Sidebar | |
| st.sidebar.markdown("## ๐ Upload Your Dataset") | |
| # File upload with better error handling | |
| try: | |
| uploaded_file = st.sidebar.file_uploader( | |
| "Choose a CSV file", | |
| type="csv", | |
| help="Upload your training dataset (CSV format)" | |
| ) | |
| # Encoding selection | |
| encoding = st.sidebar.selectbox( | |
| "Select file encoding", | |
| ["utf-8", "latin1", "iso-8859-1", "cp1252"], | |
| help="Try different encodings if you get reading errors" | |
| ) | |
| except Exception as e: | |
| st.sidebar.error(f"File upload error: {str(e)}") | |
| uploaded_file = None | |
| # Navigation | |
| section = st.sidebar.radio( | |
| "Choose Section", | |
| ["๐ Data Analysis", "๐ค Train Model", "๐ฎ Predictions"], | |
| help="Navigate through different sections of the app" | |
| ) | |
| # Main content based on section | |
| if uploaded_file is not None: | |
| try: | |
| # Load data with selected encoding | |
| df = pd.read_csv(uploaded_file, encoding=encoding) | |
| st.sidebar.success(f"โ Data loaded successfully! Shape: {df.shape}") | |
| # Column selection | |
| columns = df.columns.tolist() | |
| text_column = st.sidebar.selectbox("๐ Select text column:", columns) | |
| target_column = st.sidebar.selectbox("๐ฏ Select target column:", columns) | |
| # Data preprocessing | |
| df['clean_text'] = df[text_column].apply(clean_text) | |
| df['text_length'] = df[text_column].astype(str).apply(len) | |
| # Process target column | |
| label_encoder = LabelEncoder() | |
| df['encoded_target'] = label_encoder.fit_transform(df[target_column]) | |
| save_artifacts(label_encoder, "artifacts", "label_encoder.pkl") | |
| except Exception as e: | |
| st.error(f"โ Error loading data: {str(e)}") | |
| st.info("๐ก Try selecting a different encoding from the sidebar.") | |
| df = None | |
| # Section: Data Analysis | |
| if section == "๐ Data Analysis": | |
| if uploaded_file is not None and df is not None: | |
| st.markdown('<h2 class="section-header">Data Analysis</h2>', unsafe_allow_html=True) | |
| # Data overview | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("๐ Total Records", df.shape[0]) | |
| with col2: | |
| st.metric("๐ Features", df.shape[1]) | |
| with col3: | |
| st.metric("๐ท๏ธ Classes", df[target_column].nunique()) | |
| # Data preview | |
| st.subheader("๐ Data Preview") | |
| st.dataframe(df[[text_column, target_column, 'text_length']].head(10)) | |
| # Analysis results | |
| analysis = analyze_data(df, text_column, target_column) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("๐ Text Statistics") | |
| st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}") | |
| st.write("**Text length distribution:**") | |
| st.write(pd.DataFrame([analysis['text_length_stats']]).T) | |
| with col2: | |
| st.subheader("๐ท๏ธ Class Distribution") | |
| class_dist = pd.DataFrame(list(analysis['class_distribution'].items()), | |
| columns=['Class', 'Count']) | |
| st.dataframe(class_dist) | |
| # Visualizations | |
| st.subheader("๐ Visualizations") | |
| try: | |
| fig = create_visualizations(df, text_column, target_column) | |
| st.pyplot(fig) | |
| except Exception as e: | |
| st.error(f"Error creating visualizations: {str(e)}") | |
| else: | |
| st.warning("๐ Please upload a dataset to analyze.") | |
| # Section: Train Model | |
| elif section == "๐ค Train Model": | |
| if uploaded_file is not None and df is not None: | |
| st.markdown('<h2 class="section-header">Model Training</h2>', unsafe_allow_html=True) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("๐ค Select Model") | |
| model_name = st.selectbox( | |
| "Choose algorithm:", | |
| ["Logistic Regression", "Decision Tree", "Random Forest", | |
| "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"] | |
| ) | |
| with col2: | |
| st.subheader("๐ค Select Vectorizer") | |
| vectorizer_choice = st.selectbox( | |
| "Choose text vectorizer:", | |
| ["TF-IDF Vectorizer", "Count Vectorizer"] | |
| ) | |
| # Vectorizer parameters | |
| max_features = st.slider("Max features", 1000, 50000, 10000) | |
| test_size = st.slider("Test size", 0.1, 0.5, 0.2) | |
| if st.button("๐ Start Training", type="primary"): | |
| with st.spinner("๐ Training model..."): | |
| try: | |
| # Initialize vectorizer | |
| if vectorizer_choice == "TF-IDF Vectorizer": | |
| vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english') | |
| st.session_state.vectorizer_type = "tfidf" | |
| else: | |
| vectorizer = CountVectorizer(max_features=max_features, stop_words='english') | |
| st.session_state.vectorizer_type = "count" | |
| # Vectorize text | |
| X = vectorizer.fit_transform(df['clean_text']) | |
| y = df['encoded_target'] | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, random_state=42, stratify=y | |
| ) | |
| # Save vectorizer | |
| vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl" | |
| save_artifacts(vectorizer, "artifacts", vectorizer_filename) | |
| # Train model | |
| model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test) | |
| if model is not None: | |
| st.success(f"โ Model trained successfully!") | |
| st.session_state.trained_models.append(model_name) | |
| # Display results | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("๐ฏ Accuracy", f"{accuracy:.4f}") | |
| with col2: | |
| st.metric("๐ท๏ธ Classes", len(report) - 3) # Exclude avg metrics | |
| # Detailed metrics | |
| st.subheader("๐ Detailed Metrics") | |
| metrics_df = pd.DataFrame(report).transpose() | |
| st.dataframe(metrics_df.round(4)) | |
| except Exception as e: | |
| st.error(f"โ Training failed: {str(e)}") | |
| else: | |
| st.warning("๐ Please upload a dataset to train a model.") | |
| # Section: Predictions | |
| elif section == "๐ฎ Predictions": | |
| st.markdown('<h2 class="section-header">Make Predictions</h2>', unsafe_allow_html=True) | |
| # Check for trained models | |
| if os.path.exists("models") and os.listdir("models"): | |
| available_models = [f.replace('_model.pkl', '').replace('_', ' ').title() | |
| for f in os.listdir("models") if f.endswith('.pkl')] | |
| if available_models: | |
| # Single prediction | |
| st.subheader("๐ฎ Single Text Prediction") | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| text_input = st.text_area( | |
| "Enter text to classify:", | |
| height=100, | |
| placeholder="Type or paste your text here..." | |
| ) | |
| with col2: | |
| selected_model = st.selectbox("Select model:", available_models) | |
| if st.button("๐ Predict", type="primary"): | |
| if text_input.strip(): | |
| with st.spinner("๐ Making prediction..."): | |
| predicted_label, prediction_proba = predict_text( | |
| selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf') | |
| ) | |
| if predicted_label is not None: | |
| st.success("โ Prediction completed!") | |
| # Results | |
| st.markdown("### ๐ Results") | |
| st.info(f"**Predicted Class:** {predicted_label}") | |
| # Probabilities | |
| if prediction_proba is not None: | |
| encoder = load_artifacts("artifacts", "label_encoder.pkl") | |
| if encoder is not None: | |
| classes = encoder.classes_ | |
| prob_df = pd.DataFrame({ | |
| 'Class': classes, | |
| 'Probability': prediction_proba | |
| }).sort_values('Probability', ascending=False) | |
| st.markdown("### ๐ Class Probabilities") | |
| st.bar_chart(prob_df.set_index('Class')) | |
| else: | |
| st.warning("โ ๏ธ Please enter some text to classify.") | |
| # Batch predictions | |
| st.markdown("---") | |
| st.subheader("๐ฆ Batch Predictions") | |
| batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv']) | |
| if batch_file is not None: | |
| try: | |
| batch_df = pd.read_csv(batch_file, encoding=encoding) | |
| st.write("๐ Preview:") | |
| st.dataframe(batch_df.head()) | |
| batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist()) | |
| batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model") | |
| if st.button("๐ Run Batch Predictions"): | |
| with st.spinner("๐ Processing batch predictions..."): | |
| predictions = [] | |
| progress_bar = st.progress(0) | |
| for i, text in enumerate(batch_df[batch_text_col]): | |
| pred, _ = predict_text( | |
| batch_model, str(text), | |
| st.session_state.get('vectorizer_type', 'tfidf') | |
| ) | |
| predictions.append(pred if pred is not None else "Error") | |
| progress_bar.progress((i + 1) / len(batch_df)) | |
| batch_df['Predicted_Class'] = predictions | |
| st.success("โ Batch predictions completed!") | |
| st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']]) | |
| # Download option | |
| csv = batch_df.to_csv(index=False) | |
| st.download_button( | |
| "๐ฅ Download Results", | |
| csv, | |
| "batch_predictions.csv", | |
| "text/csv" | |
| ) | |
| except Exception as e: | |
| st.error(f"โ Batch prediction error: {str(e)}") | |
| else: | |
| st.warning("โ ๏ธ No trained models found.") | |
| else: | |
| st.warning("โ ๏ธ No models available. Please train a model first.") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown("*Built with Streamlit โข Text Classification Made Easy*") |