Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

File size: 26,396 Bytes

8d810b6

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
import pickle
import re
import string
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go

# Configure Streamlit page
st.set_page_config(
    page_title="Text Classification App",
    page_icon="📝",
    layout="wide"
)

# Text preprocessing class
class TextCleaner:
    def __init__(self):
        self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
    
    def clean_text(self, text):
        """Clean and preprocess text"""
        if pd.isna(text):
            return ""
        
        text = str(text).lower()
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = text.strip()
        
        # Remove stop words (optional)
        words = text.split()
        words = [word for word in words if word not in self.stop_words]
        
        return ' '.join(words)

# Data analysis functions
def get_data_insights(df, text_col, target_col):
    """Get basic insights from the dataset"""
    insights = {
        'shape': df.shape,
        'missing_values': df.isnull().sum().to_dict(),
        'class_distribution': df[target_col].value_counts().to_dict(),
        'text_length_stats': {
            'mean': df[text_col].str.len().mean(),
            'median': df[text_col].str.len().median(),
            'min': df[text_col].str.len().min(),
            'max': df[text_col].str.len().max()
        }
    }
    return insights

# Model training functions
def train_model(model_name, X_train, X_test, y_train, y_test):
    """Train and evaluate a model"""
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
        'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
        'SVC': SVC(random_state=42, probability=True),
        'Multinomial Naive Bayes': MultinomialNB(),
        'Gaussian Naive Bayes': GaussianNB()
    }
    
    model = models[model_name]
    
    # For Gaussian NB, convert sparse matrix to dense
    if model_name == 'Gaussian Naive Bayes':
        X_train = X_train.toarray()
        X_test = X_test.toarray()
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    # Save model
    os.makedirs("models", exist_ok=True)
    model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
    with open(os.path.join("models", model_filename), 'wb') as f:
        pickle.dump(model, f)
    
    return model, accuracy, y_pred, model_filename

# Utility functions
def save_artifacts(obj, folder_name, file_name):
    """Save artifacts like encoders and vectorizers"""
    os.makedirs(folder_name, exist_ok=True)
    with open(os.path.join(folder_name, file_name), 'wb') as f:
        pickle.dump(obj, f)

def load_artifacts(folder_name, file_name):
    """Load saved artifacts"""
    try:
        with open(os.path.join(folder_name, file_name), 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        st.error(f"File {file_name} not found in {folder_name} folder")
        return None

def predict_text(model_filename, text, vectorizer_type="tfidf"):
    """Make prediction on new text"""
    try:
        # Load model
        with open(os.path.join('models', model_filename), 'rb') as f:
            model = pickle.load(f)
        
        # Load vectorizer
        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
        vectorizer = load_artifacts("artifacts", vectorizer_file)
        if vectorizer is None:
            return None, None
        
        # Load label encoder
        encoder = load_artifacts("artifacts", "encoder.pkl")
        if encoder is None:
            return None, None
        
        # Clean and vectorize text
        text_cleaner = TextCleaner()
        clean_text = text_cleaner.clean_text(text)
        
        # Transform text
        text_vector = vectorizer.transform([clean_text])
        
        # For Gaussian NB, convert to dense
        if 'gaussian' in model_filename:
            text_vector = text_vector.toarray()
        
        # Make prediction
        prediction = model.predict(text_vector)
        prediction_proba = None
        
        # Get prediction probabilities if available
        if hasattr(model, 'predict_proba'):
            try:
                prediction_proba = model.predict_proba(text_vector)[0]
            except:
                pass
        
        # Decode prediction
        predicted_label = encoder.inverse_transform(prediction)[0]
        
        return predicted_label, prediction_proba
        
    except Exception as e:
        st.error(f"Error during prediction: {str(e)}")
        return None, None

# Streamlit App
st.title('📝 No Code Text Classification App')
st.markdown('---')
st.write('Analyze your text data and train machine learning models without coding!')

# Sidebar
st.sidebar.title("Navigation")
section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"])

# Upload Data
st.sidebar.markdown("---")
st.sidebar.subheader("📁 Upload Your Dataset")
train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])

# Global variables to store data and settings
if 'vectorizer_type' not in st.session_state:
    st.session_state.vectorizer_type = "tfidf"

if train_data is not None:
    try:
        # Try different encodings
        encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
        train_df = None
        
        for encoding in encodings:
            try:
                train_df = pd.read_csv(train_data, encoding=encoding)
                break
            except UnicodeDecodeError:
                continue
        
        if train_df is None:
            st.error("Unable to read the CSV file. Please check the file encoding.")
        else:
            if test_data is not None:
                for encoding in encodings:
                    try:
                        test_df = pd.read_csv(test_data, encoding=encoding)
                        break
                    except UnicodeDecodeError:
                        continue
            else:
                test_df = None
                
            # Show data preview
            with st.sidebar.expander("📋 Data Preview", expanded=True):
                st.write("Shape:", train_df.shape)
                st.write(train_df.head(2))
            
            columns = train_df.columns.tolist()
            text_data = st.sidebar.selectbox("📝 Choose the text column:", columns)
            target = st.sidebar.selectbox("🎯 Choose the target column:", columns)

            # Process data
            if text_data and target:
                # Clean text
                text_cleaner = TextCleaner()
                train_df['clean_text'] = train_df[text_data].apply(text_cleaner.clean_text)
                train_df['text_length'] = train_df[text_data].str.len()
                
                # Handle label encoding
                label_encoder = LabelEncoder()
                train_df['target_encoded'] = label_encoder.fit_transform(train_df[target])
                
                # Save label encoder
                save_artifacts(label_encoder, "artifacts", "encoder.pkl")
                
    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        train_df = None

# Data Analysis Section
if section == "📊 Data Analysis":
    if train_data is not None and 'train_df' in locals() and train_df is not None:
        st.header("📊 Data Analysis")
        
        # Get insights
        insights = get_data_insights(train_df, text_data, target)
        
        # Display insights in columns
        col1, col2, col3, col4 = st.columns(4)
        
        with col1:
            st.metric("Total Samples", insights['shape'][0])
        
        with col2:
            st.metric("Features", insights['shape'][1])
        
        with col3:
            st.metric("Classes", len(insights['class_distribution']))
        
        with col4:
            st.metric("Avg Text Length", f"{insights['text_length_stats']['mean']:.1f}")
        
        st.markdown("---")
        
        # Data quality section
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("📋 Dataset Overview")
            st.write("**Shape:**", insights['shape'])
            st.write("**Missing Values:**")
            missing_df = pd.DataFrame.from_dict(insights['missing_values'], orient='index', columns=['Count'])
            st.dataframe(missing_df[missing_df['Count'] > 0])
            
            st.write("**Sample Data:**")
            st.dataframe(train_df[[text_data, target, 'text_length']].head())
        
        with col2:
            st.subheader("📊 Class Distribution")
            class_dist = pd.DataFrame.from_dict(insights['class_distribution'], orient='index', columns=['Count'])
            st.dataframe(class_dist)
            
            # Plot class distribution
            fig = px.bar(
                x=class_dist.index, 
                y=class_dist['Count'],
                title="Class Distribution",
                labels={'x': 'Class', 'y': 'Count'}
            )
            st.plotly_chart(fig, use_container_width=True)
        
        st.markdown("---")
        
        # Text analysis section
        st.subheader("📝 Text Analysis")
        
        col1, col2 = st.columns(2)
        
        with col1:
            # Text length distribution
            fig = px.histogram(
                train_df, 
                x='text_length', 
                title="Text Length Distribution",
                nbins=30
            )
            st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            # Text length by class
            fig = px.box(
                train_df, 
                x=target, 
                y='text_length', 
                title="Text Length by Class"
            )
            st.plotly_chart(fig, use_container_width=True)
        
        # Word frequency analysis
        st.subheader("🔤 Most Common Words")
        all_text = ' '.join(train_df['clean_text'].astype(str))
        word_freq = Counter(all_text.split())
        top_words = word_freq.most_common(20)
        
        if top_words:
            words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
            fig = px.bar(
                words_df, 
                x='Frequency', 
                y='Word', 
                orientation='h',
                title="Top 20 Most Common Words"
            )
            fig.update_layout(yaxis={'categoryorder': 'total ascending'})
            st.plotly_chart(fig, use_container_width=True)
    
    else:
        st.warning("📁 Please upload training data to perform analysis")

# Train Model Section
elif section == "🤖 Train Model":
    if train_data is not None and 'train_df' in locals() and train_df is not None:
        st.header("🤖 Train Machine Learning Model")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("⚙️ Model Configuration")
            model_name = st.selectbox("Choose Model", [
                "Logistic Regression", "Decision Tree", 
                "Random Forest", "Linear SVC", "SVC",
                "Multinomial Naive Bayes", "Gaussian Naive Bayes"
            ])
        
        with col2:
            st.subheader("📊 Vectorization Method")
            vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count Vectorizer"])
        
        # Model parameters
        st.subheader("🔧 Parameters")
        col1, col2 = st.columns(2)
        
        with col1:
            max_features = st.slider("Max Features", 1000, 20000, 10000, step=1000)
            test_size = st.slider("Test Size", 0.1, 0.4, 0.2, step=0.05)
        
        with col2:
            random_state = st.number_input("Random State", 0, 1000, 42)
            min_df = st.slider("Min Document Frequency", 1, 10, 1)
        
        # Initialize vectorizer
        if vectorizer_choice == "TF-IDF":
            vectorizer = TfidfVectorizer(
                max_features=max_features, 
                min_df=min_df,
                stop_words='english'
            )
            st.session_state.vectorizer_type = "tfidf"
        else:
            vectorizer = CountVectorizer(
                max_features=max_features, 
                min_df=min_df,
                stop_words='english'
            )
            st.session_state.vectorizer_type = "count"
        
        # Show data info
        st.subheader("📋 Training Data Info")
        col1, col2, col3 = st.columns(3)
        
        with col1:
            st.metric("Total Samples", len(train_df))
        
        with col2:
            st.metric("Unique Classes", train_df[target].nunique())
        
        with col3:
            st.metric("Avg Text Length", f"{train_df['text_length'].mean():.1f}")
        
        if st.button("🚀 Start Training", type="primary"):
            with st.spinner("Training model... This may take a few minutes."):
                try:
                    # Vectorize text data
                    X = vectorizer.fit_transform(train_df['clean_text'])
                    y = train_df['target_encoded']
                    
                    # Split data
                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, 
                        test_size=test_size, 
                        random_state=random_state,
                        stratify=y
                    )
                    
                    st.success(f"✅ Data split - Train: {X_train.shape}, Test: {X_test.shape}")
                    
                    # Save vectorizer
                    vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
                    save_artifacts(vectorizer, "artifacts", vectorizer_filename)
                    
                    # Train model
                    model, accuracy, y_pred, model_filename = train_model(
                        model_name, X_train, X_test, y_train, y_test
                    )
                    
                    st.success("🎉 Model training completed!")
                    
                    # Display results
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.metric("🎯 Test Accuracy", f"{accuracy:.4f}")
                        
                        # Classification report
                        st.subheader("📊 Classification Report")
                        report = classification_report(
                            y_test, y_pred, 
                            target_names=label_encoder.classes_,
                            output_dict=True
                        )
                        report_df = pd.DataFrame(report).transpose()
                        st.dataframe(report_df.round(4))
                    
                    with col2:
                        # Confusion matrix
                        st.subheader("🔄 Confusion Matrix")
                        cm = confusion_matrix(y_test, y_pred)
                        fig = px.imshow(
                            cm,
                            text_auto=True,
                            aspect="auto",
                            title="Confusion Matrix",
                            labels=dict(x="Predicted", y="Actual"),
                            x=label_encoder.classes_,
                            y=label_encoder.classes_
                        )
                        st.plotly_chart(fig, use_container_width=True)
                    
                    st.info(f"✅ Model saved as: {model_filename}")
                    st.info("🔮 You can now use the 'Predictions' section to classify new text!")
                    
                except Exception as e:
                    st.error(f"❌ Error during training: {str(e)}")
    
    else:
        st.warning("📁 Please upload training data to train a model")

# Predictions Section
elif section == "🔮 Predictions":
    st.header("🔮 Text Classification Predictions")
    
    # Check if models exist
    if os.path.exists("models") and os.listdir("models"):
        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
        
        if available_models:
            # Single prediction
            st.subheader("📝 Single Text Classification")
            
            col1, col2 = st.columns([2, 1])
            
            with col1:
                text_input = st.text_area("Enter text to classify:", height=150)
            
            with col2:
                selected_model = st.selectbox("Choose model:", available_models)
                predict_button = st.button("🔮 Predict", type="primary")
            
            if predict_button and text_input.strip():
                with st.spinner("Making prediction..."):
                    predicted_label, prediction_proba = predict_text(
                        selected_model, 
                        text_input, 
                        st.session_state.get('vectorizer_type', 'tfidf')
                    )
                    
                    if predicted_label is not None:
                        st.success("✅ Prediction completed!")
                        
                        # Display results
                        col1, col2 = st.columns(2)
                        
                        with col1:
                            st.markdown("### 🎯 Results")
                            st.markdown(f"**Input Text:** {text_input[:200]}{'...' if len(text_input) > 200 else ''}")
                            st.markdown(f"**Predicted Class:** `{predicted_label}`")
                        
                        with col2:
                            # Display probabilities if available
                            if prediction_proba is not None:
                                st.markdown("### 📊 Class Probabilities")
                                
                                encoder = load_artifacts("artifacts", "encoder.pkl")
                                if encoder is not None:
                                    prob_df = pd.DataFrame({
                                        'Class': encoder.classes_,
                                        'Probability': prediction_proba
                                    }).sort_values('Probability', ascending=False)
                                    
                                    fig = px.bar(
                                        prob_df, 
                                        x='Probability', 
                                        y='Class',
                                        orientation='h',
                                        title="Prediction Confidence"
                                    )
                                    fig.update_layout(yaxis={'categoryorder': 'total ascending'})
                                    st.plotly_chart(fig, use_container_width=True)
            
            elif predict_button:
                st.warning("⚠️ Please enter some text to classify")
            
            # Batch predictions
            st.markdown("---")
            st.subheader("📊 Batch Predictions")
            
            uploaded_file = st.file_uploader("Upload CSV file with texts to classify", type=['csv'])
            
            if uploaded_file is not None:
                try:
                    # Try different encodings for batch file
                    encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
                    batch_df = None
                    
                    for encoding in encodings:
                        try:
                            batch_df = pd.read_csv(uploaded_file, encoding=encoding)
                            break
                        except UnicodeDecodeError:
                            continue
                    
                    if batch_df is not None:
                        st.write("📋 Uploaded data preview:")
                        st.dataframe(batch_df.head())
                        
                        col1, col2 = st.columns(2)
                        
                        with col1:
                            text_column = st.selectbox("Select text column:", batch_df.columns.tolist())
                        
                        with col2:
                            batch_model = st.selectbox("Choose model:", available_models, key="batch_model")
                        
                        if st.button("🚀 Run Batch Predictions", type="primary"):
                            with st.spinner("Processing batch predictions..."):
                                predictions = []
                                confidences = []
                                
                                progress_bar = st.progress(0)
                                total_texts = len(batch_df)
                                
                                for i, text in enumerate(batch_df[text_column]):
                                    pred, proba = predict_text(
                                        batch_model, 
                                        str(text), 
                                        st.session_state.get('vectorizer_type', 'tfidf')
                                    )
                                    predictions.append(pred if pred is not None else "Error")
                                    
                                    # Get confidence (max probability)
                                    if proba is not None:
                                        confidences.append(max(proba))
                                    else:
                                        confidences.append(0.0)
                                    
                                    progress_bar.progress((i + 1) / total_texts)
                                
                                batch_df['Predicted_Class'] = predictions
                                batch_df['Confidence'] = confidences
                                
                                st.success("✅ Batch predictions completed!")
                                
                                # Show results
                                st.subheader("📊 Results")
                                result_df = batch_df[[text_column, 'Predicted_Class', 'Confidence']]
                                st.dataframe(result_df)
                                
                                # Summary statistics
                                st.subheader("📈 Summary")
                                col1, col2, col3 = st.columns(3)
                                
                                with col1:
                                    st.metric("Total Predictions", len(predictions))
                                
                                with col2:
                                    successful_preds = sum(1 for p in predictions if p != "Error")
                                    st.metric("Successful", successful_preds)
                                
                                with col3:
                                    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
                                    st.metric("Avg Confidence", f"{avg_confidence:.3f}")
                                
                                # Class distribution of predictions
                                pred_counts = pd.Series(predictions).value_counts()
                                if len(pred_counts) > 0:
                                    fig = px.pie(
                                        values=pred_counts.values,
                                        names=pred_counts.index,
                                        title="Distribution of Predictions"
                                    )
                                    st.plotly_chart(fig, use_container_width=True)
                                
                                # Download results
                                csv = batch_df.to_csv(index=False)
                                st.download_button(
                                    label="📥 Download Results as CSV",
                                    data=csv,
                                    file_name="batch_predictions.csv",
                                    mime="text/csv"
                                )
                    else:
                        st.error("❌ Unable to read the CSV file. Please check the file encoding.")
                        
                except Exception as e:
                    st.error(f"❌ Error in batch prediction: {str(e)}")
        else:
            st.warning("⚠️ No trained models found. Please train a model first.")
    else:
        st.warning("⚠️ No models directory found. Please go to 'Train Model' section to train a model first.")

# Footer
st.markdown("---")
st.markdown("🚀 Built with Streamlit | 📊 No-Code Text Classification")