Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

File size: 22,400 Bytes

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string
import nltk
import os
import pickle
import io
import base64

# Download required NLTK data
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', quiet=True)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Set page config
st.set_page_config(
    page_title="No Code Text Classification",
    page_icon="📝",
    layout="wide"
)

# Initialize session state
if 'trained_model' not in st.session_state:
    st.session_state.trained_model = None
if 'vectorizer' not in st.session_state:
    st.session_state.vectorizer = None
if 'label_encoder' not in st.session_state:
    st.session_state.label_encoder = None
if 'vectorizer_type' not in st.session_state:
    st.session_state.vectorizer_type = 'tfidf'
if 'train_df' not in st.session_state:
    st.session_state.train_df = None

# Text cleaning class
class TextCleaner:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text):
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove user mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove stopwords and lemmatize
        words = text.split()
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        
        return ' '.join(words)

# Utility functions
def create_download_link(val, filename):
    """Generate a download link for a file"""
    b64 = base64.b64encode(val)
    return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">Download {filename}</a>'

def safe_file_read(uploaded_file):
    """Safely read uploaded file with multiple encoding attempts"""
    try:
        # Try UTF-8 first
        return pd.read_csv(uploaded_file, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            # Try latin1
            uploaded_file.seek(0)  # Reset file pointer
            return pd.read_csv(uploaded_file, encoding='latin1')
        except:
            try:
                # Try cp1252
                uploaded_file.seek(0)
                return pd.read_csv(uploaded_file, encoding='cp1252')
            except Exception as e:
                st.error(f"Error reading file: {str(e)}")
                return None

# Data Analysis Functions
def get_data_insights(df, text_col, target_col):
    """Get basic insights from the data"""
    insights = {}
    
    # Basic info
    insights['shape'] = df.shape
    insights['missing_values'] = df.isnull().sum().to_dict()
    
    # Class distribution
    insights['class_distribution'] = df[target_col].value_counts().to_dict()
    
    # Text length analysis
    df['text_length'] = df[text_col].astype(str).str.len()
    insights['avg_text_length'] = df['text_length'].mean()
    insights['min_text_length'] = df['text_length'].min()
    insights['max_text_length'] = df['text_length'].max()
    
    return insights

def create_visualizations(df, text_col, target_col):
    """Create visualizations for the data"""
    
    # Class distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Class distribution bar plot
    class_counts = df[target_col].value_counts()
    ax1.bar(class_counts.index, class_counts.values)
    ax1.set_title('Class Distribution')
    ax1.set_xlabel('Classes')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)
    
    # Text length distribution
    df['text_length'] = df[text_col].astype(str).str.len()
    ax2.hist(df['text_length'], bins=30, alpha=0.7)
    ax2.set_title('Text Length Distribution')
    ax2.set_xlabel('Text Length')
    ax2.set_ylabel('Frequency')
    
    plt.tight_layout()
    st.pyplot(fig)

# Model Training Functions
def train_model(X_train, X_test, y_train, y_test, model_name):
    """Train the selected model"""
    
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
        'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
        'SVC': SVC(random_state=42, probability=True),
        'Multinomial Naive Bayes': MultinomialNB(),
        'Gaussian Naive Bayes': GaussianNB()
    }
    
    model = models[model_name]
    
    # Handle sparse matrices for Gaussian NB
    if model_name == 'Gaussian Naive Bayes':
        if hasattr(X_train, 'toarray'):
            X_train = X_train.toarray()
            X_test = X_test.toarray()
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    return model, accuracy, y_pred

# Main App
st.title('🔤 No Code Text Classification App')
st.markdown('Upload your data, analyze it, train models, and make predictions without writing any code!')

# Sidebar
st.sidebar.header("📁 Data Upload")

# File upload with better error handling
train_data = st.sidebar.file_uploader(
    "Upload training data (CSV)", 
    type=["csv"],
    help="Upload a CSV file with text and labels"
)

# Process uploaded data
if train_data is not None:
    try:
        with st.spinner("Loading data..."):
            train_df = safe_file_read(train_data)
            
        if train_df is not None:
            st.session_state.train_df = train_df
            
            st.sidebar.success(f"✅ Data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
            
            # Column selection
            columns = train_df.columns.tolist()
            text_col = st.sidebar.selectbox("📝 Select text column:", columns, key="text_col")
            target_col = st.sidebar.selectbox("🎯 Select target column:", columns, key="target_col")
            
            if text_col and target_col and text_col != target_col:
                # Clean and prepare data
                with st.spinner("Preprocessing data..."):
                    text_cleaner = TextCleaner()
                    train_df['clean_text'] = train_df[text_col].apply(text_cleaner.clean_text)
                    
                    # Encode labels
                    label_encoder = LabelEncoder()
                    train_df['encoded_target'] = label_encoder.fit_transform(train_df[target_col])
                    st.session_state.label_encoder = label_encoder
                
                # Main sections
                tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "🤖 Train Model", "🔍 Predictions"])
                
                # Data Analysis Tab
                with tab1:
                    st.header("📊 Data Analysis")
                    
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.subheader("📈 Dataset Overview")
                        insights = get_data_insights(train_df, text_col, target_col)
                        
                        st.metric("Total Samples", insights['shape'][0])
                        st.metric("Number of Features", insights['shape'][1])
                        st.metric("Average Text Length", f"{insights['avg_text_length']:.1f}")
                        
                        st.subheader("🎯 Class Distribution")
                        class_dist_df = pd.DataFrame(list(insights['class_distribution'].items()), 
                                                   columns=['Class', 'Count'])
                        st.dataframe(class_dist_df, use_container_width=True)
                    
                    with col2:
                        st.subheader("📋 Data Preview")
                        preview_df = train_df[[text_col, target_col]].head()
                        st.dataframe(preview_df, use_container_width=True)
                        
                        st.subheader("🧹 Cleaned Text Preview")
                        cleaned_preview = train_df[['clean_text', target_col]].head()
                        st.dataframe(cleaned_preview, use_container_width=True)
                    
                    st.subheader("📊 Visualizations")
                    create_visualizations(train_df, text_col, target_col)
                
                # Train Model Tab
                with tab2:
                    st.header("🤖 Train Model")
                    
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        st.subheader("🔧 Model Selection")
                        model_name = st.selectbox(
                            "Choose a model:",
                            ["Logistic Regression", "Decision Tree", "Random Forest", 
                             "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
                        )
                    
                    with col2:
                        st.subheader("📊 Vectorizer Selection")
                        vectorizer_type = st.selectbox(
                            "Choose vectorizer:",
                            ["TF-IDF Vectorizer", "Count Vectorizer"]
                        )
                    
                    # Training parameters
                    st.subheader("⚙️ Training Parameters")
                    col3, col4 = st.columns(2)
                    with col3:
                        test_size = st.slider("Test size", 0.1, 0.5, 0.2, 0.05)
                        max_features = st.number_input("Max features", 1000, 20000, 10000, 1000)
                    
                    if st.button("🚀 Train Model", type="primary"):
                        try:
                            with st.spinner("Training model... This may take a few minutes."):
                                # Initialize vectorizer
                                if vectorizer_type == "TF-IDF Vectorizer":
                                    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
                                    st.session_state.vectorizer_type = 'tfidf'
                                else:
                                    vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
                                    st.session_state.vectorizer_type = 'count'
                                
                                # Vectorize text
                                X = vectorizer.fit_transform(train_df['clean_text'])
                                y = train_df['encoded_target']
                                
                                # Split data
                                X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=test_size, random_state=42, stratify=y
                                )
                                
                                # Train model
                                model, accuracy, y_pred = train_model(X_train, X_test, y_train, y_test, model_name)
                                
                                # Store in session state
                                st.session_state.trained_model = model
                                st.session_state.vectorizer = vectorizer
                                
                                # Display results
                                st.success("🎉 Model training completed!")
                                
                                col5, col6 = st.columns(2)
                                with col5:
                                    st.metric("🎯 Accuracy", f"{accuracy:.4f}")
                                    st.metric("🏋️ Training Samples", len(X_train))
                                    st.metric("🧪 Test Samples", len(X_test))
                                
                                with col6:
                                    st.subheader("📊 Classification Report")
                                    report = classification_report(y_test, y_pred, 
                                                                 target_names=label_encoder.classes_,
                                                                 output_dict=True)
                                    report_df = pd.DataFrame(report).transpose()
                                    st.dataframe(report_df.round(3), use_container_width=True)
                        
                        except Exception as e:
                            st.error(f"❌ Error during training: {str(e)}")
                
                # Predictions Tab
                with tab3:
                    st.header("🔍 Make Predictions")
                    
                    if st.session_state.trained_model is not None:
                        # Single prediction
                        st.subheader("📝 Single Text Prediction")
                        user_input = st.text_area("Enter text to classify:", height=100)
                        
                        if st.button("🔮 Predict", type="primary"):
                            if user_input.strip():
                                try:
                                    with st.spinner("Making prediction..."):
                                        # Clean and vectorize input
                                        text_cleaner = TextCleaner()
                                        clean_input = text_cleaner.clean_text(user_input)
                                        input_vector = st.session_state.vectorizer.transform([clean_input])
                                        
                                        # Handle sparse matrix for Gaussian NB
                                        if isinstance(st.session_state.trained_model, GaussianNB):
                                            input_vector = input_vector.toarray()
                                        
                                        # Make prediction
                                        prediction = st.session_state.trained_model.predict(input_vector)[0]
                                        predicted_label = st.session_state.label_encoder.inverse_transform([prediction])[0]
                                        
                                        # Get probabilities if available
                                        if hasattr(st.session_state.trained_model, 'predict_proba'):
                                            try:
                                                proba = st.session_state.trained_model.predict_proba(input_vector)[0]
                                                
                                                st.success("🎉 Prediction completed!")
                                                st.write(f"**Input:** {user_input}")
                                                st.write(f"**Predicted Class:** {predicted_label}")
                                                
                                                # Show probabilities
                                                st.subheader("📊 Class Probabilities")
                                                prob_df = pd.DataFrame({
                                                    'Class': st.session_state.label_encoder.classes_,
                                                    'Probability': proba
                                                }).sort_values('Probability', ascending=False)
                                                
                                                st.bar_chart(prob_df.set_index('Class'))
                                                st.dataframe(prob_df.round(4), use_container_width=True)
                                            except:
                                                st.success("🎉 Prediction completed!")
                                                st.write(f"**Predicted Class:** {predicted_label}")
                                        else:
                                            st.success("🎉 Prediction completed!")
                                            st.write(f"**Predicted Class:** {predicted_label}")
                                
                                except Exception as e:
                                    st.error(f"❌ Error during prediction: {str(e)}")
                            else:
                                st.warning("⚠️ Please enter some text to classify")
                        
                        # Batch predictions
                        st.subheader("📊 Batch Predictions")
                        batch_file = st.file_uploader("Upload CSV for batch predictions", type=["csv"])
                        
                        if batch_file is not None:
                            try:
                                batch_df = safe_file_read(batch_file)
                                if batch_df is not None:
                                    st.write("**Preview:**")
                                    st.dataframe(batch_df.head(), use_container_width=True)
                                    
                                    batch_text_col = st.selectbox("Select text column for prediction:", 
                                                                 batch_df.columns.tolist())
                                    
                                    if st.button("🚀 Run Batch Predictions"):
                                        with st.spinner("Processing batch predictions..."):
                                            text_cleaner = TextCleaner()
                                            predictions = []
                                            
                                            for text in batch_df[batch_text_col]:
                                                try:
                                                    clean_text = text_cleaner.clean_text(str(text))
                                                    text_vector = st.session_state.vectorizer.transform([clean_text])
                                                    
                                                    if isinstance(st.session_state.trained_model, GaussianNB):
                                                        text_vector = text_vector.toarray()
                                                    
                                                    pred = st.session_state.trained_model.predict(text_vector)[0]
                                                    pred_label = st.session_state.label_encoder.inverse_transform([pred])[0]
                                                    predictions.append(pred_label)
                                                except:
                                                    predictions.append("Error")
                                            
                                            batch_df['Predicted_Class'] = predictions
                                            
                                            st.success("🎉 Batch predictions completed!")
                                            st.dataframe(batch_df, use_container_width=True)
                                            
                                            # Download results
                                            csv_data = batch_df.to_csv(index=False)
                                            st.download_button(
                                                label="📥 Download Results",
                                                data=csv_data,
                                                file_name="batch_predictions.csv",
                                                mime="text/csv"
                                            )
                            except Exception as e:
                                st.error(f"❌ Error processing batch file: {str(e)}")
                    else:
                        st.warning("⚠️ No trained model found. Please train a model first in the 'Train Model' tab.")
            else:
                st.warning("⚠️ Please select different columns for text and target.")
    
    except Exception as e:
        st.error(f"❌ Error loading file: {str(e)}")
        st.info("💡 Try these solutions:")
        st.write("- Check if the file is a valid CSV")
        st.write("- Ensure the file is not corrupted")
        st.write("- Try saving the file with UTF-8 encoding")

else:
    st.info("👆 Please upload a CSV file to get started")
    
    # Show example data format
    st.subheader("📋 Expected Data Format")
    example_df = pd.DataFrame({
        'text': [
            "This product is amazing! I love it.",
            "Terrible quality, waste of money.",
            "Good value for the price.",
            "Not what I expected, disappointed."
        ],
        'sentiment': ['positive', 'negative', 'positive', 'negative']
    })
    st.dataframe(example_df, use_container_width=True)

# Footer
st.markdown("---")
st.markdown("Built with ❤️ using Streamlit | No Code Text Classification App")