Spaces:

Alamgirapi
/

TextClassifier

Runtime error

App Files Files Community

Alamgirapi commited on Aug 6, 2025

Commit

56d15cb

verified ·

1 Parent(s): 060249f

Update app.py

Browse files

Files changed (1) hide show

app.py +515 -0

app.py CHANGED Viewed

	@@ -0,0 +1,515 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pickle
+import ssl
+import nltk
+import re
+import string
+from pathlib import Path
+from sklearn.preprocessing import LabelEncoder
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import LinearSVC, SVC
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import MultinomialNB, GaussianNB
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+# Fix SSL certificate issues for NLTK downloads
+try:
+    _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+    pass
+else:
+    ssl._create_default_https_context = _create_unverified_https_context
+# Download NLTK data with error handling
+@st.cache_resource
+def download_nltk_data():
+    try:
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        nltk.download('stopwords', quiet=True)
+    try:
+        nltk.data.find('corpora/wordnet')
+    except LookupError:
+        nltk.download('wordnet', quiet=True)
+        nltk.download('omw-1.4', quiet=True)
+# Download required NLTK data
+download_nltk_data()
+class TextCleaner:
+    """Class for cleaning Text"""
+    def __init__(self, currency_symbols=r'[\$\£\€\¥\₹\¢\₽\₩\₪]', stop_words=None, lemmatizer=None):
+        self.currency_symbols = currency_symbols
+        if stop_words is None:
+            try:
+                self.stop_words = set(stopwords.words('english'))
+            except LookupError:
+                nltk.download('stopwords', quiet=True)
+                self.stop_words = set(stopwords.words('english'))
+        else:
+            self.stop_words = stop_words
+        if lemmatizer is None:
+            try:
+                self.lemmatizer = WordNetLemmatizer()
+                # Test the lemmatizer to ensure it works
+                test_word = self.lemmatizer.lemmatize('testing')
+            except (AttributeError, LookupError) as e:
+                print(f"WordNet lemmatizer initialization failed: {e}")
+                nltk.download('wordnet', quiet=True)
+                nltk.download('omw-1.4', quiet=True)
+                self.lemmatizer = WordNetLemmatizer()
+        else:
+            self.lemmatizer = lemmatizer
+    def remove_punctuation(self, text):
+        return text.translate(str.maketrans('', '', string.punctuation))
+    def clean_text(self, text):
+        """Clean the text by removing punctuations, html tag, underscore,
+        whitespaces, numbers, stopwords. Lemmatize the words in root format."""
+        if not isinstance(text, str):
+            text = str(text) if text is not None else ""
+        if not text.strip():
+            return ""
+        try:
+            text = text.lower()
+            text = re.sub(self.currency_symbols, 'currency', text)
+            # Remove any kind of emojis in the text
+            emoji_pattern = re.compile("["
+                                u"\U0001F600-\U0001F64F"  # emoticons
+                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                                u"\U00002702-\U000027B0"
+                                u"\U000024C2-\U0001F251"
+                                "]+", flags=re.UNICODE)
+            text = emoji_pattern.sub(r'', text)
+            text = self.remove_punctuation(text)
+            text = re.compile('<.*?>').sub('', text)
+            text = text.replace('_', '')
+            text = re.sub(r'[^\w\s]', '', text)
+            text = re.sub(r'\d', ' ', text)
+            text = re.sub(r'\s+', ' ', text).strip()
+            text = ' '.join(word for word in text.split() if word not in self.stop_words)
+            # Lemmatization with error handling
+            try:
+                text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
+            except (AttributeError, LookupError) as e:
+                print(f"Lemmatization failed for text: {e}")
+                pass
+            return str(text)
+        except Exception as e:
+            print(f"Error cleaning text: {e}")
+            return str(text)
+class DataAnalyzer:
+    """Class for data analysis and visualization"""
+    def __init__(self, df, text_column, target_column):
+        self.df = df
+        self.text_column = text_column
+        self.target_column = target_column
+    def get_basic_info(self):
+        info = {
+            'shape': self.df.shape,
+            'missing_values': self.df.isnull().sum().to_dict(),
+            'class_distribution': self.df[self.target_column].value_counts().to_dict()
+        }
+        return info
+    def plot_class_distribution(self):
+        fig, ax = plt.subplots(figsize=(10, 6))
+        self.df[self.target_column].value_counts().plot(kind='bar', ax=ax)
+        ax.set_title('Class Distribution')
+        ax.set_xlabel('Classes')
+        ax.set_ylabel('Count')
+        plt.xticks(rotation=45)
+        st.pyplot(fig)
+    def plot_text_length_distribution(self):
+        fig, ax = plt.subplots(figsize=(10, 6))
+        text_lengths = self.df[self.text_column].str.len()
+        ax.hist(text_lengths, bins=50, alpha=0.7)
+        ax.set_title('Text Length Distribution')
+        ax.set_xlabel('Text Length')
+        ax.set_ylabel('Frequency')
+        st.pyplot(fig)
+# Utility functions
+def save_artifacts(obj, folder_name, file_name):
+    """Save artifacts like encoders and vectorizers"""
+    os.makedirs(folder_name, exist_ok=True)
+    with open(os.path.join(folder_name, file_name), 'wb') as f:
+        pickle.dump(obj, f)
+def load_artifacts(folder_name, file_name):
+    """Load saved artifacts"""
+    try:
+        with open(os.path.join(folder_name, file_name), 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        st.error(f"File {file_name} not found in {folder_name} folder")
+        return None
+def load_model(model_name):
+    """Load trained model"""
+    try:
+        with open(os.path.join('models', model_name), 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        st.error(f"Model {model_name} not found. Please train a model first.")
+        return None
+def train_model(model_name, X_train, X_test, y_train, y_test):
+    """Train selected model"""
+    os.makedirs("models", exist_ok=True)
+    models_dict = {
+        "Logistic Regression": LogisticRegression(),
+        "Decision Tree": DecisionTreeClassifier(),
+        "Random Forest": RandomForestClassifier(),
+        "Linear SVC": LinearSVC(),
+        "SVC": SVC(),
+        "Multinomial Naive Bayes": MultinomialNB(),
+        "Gaussian Naive Bayes": GaussianNB()
+    }
+    if model_name in models_dict:
+        model = models_dict[model_name]
+        model.fit(X_train, y_train)
+        # Save model
+        model_filename = f"{model_name.replace(' ', '')}.pkl"
+        save_path = os.path.join("models", model_filename)
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        # Evaluate model
+        y_pred = model.predict(X_test)
+        accuracy = accuracy_score(y_test, y_pred)
+        st.success("Model training completed!")
+        st.write(f"**Accuracy**: {accuracy:.4f}")
+        return model_filename
+    else:
+        st.error(f"Model {model_name} not supported")
+        return None
+def predict_text(model_name, text, vectorizer_type="tfidf"):
+    """Make prediction on new text"""
+    try:
+        # Load model
+        model = load_model(model_name)
+        if model is None:
+            return None, None
+        # Load vectorizer
+        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
+        vectorizer = load_artifacts("artifacts", vectorizer_file)
+        if vectorizer is None:
+            return None, None
+        # Load label encoder
+        encoder = load_artifacts("artifacts", "encoder.pkl")
+        if encoder is None:
+            return None, None
+        # Clean and vectorize text
+        text_cleaner = TextCleaner()
+        clean_text = text_cleaner.clean_text(text)
+        # Transform text using the same vectorizer used during training
+        text_vector = vectorizer.transform([clean_text])
+        # Make prediction
+        prediction = model.predict(text_vector)
+        prediction_proba = None
+        # Get prediction probabilities if available
+        if hasattr(model, 'predict_proba'):
+            try:
+                prediction_proba = model.predict_proba(text_vector)[0]
+            except:
+                pass
+        # Decode prediction
+        predicted_label = encoder.inverse_transform(prediction)[0]
+        return predicted_label, prediction_proba
+    except Exception as e:
+        st.error(f"Error during prediction: {str(e)}")
+        return None, None
+# Streamlit App
+st.set_page_config(page_title="No Code Text Classifier", page_icon="🤖", layout="wide")
+st.title('🤖 No Code Text Classification App')
+st.write('Understand the behavior of your text data and train a model to classify text data')
+# Sidebar
+section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
+# Upload Data
+st.sidebar.subheader("📁 Upload Your Dataset")
+train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
+test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
+# Global variables to store data and settings
+if 'vectorizer_type' not in st.session_state:
+    st.session_state.vectorizer_type = "tfidf"
+if train_data is not None:
+    try:
+        train_df = pd.read_csv(train_data, encoding='latin1')
+        if test_data is not None:
+            test_df = pd.read_csv(test_data, encoding='latin1')
+        else:
+            test_df = None
+        st.write("**Training Data Preview:**")
+        st.dataframe(train_df.head(3))
+        columns = train_df.columns.tolist()
+        text_data = st.sidebar.selectbox("Choose the text column:", columns)
+        target = st.sidebar.selectbox("Choose the target column:", columns)
+        # Process data
+        text_cleaner = TextCleaner()
+        train_df['clean_text'] = train_df[text_data].apply(lambda x: text_cleaner.clean_text(x))
+        train_df['text_length'] = train_df[text_data].str.len()
+        # Handle label encoding
+        label_encoder = LabelEncoder()
+        train_df['target'] = label_encoder.fit_transform(train_df[target])
+        # Save label encoder for later use
+        os.makedirs("artifacts", exist_ok=True)
+        save_artifacts(label_encoder, "artifacts", "encoder.pkl")
+    except Exception as e:
+        st.error(f"Error loading data: {str(e)}")
+        train_df = None
+# Data Analysis Section
+if section == "Data Analysis":
+    if train_data is not None and train_df is not None:
+        try:
+            st.subheader("📊 Data Insights")
+            analyzer = DataAnalyzer(train_df, text_data, target)
+            info = analyzer.get_basic_info()
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Total Samples", info['shape'][0])
+            with col2:
+                st.metric("Features", info['shape'][1])
+            with col3:
+                st.metric("Classes", len(info['class_distribution']))
+            st.write("**Class Distribution:**")
+            st.write(info['class_distribution'])
+            st.write("**Missing Values:**")
+            st.write(info['missing_values'])
+            st.write("**Processed Data Preview:**")
+            st.dataframe(train_df[['clean_text', 'text_length', 'target']].head())
+            st.subheader("📈 Visualizations")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write("**Class Distribution**")
+                analyzer.plot_class_distribution()
+            with col2:
+                st.write("**Text Length Distribution**")
+                analyzer.plot_text_length_distribution()
+        except Exception as e:
+            st.error(f"Error in data analysis: {str(e)}")
+    else:
+        st.warning("⚠️ Please upload training data to get insights")
+# Train Model Section
+elif section == "Train Model":
+    if train_data is not None and train_df is not None:
+        try:
+            st.subheader("🚀 Train a Model")
+            col1, col2 = st.columns(2)
+            with col1:
+                model = st.selectbox("Choose the Model", [
+                    "Logistic Regression", "Decision Tree",
+                    "Random Forest", "Linear SVC", "SVC",
+                    "Multinomial Naive Bayes", "Gaussian Naive Bayes"
+                ])
+            with col2:
+                vectorizer_choice = st.selectbox("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
+            # Initialize vectorizer
+            if vectorizer_choice == "Tfidf Vectorizer":
+                vectorizer = TfidfVectorizer(max_features=10000)
+                st.session_state.vectorizer_type = "tfidf"
+            else:
+                vectorizer = CountVectorizer(max_features=10000)
+                st.session_state.vectorizer_type = "count"
+            st.write("**Training Data Preview:**")
+            st.dataframe(train_df[['clean_text', 'target']].head())
+            # Vectorize text data
+            X = vectorizer.fit_transform(train_df['clean_text'])
+            y = train_df['target']
+            # Split data
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+            st.write(f"**Data split** - Train: {X_train.shape}, Test: {X_test.shape}")
+            # Save vectorizer for later use
+            vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
+            save_artifacts(vectorizer, "artifacts", vectorizer_filename)
+            if st.button("🎯 Start Training", type="primary"):
+                with st.spinner("Training model..."):
+                    model_filename = train_model(model, X_train, X_test, y_train, y_test)
+                    if model_filename:
+                        st.info("✅ You can now use the 'Predictions' section to classify new text.")
+        except Exception as e:
+            st.error(f"Error in model training: {str(e)}")
+    else:
+        st.warning("⚠️ Please upload training data to train a model")
+# Predictions Section
+elif section == "Predictions":
+    st.subheader("🔮 Perform Predictions on New Text")
+    # Check if models exist
+    if os.path.exists("models") and os.listdir("models"):
+        # Text input for prediction
+        text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type your text here...")
+        # Model selection
+        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
+        if available_models:
+            selected_model = st.selectbox("Choose the trained model:", available_models)
+            # Prediction button
+            if st.button("🎯 Predict", type="primary"):
+                if text_input.strip():
+                    with st.spinner("Making prediction..."):
+                        predicted_label, prediction_proba = predict_text(
+                            selected_model,
+                            text_input,
+                            st.session_state.get('vectorizer_type', 'tfidf')
+                        )
+                        if predicted_label is not None:
+                            st.success("✅ Prediction completed!")
+                            # Display results
+                            st.markdown("### 📊 Prediction Results")
+                            st.markdown(f"**Input Text:** {text_input}")
+                            st.markdown(f"**Predicted Class:** `{predicted_label}`")
+                            # Display probabilities if available
+                            if prediction_proba is not None:
+                                st.markdown("**📈 Class Probabilities:**")
+                                # Load encoder to get class names
+                                encoder = load_artifacts("artifacts", "encoder.pkl")
+                                if encoder is not None:
+                                    classes = encoder.classes_
+                                    prob_df = pd.DataFrame({
+                                        'Class': classes,
+                                        'Probability': prediction_proba
+                                    }).sort_values('Probability', ascending=False)
+                                    st.bar_chart(prob_df.set_index('Class'))
+                                    st.dataframe(prob_df, use_container_width=True)
+                else:
+                    st.warning("⚠️ Please enter some text to classify")
+        else:
+            st.warning("⚠️ No trained models found. Please train a model first.")
+    else:
+        st.warning("⚠️ No trained models found. Please go to 'Train Model' section to train a model first.")
+    # Option to classify multiple texts
+    st.markdown("---")
+    st.subheader("📊 Batch Predictions")
+    uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
+    if uploaded_file is not None:
+        try:
+            batch_df = pd.read_csv(uploaded_file, encoding='latin1')
+            st.write("**Uploaded data preview:**")
+            st.dataframe(batch_df.head())
+            # Select text column
+            text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
+            if os.path.exists("models") and os.listdir("models"):
+                available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
+                batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
+                if st.button("🚀 Run Batch Predictions", type="primary"):
+                    with st.spinner("Processing batch predictions..."):
+                        predictions = []
+                        progress_bar = st.progress(0)
+                        for i, text in enumerate(batch_df[text_column]):
+                            pred, _ = predict_text(
+                                batch_model,
+                                str(text),
+                                st.session_state.get('vectorizer_type', 'tfidf')
+                            )
+                            predictions.append(pred if pred is not None else "Error")
+                            progress_bar.progress((i + 1) / len(batch_df))
+                        batch_df['Predicted_Class'] = predictions
+                        st.success("✅ Batch predictions completed!")
+                        st.write("**Results:**")
+                        st.dataframe(batch_df[[text_column, 'Predicted_Class']], use_container_width=True)
+                        # Download results
+                        csv = batch_df.to_csv(index=False)
+                        st.download_button(
+                            label="💾 Download predictions as CSV",
+                            data=csv,
+                            file_name="batch_predictions.csv",
+                            mime="text/csv"
+                        )
+        except Exception as e:
+            st.error(f"Error in batch prediction: {str(e)}")
+# Footer
+st.markdown("---")
+st.markdown("Built with ❤️ using Streamlit | Deploy on 🤗 Hugging Face Spaces")