Spaces:

Alamgirapi
/

TextClassifier

Runtime error

App Files Files Community

Alamgirapi commited on Aug 6, 2025

Commit

b97bac9

verified ·

1 Parent(s): 56d15cb

Update app.py

Browse files

Files changed (1) hide show

app.py +293 -309

app.py CHANGED Viewed

@@ -4,81 +4,85 @@ import matplotlib.pyplot as plt
 import numpy as np
 import os
 import pickle
-import ssl
-import nltk
 import re
 import string
 from pathlib import Path
 from sklearn.preprocessing import LabelEncoder
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.svm import LinearSVC, SVC
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.naive_bayes import MultinomialNB, GaussianNB
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-# Fix SSL certificate issues for NLTK downloads
-try:
-    _create_unverified_https_context = ssl._create_unverified_context
-except AttributeError:
-    pass
-else:
-    ssl._create_default_https_context = _create_unverified_https_context
-# Download NLTK data with error handling
 @st.cache_resource
-def download_nltk_data():
-    try:
-        nltk.data.find('corpora/stopwords')
-    except LookupError:
-        nltk.download('stopwords', quiet=True)
     try:
-        nltk.data.find('corpora/wordnet')
-    except LookupError:
-        nltk.download('wordnet', quiet=True)
-        nltk.download('omw-1.4', quiet=True)
-# Download required NLTK data
-download_nltk_data()
-class TextCleaner:
-    """Class for cleaning Text"""
-    def __init__(self, currency_symbols=r'[\$\£\€\¥\₹\¢\₽\₩\₪]', stop_words=None, lemmatizer=None):
-        self.currency_symbols = currency_symbols
-        if stop_words is None:
             try:
-                self.stop_words = set(stopwords.words('english'))
-            except LookupError:
                 nltk.download('stopwords', quiet=True)
-                self.stop_words = set(stopwords.words('english'))
-        else:
-            self.stop_words = stop_words
-        if lemmatizer is None:
-            try:
-                self.lemmatizer = WordNetLemmatizer()
-                # Test the lemmatizer to ensure it works
-                test_word = self.lemmatizer.lemmatize('testing')
-            except (AttributeError, LookupError) as e:
-                print(f"WordNet lemmatizer initialization failed: {e}")
-                nltk.download('wordnet', quiet=True)
                 nltk.download('omw-1.4', quiet=True)
-                self.lemmatizer = WordNetLemmatizer()
-        else:
-            self.lemmatizer = lemmatizer
     def remove_punctuation(self, text):
         return text.translate(str.maketrans('', '', string.punctuation))
     def clean_text(self, text):
-        """Clean the text by removing punctuations, html tag, underscore,
-        whitespaces, numbers, stopwords. Lemmatize the words in root format."""
         if not isinstance(text, str):
             text = str(text) if text is not None else ""
@@ -86,10 +90,11 @@ class TextCleaner:
             return ""
         try:
             text = text.lower()
             text = re.sub(self.currency_symbols, 'currency', text)
-            # Remove any kind of emojis in the text
             emoji_pattern = re.compile("["
                                 u"\U0001F600-\U0001F64F"  # emoticons
                                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
@@ -99,29 +104,34 @@ class TextCleaner:
                                 u"\U000024C2-\U0001F251"
                                 "]+", flags=re.UNICODE)
             text = emoji_pattern.sub(r'', text)
             text = self.remove_punctuation(text)
             text = re.compile('<.*?>').sub('', text)
             text = text.replace('_', '')
             text = re.sub(r'[^\w\s]', '', text)
             text = re.sub(r'\d', ' ', text)
             text = re.sub(r'\s+', ' ', text).strip()
-            text = ' '.join(word for word in text.split() if word not in self.stop_words)
-            # Lemmatization with error handling
-            try:
-                text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
-            except (AttributeError, LookupError) as e:
-                print(f"Lemmatization failed for text: {e}")
-                pass
-            return str(text)
         except Exception as e:
-            print(f"Error cleaning text: {e}")
             return str(text)
 class DataAnalyzer:
-    """Class for data analysis and visualization"""
     def __init__(self, df, text_column, target_column):
         self.df = df
         self.text_column = text_column
@@ -136,115 +146,129 @@ class DataAnalyzer:
         return info
     def plot_class_distribution(self):
-        fig, ax = plt.subplots(figsize=(10, 6))
-        self.df[self.target_column].value_counts().plot(kind='bar', ax=ax)
-        ax.set_title('Class Distribution')
-        ax.set_xlabel('Classes')
-        ax.set_ylabel('Count')
-        plt.xticks(rotation=45)
-        st.pyplot(fig)
     def plot_text_length_distribution(self):
-        fig, ax = plt.subplots(figsize=(10, 6))
-        text_lengths = self.df[self.text_column].str.len()
-        ax.hist(text_lengths, bins=50, alpha=0.7)
-        ax.set_title('Text Length Distribution')
-        ax.set_xlabel('Text Length')
-        ax.set_ylabel('Frequency')
-        st.pyplot(fig)
-# Utility functions
 def save_artifacts(obj, folder_name, file_name):
-    """Save artifacts like encoders and vectorizers"""
-    os.makedirs(folder_name, exist_ok=True)
-    with open(os.path.join(folder_name, file_name), 'wb') as f:
-        pickle.dump(obj, f)
 def load_artifacts(folder_name, file_name):
-    """Load saved artifacts"""
     try:
         with open(os.path.join(folder_name, file_name), 'rb') as f:
             return pickle.load(f)
     except FileNotFoundError:
-        st.error(f"File {file_name} not found in {folder_name} folder")
         return None
-def load_model(model_name):
-    """Load trained model"""
-    try:
-        with open(os.path.join('models', model_name), 'rb') as f:
-            return pickle.load(f)
-    except FileNotFoundError:
-        st.error(f"Model {model_name} not found. Please train a model first.")
         return None
 def train_model(model_name, X_train, X_test, y_train, y_test):
-    """Train selected model"""
-    os.makedirs("models", exist_ok=True)
-    models_dict = {
-        "Logistic Regression": LogisticRegression(),
-        "Decision Tree": DecisionTreeClassifier(),
-        "Random Forest": RandomForestClassifier(),
-        "Linear SVC": LinearSVC(),
-        "SVC": SVC(),
-        "Multinomial Naive Bayes": MultinomialNB(),
-        "Gaussian Naive Bayes": GaussianNB()
-    }
-    if model_name in models_dict:
         model = models_dict[model_name]
         model.fit(X_train, y_train)
         # Save model
-        model_filename = f"{model_name.replace(' ', '')}.pkl"
         save_path = os.path.join("models", model_filename)
-        with open(save_path, 'wb') as f:
-            pickle.dump(model, f)
-        # Evaluate model
-        y_pred = model.predict(X_test)
-        accuracy = accuracy_score(y_test, y_pred)
-        st.success("Model training completed!")
-        st.write(f"**Accuracy**: {accuracy:.4f}")
-        return model_filename
-    else:
-        st.error(f"Model {model_name} not supported")
         return None
 def predict_text(model_name, text, vectorizer_type="tfidf"):
-    """Make prediction on new text"""
     try:
-        # Load model
-        model = load_model(model_name)
         if model is None:
             return None, None
-        # Load vectorizer
         vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
         vectorizer = load_artifacts("artifacts", vectorizer_file)
         if vectorizer is None:
             return None, None
-        # Load label encoder
         encoder = load_artifacts("artifacts", "encoder.pkl")
         if encoder is None:
             return None, None
-        # Clean and vectorize text
         text_cleaner = TextCleaner()
         clean_text = text_cleaner.clean_text(text)
-        # Transform text using the same vectorizer used during training
-        text_vector = vectorizer.transform([clean_text])
-        # Make prediction
         prediction = model.predict(text_vector)
-        prediction_proba = None
-        # Get prediction probabilities if available
         if hasattr(model, 'predict_proba'):
             try:
                 prediction_proba = model.predict_proba(text_vector)[0]
@@ -257,13 +281,16 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
         return predicted_label, prediction_proba
     except Exception as e:
-        st.error(f"Error during prediction: {str(e)}")
         return None, None
-# Streamlit App
-st.set_page_config(page_title="No Code Text Classifier", page_icon="🤖", layout="wide")
 st.title('🤖 No Code Text Classification App')
 st.write('Understand the behavior of your text data and train a model to classify text data')
 # Sidebar
@@ -272,155 +299,170 @@ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "P
 # Upload Data
 st.sidebar.subheader("📁 Upload Your Dataset")
 train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
-test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
-# Global variables to store data and settings
 if 'vectorizer_type' not in st.session_state:
     st.session_state.vectorizer_type = "tfidf"
 if train_data is not None:
     try:
-        train_df = pd.read_csv(train_data, encoding='latin1')
-        if test_data is not None:
-            test_df = pd.read_csv(test_data, encoding='latin1')
         else:
-            test_df = None
-        st.write("**Training Data Preview:**")
-        st.dataframe(train_df.head(3))
-        columns = train_df.columns.tolist()
-        text_data = st.sidebar.selectbox("Choose the text column:", columns)
-        target = st.sidebar.selectbox("Choose the target column:", columns)
-        # Process data
-        text_cleaner = TextCleaner()
-        train_df['clean_text'] = train_df[text_data].apply(lambda x: text_cleaner.clean_text(x))
-        train_df['text_length'] = train_df[text_data].str.len()
-        # Handle label encoding
-        label_encoder = LabelEncoder()
-        train_df['target'] = label_encoder.fit_transform(train_df[target])
-        # Save label encoder for later use
-        os.makedirs("artifacts", exist_ok=True)
-        save_artifacts(label_encoder, "artifacts", "encoder.pkl")
     except Exception as e:
-        st.error(f"Error loading data: {str(e)}")
         train_df = None
 # Data Analysis Section
 if section == "Data Analysis":
-    if train_data is not None and train_df is not None:
-        try:
-            st.subheader("📊 Data Insights")
-            analyzer = DataAnalyzer(train_df, text_data, target)
-            info = analyzer.get_basic_info()
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Total Samples", info['shape'][0])
-            with col2:
-                st.metric("Features", info['shape'][1])
-            with col3:
-                st.metric("Classes", len(info['class_distribution']))
-            st.write("**Class Distribution:**")
-            st.write(info['class_distribution'])
-            st.write("**Missing Values:**")
-            st.write(info['missing_values'])
-            st.write("**Processed Data Preview:**")
-            st.dataframe(train_df[['clean_text', 'text_length', 'target']].head())
-            st.subheader("📈 Visualizations")
-            col1, col2 = st.columns(2)
-            with col1:
-                st.write("**Class Distribution**")
-                analyzer.plot_class_distribution()
-            with col2:
-                st.write("**Text Length Distribution**")
-                analyzer.plot_text_length_distribution()
-        except Exception as e:
-            st.error(f"Error in data analysis: {str(e)}")
     else:
-        st.warning("⚠️ Please upload training data to get insights")
 # Train Model Section
 elif section == "Train Model":
-    if train_data is not None and train_df is not None:
-        try:
-            st.subheader("🚀 Train a Model")
-            col1, col2 = st.columns(2)
-            with col1:
-                model = st.selectbox("Choose the Model", [
-                    "Logistic Regression", "Decision Tree",
-                    "Random Forest", "Linear SVC", "SVC",
-                    "Multinomial Naive Bayes", "Gaussian Naive Bayes"
-                ])
-            with col2:
-                vectorizer_choice = st.selectbox("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
             # Initialize vectorizer
             if vectorizer_choice == "Tfidf Vectorizer":
-                vectorizer = TfidfVectorizer(max_features=10000)
                 st.session_state.vectorizer_type = "tfidf"
             else:
-                vectorizer = CountVectorizer(max_features=10000)
                 st.session_state.vectorizer_type = "count"
-            st.write("**Training Data Preview:**")
-            st.dataframe(train_df[['clean_text', 'target']].head())
-            # Vectorize text data
-            X = vectorizer.fit_transform(train_df['clean_text'])
-            y = train_df['target']
-            # Split data
-            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-            st.write(f"**Data split** - Train: {X_train.shape}, Test: {X_test.shape}")
-            # Save vectorizer for later use
-            vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
-            save_artifacts(vectorizer, "artifacts", vectorizer_filename)
             if st.button("🎯 Start Training", type="primary"):
                 with st.spinner("Training model..."):
-                    model_filename = train_model(model, X_train, X_test, y_train, y_test)
-                    if model_filename:
-                        st.info("✅ You can now use the 'Predictions' section to classify new text.")
-        except Exception as e:
-            st.error(f"Error in model training: {str(e)}")
     else:
-        st.warning("⚠️ Please upload training data to train a model")
 # Predictions Section
 elif section == "Predictions":
-    st.subheader("🔮 Perform Predictions on New Text")
-    # Check if models exist
     if os.path.exists("models") and os.listdir("models"):
-        # Text input for prediction
-        text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type your text here...")
-        # Model selection
         available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
         if available_models:
-            selected_model = st.selectbox("Choose the trained model:", available_models)
-            # Prediction button
             if st.button("🎯 Predict", type="primary"):
                 if text_input.strip():
                     with st.spinner("Making prediction..."):
@@ -432,17 +474,10 @@ elif section == "Predictions":
                         if predicted_label is not None:
                             st.success("✅ Prediction completed!")
-                            # Display results
-                            st.markdown("### 📊 Prediction Results")
-                            st.markdown(f"**Input Text:** {text_input}")
                             st.markdown(f"**Predicted Class:** `{predicted_label}`")
-                            # Display probabilities if available
                             if prediction_proba is not None:
-                                st.markdown("**📈 Class Probabilities:**")
-                                # Load encoder to get class names
                                 encoder = load_artifacts("artifacts", "encoder.pkl")
                                 if encoder is not None:
                                     classes = encoder.classes_
@@ -451,65 +486,14 @@ elif section == "Predictions":
                                         'Probability': prediction_proba
                                     }).sort_values('Probability', ascending=False)
-                                    st.bar_chart(prob_df.set_index('Class'))
                                     st.dataframe(prob_df, use_container_width=True)
                 else:
-                    st.warning("⚠️ Please enter some text to classify")
         else:
-            st.warning("⚠️ No trained models found. Please train a model first.")
     else:
-        st.warning("⚠️ No trained models found. Please go to 'Train Model' section to train a model first.")
-    # Option to classify multiple texts
-    st.markdown("---")
-    st.subheader("📊 Batch Predictions")
-    uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
-    if uploaded_file is not None:
-        try:
-            batch_df = pd.read_csv(uploaded_file, encoding='latin1')
-            st.write("**Uploaded data preview:**")
-            st.dataframe(batch_df.head())
-            # Select text column
-            text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
-            if os.path.exists("models") and os.listdir("models"):
-                available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
-                batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
-                if st.button("🚀 Run Batch Predictions", type="primary"):
-                    with st.spinner("Processing batch predictions..."):
-                        predictions = []
-                        progress_bar = st.progress(0)
-                        for i, text in enumerate(batch_df[text_column]):
-                            pred, _ = predict_text(
-                                batch_model,
-                                str(text),
-                                st.session_state.get('vectorizer_type', 'tfidf')
-                            )
-                            predictions.append(pred if pred is not None else "Error")
-                            progress_bar.progress((i + 1) / len(batch_df))
-                        batch_df['Predicted_Class'] = predictions
-                        st.success("✅ Batch predictions completed!")
-                        st.write("**Results:**")
-                        st.dataframe(batch_df[[text_column, 'Predicted_Class']], use_container_width=True)
-                        # Download results
-                        csv = batch_df.to_csv(index=False)
-                        st.download_button(
-                            label="💾 Download predictions as CSV",
-                            data=csv,
-                            file_name="batch_predictions.csv",
-                            mime="text/csv"
-                        )
-        except Exception as e:
-            st.error(f"Error in batch prediction: {str(e)}")
 # Footer
 st.markdown("---")
-st.markdown("Built with ❤️ using Streamlit | Deploy on 🤗 Hugging Face Spaces")

 import numpy as np
 import os
 import pickle
 import re
 import string
 from pathlib import Path
 from sklearn.preprocessing import LabelEncoder
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import LinearSVC
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import MultinomialNB
+# Configure Streamlit page
+st.set_page_config(page_title="No Code Text Classifier", page_icon="🤖", layout="wide")
+# Initialize NLTK components with fallbacks
 @st.cache_resource
+def init_nltk_components():
+    """Initialize NLTK components with fallbacks"""
     try:
+        import nltk
+        # Try to use pre-downloaded data first
+        try:
+            from nltk.corpus import stopwords
+            from nltk.stem import WordNetLemmatizer
+            stop_words = set(stopwords.words('english'))
+            lemmatizer = WordNetLemmatizer()
+            # Test lemmatizer
+            _ = lemmatizer.lemmatize('test')
+            return stop_words, lemmatizer, True
+        except:
+            # Fallback: try to download
             try:
                 nltk.download('stopwords', quiet=True)
+                nltk.download('wordnet', quiet=True)
                 nltk.download('omw-1.4', quiet=True)
+                from nltk.corpus import stopwords
+                from nltk.stem import WordNetLemmatizer
+                stop_words = set(stopwords.words('english'))
+                lemmatizer = WordNetLemmatizer()
+                return stop_words, lemmatizer, True
+            except:
+                # Final fallback: use basic English stopwords
+                basic_stopwords = {
+                    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
+                    'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
+                    'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
+                    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
+                    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
+                    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
+                    'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
+                    'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
+                    'with', 'through', 'during', 'before', 'after', 'above', 'below',
+                    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
+                    'further', 'then', 'once'
+                }
+                return basic_stopwords, None, False
+    except ImportError:
+        # NLTK not available at all
+        basic_stopwords = set()
+        return basic_stopwords, None, False
+# Initialize NLTK components
+STOP_WORDS, LEMMATIZER, NLTK_AVAILABLE = init_nltk_components()
+class TextCleaner:
+    """Simplified text cleaner with fallbacks"""
+    def __init__(self):
+        self.currency_symbols = r'[\$\£\€\¥\₹\¢\₽\₩\₪]'
+        self.stop_words = STOP_WORDS
+        self.lemmatizer = LEMMATIZER
+        self.nltk_available = NLTK_AVAILABLE
     def remove_punctuation(self, text):
         return text.translate(str.maketrans('', '', string.punctuation))
     def clean_text(self, text):
+        """Clean text with robust error handling"""
         if not isinstance(text, str):
             text = str(text) if text is not None else ""
             return ""
         try:
+            # Basic cleaning
             text = text.lower()
             text = re.sub(self.currency_symbols, 'currency', text)
+            # Remove emojis
             emoji_pattern = re.compile("["
                                 u"\U0001F600-\U0001F64F"  # emoticons
                                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                 u"\U000024C2-\U0001F251"
                                 "]+", flags=re.UNICODE)
             text = emoji_pattern.sub(r'', text)
+            # Remove punctuation and clean
             text = self.remove_punctuation(text)
             text = re.compile('<.*?>').sub('', text)
             text = text.replace('_', '')
             text = re.sub(r'[^\w\s]', '', text)
             text = re.sub(r'\d', ' ', text)
             text = re.sub(r'\s+', ' ', text).strip()
+            # Remove stopwords if available
+            if self.stop_words:
+                text = ' '.join(word for word in text.split() if word not in self.stop_words)
+            # Lemmatize if available
+            if self.lemmatizer and self.nltk_available:
+                try:
+                    text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
+                except:
+                    pass  # Skip lemmatization if it fails
+            return text
         except Exception as e:
+            st.warning(f"Text cleaning warning: {e}")
             return str(text)
 class DataAnalyzer:
+    """Simplified data analyzer"""
     def __init__(self, df, text_column, target_column):
         self.df = df
         self.text_column = text_column
         return info
     def plot_class_distribution(self):
+        try:
+            fig, ax = plt.subplots(figsize=(10, 6))
+            self.df[self.target_column].value_counts().plot(kind='bar', ax=ax)
+            ax.set_title('Class Distribution')
+            ax.set_xlabel('Classes')
+            ax.set_ylabel('Count')
+            plt.xticks(rotation=45)
+            plt.tight_layout()
+            st.pyplot(fig)
+        except Exception as e:
+            st.error(f"Error creating plot: {e}")
     def plot_text_length_distribution(self):
+        try:
+            fig, ax = plt.subplots(figsize=(10, 6))
+            text_lengths = self.df[self.text_column].str.len()
+            ax.hist(text_lengths, bins=50, alpha=0.7)
+            ax.set_title('Text Length Distribution')
+            ax.set_xlabel('Text Length')
+            ax.set_ylabel('Frequency')
+            plt.tight_layout()
+            st.pyplot(fig)
+        except Exception as e:
+            st.error(f"Error creating plot: {e}")
+# Utility functions with better error handling
 def save_artifacts(obj, folder_name, file_name):
+    """Save artifacts with error handling"""
+    try:
+        os.makedirs(folder_name, exist_ok=True)
+        with open(os.path.join(folder_name, file_name), 'wb') as f:
+            pickle.dump(obj, f)
+        return True
+    except Exception as e:
+        st.error(f"Error saving {file_name}: {e}")
+        return False
 def load_artifacts(folder_name, file_name):
+    """Load artifacts with error handling"""
     try:
         with open(os.path.join(folder_name, file_name), 'rb') as f:
             return pickle.load(f)
     except FileNotFoundError:
+        st.error(f"File {file_name} not found in {folder_name}")
         return None
+    except Exception as e:
+        st.error(f"Error loading {file_name}: {e}")
         return None
 def train_model(model_name, X_train, X_test, y_train, y_test):
+    """Train model with simplified selection"""
+    try:
+        os.makedirs("models", exist_ok=True)
+        # Simplified model dictionary
+        models_dict = {
+            "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
+            "Decision Tree": DecisionTreeClassifier(random_state=42),
+            "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42),  # Reduced for speed
+            "Linear SVC": LinearSVC(random_state=42, max_iter=1000),
+            "Multinomial Naive Bayes": MultinomialNB(),
+        }
+        if model_name not in models_dict:
+            st.error(f"Model {model_name} not supported")
+            return None
         model = models_dict[model_name]
+        # Train model
         model.fit(X_train, y_train)
         # Save model
+        model_filename = f"{model_name.replace(' ', '_')}.pkl"
         save_path = os.path.join("models", model_filename)
+        if save_artifacts(model, "models", model_filename):
+            # Evaluate
+            y_pred = model.predict(X_test)
+            accuracy = accuracy_score(y_test, y_pred)
+            st.success("✅ Model training completed!")
+            st.write(f"**Accuracy**: {accuracy:.4f}")
+            return model_filename
+        else:
+            return None
+    except Exception as e:
+        st.error(f"Error training model: {e}")
         return None
 def predict_text(model_name, text, vectorizer_type="tfidf"):
+    """Make prediction with better error handling"""
     try:
+        # Load components
+        model = load_artifacts("models", model_name)
         if model is None:
             return None, None
         vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
         vectorizer = load_artifacts("artifacts", vectorizer_file)
         if vectorizer is None:
             return None, None
         encoder = load_artifacts("artifacts", "encoder.pkl")
         if encoder is None:
             return None, None
+        # Process text
         text_cleaner = TextCleaner()
         clean_text = text_cleaner.clean_text(text)
+        if not clean_text.strip():
+            st.warning("Text became empty after cleaning")
+            return None, None
+        # Vectorize and predict
+        text_vector = vectorizer.transform([clean_text])
         prediction = model.predict(text_vector)
+        # Get probabilities if possible
+        prediction_proba = None
         if hasattr(model, 'predict_proba'):
             try:
                 prediction_proba = model.predict_proba(text_vector)[0]
         return predicted_label, prediction_proba
     except Exception as e:
+        st.error(f"Prediction error: {e}")
         return None, None
+# Main Streamlit App
 st.title('🤖 No Code Text Classification App')
+# Show NLTK status
+if not NLTK_AVAILABLE:
+    st.warning("⚠️ NLTK not fully available. Using basic text processing.")
 st.write('Understand the behavior of your text data and train a model to classify text data')
 # Sidebar
 # Upload Data
 st.sidebar.subheader("📁 Upload Your Dataset")
 train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
+# Initialize session state
 if 'vectorizer_type' not in st.session_state:
     st.session_state.vectorizer_type = "tfidf"
+# Load and process data
+train_df = None
 if train_data is not None:
     try:
+        # Try different encodings
+        for encoding in ['utf-8', 'latin1', 'iso-8859-1']:
+            try:
+                train_df = pd.read_csv(train_data, encoding=encoding)
+                break
+            except UnicodeDecodeError:
+                continue
+        if train_df is None:
+            st.error("Could not read the CSV file. Please check the encoding.")
         else:
+            st.write("**Training Data Preview:**")
+            st.dataframe(train_df.head(3))
+            columns = train_df.columns.tolist()
+            text_data = st.sidebar.selectbox("Choose the text column:", columns)
+            target = st.sidebar.selectbox("Choose the target column:", columns)
+            # Process data
+            if text_data and target:
+                with st.spinner("Processing data..."):
+                    text_cleaner = TextCleaner()
+                    train_df['clean_text'] = train_df[text_data].apply(
+                        lambda x: text_cleaner.clean_text(x) if pd.notna(x) else ""
+                    )
+                    train_df['text_length'] = train_df[text_data].astype(str).str.len()
+                    # Handle label encoding
+                    label_encoder = LabelEncoder()
+                    train_df['target'] = label_encoder.fit_transform(train_df[target].astype(str))
+                    # Save encoder
+                    save_artifacts(label_encoder, "artifacts", "encoder.pkl")
     except Exception as e:
+        st.error(f"Error processing data: {e}")
         train_df = None
 # Data Analysis Section
 if section == "Data Analysis":
+    if train_df is not None:
+        st.subheader("📊 Data Insights")
+        analyzer = DataAnalyzer(train_df, text_data, target)
+        info = analyzer.get_basic_info()
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Total Samples", info['shape'][0])
+        with col2:
+            st.metric("Features", info['shape'][1])
+        with col3:
+            st.metric("Classes", len(info['class_distribution']))
+        st.write("**Class Distribution:**")
+        st.write(info['class_distribution'])
+        # Show sample of processed data
+        st.write("**Processed Data Preview:**")
+        sample_df = train_df[['clean_text', 'text_length', 'target']].head(10)
+        st.dataframe(sample_df)
+        st.subheader("📈 Visualizations")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("**Class Distribution**")
+            analyzer.plot_class_distribution()
+        with col2:
+            st.write("**Text Length Distribution**")
+            analyzer.plot_text_length_distribution()
     else:
+        st.warning("⚠️ Please upload training data to see analysis")
 # Train Model Section
 elif section == "Train Model":
+    if train_df is not None and 'clean_text' in train_df.columns:
+        st.subheader("🚀 Train a Model")
+        col1, col2 = st.columns(2)
+        with col1:
+            model = st.selectbox("Choose the Model", [
+                "Logistic Regression",
+                "Decision Tree",
+                "Random Forest",
+                "Linear SVC",
+                "Multinomial Naive Bayes"
+            ])
+        with col2:
+            vectorizer_choice = st.selectbox("Choose Vectorizer",
+                                           ["Tfidf Vectorizer", "Count Vectorizer"])
+        # Filter out empty texts
+        valid_data = train_df[train_df['clean_text'].str.len() > 0].copy()
+        if len(valid_data) == 0:
+            st.error("No valid text data after cleaning!")
+        else:
+            st.write(f"**Valid samples**: {len(valid_data)}")
             # Initialize vectorizer
+            max_features = min(10000, len(valid_data) * 10)  # Adaptive max_features
             if vectorizer_choice == "Tfidf Vectorizer":
+                vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
                 st.session_state.vectorizer_type = "tfidf"
             else:
+                vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
                 st.session_state.vectorizer_type = "count"
             if st.button("🎯 Start Training", type="primary"):
                 with st.spinner("Training model..."):
+                    try:
+                        # Vectorize
+                        X = vectorizer.fit_transform(valid_data['clean_text'])
+                        y = valid_data['target']
+                        # Split data
+                        test_size = min(0.3, max(0.1, len(valid_data) * 0.2 / len(valid_data)))
+                        X_train, X_test, y_train, y_test = train_test_split(
+                            X, y, test_size=test_size, random_state=42, stratify=y
+                        )
+                        st.write(f"**Data split** - Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")
+                        # Save vectorizer
+                        vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
+                        if save_artifacts(vectorizer, "artifacts", vectorizer_filename):
+                            # Train model
+                            model_filename = train_model(model, X_train, X_test, y_train, y_test)
+                            if model_filename:
+                                st.success("✅ Model ready! Go to 'Predictions' to test it.")
+                    except Exception as e:
+                        st.error(f"Training failed: {e}")
     else:
+        st.warning("⚠️ Please upload and process training data first")
 # Predictions Section
 elif section == "Predictions":
+    st.subheader("🔮 Make Predictions")
     if os.path.exists("models") and os.listdir("models"):
         available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
         if available_models:
+            selected_model = st.selectbox("Choose trained model:", available_models)
+            text_input = st.text_area("Enter text to classify:",
+                                    height=100,
+                                    placeholder="Type your text here...")
             if st.button("🎯 Predict", type="primary"):
                 if text_input.strip():
                     with st.spinner("Making prediction..."):
                         if predicted_label is not None:
                             st.success("✅ Prediction completed!")
                             st.markdown(f"**Predicted Class:** `{predicted_label}`")
                             if prediction_proba is not None:
+                                st.markdown("**Class Probabilities:**")
                                 encoder = load_artifacts("artifacts", "encoder.pkl")
                                 if encoder is not None:
                                     classes = encoder.classes_
                                         'Probability': prediction_proba
                                     }).sort_values('Probability', ascending=False)
                                     st.dataframe(prob_df, use_container_width=True)
                 else:
+                    st.warning("⚠️ Please enter some text")
         else:
+            st.warning("⚠️ No trained models found")
     else:
+        st.warning("⚠️ No models available. Please train a model first.")
 # Footer
 st.markdown("---")
+st.markdown("🚀 Built with Streamlit | Ready for 🤗 Hugging Face Spaces")