Alamgirapi commited on
Commit
b97bac9
·
verified ·
1 Parent(s): 56d15cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +293 -309
app.py CHANGED
@@ -4,81 +4,85 @@ import matplotlib.pyplot as plt
4
  import numpy as np
5
  import os
6
  import pickle
7
- import ssl
8
- import nltk
9
  import re
10
  import string
11
  from pathlib import Path
12
  from sklearn.preprocessing import LabelEncoder
13
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
14
  from sklearn.model_selection import train_test_split
15
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
16
  from sklearn.linear_model import LogisticRegression
17
  from sklearn.tree import DecisionTreeClassifier
18
- from sklearn.svm import LinearSVC, SVC
19
  from sklearn.ensemble import RandomForestClassifier
20
- from sklearn.naive_bayes import MultinomialNB, GaussianNB
21
- from nltk.corpus import stopwords
22
- from nltk.stem import WordNetLemmatizer
23
 
24
- # Fix SSL certificate issues for NLTK downloads
25
- try:
26
- _create_unverified_https_context = ssl._create_unverified_context
27
- except AttributeError:
28
- pass
29
- else:
30
- ssl._create_default_https_context = _create_unverified_https_context
31
 
32
- # Download NLTK data with error handling
33
  @st.cache_resource
34
- def download_nltk_data():
35
- try:
36
- nltk.data.find('corpora/stopwords')
37
- except LookupError:
38
- nltk.download('stopwords', quiet=True)
39
-
40
  try:
41
- nltk.data.find('corpora/wordnet')
42
- except LookupError:
43
- nltk.download('wordnet', quiet=True)
44
- nltk.download('omw-1.4', quiet=True)
45
-
46
- # Download required NLTK data
47
- download_nltk_data()
48
-
49
- class TextCleaner:
50
- """Class for cleaning Text"""
51
- def __init__(self, currency_symbols=r'[\$\£\€\¥\₹\¢\₽\₩\₪]', stop_words=None, lemmatizer=None):
52
- self.currency_symbols = currency_symbols
53
-
54
- if stop_words is None:
55
  try:
56
- self.stop_words = set(stopwords.words('english'))
57
- except LookupError:
58
  nltk.download('stopwords', quiet=True)
59
- self.stop_words = set(stopwords.words('english'))
60
- else:
61
- self.stop_words = stop_words
62
-
63
- if lemmatizer is None:
64
- try:
65
- self.lemmatizer = WordNetLemmatizer()
66
- # Test the lemmatizer to ensure it works
67
- test_word = self.lemmatizer.lemmatize('testing')
68
- except (AttributeError, LookupError) as e:
69
- print(f"WordNet lemmatizer initialization failed: {e}")
70
- nltk.download('wordnet', quiet=True)
71
  nltk.download('omw-1.4', quiet=True)
72
- self.lemmatizer = WordNetLemmatizer()
73
- else:
74
- self.lemmatizer = lemmatizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  def remove_punctuation(self, text):
77
  return text.translate(str.maketrans('', '', string.punctuation))
78
 
79
  def clean_text(self, text):
80
- """Clean the text by removing punctuations, html tag, underscore,
81
- whitespaces, numbers, stopwords. Lemmatize the words in root format."""
82
  if not isinstance(text, str):
83
  text = str(text) if text is not None else ""
84
 
@@ -86,10 +90,11 @@ class TextCleaner:
86
  return ""
87
 
88
  try:
 
89
  text = text.lower()
90
  text = re.sub(self.currency_symbols, 'currency', text)
91
 
92
- # Remove any kind of emojis in the text
93
  emoji_pattern = re.compile("["
94
  u"\U0001F600-\U0001F64F" # emoticons
95
  u"\U0001F300-\U0001F5FF" # symbols & pictographs
@@ -99,29 +104,34 @@ class TextCleaner:
99
  u"\U000024C2-\U0001F251"
100
  "]+", flags=re.UNICODE)
101
  text = emoji_pattern.sub(r'', text)
 
 
102
  text = self.remove_punctuation(text)
103
  text = re.compile('<.*?>').sub('', text)
104
  text = text.replace('_', '')
105
  text = re.sub(r'[^\w\s]', '', text)
106
  text = re.sub(r'\d', ' ', text)
107
  text = re.sub(r'\s+', ' ', text).strip()
108
- text = ' '.join(word for word in text.split() if word not in self.stop_words)
109
 
110
- # Lemmatization with error handling
111
- try:
112
- text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
113
- except (AttributeError, LookupError) as e:
114
- print(f"Lemmatization failed for text: {e}")
115
- pass
 
 
 
 
 
 
116
 
117
- return str(text)
118
-
119
  except Exception as e:
120
- print(f"Error cleaning text: {e}")
121
  return str(text)
122
 
123
  class DataAnalyzer:
124
- """Class for data analysis and visualization"""
125
  def __init__(self, df, text_column, target_column):
126
  self.df = df
127
  self.text_column = text_column
@@ -136,115 +146,129 @@ class DataAnalyzer:
136
  return info
137
 
138
  def plot_class_distribution(self):
139
- fig, ax = plt.subplots(figsize=(10, 6))
140
- self.df[self.target_column].value_counts().plot(kind='bar', ax=ax)
141
- ax.set_title('Class Distribution')
142
- ax.set_xlabel('Classes')
143
- ax.set_ylabel('Count')
144
- plt.xticks(rotation=45)
145
- st.pyplot(fig)
 
 
 
 
146
 
147
  def plot_text_length_distribution(self):
148
- fig, ax = plt.subplots(figsize=(10, 6))
149
- text_lengths = self.df[self.text_column].str.len()
150
- ax.hist(text_lengths, bins=50, alpha=0.7)
151
- ax.set_title('Text Length Distribution')
152
- ax.set_xlabel('Text Length')
153
- ax.set_ylabel('Frequency')
154
- st.pyplot(fig)
 
 
 
 
155
 
156
- # Utility functions
157
  def save_artifacts(obj, folder_name, file_name):
158
- """Save artifacts like encoders and vectorizers"""
159
- os.makedirs(folder_name, exist_ok=True)
160
- with open(os.path.join(folder_name, file_name), 'wb') as f:
161
- pickle.dump(obj, f)
 
 
 
 
 
162
 
163
  def load_artifacts(folder_name, file_name):
164
- """Load saved artifacts"""
165
  try:
166
  with open(os.path.join(folder_name, file_name), 'rb') as f:
167
  return pickle.load(f)
168
  except FileNotFoundError:
169
- st.error(f"File {file_name} not found in {folder_name} folder")
170
  return None
171
-
172
- def load_model(model_name):
173
- """Load trained model"""
174
- try:
175
- with open(os.path.join('models', model_name), 'rb') as f:
176
- return pickle.load(f)
177
- except FileNotFoundError:
178
- st.error(f"Model {model_name} not found. Please train a model first.")
179
  return None
180
 
181
  def train_model(model_name, X_train, X_test, y_train, y_test):
182
- """Train selected model"""
183
- os.makedirs("models", exist_ok=True)
184
-
185
- models_dict = {
186
- "Logistic Regression": LogisticRegression(),
187
- "Decision Tree": DecisionTreeClassifier(),
188
- "Random Forest": RandomForestClassifier(),
189
- "Linear SVC": LinearSVC(),
190
- "SVC": SVC(),
191
- "Multinomial Naive Bayes": MultinomialNB(),
192
- "Gaussian Naive Bayes": GaussianNB()
193
- }
194
-
195
- if model_name in models_dict:
 
 
 
196
  model = models_dict[model_name]
 
 
197
  model.fit(X_train, y_train)
198
 
199
  # Save model
200
- model_filename = f"{model_name.replace(' ', '')}.pkl"
201
  save_path = os.path.join("models", model_filename)
202
- with open(save_path, 'wb') as f:
203
- pickle.dump(model, f)
204
 
205
- # Evaluate model
206
- y_pred = model.predict(X_test)
207
- accuracy = accuracy_score(y_test, y_pred)
208
-
209
- st.success("Model training completed!")
210
- st.write(f"**Accuracy**: {accuracy:.4f}")
211
-
212
- return model_filename
213
- else:
214
- st.error(f"Model {model_name} not supported")
 
 
 
 
215
  return None
216
 
217
  def predict_text(model_name, text, vectorizer_type="tfidf"):
218
- """Make prediction on new text"""
219
  try:
220
- # Load model
221
- model = load_model(model_name)
222
  if model is None:
223
  return None, None
224
 
225
- # Load vectorizer
226
  vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
227
  vectorizer = load_artifacts("artifacts", vectorizer_file)
228
  if vectorizer is None:
229
  return None, None
230
 
231
- # Load label encoder
232
  encoder = load_artifacts("artifacts", "encoder.pkl")
233
  if encoder is None:
234
  return None, None
235
 
236
- # Clean and vectorize text
237
  text_cleaner = TextCleaner()
238
  clean_text = text_cleaner.clean_text(text)
239
 
240
- # Transform text using the same vectorizer used during training
241
- text_vector = vectorizer.transform([clean_text])
 
242
 
243
- # Make prediction
 
244
  prediction = model.predict(text_vector)
245
- prediction_proba = None
246
 
247
- # Get prediction probabilities if available
 
248
  if hasattr(model, 'predict_proba'):
249
  try:
250
  prediction_proba = model.predict_proba(text_vector)[0]
@@ -257,13 +281,16 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
257
  return predicted_label, prediction_proba
258
 
259
  except Exception as e:
260
- st.error(f"Error during prediction: {str(e)}")
261
  return None, None
262
 
263
- # Streamlit App
264
- st.set_page_config(page_title="No Code Text Classifier", page_icon="🤖", layout="wide")
265
-
266
  st.title('🤖 No Code Text Classification App')
 
 
 
 
 
267
  st.write('Understand the behavior of your text data and train a model to classify text data')
268
 
269
  # Sidebar
@@ -272,155 +299,170 @@ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "P
272
  # Upload Data
273
  st.sidebar.subheader("📁 Upload Your Dataset")
274
  train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
275
- test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
276
 
277
- # Global variables to store data and settings
278
  if 'vectorizer_type' not in st.session_state:
279
  st.session_state.vectorizer_type = "tfidf"
280
 
 
 
281
  if train_data is not None:
282
  try:
283
- train_df = pd.read_csv(train_data, encoding='latin1')
 
 
 
 
 
 
284
 
285
- if test_data is not None:
286
- test_df = pd.read_csv(test_data, encoding='latin1')
287
  else:
288
- test_df = None
 
289
 
290
- st.write("**Training Data Preview:**")
291
- st.dataframe(train_df.head(3))
292
-
293
- columns = train_df.columns.tolist()
294
- text_data = st.sidebar.selectbox("Choose the text column:", columns)
295
- target = st.sidebar.selectbox("Choose the target column:", columns)
296
 
297
- # Process data
298
- text_cleaner = TextCleaner()
299
- train_df['clean_text'] = train_df[text_data].apply(lambda x: text_cleaner.clean_text(x))
300
- train_df['text_length'] = train_df[text_data].str.len()
301
-
302
- # Handle label encoding
303
- label_encoder = LabelEncoder()
304
- train_df['target'] = label_encoder.fit_transform(train_df[target])
305
-
306
- # Save label encoder for later use
307
- os.makedirs("artifacts", exist_ok=True)
308
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
309
-
 
 
 
310
  except Exception as e:
311
- st.error(f"Error loading data: {str(e)}")
312
  train_df = None
313
 
314
  # Data Analysis Section
315
  if section == "Data Analysis":
316
- if train_data is not None and train_df is not None:
317
- try:
318
- st.subheader("📊 Data Insights")
319
-
320
- analyzer = DataAnalyzer(train_df, text_data, target)
321
- info = analyzer.get_basic_info()
322
-
323
- col1, col2, col3 = st.columns(3)
324
- with col1:
325
- st.metric("Total Samples", info['shape'][0])
326
- with col2:
327
- st.metric("Features", info['shape'][1])
328
- with col3:
329
- st.metric("Classes", len(info['class_distribution']))
330
-
331
- st.write("**Class Distribution:**")
332
- st.write(info['class_distribution'])
333
-
334
- st.write("**Missing Values:**")
335
- st.write(info['missing_values'])
336
-
337
- st.write("**Processed Data Preview:**")
338
- st.dataframe(train_df[['clean_text', 'text_length', 'target']].head())
339
-
340
- st.subheader("📈 Visualizations")
341
-
342
- col1, col2 = st.columns(2)
343
- with col1:
344
- st.write("**Class Distribution**")
345
- analyzer.plot_class_distribution()
346
-
347
- with col2:
348
- st.write("**Text Length Distribution**")
349
- analyzer.plot_text_length_distribution()
350
-
351
- except Exception as e:
352
- st.error(f"Error in data analysis: {str(e)}")
353
  else:
354
- st.warning("⚠️ Please upload training data to get insights")
355
 
356
  # Train Model Section
357
  elif section == "Train Model":
358
- if train_data is not None and train_df is not None:
359
- try:
360
- st.subheader("🚀 Train a Model")
361
 
362
- col1, col2 = st.columns(2)
363
 
364
- with col1:
365
- model = st.selectbox("Choose the Model", [
366
- "Logistic Regression", "Decision Tree",
367
- "Random Forest", "Linear SVC", "SVC",
368
- "Multinomial Naive Bayes", "Gaussian Naive Bayes"
369
- ])
370
-
371
- with col2:
372
- vectorizer_choice = st.selectbox("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
 
 
 
373
 
 
 
 
 
 
 
 
 
374
  # Initialize vectorizer
 
 
375
  if vectorizer_choice == "Tfidf Vectorizer":
376
- vectorizer = TfidfVectorizer(max_features=10000)
377
  st.session_state.vectorizer_type = "tfidf"
378
  else:
379
- vectorizer = CountVectorizer(max_features=10000)
380
  st.session_state.vectorizer_type = "count"
381
 
382
- st.write("**Training Data Preview:**")
383
- st.dataframe(train_df[['clean_text', 'target']].head())
384
-
385
- # Vectorize text data
386
- X = vectorizer.fit_transform(train_df['clean_text'])
387
- y = train_df['target']
388
-
389
- # Split data
390
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
391
- st.write(f"**Data split** - Train: {X_train.shape}, Test: {X_test.shape}")
392
-
393
- # Save vectorizer for later use
394
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
395
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
396
-
397
  if st.button("🎯 Start Training", type="primary"):
398
  with st.spinner("Training model..."):
399
- model_filename = train_model(model, X_train, X_test, y_train, y_test)
400
- if model_filename:
401
- st.info("✅ You can now use the 'Predictions' section to classify new text.")
402
-
403
- except Exception as e:
404
- st.error(f"Error in model training: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  else:
406
- st.warning("⚠️ Please upload training data to train a model")
407
 
408
  # Predictions Section
409
  elif section == "Predictions":
410
- st.subheader("🔮 Perform Predictions on New Text")
411
 
412
- # Check if models exist
413
  if os.path.exists("models") and os.listdir("models"):
414
- # Text input for prediction
415
- text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type your text here...")
416
-
417
- # Model selection
418
  available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
419
 
420
  if available_models:
421
- selected_model = st.selectbox("Choose the trained model:", available_models)
 
 
 
 
422
 
423
- # Prediction button
424
  if st.button("🎯 Predict", type="primary"):
425
  if text_input.strip():
426
  with st.spinner("Making prediction..."):
@@ -432,17 +474,10 @@ elif section == "Predictions":
432
 
433
  if predicted_label is not None:
434
  st.success("✅ Prediction completed!")
435
-
436
- # Display results
437
- st.markdown("### 📊 Prediction Results")
438
- st.markdown(f"**Input Text:** {text_input}")
439
  st.markdown(f"**Predicted Class:** `{predicted_label}`")
440
 
441
- # Display probabilities if available
442
  if prediction_proba is not None:
443
- st.markdown("**📈 Class Probabilities:**")
444
-
445
- # Load encoder to get class names
446
  encoder = load_artifacts("artifacts", "encoder.pkl")
447
  if encoder is not None:
448
  classes = encoder.classes_
@@ -451,65 +486,14 @@ elif section == "Predictions":
451
  'Probability': prediction_proba
452
  }).sort_values('Probability', ascending=False)
453
 
454
- st.bar_chart(prob_df.set_index('Class'))
455
  st.dataframe(prob_df, use_container_width=True)
456
  else:
457
- st.warning("⚠️ Please enter some text to classify")
458
  else:
459
- st.warning("⚠️ No trained models found. Please train a model first.")
460
  else:
461
- st.warning("⚠️ No trained models found. Please go to 'Train Model' section to train a model first.")
462
-
463
- # Option to classify multiple texts
464
- st.markdown("---")
465
- st.subheader("📊 Batch Predictions")
466
-
467
- uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
468
-
469
- if uploaded_file is not None:
470
- try:
471
- batch_df = pd.read_csv(uploaded_file, encoding='latin1')
472
- st.write("**Uploaded data preview:**")
473
- st.dataframe(batch_df.head())
474
-
475
- # Select text column
476
- text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
477
-
478
- if os.path.exists("models") and os.listdir("models"):
479
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
480
- batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
481
-
482
- if st.button("🚀 Run Batch Predictions", type="primary"):
483
- with st.spinner("Processing batch predictions..."):
484
- predictions = []
485
- progress_bar = st.progress(0)
486
-
487
- for i, text in enumerate(batch_df[text_column]):
488
- pred, _ = predict_text(
489
- batch_model,
490
- str(text),
491
- st.session_state.get('vectorizer_type', 'tfidf')
492
- )
493
- predictions.append(pred if pred is not None else "Error")
494
- progress_bar.progress((i + 1) / len(batch_df))
495
-
496
- batch_df['Predicted_Class'] = predictions
497
-
498
- st.success("✅ Batch predictions completed!")
499
- st.write("**Results:**")
500
- st.dataframe(batch_df[[text_column, 'Predicted_Class']], use_container_width=True)
501
-
502
- # Download results
503
- csv = batch_df.to_csv(index=False)
504
- st.download_button(
505
- label="💾 Download predictions as CSV",
506
- data=csv,
507
- file_name="batch_predictions.csv",
508
- mime="text/csv"
509
- )
510
- except Exception as e:
511
- st.error(f"Error in batch prediction: {str(e)}")
512
 
513
  # Footer
514
  st.markdown("---")
515
- st.markdown("Built with ❤️ using Streamlit | Deploy on 🤗 Hugging Face Spaces")
 
4
  import numpy as np
5
  import os
6
  import pickle
 
 
7
  import re
8
  import string
9
  from pathlib import Path
10
  from sklearn.preprocessing import LabelEncoder
11
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
12
  from sklearn.model_selection import train_test_split
13
+ from sklearn.metrics import accuracy_score
14
  from sklearn.linear_model import LogisticRegression
15
  from sklearn.tree import DecisionTreeClassifier
16
+ from sklearn.svm import LinearSVC
17
  from sklearn.ensemble import RandomForestClassifier
18
+ from sklearn.naive_bayes import MultinomialNB
 
 
19
 
20
+ # Configure Streamlit page
21
+ st.set_page_config(page_title="No Code Text Classifier", page_icon="🤖", layout="wide")
 
 
 
 
 
22
 
23
+ # Initialize NLTK components with fallbacks
24
  @st.cache_resource
25
+ def init_nltk_components():
26
+ """Initialize NLTK components with fallbacks"""
 
 
 
 
27
  try:
28
+ import nltk
29
+ # Try to use pre-downloaded data first
30
+ try:
31
+ from nltk.corpus import stopwords
32
+ from nltk.stem import WordNetLemmatizer
33
+ stop_words = set(stopwords.words('english'))
34
+ lemmatizer = WordNetLemmatizer()
35
+ # Test lemmatizer
36
+ _ = lemmatizer.lemmatize('test')
37
+ return stop_words, lemmatizer, True
38
+ except:
39
+ # Fallback: try to download
 
 
40
  try:
 
 
41
  nltk.download('stopwords', quiet=True)
42
+ nltk.download('wordnet', quiet=True)
 
 
 
 
 
 
 
 
 
 
 
43
  nltk.download('omw-1.4', quiet=True)
44
+ from nltk.corpus import stopwords
45
+ from nltk.stem import WordNetLemmatizer
46
+ stop_words = set(stopwords.words('english'))
47
+ lemmatizer = WordNetLemmatizer()
48
+ return stop_words, lemmatizer, True
49
+ except:
50
+ # Final fallback: use basic English stopwords
51
+ basic_stopwords = {
52
+ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
53
+ 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
54
+ 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
55
+ 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
56
+ 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
57
+ 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
58
+ 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
59
+ 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
60
+ 'with', 'through', 'during', 'before', 'after', 'above', 'below',
61
+ 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
62
+ 'further', 'then', 'once'
63
+ }
64
+ return basic_stopwords, None, False
65
+ except ImportError:
66
+ # NLTK not available at all
67
+ basic_stopwords = set()
68
+ return basic_stopwords, None, False
69
+
70
+ # Initialize NLTK components
71
+ STOP_WORDS, LEMMATIZER, NLTK_AVAILABLE = init_nltk_components()
72
+
73
+ class TextCleaner:
74
+ """Simplified text cleaner with fallbacks"""
75
+ def __init__(self):
76
+ self.currency_symbols = r'[\$\£\€\¥\₹\¢\₽\₩\₪]'
77
+ self.stop_words = STOP_WORDS
78
+ self.lemmatizer = LEMMATIZER
79
+ self.nltk_available = NLTK_AVAILABLE
80
 
81
  def remove_punctuation(self, text):
82
  return text.translate(str.maketrans('', '', string.punctuation))
83
 
84
  def clean_text(self, text):
85
+ """Clean text with robust error handling"""
 
86
  if not isinstance(text, str):
87
  text = str(text) if text is not None else ""
88
 
 
90
  return ""
91
 
92
  try:
93
+ # Basic cleaning
94
  text = text.lower()
95
  text = re.sub(self.currency_symbols, 'currency', text)
96
 
97
+ # Remove emojis
98
  emoji_pattern = re.compile("["
99
  u"\U0001F600-\U0001F64F" # emoticons
100
  u"\U0001F300-\U0001F5FF" # symbols & pictographs
 
104
  u"\U000024C2-\U0001F251"
105
  "]+", flags=re.UNICODE)
106
  text = emoji_pattern.sub(r'', text)
107
+
108
+ # Remove punctuation and clean
109
  text = self.remove_punctuation(text)
110
  text = re.compile('<.*?>').sub('', text)
111
  text = text.replace('_', '')
112
  text = re.sub(r'[^\w\s]', '', text)
113
  text = re.sub(r'\d', ' ', text)
114
  text = re.sub(r'\s+', ' ', text).strip()
 
115
 
116
+ # Remove stopwords if available
117
+ if self.stop_words:
118
+ text = ' '.join(word for word in text.split() if word not in self.stop_words)
119
+
120
+ # Lemmatize if available
121
+ if self.lemmatizer and self.nltk_available:
122
+ try:
123
+ text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
124
+ except:
125
+ pass # Skip lemmatization if it fails
126
+
127
+ return text
128
 
 
 
129
  except Exception as e:
130
+ st.warning(f"Text cleaning warning: {e}")
131
  return str(text)
132
 
133
  class DataAnalyzer:
134
+ """Simplified data analyzer"""
135
  def __init__(self, df, text_column, target_column):
136
  self.df = df
137
  self.text_column = text_column
 
146
  return info
147
 
148
  def plot_class_distribution(self):
149
+ try:
150
+ fig, ax = plt.subplots(figsize=(10, 6))
151
+ self.df[self.target_column].value_counts().plot(kind='bar', ax=ax)
152
+ ax.set_title('Class Distribution')
153
+ ax.set_xlabel('Classes')
154
+ ax.set_ylabel('Count')
155
+ plt.xticks(rotation=45)
156
+ plt.tight_layout()
157
+ st.pyplot(fig)
158
+ except Exception as e:
159
+ st.error(f"Error creating plot: {e}")
160
 
161
  def plot_text_length_distribution(self):
162
+ try:
163
+ fig, ax = plt.subplots(figsize=(10, 6))
164
+ text_lengths = self.df[self.text_column].str.len()
165
+ ax.hist(text_lengths, bins=50, alpha=0.7)
166
+ ax.set_title('Text Length Distribution')
167
+ ax.set_xlabel('Text Length')
168
+ ax.set_ylabel('Frequency')
169
+ plt.tight_layout()
170
+ st.pyplot(fig)
171
+ except Exception as e:
172
+ st.error(f"Error creating plot: {e}")
173
 
174
+ # Utility functions with better error handling
175
  def save_artifacts(obj, folder_name, file_name):
176
+ """Save artifacts with error handling"""
177
+ try:
178
+ os.makedirs(folder_name, exist_ok=True)
179
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
180
+ pickle.dump(obj, f)
181
+ return True
182
+ except Exception as e:
183
+ st.error(f"Error saving {file_name}: {e}")
184
+ return False
185
 
186
  def load_artifacts(folder_name, file_name):
187
+ """Load artifacts with error handling"""
188
  try:
189
  with open(os.path.join(folder_name, file_name), 'rb') as f:
190
  return pickle.load(f)
191
  except FileNotFoundError:
192
+ st.error(f"File {file_name} not found in {folder_name}")
193
  return None
194
+ except Exception as e:
195
+ st.error(f"Error loading {file_name}: {e}")
 
 
 
 
 
 
196
  return None
197
 
198
  def train_model(model_name, X_train, X_test, y_train, y_test):
199
+ """Train model with simplified selection"""
200
+ try:
201
+ os.makedirs("models", exist_ok=True)
202
+
203
+ # Simplified model dictionary
204
+ models_dict = {
205
+ "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
206
+ "Decision Tree": DecisionTreeClassifier(random_state=42),
207
+ "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42), # Reduced for speed
208
+ "Linear SVC": LinearSVC(random_state=42, max_iter=1000),
209
+ "Multinomial Naive Bayes": MultinomialNB(),
210
+ }
211
+
212
+ if model_name not in models_dict:
213
+ st.error(f"Model {model_name} not supported")
214
+ return None
215
+
216
  model = models_dict[model_name]
217
+
218
+ # Train model
219
  model.fit(X_train, y_train)
220
 
221
  # Save model
222
+ model_filename = f"{model_name.replace(' ', '_')}.pkl"
223
  save_path = os.path.join("models", model_filename)
 
 
224
 
225
+ if save_artifacts(model, "models", model_filename):
226
+ # Evaluate
227
+ y_pred = model.predict(X_test)
228
+ accuracy = accuracy_score(y_test, y_pred)
229
+
230
+ st.success(" Model training completed!")
231
+ st.write(f"**Accuracy**: {accuracy:.4f}")
232
+
233
+ return model_filename
234
+ else:
235
+ return None
236
+
237
+ except Exception as e:
238
+ st.error(f"Error training model: {e}")
239
  return None
240
 
241
  def predict_text(model_name, text, vectorizer_type="tfidf"):
242
+ """Make prediction with better error handling"""
243
  try:
244
+ # Load components
245
+ model = load_artifacts("models", model_name)
246
  if model is None:
247
  return None, None
248
 
 
249
  vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
250
  vectorizer = load_artifacts("artifacts", vectorizer_file)
251
  if vectorizer is None:
252
  return None, None
253
 
 
254
  encoder = load_artifacts("artifacts", "encoder.pkl")
255
  if encoder is None:
256
  return None, None
257
 
258
+ # Process text
259
  text_cleaner = TextCleaner()
260
  clean_text = text_cleaner.clean_text(text)
261
 
262
+ if not clean_text.strip():
263
+ st.warning("Text became empty after cleaning")
264
+ return None, None
265
 
266
+ # Vectorize and predict
267
+ text_vector = vectorizer.transform([clean_text])
268
  prediction = model.predict(text_vector)
 
269
 
270
+ # Get probabilities if possible
271
+ prediction_proba = None
272
  if hasattr(model, 'predict_proba'):
273
  try:
274
  prediction_proba = model.predict_proba(text_vector)[0]
 
281
  return predicted_label, prediction_proba
282
 
283
  except Exception as e:
284
+ st.error(f"Prediction error: {e}")
285
  return None, None
286
 
287
+ # Main Streamlit App
 
 
288
  st.title('🤖 No Code Text Classification App')
289
+
290
+ # Show NLTK status
291
+ if not NLTK_AVAILABLE:
292
+ st.warning("⚠️ NLTK not fully available. Using basic text processing.")
293
+
294
  st.write('Understand the behavior of your text data and train a model to classify text data')
295
 
296
  # Sidebar
 
299
  # Upload Data
300
  st.sidebar.subheader("📁 Upload Your Dataset")
301
  train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
 
302
 
303
+ # Initialize session state
304
  if 'vectorizer_type' not in st.session_state:
305
  st.session_state.vectorizer_type = "tfidf"
306
 
307
+ # Load and process data
308
+ train_df = None
309
  if train_data is not None:
310
  try:
311
+ # Try different encodings
312
+ for encoding in ['utf-8', 'latin1', 'iso-8859-1']:
313
+ try:
314
+ train_df = pd.read_csv(train_data, encoding=encoding)
315
+ break
316
+ except UnicodeDecodeError:
317
+ continue
318
 
319
+ if train_df is None:
320
+ st.error("Could not read the CSV file. Please check the encoding.")
321
  else:
322
+ st.write("**Training Data Preview:**")
323
+ st.dataframe(train_df.head(3))
324
 
325
+ columns = train_df.columns.tolist()
326
+ text_data = st.sidebar.selectbox("Choose the text column:", columns)
327
+ target = st.sidebar.selectbox("Choose the target column:", columns)
 
 
 
328
 
329
+ # Process data
330
+ if text_data and target:
331
+ with st.spinner("Processing data..."):
332
+ text_cleaner = TextCleaner()
333
+ train_df['clean_text'] = train_df[text_data].apply(
334
+ lambda x: text_cleaner.clean_text(x) if pd.notna(x) else ""
335
+ )
336
+ train_df['text_length'] = train_df[text_data].astype(str).str.len()
337
+
338
+ # Handle label encoding
339
+ label_encoder = LabelEncoder()
340
+ train_df['target'] = label_encoder.fit_transform(train_df[target].astype(str))
341
+
342
+ # Save encoder
343
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
344
+
345
  except Exception as e:
346
+ st.error(f"Error processing data: {e}")
347
  train_df = None
348
 
349
  # Data Analysis Section
350
  if section == "Data Analysis":
351
+ if train_df is not None:
352
+ st.subheader("📊 Data Insights")
353
+
354
+ analyzer = DataAnalyzer(train_df, text_data, target)
355
+ info = analyzer.get_basic_info()
356
+
357
+ col1, col2, col3 = st.columns(3)
358
+ with col1:
359
+ st.metric("Total Samples", info['shape'][0])
360
+ with col2:
361
+ st.metric("Features", info['shape'][1])
362
+ with col3:
363
+ st.metric("Classes", len(info['class_distribution']))
364
+
365
+ st.write("**Class Distribution:**")
366
+ st.write(info['class_distribution'])
367
+
368
+ # Show sample of processed data
369
+ st.write("**Processed Data Preview:**")
370
+ sample_df = train_df[['clean_text', 'text_length', 'target']].head(10)
371
+ st.dataframe(sample_df)
372
+
373
+ st.subheader("📈 Visualizations")
374
+
375
+ col1, col2 = st.columns(2)
376
+ with col1:
377
+ st.write("**Class Distribution**")
378
+ analyzer.plot_class_distribution()
379
+
380
+ with col2:
381
+ st.write("**Text Length Distribution**")
382
+ analyzer.plot_text_length_distribution()
 
 
 
 
 
383
  else:
384
+ st.warning("⚠️ Please upload training data to see analysis")
385
 
386
  # Train Model Section
387
  elif section == "Train Model":
388
+ if train_df is not None and 'clean_text' in train_df.columns:
389
+ st.subheader("🚀 Train a Model")
 
390
 
391
+ col1, col2 = st.columns(2)
392
 
393
+ with col1:
394
+ model = st.selectbox("Choose the Model", [
395
+ "Logistic Regression",
396
+ "Decision Tree",
397
+ "Random Forest",
398
+ "Linear SVC",
399
+ "Multinomial Naive Bayes"
400
+ ])
401
+
402
+ with col2:
403
+ vectorizer_choice = st.selectbox("Choose Vectorizer",
404
+ ["Tfidf Vectorizer", "Count Vectorizer"])
405
 
406
+ # Filter out empty texts
407
+ valid_data = train_df[train_df['clean_text'].str.len() > 0].copy()
408
+
409
+ if len(valid_data) == 0:
410
+ st.error("No valid text data after cleaning!")
411
+ else:
412
+ st.write(f"**Valid samples**: {len(valid_data)}")
413
+
414
  # Initialize vectorizer
415
+ max_features = min(10000, len(valid_data) * 10) # Adaptive max_features
416
+
417
  if vectorizer_choice == "Tfidf Vectorizer":
418
+ vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
419
  st.session_state.vectorizer_type = "tfidf"
420
  else:
421
+ vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
422
  st.session_state.vectorizer_type = "count"
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  if st.button("🎯 Start Training", type="primary"):
425
  with st.spinner("Training model..."):
426
+ try:
427
+ # Vectorize
428
+ X = vectorizer.fit_transform(valid_data['clean_text'])
429
+ y = valid_data['target']
430
+
431
+ # Split data
432
+ test_size = min(0.3, max(0.1, len(valid_data) * 0.2 / len(valid_data)))
433
+ X_train, X_test, y_train, y_test = train_test_split(
434
+ X, y, test_size=test_size, random_state=42, stratify=y
435
+ )
436
+
437
+ st.write(f"**Data split** - Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")
438
+
439
+ # Save vectorizer
440
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
441
+ if save_artifacts(vectorizer, "artifacts", vectorizer_filename):
442
+ # Train model
443
+ model_filename = train_model(model, X_train, X_test, y_train, y_test)
444
+ if model_filename:
445
+ st.success("✅ Model ready! Go to 'Predictions' to test it.")
446
+
447
+ except Exception as e:
448
+ st.error(f"Training failed: {e}")
449
  else:
450
+ st.warning("⚠️ Please upload and process training data first")
451
 
452
  # Predictions Section
453
  elif section == "Predictions":
454
+ st.subheader("🔮 Make Predictions")
455
 
 
456
  if os.path.exists("models") and os.listdir("models"):
 
 
 
 
457
  available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
458
 
459
  if available_models:
460
+ selected_model = st.selectbox("Choose trained model:", available_models)
461
+
462
+ text_input = st.text_area("Enter text to classify:",
463
+ height=100,
464
+ placeholder="Type your text here...")
465
 
 
466
  if st.button("🎯 Predict", type="primary"):
467
  if text_input.strip():
468
  with st.spinner("Making prediction..."):
 
474
 
475
  if predicted_label is not None:
476
  st.success("✅ Prediction completed!")
 
 
 
 
477
  st.markdown(f"**Predicted Class:** `{predicted_label}`")
478
 
 
479
  if prediction_proba is not None:
480
+ st.markdown("**Class Probabilities:**")
 
 
481
  encoder = load_artifacts("artifacts", "encoder.pkl")
482
  if encoder is not None:
483
  classes = encoder.classes_
 
486
  'Probability': prediction_proba
487
  }).sort_values('Probability', ascending=False)
488
 
 
489
  st.dataframe(prob_df, use_container_width=True)
490
  else:
491
+ st.warning("⚠️ Please enter some text")
492
  else:
493
+ st.warning("⚠️ No trained models found")
494
  else:
495
+ st.warning("⚠️ No models available. Please train a model first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
  # Footer
498
  st.markdown("---")
499
+ st.markdown("🚀 Built with Streamlit | Ready for 🤗 Hugging Face Spaces")