Alamgirapi commited on
Commit
b7cf6e8
ยท
verified ยท
1 Parent(s): b97bac9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -278
app.py CHANGED
@@ -7,82 +7,69 @@ import pickle
7
  import re
8
  import string
9
  from pathlib import Path
10
- from sklearn.preprocessing import LabelEncoder
11
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
12
- from sklearn.model_selection import train_test_split
13
- from sklearn.metrics import accuracy_score
14
- from sklearn.linear_model import LogisticRegression
15
- from sklearn.tree import DecisionTreeClassifier
16
- from sklearn.svm import LinearSVC
17
- from sklearn.ensemble import RandomForestClassifier
18
- from sklearn.naive_bayes import MultinomialNB
19
 
20
  # Configure Streamlit page
21
  st.set_page_config(page_title="No Code Text Classifier", page_icon="๐Ÿค–", layout="wide")
22
 
23
- # Initialize NLTK components with fallbacks
24
  @st.cache_resource
25
- def init_nltk_components():
26
- """Initialize NLTK components with fallbacks"""
27
  try:
28
- import nltk
29
- # Try to use pre-downloaded data first
30
- try:
31
- from nltk.corpus import stopwords
32
- from nltk.stem import WordNetLemmatizer
33
- stop_words = set(stopwords.words('english'))
34
- lemmatizer = WordNetLemmatizer()
35
- # Test lemmatizer
36
- _ = lemmatizer.lemmatize('test')
37
- return stop_words, lemmatizer, True
38
- except:
39
- # Fallback: try to download
40
- try:
41
- nltk.download('stopwords', quiet=True)
42
- nltk.download('wordnet', quiet=True)
43
- nltk.download('omw-1.4', quiet=True)
44
- from nltk.corpus import stopwords
45
- from nltk.stem import WordNetLemmatizer
46
- stop_words = set(stopwords.words('english'))
47
- lemmatizer = WordNetLemmatizer()
48
- return stop_words, lemmatizer, True
49
- except:
50
- # Final fallback: use basic English stopwords
51
- basic_stopwords = {
52
- 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
53
- 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
54
- 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
55
- 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
56
- 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
57
- 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
58
- 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
59
- 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
60
- 'with', 'through', 'during', 'before', 'after', 'above', 'below',
61
- 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
62
- 'further', 'then', 'once'
63
- }
64
- return basic_stopwords, None, False
65
- except ImportError:
66
- # NLTK not available at all
67
- basic_stopwords = set()
68
- return basic_stopwords, None, False
69
 
70
- # Initialize NLTK components
71
- STOP_WORDS, LEMMATIZER, NLTK_AVAILABLE = init_nltk_components()
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  class TextCleaner:
74
- """Simplified text cleaner with fallbacks"""
75
  def __init__(self):
76
  self.currency_symbols = r'[\$\ยฃ\โ‚ฌ\ยฅ\โ‚น\ยข\โ‚ฝ\โ‚ฉ\โ‚ช]'
77
- self.stop_words = STOP_WORDS
78
- self.lemmatizer = LEMMATIZER
79
- self.nltk_available = NLTK_AVAILABLE
80
 
81
  def remove_punctuation(self, text):
82
  return text.translate(str.maketrans('', '', string.punctuation))
83
 
84
  def clean_text(self, text):
85
- """Clean text with robust error handling"""
86
  if not isinstance(text, str):
87
  text = str(text) if text is not None else ""
88
 
@@ -94,44 +81,22 @@ class TextCleaner:
94
  text = text.lower()
95
  text = re.sub(self.currency_symbols, 'currency', text)
96
 
97
- # Remove emojis
98
- emoji_pattern = re.compile("["
99
- u"\U0001F600-\U0001F64F" # emoticons
100
- u"\U0001F300-\U0001F5FF" # symbols & pictographs
101
- u"\U0001F680-\U0001F6FF" # transport & map symbols
102
- u"\U0001F1E0-\U0001F1FF" # flags (iOS)
103
- u"\U00002702-\U000027B0"
104
- u"\U000024C2-\U0001F251"
105
- "]+", flags=re.UNICODE)
106
- text = emoji_pattern.sub(r'', text)
107
-
108
- # Remove punctuation and clean
109
- text = self.remove_punctuation(text)
110
- text = re.compile('<.*?>').sub('', text)
111
- text = text.replace('_', '')
112
- text = re.sub(r'[^\w\s]', '', text)
113
- text = re.sub(r'\d', ' ', text)
114
  text = re.sub(r'\s+', ' ', text).strip()
115
 
116
- # Remove stopwords if available
117
- if self.stop_words:
118
- text = ' '.join(word for word in text.split() if word not in self.stop_words)
119
 
120
- # Lemmatize if available
121
- if self.lemmatizer and self.nltk_available:
122
- try:
123
- text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
124
- except:
125
- pass # Skip lemmatization if it fails
126
-
127
- return text
128
 
129
  except Exception as e:
130
  st.warning(f"Text cleaning warning: {e}")
131
- return str(text)
132
 
133
  class DataAnalyzer:
134
- """Simplified data analyzer"""
135
  def __init__(self, df, text_column, target_column):
136
  self.df = df
137
  self.text_column = text_column
@@ -147,36 +112,39 @@ class DataAnalyzer:
147
 
148
  def plot_class_distribution(self):
149
  try:
150
- fig, ax = plt.subplots(figsize=(10, 6))
151
- self.df[self.target_column].value_counts().plot(kind='bar', ax=ax)
152
  ax.set_title('Class Distribution')
153
  ax.set_xlabel('Classes')
154
  ax.set_ylabel('Count')
155
  plt.xticks(rotation=45)
156
  plt.tight_layout()
157
  st.pyplot(fig)
 
158
  except Exception as e:
159
  st.error(f"Error creating plot: {e}")
160
 
161
  def plot_text_length_distribution(self):
162
  try:
163
- fig, ax = plt.subplots(figsize=(10, 6))
164
- text_lengths = self.df[self.text_column].str.len()
165
- ax.hist(text_lengths, bins=50, alpha=0.7)
166
  ax.set_title('Text Length Distribution')
167
- ax.set_xlabel('Text Length')
168
  ax.set_ylabel('Frequency')
169
  plt.tight_layout()
170
  st.pyplot(fig)
 
171
  except Exception as e:
172
  st.error(f"Error creating plot: {e}")
173
 
174
- # Utility functions with better error handling
175
  def save_artifacts(obj, folder_name, file_name):
176
  """Save artifacts with error handling"""
177
  try:
178
  os.makedirs(folder_name, exist_ok=True)
179
- with open(os.path.join(folder_name, file_name), 'wb') as f:
 
180
  pickle.dump(obj, f)
181
  return True
182
  except Exception as e:
@@ -186,7 +154,8 @@ def save_artifacts(obj, folder_name, file_name):
186
  def load_artifacts(folder_name, file_name):
187
  """Load artifacts with error handling"""
188
  try:
189
- with open(os.path.join(folder_name, file_name), 'rb') as f:
 
190
  return pickle.load(f)
191
  except FileNotFoundError:
192
  st.error(f"File {file_name} not found in {folder_name}")
@@ -195,37 +164,38 @@ def load_artifacts(folder_name, file_name):
195
  st.error(f"Error loading {file_name}: {e}")
196
  return None
197
 
198
- def train_model(model_name, X_train, X_test, y_train, y_test):
199
- """Train model with simplified selection"""
200
  try:
201
  os.makedirs("models", exist_ok=True)
202
 
203
- # Simplified model dictionary
204
- models_dict = {
205
- "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
206
- "Decision Tree": DecisionTreeClassifier(random_state=42),
207
- "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42), # Reduced for speed
208
- "Linear SVC": LinearSVC(random_state=42, max_iter=1000),
209
- "Multinomial Naive Bayes": MultinomialNB(),
210
- }
211
-
212
- if model_name not in models_dict:
213
  st.error(f"Model {model_name} not supported")
214
  return None
215
 
216
- model = models_dict[model_name]
 
 
 
 
 
 
 
 
217
 
218
  # Train model
219
- model.fit(X_train, y_train)
 
220
 
221
  # Save model
222
  model_filename = f"{model_name.replace(' ', '_')}.pkl"
223
- save_path = os.path.join("models", model_filename)
224
 
225
  if save_artifacts(model, "models", model_filename):
226
- # Evaluate
227
  y_pred = model.predict(X_test)
228
- accuracy = accuracy_score(y_test, y_pred)
229
 
230
  st.success("โœ… Model training completed!")
231
  st.write(f"**Accuracy**: {accuracy:.4f}")
@@ -238,8 +208,8 @@ def train_model(model_name, X_train, X_test, y_train, y_test):
238
  st.error(f"Error training model: {e}")
239
  return None
240
 
241
- def predict_text(model_name, text, vectorizer_type="tfidf"):
242
- """Make prediction with better error handling"""
243
  try:
244
  # Load components
245
  model = load_artifacts("models", model_name)
@@ -267,7 +237,7 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
267
  text_vector = vectorizer.transform([clean_text])
268
  prediction = model.predict(text_vector)
269
 
270
- # Get probabilities if possible
271
  prediction_proba = None
272
  if hasattr(model, 'predict_proba'):
273
  try:
@@ -285,199 +255,232 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
285
  return None, None
286
 
287
  # Main Streamlit App
288
- st.title('๐Ÿค– No Code Text Classification App')
 
 
289
 
290
- # Show NLTK status
291
- if not NLTK_AVAILABLE:
292
- st.warning("โš ๏ธ NLTK not fully available. Using basic text processing.")
293
 
294
- st.write('Understand the behavior of your text data and train a model to classify text data')
 
 
295
 
296
- # Sidebar
297
- section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
 
298
 
299
- # Upload Data
300
- st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
301
- train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
- # Initialize session state
304
- if 'vectorizer_type' not in st.session_state:
305
- st.session_state.vectorizer_type = "tfidf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- # Load and process data
308
- train_df = None
309
- if train_data is not None:
310
- try:
311
- # Try different encodings
312
- for encoding in ['utf-8', 'latin1', 'iso-8859-1']:
313
- try:
314
- train_df = pd.read_csv(train_data, encoding=encoding)
315
- break
316
- except UnicodeDecodeError:
317
- continue
318
-
319
- if train_df is None:
320
- st.error("Could not read the CSV file. Please check the encoding.")
321
- else:
322
- st.write("**Training Data Preview:**")
323
- st.dataframe(train_df.head(3))
324
 
325
- columns = train_df.columns.tolist()
326
- text_data = st.sidebar.selectbox("Choose the text column:", columns)
327
- target = st.sidebar.selectbox("Choose the target column:", columns)
328
-
329
- # Process data
330
- if text_data and target:
331
- with st.spinner("Processing data..."):
332
- text_cleaner = TextCleaner()
333
- train_df['clean_text'] = train_df[text_data].apply(
334
- lambda x: text_cleaner.clean_text(x) if pd.notna(x) else ""
335
- )
336
- train_df['text_length'] = train_df[text_data].astype(str).str.len()
337
-
338
- # Handle label encoding
339
- label_encoder = LabelEncoder()
340
- train_df['target'] = label_encoder.fit_transform(train_df[target].astype(str))
341
-
342
- # Save encoder
343
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
344
 
345
- except Exception as e:
346
- st.error(f"Error processing data: {e}")
347
- train_df = None
348
-
349
- # Data Analysis Section
350
- if section == "Data Analysis":
351
- if train_df is not None:
352
- st.subheader("๐Ÿ“Š Data Insights")
353
-
354
- analyzer = DataAnalyzer(train_df, text_data, target)
355
- info = analyzer.get_basic_info()
356
-
357
- col1, col2, col3 = st.columns(3)
358
- with col1:
359
- st.metric("Total Samples", info['shape'][0])
360
- with col2:
361
- st.metric("Features", info['shape'][1])
362
- with col3:
363
- st.metric("Classes", len(info['class_distribution']))
364
-
365
- st.write("**Class Distribution:**")
366
- st.write(info['class_distribution'])
367
-
368
- # Show sample of processed data
369
- st.write("**Processed Data Preview:**")
370
- sample_df = train_df[['clean_text', 'text_length', 'target']].head(10)
371
- st.dataframe(sample_df)
372
-
373
- st.subheader("๐Ÿ“ˆ Visualizations")
374
-
375
- col1, col2 = st.columns(2)
376
- with col1:
377
- st.write("**Class Distribution**")
378
- analyzer.plot_class_distribution()
379
-
380
- with col2:
381
- st.write("**Text Length Distribution**")
382
- analyzer.plot_text_length_distribution()
383
- else:
384
- st.warning("โš ๏ธ Please upload training data to see analysis")
385
 
386
- # Train Model Section
387
- elif section == "Train Model":
388
- if train_df is not None and 'clean_text' in train_df.columns:
389
- st.subheader("๐Ÿš€ Train a Model")
390
 
391
- col1, col2 = st.columns(2)
392
 
393
- with col1:
394
- model = st.selectbox("Choose the Model", [
395
- "Logistic Regression",
396
- "Decision Tree",
397
- "Random Forest",
398
- "Linear SVC",
399
- "Multinomial Naive Bayes"
400
- ])
401
-
402
- with col2:
403
- vectorizer_choice = st.selectbox("Choose Vectorizer",
404
- ["Tfidf Vectorizer", "Count Vectorizer"])
405
-
406
- # Filter out empty texts
407
- valid_data = train_df[train_df['clean_text'].str.len() > 0].copy()
408
-
409
- if len(valid_data) == 0:
410
- st.error("No valid text data after cleaning!")
411
- else:
412
- st.write(f"**Valid samples**: {len(valid_data)}")
413
 
414
- # Initialize vectorizer
415
- max_features = min(10000, len(valid_data) * 10) # Adaptive max_features
 
 
 
 
416
 
417
- if vectorizer_choice == "Tfidf Vectorizer":
418
- vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
419
- st.session_state.vectorizer_type = "tfidf"
420
  else:
421
- vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
422
- st.session_state.vectorizer_type = "count"
 
 
 
 
 
423
 
424
- if st.button("๐ŸŽฏ Start Training", type="primary"):
425
- with st.spinner("Training model..."):
 
 
 
 
 
 
 
 
 
426
  try:
427
  # Vectorize
428
- X = vectorizer.fit_transform(valid_data['clean_text'])
429
- y = valid_data['target']
 
 
 
430
 
431
  # Split data
432
- test_size = min(0.3, max(0.1, len(valid_data) * 0.2 / len(valid_data)))
433
- X_train, X_test, y_train, y_test = train_test_split(
434
  X, y, test_size=test_size, random_state=42, stratify=y
435
  )
436
 
437
- st.write(f"**Data split** - Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")
438
 
439
  # Save vectorizer
440
  vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
441
  if save_artifacts(vectorizer, "artifacts", vectorizer_filename):
442
  # Train model
443
- model_filename = train_model(model, X_train, X_test, y_train, y_test)
444
  if model_filename:
445
- st.success("โœ… Model ready! Go to 'Predictions' to test it.")
 
446
 
447
  except Exception as e:
448
- st.error(f"Training failed: {e}")
449
- else:
450
- st.warning("โš ๏ธ Please upload and process training data first")
451
 
452
- # Predictions Section
453
- elif section == "Predictions":
454
- st.subheader("๐Ÿ”ฎ Make Predictions")
455
-
456
- if os.path.exists("models") and os.listdir("models"):
457
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
458
 
459
- if available_models:
460
- selected_model = st.selectbox("Choose trained model:", available_models)
461
 
462
- text_input = st.text_area("Enter text to classify:",
463
- height=100,
464
- placeholder="Type your text here...")
465
-
466
- if st.button("๐ŸŽฏ Predict", type="primary"):
467
- if text_input.strip():
468
- with st.spinner("Making prediction..."):
 
 
 
 
 
 
 
469
  predicted_label, prediction_proba = predict_text(
470
  selected_model,
471
  text_input,
472
- st.session_state.get('vectorizer_type', 'tfidf')
 
473
  )
474
 
475
  if predicted_label is not None:
476
  st.success("โœ… Prediction completed!")
 
 
 
477
  st.markdown(f"**Predicted Class:** `{predicted_label}`")
478
 
 
479
  if prediction_proba is not None:
480
- st.markdown("**Class Probabilities:**")
481
  encoder = load_artifacts("artifacts", "encoder.pkl")
482
  if encoder is not None:
483
  classes = encoder.classes_
@@ -486,14 +489,21 @@ elif section == "Predictions":
486
  'Probability': prediction_proba
487
  }).sort_values('Probability', ascending=False)
488
 
489
- st.dataframe(prob_df, use_container_width=True)
490
- else:
491
- st.warning("โš ๏ธ Please enter some text")
 
 
 
 
 
 
492
  else:
493
- st.warning("โš ๏ธ No trained models found")
494
- else:
495
- st.warning("โš ๏ธ No models available. Please train a model first.")
 
 
496
 
497
- # Footer
498
- st.markdown("---")
499
- st.markdown("๐Ÿš€ Built with Streamlit | Ready for ๐Ÿค— Hugging Face Spaces")
 
7
  import re
8
  import string
9
  from pathlib import Path
 
 
 
 
 
 
 
 
 
10
 
11
  # Configure Streamlit page
12
  st.set_page_config(page_title="No Code Text Classifier", page_icon="๐Ÿค–", layout="wide")
13
 
14
+ # Lazy imports to speed up startup
15
  @st.cache_resource
16
+ def load_ml_libraries():
17
+ """Lazy load ML libraries only when needed"""
18
  try:
19
+ from sklearn.preprocessing import LabelEncoder
20
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
21
+ from sklearn.model_selection import train_test_split
22
+ from sklearn.metrics import accuracy_score
23
+ from sklearn.linear_model import LogisticRegression
24
+ from sklearn.tree import DecisionTreeClassifier
25
+ from sklearn.svm import LinearSVC
26
+ from sklearn.ensemble import RandomForestClassifier
27
+ from sklearn.naive_bayes import MultinomialNB
28
+
29
+ return {
30
+ 'LabelEncoder': LabelEncoder,
31
+ 'TfidfVectorizer': TfidfVectorizer,
32
+ 'CountVectorizer': CountVectorizer,
33
+ 'train_test_split': train_test_split,
34
+ 'accuracy_score': accuracy_score,
35
+ 'models': {
36
+ "Logistic Regression": LogisticRegression,
37
+ "Decision Tree": DecisionTreeClassifier,
38
+ "Random Forest": RandomForestClassifier,
39
+ "Linear SVC": LinearSVC,
40
+ "Multinomial Naive Bayes": MultinomialNB,
41
+ }
42
+ }
43
+ except ImportError as e:
44
+ st.error(f"Error importing ML libraries: {e}")
45
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Basic stopwords (no NLTK dependency)
48
+ BASIC_STOPWORDS = {
49
+ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
50
+ 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
51
+ 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
52
+ 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
53
+ 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
54
+ 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
55
+ 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
56
+ 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
57
+ 'with', 'through', 'during', 'before', 'after', 'above', 'below',
58
+ 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
59
+ 'further', 'then', 'once'
60
+ }
61
 
62
  class TextCleaner:
63
+ """Lightweight text cleaner without NLTK dependencies"""
64
  def __init__(self):
65
  self.currency_symbols = r'[\$\ยฃ\โ‚ฌ\ยฅ\โ‚น\ยข\โ‚ฝ\โ‚ฉ\โ‚ช]'
66
+ self.stop_words = BASIC_STOPWORDS
 
 
67
 
68
  def remove_punctuation(self, text):
69
  return text.translate(str.maketrans('', '', string.punctuation))
70
 
71
  def clean_text(self, text):
72
+ """Clean text with basic processing"""
73
  if not isinstance(text, str):
74
  text = str(text) if text is not None else ""
75
 
 
81
  text = text.lower()
82
  text = re.sub(self.currency_symbols, 'currency', text)
83
 
84
+ # Remove emojis (simplified pattern)
85
+ text = re.sub(r'[^\w\s]', ' ', text)
86
+ text = re.sub(r'\d+', ' ', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  text = re.sub(r'\s+', ' ', text).strip()
88
 
89
+ # Remove stopwords
90
+ words = [word for word in text.split() if word not in self.stop_words and len(word) > 2]
 
91
 
92
+ return ' '.join(words)
 
 
 
 
 
 
 
93
 
94
  except Exception as e:
95
  st.warning(f"Text cleaning warning: {e}")
96
+ return str(text).lower()
97
 
98
  class DataAnalyzer:
99
+ """Lightweight data analyzer"""
100
  def __init__(self, df, text_column, target_column):
101
  self.df = df
102
  self.text_column = text_column
 
112
 
113
  def plot_class_distribution(self):
114
  try:
115
+ fig, ax = plt.subplots(figsize=(8, 5))
116
+ self.df[self.target_column].value_counts().plot(kind='bar', ax=ax, color='steelblue')
117
  ax.set_title('Class Distribution')
118
  ax.set_xlabel('Classes')
119
  ax.set_ylabel('Count')
120
  plt.xticks(rotation=45)
121
  plt.tight_layout()
122
  st.pyplot(fig)
123
+ plt.close()
124
  except Exception as e:
125
  st.error(f"Error creating plot: {e}")
126
 
127
  def plot_text_length_distribution(self):
128
  try:
129
+ fig, ax = plt.subplots(figsize=(8, 5))
130
+ text_lengths = self.df[self.text_column].astype(str).str.len()
131
+ ax.hist(text_lengths, bins=30, alpha=0.7, color='lightcoral')
132
  ax.set_title('Text Length Distribution')
133
+ ax.set_xlabel('Text Length (characters)')
134
  ax.set_ylabel('Frequency')
135
  plt.tight_layout()
136
  st.pyplot(fig)
137
+ plt.close()
138
  except Exception as e:
139
  st.error(f"Error creating plot: {e}")
140
 
141
+ # Utility functions
142
  def save_artifacts(obj, folder_name, file_name):
143
  """Save artifacts with error handling"""
144
  try:
145
  os.makedirs(folder_name, exist_ok=True)
146
+ file_path = os.path.join(folder_name, file_name)
147
+ with open(file_path, 'wb') as f:
148
  pickle.dump(obj, f)
149
  return True
150
  except Exception as e:
 
154
  def load_artifacts(folder_name, file_name):
155
  """Load artifacts with error handling"""
156
  try:
157
+ file_path = os.path.join(folder_name, file_name)
158
+ with open(file_path, 'rb') as f:
159
  return pickle.load(f)
160
  except FileNotFoundError:
161
  st.error(f"File {file_name} not found in {folder_name}")
 
164
  st.error(f"Error loading {file_name}: {e}")
165
  return None
166
 
167
+ def train_model(model_name, X_train, X_test, y_train, y_test, ml_libs):
168
+ """Train model with optimized parameters"""
169
  try:
170
  os.makedirs("models", exist_ok=True)
171
 
172
+ # Get model class
173
+ model_class = ml_libs['models'].get(model_name)
174
+ if not model_class:
 
 
 
 
 
 
 
175
  st.error(f"Model {model_name} not supported")
176
  return None
177
 
178
+ # Initialize model with faster parameters
179
+ if model_name == "Logistic Regression":
180
+ model = model_class(max_iter=500, random_state=42, solver='liblinear')
181
+ elif model_name == "Random Forest":
182
+ model = model_class(n_estimators=20, random_state=42, n_jobs=1) # Reduced trees
183
+ elif model_name == "Linear SVC":
184
+ model = model_class(random_state=42, max_iter=500)
185
+ else:
186
+ model = model_class(random_state=42) if 'random_state' in model_class().get_params() else model_class()
187
 
188
  # Train model
189
+ with st.spinner(f"Training {model_name}..."):
190
+ model.fit(X_train, y_train)
191
 
192
  # Save model
193
  model_filename = f"{model_name.replace(' ', '_')}.pkl"
 
194
 
195
  if save_artifacts(model, "models", model_filename):
196
+ # Quick evaluation
197
  y_pred = model.predict(X_test)
198
+ accuracy = ml_libs['accuracy_score'](y_test, y_pred)
199
 
200
  st.success("โœ… Model training completed!")
201
  st.write(f"**Accuracy**: {accuracy:.4f}")
 
208
  st.error(f"Error training model: {e}")
209
  return None
210
 
211
+ def predict_text(model_name, text, vectorizer_type="tfidf", ml_libs=None):
212
+ """Make prediction with error handling"""
213
  try:
214
  # Load components
215
  model = load_artifacts("models", model_name)
 
237
  text_vector = vectorizer.transform([clean_text])
238
  prediction = model.predict(text_vector)
239
 
240
+ # Get probabilities if available
241
  prediction_proba = None
242
  if hasattr(model, 'predict_proba'):
243
  try:
 
255
  return None, None
256
 
257
  # Main Streamlit App
258
+ def main():
259
+ st.title('๐Ÿค– No Code Text Classification App')
260
+ st.write('Build and deploy text classification models without coding!')
261
 
262
+ # Sidebar
263
+ section = st.sidebar.radio("Choose Section", ["๐Ÿ“Š Data Analysis", "๐Ÿš€ Train Model", "๐Ÿ”ฎ Predictions"])
 
264
 
265
+ # Upload Data
266
+ st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
267
+ train_data = st.sidebar.file_uploader("Upload training data (CSV)", type=["csv"])
268
 
269
+ # Initialize session state
270
+ if 'vectorizer_type' not in st.session_state:
271
+ st.session_state.vectorizer_type = "tfidf"
272
 
273
+ # Load and process data
274
+ train_df = None
275
+ if train_data is not None:
276
+ try:
277
+ # Try different encodings
278
+ for encoding in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
279
+ try:
280
+ train_df = pd.read_csv(train_data, encoding=encoding)
281
+ st.success(f"โœ… File loaded successfully with {encoding} encoding")
282
+ break
283
+ except UnicodeDecodeError:
284
+ continue
285
+
286
+ if train_df is None:
287
+ st.error("โŒ Could not read the CSV file. Please check the file format.")
288
+ else:
289
+ st.write("**Training Data Preview:**")
290
+ st.dataframe(train_df.head(3))
291
+
292
+ columns = train_df.columns.tolist()
293
+ text_data = st.sidebar.selectbox("๐Ÿ“ Choose the text column:", columns)
294
+ target = st.sidebar.selectbox("๐ŸŽฏ Choose the target column:", columns)
295
 
296
+ # Process data
297
+ if text_data and target and st.sidebar.button("Process Data"):
298
+ with st.spinner("Processing data..."):
299
+ text_cleaner = TextCleaner()
300
+
301
+ # Clean text with progress
302
+ progress_bar = st.progress(0)
303
+ cleaned_texts = []
304
+ for i, text in enumerate(train_df[text_data]):
305
+ cleaned_texts.append(text_cleaner.clean_text(text) if pd.notna(text) else "")
306
+ progress_bar.progress((i + 1) / len(train_df))
307
+
308
+ train_df['clean_text'] = cleaned_texts
309
+ train_df['text_length'] = train_df[text_data].astype(str).str.len()
310
+
311
+ # Handle label encoding
312
+ ml_libs = load_ml_libraries()
313
+ if ml_libs:
314
+ label_encoder = ml_libs['LabelEncoder']()
315
+ train_df['target'] = label_encoder.fit_transform(train_df[target].astype(str))
316
+
317
+ # Save encoder
318
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
319
+ st.success("โœ… Data processed successfully!")
320
+ else:
321
+ st.error("โŒ Could not load ML libraries")
322
+
323
+ except Exception as e:
324
+ st.error(f"โŒ Error processing data: {e}")
325
+ train_df = None
326
 
327
+ # Data Analysis Section
328
+ if section == "๐Ÿ“Š Data Analysis":
329
+ if train_df is not None and 'clean_text' in train_df.columns:
330
+ st.subheader("๐Ÿ“Š Data Insights")
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
+ analyzer = DataAnalyzer(train_df, text_data, target)
333
+ info = analyzer.get_basic_info()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
+ # Metrics
336
+ col1, col2, col3 = st.columns(3)
337
+ with col1:
338
+ st.metric("๐Ÿ“„ Total Samples", f"{info['shape'][0]:,}")
339
+ with col2:
340
+ st.metric("๐Ÿ“Š Features", info['shape'][1])
341
+ with col3:
342
+ st.metric("๐Ÿท๏ธ Classes", len(info['class_distribution']))
343
+
344
+ # Class distribution
345
+ st.write("**Class Distribution:**")
346
+ class_dist_df = pd.DataFrame(list(info['class_distribution'].items()),
347
+ columns=['Class', 'Count'])
348
+ st.dataframe(class_dist_df, use_container_width=True)
349
+
350
+ # Sample data
351
+ st.write("**Processed Data Sample:**")
352
+ if 'clean_text' in train_df.columns:
353
+ sample_df = train_df[['clean_text', 'text_length', target]].head(5)
354
+ st.dataframe(sample_df, use_container_width=True)
355
+
356
+ # Visualizations
357
+ st.subheader("๐Ÿ“ˆ Data Visualizations")
358
+
359
+ col1, col2 = st.columns(2)
360
+ with col1:
361
+ st.write("**Class Distribution**")
362
+ analyzer.plot_class_distribution()
363
+
364
+ with col2:
365
+ st.write("**Text Length Distribution**")
366
+ analyzer.plot_text_length_distribution()
367
+
368
+ else:
369
+ st.info("๐Ÿ“‹ Upload and process your data to see analysis")
 
 
 
 
 
370
 
371
+ # Train Model Section
372
+ elif section == "๐Ÿš€ Train Model":
373
+ if train_df is not None and 'clean_text' in train_df.columns:
374
+ st.subheader("๐Ÿš€ Train Your Classification Model")
375
 
376
+ col1, col2 = st.columns(2)
377
 
378
+ with col1:
379
+ model = st.selectbox("๐Ÿค– Choose Model", [
380
+ "Logistic Regression",
381
+ "Decision Tree",
382
+ "Random Forest",
383
+ "Linear SVC",
384
+ "Multinomial Naive Bayes"
385
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ with col2:
388
+ vectorizer_choice = st.selectbox("๐Ÿ“Š Choose Vectorizer",
389
+ ["Tfidf Vectorizer", "Count Vectorizer"])
390
+
391
+ # Filter out empty texts
392
+ valid_data = train_df[train_df['clean_text'].str.len() > 0].copy()
393
 
394
+ if len(valid_data) < 10:
395
+ st.error("โŒ Not enough valid text data after cleaning! Need at least 10 samples.")
 
396
  else:
397
+ st.info(f"โœ… Ready to train with {len(valid_data):,} valid samples")
398
+
399
+ # Load ML libraries when needed
400
+ ml_libs = load_ml_libraries()
401
+ if not ml_libs:
402
+ st.error("โŒ Could not load ML libraries")
403
+ return
404
 
405
+ # Initialize vectorizer
406
+ max_features = min(5000, len(valid_data) * 5) # Conservative limit
407
+
408
+ if vectorizer_choice == "Tfidf Vectorizer":
409
+ vectorizer = ml_libs['TfidfVectorizer'](max_features=max_features, stop_words='english', ngram_range=(1,1))
410
+ st.session_state.vectorizer_type = "tfidf"
411
+ else:
412
+ vectorizer = ml_libs['CountVectorizer'](max_features=max_features, stop_words='english', ngram_range=(1,1))
413
+ st.session_state.vectorizer_type = "count"
414
+
415
+ if st.button("๐ŸŽฏ Start Training", type="primary"):
416
  try:
417
  # Vectorize
418
+ with st.spinner("Vectorizing text data..."):
419
+ X = vectorizer.fit_transform(valid_data['clean_text'])
420
+ y = valid_data['target']
421
+
422
+ st.write(f"๐Ÿ“Š **Feature matrix shape:** {X.shape}")
423
 
424
  # Split data
425
+ test_size = min(0.3, max(0.1, 50 / len(valid_data)))
426
+ X_train, X_test, y_train, y_test = ml_libs['train_test_split'](
427
  X, y, test_size=test_size, random_state=42, stratify=y
428
  )
429
 
430
+ st.write(f"๐Ÿ“ˆ **Data split** - Train: {X_train.shape[0]:,}, Test: {X_test.shape[0]:,}")
431
 
432
  # Save vectorizer
433
  vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
434
  if save_artifacts(vectorizer, "artifacts", vectorizer_filename):
435
  # Train model
436
+ model_filename = train_model(model, X_train, X_test, y_train, y_test, ml_libs)
437
  if model_filename:
438
+ st.balloons()
439
+ st.success("๐ŸŽ‰ Model ready! Go to 'Predictions' to test it.")
440
 
441
  except Exception as e:
442
+ st.error(f"โŒ Training failed: {e}")
443
+ else:
444
+ st.info("๐Ÿ“‹ Please upload and process training data first")
445
 
446
+ # Predictions Section
447
+ elif section == "๐Ÿ”ฎ Predictions":
448
+ st.subheader("๐Ÿ”ฎ Make Predictions")
 
 
 
449
 
450
+ if os.path.exists("models") and os.listdir("models"):
451
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
452
 
453
+ if available_models:
454
+ selected_model = st.selectbox("๐Ÿค– Choose trained model:", available_models)
455
+
456
+ text_input = st.text_area("๐Ÿ“ Enter text to classify:",
457
+ height=120,
458
+ placeholder="Type your text here...")
459
+
460
+ col1, col2 = st.columns([1, 3])
461
+ with col1:
462
+ predict_button = st.button("๐ŸŽฏ Predict", type="primary")
463
+
464
+ if predict_button and text_input.strip():
465
+ ml_libs = load_ml_libraries()
466
+ if ml_libs:
467
  predicted_label, prediction_proba = predict_text(
468
  selected_model,
469
  text_input,
470
+ st.session_state.get('vectorizer_type', 'tfidf'),
471
+ ml_libs
472
  )
473
 
474
  if predicted_label is not None:
475
  st.success("โœ… Prediction completed!")
476
+
477
+ # Show prediction
478
+ st.markdown("### ๐Ÿท๏ธ Prediction Result")
479
  st.markdown(f"**Predicted Class:** `{predicted_label}`")
480
 
481
+ # Show probabilities if available
482
  if prediction_proba is not None:
483
+ st.markdown("### ๐Ÿ“Š Class Probabilities")
484
  encoder = load_artifacts("artifacts", "encoder.pkl")
485
  if encoder is not None:
486
  classes = encoder.classes_
 
489
  'Probability': prediction_proba
490
  }).sort_values('Probability', ascending=False)
491
 
492
+ # Create a nice probability display
493
+ for idx, row in prob_df.iterrows():
494
+ confidence = row['Probability']
495
+ st.write(f"**{row['Class']}**: {confidence:.1%}")
496
+ st.progress(confidence)
497
+ elif predict_button:
498
+ st.warning("โš ๏ธ Please enter some text to classify")
499
+ else:
500
+ st.info("๐Ÿ“‹ No trained models found")
501
  else:
502
+ st.info("๐Ÿ“‹ No models available. Please train a model first in the 'Train Model' section.")
503
+
504
+ # Footer
505
+ st.markdown("---")
506
+ st.markdown("๐Ÿš€ **Built with Streamlit** | Ready for deployment on Hugging Face Spaces")
507
 
508
+ if __name__ == "__main__":
509
+ main()