Alamgirapi commited on
Commit
56d15cb
ยท
verified ยท
1 Parent(s): 060249f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +515 -0
app.py CHANGED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import os
6
+ import pickle
7
+ import ssl
8
+ import nltk
9
+ import re
10
+ import string
11
+ from pathlib import Path
12
+ from sklearn.preprocessing import LabelEncoder
13
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
16
+ from sklearn.linear_model import LogisticRegression
17
+ from sklearn.tree import DecisionTreeClassifier
18
+ from sklearn.svm import LinearSVC, SVC
19
+ from sklearn.ensemble import RandomForestClassifier
20
+ from sklearn.naive_bayes import MultinomialNB, GaussianNB
21
+ from nltk.corpus import stopwords
22
+ from nltk.stem import WordNetLemmatizer
23
+
24
+ # Fix SSL certificate issues for NLTK downloads
25
+ try:
26
+ _create_unverified_https_context = ssl._create_unverified_context
27
+ except AttributeError:
28
+ pass
29
+ else:
30
+ ssl._create_default_https_context = _create_unverified_https_context
31
+
32
+ # Download NLTK data with error handling
33
+ @st.cache_resource
34
+ def download_nltk_data():
35
+ try:
36
+ nltk.data.find('corpora/stopwords')
37
+ except LookupError:
38
+ nltk.download('stopwords', quiet=True)
39
+
40
+ try:
41
+ nltk.data.find('corpora/wordnet')
42
+ except LookupError:
43
+ nltk.download('wordnet', quiet=True)
44
+ nltk.download('omw-1.4', quiet=True)
45
+
46
+ # Download required NLTK data
47
+ download_nltk_data()
48
+
49
+ class TextCleaner:
50
+ """Class for cleaning Text"""
51
+ def __init__(self, currency_symbols=r'[\$\ยฃ\โ‚ฌ\ยฅ\โ‚น\ยข\โ‚ฝ\โ‚ฉ\โ‚ช]', stop_words=None, lemmatizer=None):
52
+ self.currency_symbols = currency_symbols
53
+
54
+ if stop_words is None:
55
+ try:
56
+ self.stop_words = set(stopwords.words('english'))
57
+ except LookupError:
58
+ nltk.download('stopwords', quiet=True)
59
+ self.stop_words = set(stopwords.words('english'))
60
+ else:
61
+ self.stop_words = stop_words
62
+
63
+ if lemmatizer is None:
64
+ try:
65
+ self.lemmatizer = WordNetLemmatizer()
66
+ # Test the lemmatizer to ensure it works
67
+ test_word = self.lemmatizer.lemmatize('testing')
68
+ except (AttributeError, LookupError) as e:
69
+ print(f"WordNet lemmatizer initialization failed: {e}")
70
+ nltk.download('wordnet', quiet=True)
71
+ nltk.download('omw-1.4', quiet=True)
72
+ self.lemmatizer = WordNetLemmatizer()
73
+ else:
74
+ self.lemmatizer = lemmatizer
75
+
76
+ def remove_punctuation(self, text):
77
+ return text.translate(str.maketrans('', '', string.punctuation))
78
+
79
+ def clean_text(self, text):
80
+ """Clean the text by removing punctuations, html tag, underscore,
81
+ whitespaces, numbers, stopwords. Lemmatize the words in root format."""
82
+ if not isinstance(text, str):
83
+ text = str(text) if text is not None else ""
84
+
85
+ if not text.strip():
86
+ return ""
87
+
88
+ try:
89
+ text = text.lower()
90
+ text = re.sub(self.currency_symbols, 'currency', text)
91
+
92
+ # Remove any kind of emojis in the text
93
+ emoji_pattern = re.compile("["
94
+ u"\U0001F600-\U0001F64F" # emoticons
95
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
96
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
97
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
98
+ u"\U00002702-\U000027B0"
99
+ u"\U000024C2-\U0001F251"
100
+ "]+", flags=re.UNICODE)
101
+ text = emoji_pattern.sub(r'', text)
102
+ text = self.remove_punctuation(text)
103
+ text = re.compile('<.*?>').sub('', text)
104
+ text = text.replace('_', '')
105
+ text = re.sub(r'[^\w\s]', '', text)
106
+ text = re.sub(r'\d', ' ', text)
107
+ text = re.sub(r'\s+', ' ', text).strip()
108
+ text = ' '.join(word for word in text.split() if word not in self.stop_words)
109
+
110
+ # Lemmatization with error handling
111
+ try:
112
+ text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
113
+ except (AttributeError, LookupError) as e:
114
+ print(f"Lemmatization failed for text: {e}")
115
+ pass
116
+
117
+ return str(text)
118
+
119
+ except Exception as e:
120
+ print(f"Error cleaning text: {e}")
121
+ return str(text)
122
+
123
+ class DataAnalyzer:
124
+ """Class for data analysis and visualization"""
125
+ def __init__(self, df, text_column, target_column):
126
+ self.df = df
127
+ self.text_column = text_column
128
+ self.target_column = target_column
129
+
130
+ def get_basic_info(self):
131
+ info = {
132
+ 'shape': self.df.shape,
133
+ 'missing_values': self.df.isnull().sum().to_dict(),
134
+ 'class_distribution': self.df[self.target_column].value_counts().to_dict()
135
+ }
136
+ return info
137
+
138
+ def plot_class_distribution(self):
139
+ fig, ax = plt.subplots(figsize=(10, 6))
140
+ self.df[self.target_column].value_counts().plot(kind='bar', ax=ax)
141
+ ax.set_title('Class Distribution')
142
+ ax.set_xlabel('Classes')
143
+ ax.set_ylabel('Count')
144
+ plt.xticks(rotation=45)
145
+ st.pyplot(fig)
146
+
147
+ def plot_text_length_distribution(self):
148
+ fig, ax = plt.subplots(figsize=(10, 6))
149
+ text_lengths = self.df[self.text_column].str.len()
150
+ ax.hist(text_lengths, bins=50, alpha=0.7)
151
+ ax.set_title('Text Length Distribution')
152
+ ax.set_xlabel('Text Length')
153
+ ax.set_ylabel('Frequency')
154
+ st.pyplot(fig)
155
+
156
+ # Utility functions
157
+ def save_artifacts(obj, folder_name, file_name):
158
+ """Save artifacts like encoders and vectorizers"""
159
+ os.makedirs(folder_name, exist_ok=True)
160
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
161
+ pickle.dump(obj, f)
162
+
163
+ def load_artifacts(folder_name, file_name):
164
+ """Load saved artifacts"""
165
+ try:
166
+ with open(os.path.join(folder_name, file_name), 'rb') as f:
167
+ return pickle.load(f)
168
+ except FileNotFoundError:
169
+ st.error(f"File {file_name} not found in {folder_name} folder")
170
+ return None
171
+
172
+ def load_model(model_name):
173
+ """Load trained model"""
174
+ try:
175
+ with open(os.path.join('models', model_name), 'rb') as f:
176
+ return pickle.load(f)
177
+ except FileNotFoundError:
178
+ st.error(f"Model {model_name} not found. Please train a model first.")
179
+ return None
180
+
181
+ def train_model(model_name, X_train, X_test, y_train, y_test):
182
+ """Train selected model"""
183
+ os.makedirs("models", exist_ok=True)
184
+
185
+ models_dict = {
186
+ "Logistic Regression": LogisticRegression(),
187
+ "Decision Tree": DecisionTreeClassifier(),
188
+ "Random Forest": RandomForestClassifier(),
189
+ "Linear SVC": LinearSVC(),
190
+ "SVC": SVC(),
191
+ "Multinomial Naive Bayes": MultinomialNB(),
192
+ "Gaussian Naive Bayes": GaussianNB()
193
+ }
194
+
195
+ if model_name in models_dict:
196
+ model = models_dict[model_name]
197
+ model.fit(X_train, y_train)
198
+
199
+ # Save model
200
+ model_filename = f"{model_name.replace(' ', '')}.pkl"
201
+ save_path = os.path.join("models", model_filename)
202
+ with open(save_path, 'wb') as f:
203
+ pickle.dump(model, f)
204
+
205
+ # Evaluate model
206
+ y_pred = model.predict(X_test)
207
+ accuracy = accuracy_score(y_test, y_pred)
208
+
209
+ st.success("Model training completed!")
210
+ st.write(f"**Accuracy**: {accuracy:.4f}")
211
+
212
+ return model_filename
213
+ else:
214
+ st.error(f"Model {model_name} not supported")
215
+ return None
216
+
217
+ def predict_text(model_name, text, vectorizer_type="tfidf"):
218
+ """Make prediction on new text"""
219
+ try:
220
+ # Load model
221
+ model = load_model(model_name)
222
+ if model is None:
223
+ return None, None
224
+
225
+ # Load vectorizer
226
+ vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
227
+ vectorizer = load_artifacts("artifacts", vectorizer_file)
228
+ if vectorizer is None:
229
+ return None, None
230
+
231
+ # Load label encoder
232
+ encoder = load_artifacts("artifacts", "encoder.pkl")
233
+ if encoder is None:
234
+ return None, None
235
+
236
+ # Clean and vectorize text
237
+ text_cleaner = TextCleaner()
238
+ clean_text = text_cleaner.clean_text(text)
239
+
240
+ # Transform text using the same vectorizer used during training
241
+ text_vector = vectorizer.transform([clean_text])
242
+
243
+ # Make prediction
244
+ prediction = model.predict(text_vector)
245
+ prediction_proba = None
246
+
247
+ # Get prediction probabilities if available
248
+ if hasattr(model, 'predict_proba'):
249
+ try:
250
+ prediction_proba = model.predict_proba(text_vector)[0]
251
+ except:
252
+ pass
253
+
254
+ # Decode prediction
255
+ predicted_label = encoder.inverse_transform(prediction)[0]
256
+
257
+ return predicted_label, prediction_proba
258
+
259
+ except Exception as e:
260
+ st.error(f"Error during prediction: {str(e)}")
261
+ return None, None
262
+
263
+ # Streamlit App
264
+ st.set_page_config(page_title="No Code Text Classifier", page_icon="๐Ÿค–", layout="wide")
265
+
266
+ st.title('๐Ÿค– No Code Text Classification App')
267
+ st.write('Understand the behavior of your text data and train a model to classify text data')
268
+
269
+ # Sidebar
270
+ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
271
+
272
+ # Upload Data
273
+ st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
274
+ train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
275
+ test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
276
+
277
+ # Global variables to store data and settings
278
+ if 'vectorizer_type' not in st.session_state:
279
+ st.session_state.vectorizer_type = "tfidf"
280
+
281
+ if train_data is not None:
282
+ try:
283
+ train_df = pd.read_csv(train_data, encoding='latin1')
284
+
285
+ if test_data is not None:
286
+ test_df = pd.read_csv(test_data, encoding='latin1')
287
+ else:
288
+ test_df = None
289
+
290
+ st.write("**Training Data Preview:**")
291
+ st.dataframe(train_df.head(3))
292
+
293
+ columns = train_df.columns.tolist()
294
+ text_data = st.sidebar.selectbox("Choose the text column:", columns)
295
+ target = st.sidebar.selectbox("Choose the target column:", columns)
296
+
297
+ # Process data
298
+ text_cleaner = TextCleaner()
299
+ train_df['clean_text'] = train_df[text_data].apply(lambda x: text_cleaner.clean_text(x))
300
+ train_df['text_length'] = train_df[text_data].str.len()
301
+
302
+ # Handle label encoding
303
+ label_encoder = LabelEncoder()
304
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
305
+
306
+ # Save label encoder for later use
307
+ os.makedirs("artifacts", exist_ok=True)
308
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
309
+
310
+ except Exception as e:
311
+ st.error(f"Error loading data: {str(e)}")
312
+ train_df = None
313
+
314
+ # Data Analysis Section
315
+ if section == "Data Analysis":
316
+ if train_data is not None and train_df is not None:
317
+ try:
318
+ st.subheader("๐Ÿ“Š Data Insights")
319
+
320
+ analyzer = DataAnalyzer(train_df, text_data, target)
321
+ info = analyzer.get_basic_info()
322
+
323
+ col1, col2, col3 = st.columns(3)
324
+ with col1:
325
+ st.metric("Total Samples", info['shape'][0])
326
+ with col2:
327
+ st.metric("Features", info['shape'][1])
328
+ with col3:
329
+ st.metric("Classes", len(info['class_distribution']))
330
+
331
+ st.write("**Class Distribution:**")
332
+ st.write(info['class_distribution'])
333
+
334
+ st.write("**Missing Values:**")
335
+ st.write(info['missing_values'])
336
+
337
+ st.write("**Processed Data Preview:**")
338
+ st.dataframe(train_df[['clean_text', 'text_length', 'target']].head())
339
+
340
+ st.subheader("๐Ÿ“ˆ Visualizations")
341
+
342
+ col1, col2 = st.columns(2)
343
+ with col1:
344
+ st.write("**Class Distribution**")
345
+ analyzer.plot_class_distribution()
346
+
347
+ with col2:
348
+ st.write("**Text Length Distribution**")
349
+ analyzer.plot_text_length_distribution()
350
+
351
+ except Exception as e:
352
+ st.error(f"Error in data analysis: {str(e)}")
353
+ else:
354
+ st.warning("โš ๏ธ Please upload training data to get insights")
355
+
356
+ # Train Model Section
357
+ elif section == "Train Model":
358
+ if train_data is not None and train_df is not None:
359
+ try:
360
+ st.subheader("๐Ÿš€ Train a Model")
361
+
362
+ col1, col2 = st.columns(2)
363
+
364
+ with col1:
365
+ model = st.selectbox("Choose the Model", [
366
+ "Logistic Regression", "Decision Tree",
367
+ "Random Forest", "Linear SVC", "SVC",
368
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
369
+ ])
370
+
371
+ with col2:
372
+ vectorizer_choice = st.selectbox("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
373
+
374
+ # Initialize vectorizer
375
+ if vectorizer_choice == "Tfidf Vectorizer":
376
+ vectorizer = TfidfVectorizer(max_features=10000)
377
+ st.session_state.vectorizer_type = "tfidf"
378
+ else:
379
+ vectorizer = CountVectorizer(max_features=10000)
380
+ st.session_state.vectorizer_type = "count"
381
+
382
+ st.write("**Training Data Preview:**")
383
+ st.dataframe(train_df[['clean_text', 'target']].head())
384
+
385
+ # Vectorize text data
386
+ X = vectorizer.fit_transform(train_df['clean_text'])
387
+ y = train_df['target']
388
+
389
+ # Split data
390
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
391
+ st.write(f"**Data split** - Train: {X_train.shape}, Test: {X_test.shape}")
392
+
393
+ # Save vectorizer for later use
394
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
395
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
396
+
397
+ if st.button("๐ŸŽฏ Start Training", type="primary"):
398
+ with st.spinner("Training model..."):
399
+ model_filename = train_model(model, X_train, X_test, y_train, y_test)
400
+ if model_filename:
401
+ st.info("โœ… You can now use the 'Predictions' section to classify new text.")
402
+
403
+ except Exception as e:
404
+ st.error(f"Error in model training: {str(e)}")
405
+ else:
406
+ st.warning("โš ๏ธ Please upload training data to train a model")
407
+
408
+ # Predictions Section
409
+ elif section == "Predictions":
410
+ st.subheader("๐Ÿ”ฎ Perform Predictions on New Text")
411
+
412
+ # Check if models exist
413
+ if os.path.exists("models") and os.listdir("models"):
414
+ # Text input for prediction
415
+ text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type your text here...")
416
+
417
+ # Model selection
418
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
419
+
420
+ if available_models:
421
+ selected_model = st.selectbox("Choose the trained model:", available_models)
422
+
423
+ # Prediction button
424
+ if st.button("๐ŸŽฏ Predict", type="primary"):
425
+ if text_input.strip():
426
+ with st.spinner("Making prediction..."):
427
+ predicted_label, prediction_proba = predict_text(
428
+ selected_model,
429
+ text_input,
430
+ st.session_state.get('vectorizer_type', 'tfidf')
431
+ )
432
+
433
+ if predicted_label is not None:
434
+ st.success("โœ… Prediction completed!")
435
+
436
+ # Display results
437
+ st.markdown("### ๐Ÿ“Š Prediction Results")
438
+ st.markdown(f"**Input Text:** {text_input}")
439
+ st.markdown(f"**Predicted Class:** `{predicted_label}`")
440
+
441
+ # Display probabilities if available
442
+ if prediction_proba is not None:
443
+ st.markdown("**๐Ÿ“ˆ Class Probabilities:**")
444
+
445
+ # Load encoder to get class names
446
+ encoder = load_artifacts("artifacts", "encoder.pkl")
447
+ if encoder is not None:
448
+ classes = encoder.classes_
449
+ prob_df = pd.DataFrame({
450
+ 'Class': classes,
451
+ 'Probability': prediction_proba
452
+ }).sort_values('Probability', ascending=False)
453
+
454
+ st.bar_chart(prob_df.set_index('Class'))
455
+ st.dataframe(prob_df, use_container_width=True)
456
+ else:
457
+ st.warning("โš ๏ธ Please enter some text to classify")
458
+ else:
459
+ st.warning("โš ๏ธ No trained models found. Please train a model first.")
460
+ else:
461
+ st.warning("โš ๏ธ No trained models found. Please go to 'Train Model' section to train a model first.")
462
+
463
+ # Option to classify multiple texts
464
+ st.markdown("---")
465
+ st.subheader("๐Ÿ“Š Batch Predictions")
466
+
467
+ uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
468
+
469
+ if uploaded_file is not None:
470
+ try:
471
+ batch_df = pd.read_csv(uploaded_file, encoding='latin1')
472
+ st.write("**Uploaded data preview:**")
473
+ st.dataframe(batch_df.head())
474
+
475
+ # Select text column
476
+ text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
477
+
478
+ if os.path.exists("models") and os.listdir("models"):
479
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
480
+ batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
481
+
482
+ if st.button("๐Ÿš€ Run Batch Predictions", type="primary"):
483
+ with st.spinner("Processing batch predictions..."):
484
+ predictions = []
485
+ progress_bar = st.progress(0)
486
+
487
+ for i, text in enumerate(batch_df[text_column]):
488
+ pred, _ = predict_text(
489
+ batch_model,
490
+ str(text),
491
+ st.session_state.get('vectorizer_type', 'tfidf')
492
+ )
493
+ predictions.append(pred if pred is not None else "Error")
494
+ progress_bar.progress((i + 1) / len(batch_df))
495
+
496
+ batch_df['Predicted_Class'] = predictions
497
+
498
+ st.success("โœ… Batch predictions completed!")
499
+ st.write("**Results:**")
500
+ st.dataframe(batch_df[[text_column, 'Predicted_Class']], use_container_width=True)
501
+
502
+ # Download results
503
+ csv = batch_df.to_csv(index=False)
504
+ st.download_button(
505
+ label="๐Ÿ’พ Download predictions as CSV",
506
+ data=csv,
507
+ file_name="batch_predictions.csv",
508
+ mime="text/csv"
509
+ )
510
+ except Exception as e:
511
+ st.error(f"Error in batch prediction: {str(e)}")
512
+
513
+ # Footer
514
+ st.markdown("---")
515
+ st.markdown("Built with โค๏ธ using Streamlit | Deploy on ๐Ÿค— Hugging Face Spaces")