Priyanhsu commited on
Commit
1835c4c
·
1 Parent(s): d37e8f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -16
app.py CHANGED
@@ -18,25 +18,85 @@ import nltk
18
  nltk.download('stopwords')
19
  nltk.download('punkt')
20
 
21
- # Load the trained model
22
- model = joblib.load('model.bin')
23
 
 
 
 
 
 
 
 
 
24
  def remove_punctuation(text):
25
  punctuation_free = "".join([i for i in text if i not in string.punctuation])
26
  return punctuation_free
27
-
28
- def vectorize_text(texts):
29
- vectorizer = CountVectorizer()
30
- vectorizer.fit(texts)
31
- text_vectorized = vectorizer.transform(texts)
32
- return text_vectorized, vectorizer
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def test_model(text):
35
  # Convert text to lowercase
36
  text = text.lower()
37
 
38
  # Remove punctuation
39
- text = remove_punctuation(text)
40
 
41
  # Remove numbers
42
  text = re.sub(r'\d+', '', text)
@@ -48,16 +108,16 @@ def test_model(text):
48
 
49
  # Join the filtered tokens back into a string
50
  preprocessed_text = ' '.join(filtered_text)
51
-
52
  # Vectorize the preprocessed text
53
- vectorize_texts = vectorize_text([preprocessed_text])
54
-
55
  # Make prediction on the vectorized text
56
- prediction = model.predict(vectorize_texts[0])[0]
57
 
58
  # Return the prediction
59
  return prediction
60
-
61
  # Create the Gradio interface
62
- iface = gr.Interface(fn=test_model, inputs="text", outputs="text", title="Text Classification")
63
- iface.launch()
 
 
18
  nltk.download('stopwords')
19
  nltk.download('punkt')
20
 
21
+ with open('NewData.json') as file:
22
+ data = json.load(file)
23
 
24
+ df = pd.DataFrame(data)
25
+ # shuffling all our data
26
+ df = df.sample(frac=1)
27
+ # reading only Message_body and label
28
+ df = df[['content','label']]
29
+ df['clean_msg'] = df['content'].apply(lambda x: x.lower())
30
+ # Remove punctuation
31
+ import string
32
  def remove_punctuation(text):
33
  punctuation_free = "".join([i for i in text if i not in string.punctuation])
34
  return punctuation_free
 
 
 
 
 
 
35
 
36
+ df['clean_msg'] = df['clean_msg'].apply(lambda x: remove_punctuation(x))
37
+ # Tokenization
38
+ from nltk.tokenize import WhitespaceTokenizer
39
+ def tokenization(text):
40
+ tk = WhitespaceTokenizer()
41
+ return tk.tokenize(text)
42
+
43
+ df['tokenized_clean_msg'] = df['clean_msg'].apply(lambda x: tokenization(x))
44
+ # Remove stopwords
45
+ from nltk.corpus import stopwords
46
+ stopwords = set(stopwords.words('english'))
47
+
48
+ def remove_stopwords(text):
49
+ output = [word for word in text if word not in stopwords]
50
+ return output
51
+
52
+ df['cleaned_tokens'] = df['tokenized_clean_msg'].apply(lambda x: remove_stopwords(x))
53
+ # Count word frequencies
54
+ from collections import Counter
55
+ cnt = Counter()
56
+ for text in df['cleaned_tokens'].values:
57
+ for word in text:
58
+ cnt[word] += 1
59
+
60
+ # Select most common words
61
+ FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
62
+
63
+ # Remove frequent words
64
+ def remove_freqwords(text):
65
+ return [word for word in text if word not in FREQWORDS]
66
+
67
+ df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: remove_freqwords(x))
68
+
69
+ # Stemming
70
+ from nltk.stem.porter import PorterStemmer
71
+ porter_stemmer = PorterStemmer()
72
+
73
+ def stemming(text):
74
+ stem_text = [porter_stemmer.stem(word) for word in text]
75
+ return stem_text
76
+
77
+ df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: stemming(x))
78
+
79
+ # Prepare feature matrix and target vector
80
+ X = df['cleaned_tokens'].apply(lambda x: ' '.join(x))
81
+ y = df['label']
82
+ # Split the data into training and testing sets
83
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
84
+ # Vectorize the data
85
+ from sklearn.feature_extraction.text import CountVectorizer
86
+ vectorizer = CountVectorizer()
87
+ X_train_vectorized = vectorizer.fit_transform(X_train)
88
+ X_test_vectorized = vectorizer.transform(X_test)
89
+ # Train the Multinomial Naive Bayes model
90
+ model = MultinomialNB()
91
+ model.fit(X_train_vectorized, y_train)
92
+ # Make predictions on the test set
93
+ y_pred = model.predict(X_test_vectorized)
94
  def test_model(text):
95
  # Convert text to lowercase
96
  text = text.lower()
97
 
98
  # Remove punctuation
99
+ text =remove_punctuation(text)
100
 
101
  # Remove numbers
102
  text = re.sub(r'\d+', '', text)
 
108
 
109
  # Join the filtered tokens back into a string
110
  preprocessed_text = ' '.join(filtered_text)
111
+
112
  # Vectorize the preprocessed text
113
+ text_vectorized = vectorizer.transform([preprocessed_text])
114
+
115
  # Make prediction on the vectorized text
116
+ prediction = model.predict(text_vectorized)[0]
117
 
118
  # Return the prediction
119
  return prediction
 
120
  # Create the Gradio interface
121
+ iface = gr.Interface(fn=test_model, inputs="text", outputs="text")
122
+ # Launch the interface
123
+ iface.launch()