Spaces:

CSharpCorner
/

CSharpGrammer

Runtime error

App Files Files Community

Priyanhsu commited on Jun 1, 2023

Commit

1835c4c

1 Parent(s): d37e8f6

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -16

app.py CHANGED Viewed

@@ -18,25 +18,85 @@ import nltk
 nltk.download('stopwords')
 nltk.download('punkt')
-# Load the trained model
-model = joblib.load('model.bin')
 def remove_punctuation(text):
     punctuation_free = "".join([i for i in text if i not in string.punctuation])
     return punctuation_free
-def vectorize_text(texts):
-    vectorizer = CountVectorizer()
-    vectorizer.fit(texts)
-    text_vectorized = vectorizer.transform(texts)
-    return text_vectorized, vectorizer
 def test_model(text):
     # Convert text to lowercase
     text = text.lower()
     # Remove punctuation
-    text = remove_punctuation(text)
     # Remove numbers
     text = re.sub(r'\d+', '', text)
@@ -48,16 +108,16 @@ def test_model(text):
     # Join the filtered tokens back into a string
     preprocessed_text = ' '.join(filtered_text)
     # Vectorize the preprocessed text
-    vectorize_texts = vectorize_text([preprocessed_text])
     # Make prediction on the vectorized text
-    prediction = model.predict(vectorize_texts[0])[0]
     # Return the prediction
     return prediction
 # Create the Gradio interface
-iface = gr.Interface(fn=test_model, inputs="text", outputs="text", title="Text Classification")
-iface.launch()

 nltk.download('stopwords')
 nltk.download('punkt')
+with open('NewData.json') as file:
+    data = json.load(file)
+df = pd.DataFrame(data)
+# shuffling all our data
+df = df.sample(frac=1)
+# reading only Message_body and label
+df = df[['content','label']]
+df['clean_msg'] = df['content'].apply(lambda x: x.lower())
+# Remove punctuation
+import string
 def remove_punctuation(text):
     punctuation_free = "".join([i for i in text if i not in string.punctuation])
     return punctuation_free
+df['clean_msg'] = df['clean_msg'].apply(lambda x: remove_punctuation(x))
+# Tokenization
+from nltk.tokenize import WhitespaceTokenizer
+def tokenization(text):
+    tk = WhitespaceTokenizer()
+    return tk.tokenize(text)
+df['tokenized_clean_msg'] = df['clean_msg'].apply(lambda x: tokenization(x))
+# Remove stopwords
+from nltk.corpus import stopwords
+stopwords = set(stopwords.words('english'))
+def remove_stopwords(text):
+    output = [word for word in text if word not in stopwords]
+    return output
+df['cleaned_tokens'] = df['tokenized_clean_msg'].apply(lambda x: remove_stopwords(x))
+# Count word frequencies
+from collections import Counter
+cnt = Counter()
+for text in df['cleaned_tokens'].values:
+    for word in text:
+        cnt[word] += 1
+# Select most common words
+FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
+# Remove frequent words
+def remove_freqwords(text):
+    return [word for word in text if word not in FREQWORDS]
+df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: remove_freqwords(x))
+# Stemming
+from nltk.stem.porter import PorterStemmer
+porter_stemmer = PorterStemmer()
+def stemming(text):
+    stem_text = [porter_stemmer.stem(word) for word in text]
+    return stem_text
+df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: stemming(x))
+# Prepare feature matrix and target vector
+X = df['cleaned_tokens'].apply(lambda x: ' '.join(x))
+y = df['label']
+# Split the data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Vectorize the data
+from sklearn.feature_extraction.text import CountVectorizer
+vectorizer = CountVectorizer()
+X_train_vectorized = vectorizer.fit_transform(X_train)
+X_test_vectorized = vectorizer.transform(X_test)
+# Train the Multinomial Naive Bayes model
+model = MultinomialNB()
+model.fit(X_train_vectorized, y_train)
+# Make predictions on the test set
+y_pred = model.predict(X_test_vectorized)
 def test_model(text):
     # Convert text to lowercase
     text = text.lower()
     # Remove punctuation
+    text =remove_punctuation(text)
     # Remove numbers
     text = re.sub(r'\d+', '', text)
     # Join the filtered tokens back into a string
     preprocessed_text = ' '.join(filtered_text)
     # Vectorize the preprocessed text
+    text_vectorized = vectorizer.transform([preprocessed_text])
     # Make prediction on the vectorized text
+    prediction = model.predict(text_vectorized)[0]
     # Return the prediction
     return prediction
 # Create the Gradio interface
+iface = gr.Interface(fn=test_model, inputs="text", outputs="text")
+# Launch the interface
+iface.launch()