Spaces:

RaghuCourage9605
/

Jagriti_News_Classification

Sleeping

App Files Files Community

RaghuCourage9605 commited on Dec 31, 2024

Commit

b7bbf35

verified ·

1 Parent(s): aa31cef

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -99

app.py CHANGED Viewed

@@ -2,34 +2,27 @@ import pickle
 import gradio as gr
 import re
 import spacy
-from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 import numpy as np
-from transformers import AutoTokenizer
 import subprocess
 import sys
 subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
 subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
 with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
     tfidf_vectorizer = pickle.load(vectorizer_file)
-with open('mnb_model.pkl', 'rb') as model_file:
-    mnb = pickle.load(model_file)
-checkpoint = "answerdotai/ModernBERT-base"
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-tf_idf = TfidfVectorizer()
 nlp = spacy.load("en_core_web_sm")
 class TextPreprocessing:
-    def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
-        self.text = text
-        self.tokenizer = tokenizer
-        self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
     @staticmethod
     def Cleaning_text(text: str) -> str:
         """
@@ -44,105 +37,31 @@ class TextPreprocessing:
         text = re.sub(r'\s+', ' ', text).strip()
         return text
-    @staticmethod
-    def Tokenization_text(text: str) -> list:
-        """
-        Tokenizes the text into a list of words, excluding punctuations and spaces.
-        """
-        doc = nlp(text)
-        tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
-        return tokens
-    @staticmethod
-    def Lemmatization_text(text: str) -> str:
-        """
-        Performs lemmatization on the text and returns the lemmatized version.
-        """
-        doc = nlp(text)
-        lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
-        return lemmatized_text
-    @staticmethod
-    def Stopwords_removal(text: str) -> str:
-        """
-        Removes stopwords from the input text.
-        """
-        doc = nlp(text)
-        text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
-        return text_without_stopwords
-    def ModernBert_Tokenization(self) -> dict:
-        """
-        Tokenizes the cleaned text using ModernBERT's tokenizer.
-        """
-        cleaned_text = self.Cleaning_text(self.text)
-        tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
-        return tokenized_output
-    def Tfidf_Transformation(self, texts: list) -> np.ndarray:
-        """
-        Applies TF-IDF transformation to a list of texts.
-        Args:
-            texts (list of str): List of text strings to apply the TF-IDF transformation.
-        Returns:
-            np.ndarray: TF-IDF feature matrix.
-        """
-        tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
-        return tfidf_matrix.toarray()
-    def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
-        """
-        Applies Bag of Words (BoW) transformation to a list of texts.
-        Args:
-            texts (list of str): List of text strings to apply the BoW transformation.
-        Returns:
-            np.ndarray: Bag of Words feature matrix.
-        """
-        vectorizer = CountVectorizer()
-        bow_matrix = vectorizer.fit_transform(texts)
-        return bow_matrix.toarray()
-    def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
-        """
-        Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.
-        Args:
-            texts (list of str): List of text strings to apply the N-gram transformation.
-            ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.
-        Returns:
-            np.ndarray: N-gram feature matrix.
-        """
-        vectorizer = CountVectorizer(ngram_range=ngram_range)
-        ngram_matrix = vectorizer.fit_transform(texts)
-        return ngram_matrix.toarray()
 def preprocess_text(text):
-    text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
-    cleaned_text = text_preprocessor.Cleaning_text(text)
     return cleaned_text
 def predict_news(text):
     cleaned_text = preprocess_text(text)
     X_input = tfidf_vectorizer.transform([cleaned_text])
-    prediction = mnb.predict(X_input)
     return "Fake News" if prediction == 0 else "Real News"
 iface = gr.Interface(
     fn=predict_news,
     inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
     outputs="text",
     title="Fake News Classification",
-    description="Classify news articles as real or fake."
 )
 iface.launch()

 import gradio as gr
 import re
 import spacy
+from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
+from sklearn.linear_model import PassiveAggressiveClassifier
+# Ensure required Spacy model is installed
 import subprocess
 import sys
 subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
 subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
+# Load the saved vectorizer and model
 with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
     tfidf_vectorizer = pickle.load(vectorizer_file)
+with open('pac_model.pkl', 'rb') as model_file:  # Updated to PAC model
+    pac_model = pickle.load(model_file)
+# Load Spacy language model
 nlp = spacy.load("en_core_web_sm")
 class TextPreprocessing:
     @staticmethod
     def Cleaning_text(text: str) -> str:
         """
         text = re.sub(r'\s+', ' ', text).strip()
         return text
 def preprocess_text(text):
+    """
+    Preprocess the text by cleaning it using the TextPreprocessing class.
+    """
+    cleaned_text = TextPreprocessing.Cleaning_text(text)
     return cleaned_text
 def predict_news(text):
+    """
+    Predict whether the input news text is real or fake.
+    """
     cleaned_text = preprocess_text(text)
     X_input = tfidf_vectorizer.transform([cleaned_text])
+    prediction = pac_model.predict(X_input)
     return "Fake News" if prediction == 0 else "Real News"
+# Gradio Interface
 iface = gr.Interface(
     fn=predict_news,
     inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
     outputs="text",
     title="Fake News Classification",
+    description="Classify news articles as real or fake using a Passive Aggressive Classifier."
 )
 iface.launch()