Spaces:

Prageeth-1
/

News_Classification_App

Sleeping

App Files Files Community

Prageeth-1 commited on Mar 28, 2025

Commit

5ddd7af

verified ·

1 Parent(s): 991db0d

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -1

app.py CHANGED Viewed

@@ -190,7 +190,100 @@ with tab1:
         classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
         # Preprocess
-        preprocess_text()
         # Classify each article and store the predictions

         classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
         # Preprocess
+        # Lowercase
+        df["cleaned_content"] = df["content"].str.lower()
+        # Remove URLs
+        def remove_urls(text):
+            url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
+            text = url_pattern.sub(r'', text)
+            return text.strip()
+        # applying the function
+        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
+        # Remove Emails
+        def remove_emails(text):
+            email_pattern = re.compile(r'\S+@\S+')
+            return email_pattern.sub(r'', text)
+        # applying the function
+        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
+        #Remove punctuations
+        def remove_punctuation(text):
+            return "".join([char for char in text if char not in string.punctuation])
+        # applying the function
+        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
+         # Get the list of stop words
+        stop_words = set(stopwords.words('english'))
+        # define the function
+        def remove_stopwords(text):
+            return " ".join([word for word in str(text).split() if word not in stop_words])
+        # apply the function
+        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
+        # define the function
+        def remove_special_characters(text):
+            return re.sub(r'[^A-Za-z\s]', '', text)
+        # apply the function
+        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
+        #Remove Frequent words
+        # Get the count of each word in cleaned_text
+        word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
+        # Get a set of common words
+        common_words = set([word for (word,count) in word_count.most_common(10)])
+        # deinfe the function
+        def remove_common_words(text):
+            return " ".join([word for word in str(text).split() if word not in common_words])
+        # apply the function
+        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
+        #Remove rare words
+        # Get a set of rare words
+        rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
+        print(rare_words)
+        # define the function
+        def remove_rare_words(text):
+            return " ".join([word for word in str(text).split() if word not in rare_words])
+        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
+        df['tokenized_content'] = df['cleaned_content'].apply(lambda text: nltk.word_tokenize(text))
+        # initialize stemmer
+        stemmer = PorterStemmer()
+        # Defining the function
+        def stem_tokens(tokens):
+            stems = [stemmer.stem(token) for token in tokens]
+            return stems
+        # apply the function
+        df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
+        df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
         # Classify each article and store the predictions