Spaces:

Prageeth-1
/

News_Classification_App

Sleeping

App Files Files Community

Prageeth-1 commited on Mar 28, 2025

Commit

b5f50fd

verified ·

1 Parent(s): 93c21e0

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -20

app.py CHANGED Viewed

@@ -33,22 +33,93 @@ def load_qa_model():
     return pipeline("question-answering", model="deepset/roberta-base-squad2")
 # Preprocessing function (same as in Section 01)
-def preprocess_text(text):
     # Lowercase
-    text = text.lower()
     # Remove URLs
-    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
-    # Remove special characters and numbers
-    text = re.sub(r'[^a-zA-Z\s]', '', text)
-    # Tokenize
-    tokens = word_tokenize(text)
-    # Remove stopwords
     stop_words = set(stopwords.words('english'))
-    tokens = [token for token in tokens if token not in stop_words]
-    # Lemmatization
-    tokens = [lemmatizer.lemmatize(token) for token in tokens]
-    # Join tokens back to string
-    return ' '.join(tokens)
 # Function to generate word cloud
 def generate_wordcloud(text, title=None):
@@ -117,23 +188,30 @@ with tab1:
         # Load the fine-tuned news classifier
         classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
         # Classify each article and store the predictions
-        df["predicted_category"] = df["content"].apply(lambda text: classifier(text)[0]["label"])
-        # Preprocess and classify
         # Show results
         st.subheader("Classification Results")
         st.write(df)
         # Show distribution
         st.subheader("Class Distribution")
-        class_dist = df['predicted_category'].value_counts()
         st.bar_chart(class_dist)
        # Download button
         st.subheader("Download Results")

     return pipeline("question-answering", model="deepset/roberta-base-squad2")
 # Preprocessing function (same as in Section 01)
+def preprocess_text():
     # Lowercase
+    df["cleaned_content"] = df["content"].str.lower()
     # Remove URLs
+    def remove_urls(text):
+        url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
+        text = url_pattern.sub(r'', text)
+        return text.strip()
+    # applying the function
+    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
+    # Remove Emails
+    def remove_emails(text):
+        email_pattern = re.compile(r'\S+@\S+')
+        return email_pattern.sub(r'', text)
+    # applying the function
+    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
+    #Remove punctuations
+    def remove_punctuation(text):
+        return "".join([char for char in text if char not in string.punctuation])
+    # applying the function
+    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text)
+     # Get the list of stop words
     stop_words = set(stopwords.words('english'))
+    # define the function
+    def remove_stopwords(text):
+        return " ".join([word for word in str(text).split() if word not in stop_words])
+    # apply the function
+    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
+    # define the function
+    def remove_special_characters(text):
+        return re.sub(r'[^A-Za-z\s]', '', text)
+    # apply the function
+    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
+    #Remove Frequent words
+    # Get the count of each word in cleaned_text
+    word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
+    # Get a set of common words
+    common_words = set([word for (word,count) in word_count.most_common(10)])
+    # deinfe the function
+    def remove_common_words(text):
+        return " ".join([word for word in str(text).split() if word not in common_words])
+    # apply the function
+    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
+    #Remove rare words
+    # Get a set of rare words
+    rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
+    print(rare_words)
+    # define the function
+    def remove_rare_words(text):
+        return " ".join([word for word in str(text).split() if word not in rare_words])
+    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
+    df['tokenized_content'] = df['cleaned_content'].apply(lambda text: nltk.word_tokenize(text))
+    # initialize stemmer
+    stemmer = PorterStemmer()
+    # Defining the function
+    def stem_tokens(tokens):
+        stems = [stemmer.stem(token) for token in tokens]
+        return stems
+    # apply the function
+    df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
+    df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
 # Function to generate word cloud
 def generate_wordcloud(text, title=None):
         # Load the fine-tuned news classifier
         classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
+        # Preprocess
+        preprocess_text()
         # Classify each article and store the predictions
+        df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
+        #Delete Unnecessary columns
+        df = df[['content', 'Class']]
         # Show results
         st.subheader("Classification Results")
         st.write(df)
         # Show distribution
         st.subheader("Class Distribution")
+        class_dist = df['Class'].value_counts()
         st.bar_chart(class_dist)
        # Download button
         st.subheader("Download Results")