Spaces:

Prageeth-1
/

News_Classification_App

Sleeping

App Files Files Community

Prageeth-1 commited on Mar 28, 2025

Commit

fe09de4

verified ·

1 Parent(s): 7d285e4

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -101

app.py CHANGED Viewed

@@ -44,94 +44,6 @@ def load_classification_model():
 def load_qa_model():
     return pipeline("question-answering", model="deepset/roberta-base-squad2")
-# Preprocessing function (same as in Section 01)
-def preprocess_text():
-    # Lowercase
-    df["cleaned_content"] = df["content"].str.lower()
-    # Remove URLs
-    def remove_urls(text):
-        url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
-        text = url_pattern.sub(r'', text)
-        return text.strip()
-    # applying the function
-    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
-    # Remove Emails
-    def remove_emails(text):
-        email_pattern = re.compile(r'\S+@\S+')
-        return email_pattern.sub(r'', text)
-    # applying the function
-    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
-    #Remove punctuations
-    def remove_punctuation(text):
-        return "".join([char for char in text if char not in string.punctuation])
-    # applying the function
-    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
-     # Get the list of stop words
-    stop_words = set(stopwords.words('english'))
-    # define the function
-    def remove_stopwords(text):
-        return " ".join([word for word in str(text).split() if word not in stop_words])
-    # apply the function
-    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
-    # define the function
-    def remove_special_characters(text):
-        return re.sub(r'[^A-Za-z\s]', '', text)
-    # apply the function
-    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
-    #Remove Frequent words
-    # Get the count of each word in cleaned_text
-    word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
-    # Get a set of common words
-    common_words = set([word for (word,count) in word_count.most_common(10)])
-    # deinfe the function
-    def remove_common_words(text):
-        return " ".join([word for word in str(text).split() if word not in common_words])
-    # apply the function
-    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
-    #Remove rare words
-    # Get a set of rare words
-    rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
-    print(rare_words)
-    # define the function
-    def remove_rare_words(text):
-        return " ".join([word for word in str(text).split() if word not in rare_words])
-    df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
-    df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
-    # initialize stemmer
-    stemmer = PorterStemmer()
-    # Defining the function
-    def stem_tokens(tokens):
-        stems = [stemmer.stem(token) for token in tokens]
-        return stems
-    # apply the function
-    df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
-    df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
 # Function to generate word cloud
@@ -285,25 +197,16 @@ with tab1:
         # apply the function
         df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
-        df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
         # Classify each article and store the predictions
         df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
         #Delete Unnecessary columns
-        df = df[['content', 'Class']]
         # Show results
@@ -316,7 +219,8 @@ with tab1:
         st.bar_chart(class_dist)
        # Download button

 def load_qa_model():
     return pipeline("question-answering", model="deepset/roberta-base-squad2")
 # Function to generate word cloud
         # apply the function
         df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
+        df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
         # Classify each article and store the predictions
         df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
         #Delete Unnecessary columns
+        df = df[['content','preprocessed_content','Class']]
         # Show results
         st.bar_chart(class_dist)
+        #Delete Unnecessary columns
+        df = df[['content','Class']]
        # Download button