Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -190,7 +190,100 @@ with tab1:
|
|
| 190 |
classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
|
| 191 |
|
| 192 |
# Preprocess
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
# Classify each article and store the predictions
|
|
|
|
| 190 |
classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
|
| 191 |
|
| 192 |
# Preprocess
|
| 193 |
+
# Lowercase
|
| 194 |
+
df["cleaned_content"] = df["content"].str.lower()
|
| 195 |
+
|
| 196 |
+
# Remove URLs
|
| 197 |
+
def remove_urls(text):
|
| 198 |
+
url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
|
| 199 |
+
text = url_pattern.sub(r'', text)
|
| 200 |
+
return text.strip()
|
| 201 |
+
|
| 202 |
+
# applying the function
|
| 203 |
+
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
|
| 204 |
+
|
| 205 |
+
# Remove Emails
|
| 206 |
+
def remove_emails(text):
|
| 207 |
+
email_pattern = re.compile(r'\S+@\S+')
|
| 208 |
+
return email_pattern.sub(r'', text)
|
| 209 |
+
|
| 210 |
+
# applying the function
|
| 211 |
+
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
|
| 212 |
+
|
| 213 |
+
#Remove punctuations
|
| 214 |
+
def remove_punctuation(text):
|
| 215 |
+
return "".join([char for char in text if char not in string.punctuation])
|
| 216 |
+
|
| 217 |
+
# applying the function
|
| 218 |
+
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
|
| 219 |
+
|
| 220 |
+
# Get the list of stop words
|
| 221 |
+
stop_words = set(stopwords.words('english'))
|
| 222 |
+
|
| 223 |
+
# define the function
|
| 224 |
+
def remove_stopwords(text):
|
| 225 |
+
return " ".join([word for word in str(text).split() if word not in stop_words])
|
| 226 |
+
|
| 227 |
+
# apply the function
|
| 228 |
+
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
|
| 229 |
+
|
| 230 |
+
# define the function
|
| 231 |
+
def remove_special_characters(text):
|
| 232 |
+
return re.sub(r'[^A-Za-z\s]', '', text)
|
| 233 |
+
|
| 234 |
+
# apply the function
|
| 235 |
+
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
|
| 236 |
+
|
| 237 |
+
#Remove Frequent words
|
| 238 |
+
|
| 239 |
+
# Get the count of each word in cleaned_text
|
| 240 |
+
word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
|
| 241 |
+
|
| 242 |
+
# Get a set of common words
|
| 243 |
+
common_words = set([word for (word,count) in word_count.most_common(10)])
|
| 244 |
+
|
| 245 |
+
# deinfe the function
|
| 246 |
+
def remove_common_words(text):
|
| 247 |
+
return " ".join([word for word in str(text).split() if word not in common_words])
|
| 248 |
+
|
| 249 |
+
# apply the function
|
| 250 |
+
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
|
| 251 |
+
|
| 252 |
+
#Remove rare words
|
| 253 |
+
# Get a set of rare words
|
| 254 |
+
rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
|
| 255 |
+
print(rare_words)
|
| 256 |
+
|
| 257 |
+
# define the function
|
| 258 |
+
def remove_rare_words(text):
|
| 259 |
+
return " ".join([word for word in str(text).split() if word not in rare_words])
|
| 260 |
+
|
| 261 |
+
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
|
| 262 |
+
|
| 263 |
+
df['tokenized_content'] = df['cleaned_content'].apply(lambda text: nltk.word_tokenize(text))
|
| 264 |
+
|
| 265 |
+
# initialize stemmer
|
| 266 |
+
stemmer = PorterStemmer()
|
| 267 |
+
|
| 268 |
+
# Defining the function
|
| 269 |
+
def stem_tokens(tokens):
|
| 270 |
+
stems = [stemmer.stem(token) for token in tokens]
|
| 271 |
+
return stems
|
| 272 |
+
|
| 273 |
+
# apply the function
|
| 274 |
+
df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
|
| 275 |
+
|
| 276 |
+
df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
|
| 287 |
|
| 288 |
|
| 289 |
# Classify each article and store the predictions
|