README / categorizer.py
Ismetdh's picture
Upload 12 files
04cdb6f verified
import numpy as np
from nltk.tokenize import word_tokenize
import re
import pickle
import gensim.downloader as api
class NewsCategorizer:
def __init__(self, gensim_model_name='word2vec-google-news-300', model_file='logistic_regression_model.pkl', encoder_file='label_encoder.pkl'):
# Load gensim model
self.gensim_model = api.load(gensim_model_name)
print("New categorizer was called")
# Load classifier model and label encoder
with open(model_file, 'rb') as f:
self.model = pickle.load(f)
with open(encoder_file, 'rb') as le_file:
self.label_encoder = pickle.load(le_file)
def clean_text(self, text):
"""Cleans the text by removing non-alphabetic characters and converting to lowercase."""
text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
text = text.lower()
return text
def get_word2vec_embeddings(self, tokens):
"""Converts tokens to their word2vec embeddings."""
embeddings = []
for token in tokens:
if token in self.gensim_model:
embeddings.append(self.gensim_model[token]) # Get word vector
if embeddings:
return np.mean(embeddings, axis=0)
else:
return np.zeros(self.gensim_model.vector_size)
def predict_category(self, text):
"""Predicts the category of the given text using the pre-trained model."""
cleaned_text = self.clean_text(text)
tokens = word_tokenize(cleaned_text)
embeddings = self.get_word2vec_embeddings(tokens)
embeddings = embeddings.reshape(1, -1)
predicted_label = self.model.predict(embeddings)[0]
predicted_category = self.label_encoder.inverse_transform([predicted_label])[0]
return predicted_category
# # Example Usage
# # Initialize the NewsCategorizer class
# categorizer = NewsCategorizer()
# # Example text for prediction
# unknown_text = """A horrifying incident in Sultanpuri, Delhi, has led to the arrest of Neeraj Solanki and four of his family members for allegedly killing and burying his three-day-old twin daughters. The police revealed that the act was driven by Solanki's preference for a male child. Following the birth of the twins on May 30, the newborns were taken to a cremation ground and buried after being killed. The investigation began after the children’s mother reported the crime to the police."""
# # Predict the category for the unknown text
# predicted_category = categorizer.predict_category(unknown_text)
# print(f"The predicted category is: {predicted_category}")