import numpy as np from nltk.tokenize import word_tokenize import re import pickle import gensim.downloader as api class NewsCategorizer: def __init__(self, gensim_model_name='word2vec-google-news-300', model_file='logistic_regression_model.pkl', encoder_file='label_encoder.pkl'): # Load gensim model self.gensim_model = api.load(gensim_model_name) print("New categorizer was called") # Load classifier model and label encoder with open(model_file, 'rb') as f: self.model = pickle.load(f) with open(encoder_file, 'rb') as le_file: self.label_encoder = pickle.load(le_file) def clean_text(self, text): """Cleans the text by removing non-alphabetic characters and converting to lowercase.""" text = re.sub(r'[^a-zA-Z\s]', '', text, re.I) text = text.lower() return text def get_word2vec_embeddings(self, tokens): """Converts tokens to their word2vec embeddings.""" embeddings = [] for token in tokens: if token in self.gensim_model: embeddings.append(self.gensim_model[token]) # Get word vector if embeddings: return np.mean(embeddings, axis=0) else: return np.zeros(self.gensim_model.vector_size) def predict_category(self, text): """Predicts the category of the given text using the pre-trained model.""" cleaned_text = self.clean_text(text) tokens = word_tokenize(cleaned_text) embeddings = self.get_word2vec_embeddings(tokens) embeddings = embeddings.reshape(1, -1) predicted_label = self.model.predict(embeddings)[0] predicted_category = self.label_encoder.inverse_transform([predicted_label])[0] return predicted_category # # Example Usage # # Initialize the NewsCategorizer class # categorizer = NewsCategorizer() # # Example text for prediction # unknown_text = """A horrifying incident in Sultanpuri, Delhi, has led to the arrest of Neeraj Solanki and four of his family members for allegedly killing and burying his three-day-old twin daughters. The police revealed that the act was driven by Solanki's preference for a male child. Following the birth of the twins on May 30, the newborns were taken to a cremation ground and buried after being killed. The investigation began after the children’s mother reported the crime to the police.""" # # Predict the category for the unknown text # predicted_category = categorizer.predict_category(unknown_text) # print(f"The predicted category is: {predicted_category}")