Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from nltk.tokenize import word_tokenize | |
| import re | |
| import pickle | |
| import gensim.downloader as api | |
| class NewsCategorizer: | |
| def __init__(self, gensim_model_name='word2vec-google-news-300', model_file='logistic_regression_model.pkl', encoder_file='label_encoder.pkl'): | |
| # Load gensim model | |
| self.gensim_model = api.load(gensim_model_name) | |
| print("New categorizer was called") | |
| # Load classifier model and label encoder | |
| with open(model_file, 'rb') as f: | |
| self.model = pickle.load(f) | |
| with open(encoder_file, 'rb') as le_file: | |
| self.label_encoder = pickle.load(le_file) | |
| def clean_text(self, text): | |
| """Cleans the text by removing non-alphabetic characters and converting to lowercase.""" | |
| text = re.sub(r'[^a-zA-Z\s]', '', text, re.I) | |
| text = text.lower() | |
| return text | |
| def get_word2vec_embeddings(self, tokens): | |
| """Converts tokens to their word2vec embeddings.""" | |
| embeddings = [] | |
| for token in tokens: | |
| if token in self.gensim_model: | |
| embeddings.append(self.gensim_model[token]) # Get word vector | |
| if embeddings: | |
| return np.mean(embeddings, axis=0) | |
| else: | |
| return np.zeros(self.gensim_model.vector_size) | |
| def predict_category(self, text): | |
| """Predicts the category of the given text using the pre-trained model.""" | |
| cleaned_text = self.clean_text(text) | |
| tokens = word_tokenize(cleaned_text) | |
| embeddings = self.get_word2vec_embeddings(tokens) | |
| embeddings = embeddings.reshape(1, -1) | |
| predicted_label = self.model.predict(embeddings)[0] | |
| predicted_category = self.label_encoder.inverse_transform([predicted_label])[0] | |
| return predicted_category | |
| # # Example Usage | |
| # # Initialize the NewsCategorizer class | |
| # categorizer = NewsCategorizer() | |
| # # Example text for prediction | |
| # unknown_text = """A horrifying incident in Sultanpuri, Delhi, has led to the arrest of Neeraj Solanki and four of his family members for allegedly killing and burying his three-day-old twin daughters. The police revealed that the act was driven by Solanki's preference for a male child. Following the birth of the twins on May 30, the newborns were taken to a cremation ground and buried after being killed. The investigation began after the children’s mother reported the crime to the police.""" | |
| # # Predict the category for the unknown text | |
| # predicted_category = categorizer.predict_category(unknown_text) | |
| # print(f"The predicted category is: {predicted_category}") | |