File size: 2,699 Bytes
04cdb6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
from nltk.tokenize import word_tokenize
import re
import pickle
import gensim.downloader as api


class NewsCategorizer:
    def __init__(self, gensim_model_name='word2vec-google-news-300', model_file='logistic_regression_model.pkl', encoder_file='label_encoder.pkl'):
        # Load gensim model
        self.gensim_model = api.load(gensim_model_name)
        print("New categorizer was called")
        
        # Load classifier model and label encoder
        with open(model_file, 'rb') as f:
            self.model = pickle.load(f)
        
        with open(encoder_file, 'rb') as le_file:
            self.label_encoder = pickle.load(le_file)
    
    def clean_text(self, text):
        """Cleans the text by removing non-alphabetic characters and converting to lowercase."""
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
        text = text.lower()
        return text
    
    def get_word2vec_embeddings(self, tokens):
        """Converts tokens to their word2vec embeddings."""
        embeddings = []
        for token in tokens:
            if token in self.gensim_model:
                embeddings.append(self.gensim_model[token])  # Get word vector
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(self.gensim_model.vector_size)
    
    def predict_category(self, text):
        """Predicts the category of the given text using the pre-trained model."""
        cleaned_text = self.clean_text(text)
        tokens = word_tokenize(cleaned_text)
        embeddings = self.get_word2vec_embeddings(tokens)
        embeddings = embeddings.reshape(1, -1)
        predicted_label = self.model.predict(embeddings)[0]
        predicted_category = self.label_encoder.inverse_transform([predicted_label])[0]
        return predicted_category


# # Example Usage
# # Initialize the NewsCategorizer class
# categorizer = NewsCategorizer()

# # Example text for prediction
# unknown_text = """A horrifying incident in Sultanpuri, Delhi, has led to the arrest of Neeraj Solanki and four of his family members for allegedly killing and burying his three-day-old twin daughters. The police revealed that the act was driven by Solanki's preference for a male child. Following the birth of the twins on May 30, the newborns were taken to a cremation ground and buried after being killed. The investigation began after the children’s mother reported the crime to the police."""

# # Predict the category for the unknown text
# predicted_category = categorizer.predict_category(unknown_text)
# print(f"The predicted category is: {predicted_category}")