File size: 4,873 Bytes
300f197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import nltk
from rake_nltk import Rake
from collections import Counter
import re

class KeywordExtractor:
    def __init__(self):
        """Initialize the keyword extractor."""
        self.rake = Rake()
    
    def extract_keywords_rake(self, text, max_keywords=10):
        """
        Extract keywords using RAKE algorithm.
        
        Args:
            text (str): Input text
            max_keywords (int): Maximum number of keywords to extract
            
        Returns:
            list: List of keywords with scores
        """
        self.rake.extract_keywords_from_text(text)
        keywords_with_scores = self.rake.get_ranked_phrases_with_scores()
        
        # Filter keywords
        clean_keywords = []
        seen_keywords = set()
        
        for score, keyword in keywords_with_scores:
            # Remove keywords with digits or special chars
            if re.search(r'\d', keyword) or len(keyword) < 4:
                continue
                
            # Remove very short single words that are lowercase (likely noise)
            if ' ' not in keyword and keyword[0].islower() and len(keyword) < 5:
                continue
            
            # Remove duplicates
            if keyword.lower() in seen_keywords:
                continue
                
            clean_keywords.append((score, keyword))
            seen_keywords.add(keyword.lower())
            
            if len(clean_keywords) >= max_keywords:
                break
        
        return clean_keywords
    
    def extract_named_entities(self, text):
        """
        Extract named entities (simple approach using capitalization patterns).
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of potential named entities
        """
        # Simple named entity extraction based on capitalization
        words = text.split()
        entities = []
        
        for word in words:
            # Look for capitalized words that aren't at sentence start
            if word[0].isupper() and len(word) > 2:
                # Remove punctuation
                clean_word = re.sub(r'[^\w]', '', word)
                if clean_word and clean_word not in ['The', 'This', 'That', 'These', 'Those']:
                    entities.append(clean_word)
        
        # Count occurrences and return most frequent
        entity_counts = Counter(entities)
        return entity_counts.most_common(10)
    
    def identify_important_sentences(self, sentences, keywords, top_n=5):
        """
        Identify important sentences based on keyword density.
        
        Args:
            sentences (list): List of sentences
            keywords (list): List of important keywords
            top_n (int): Number of top sentences to return
            
        Returns:
            list: List of important sentences with scores
        """
        keyword_phrases = [kw[1] for kw in keywords]  # Extract phrases from (score, phrase) tuples
        sentence_scores = []
        
        for sentence in sentences:
            score = 0
            sentence_lower = sentence.lower()
            best_keyword = ""
            
            # Score based on keyword presence
            for keyword in keyword_phrases:
                if keyword.lower() in sentence_lower:
                    score += 1
                    if not best_keyword or len(keyword) > len(best_keyword):
                        best_keyword = keyword
            
            # Bonus for sentence length (not too short, not too long)
            word_count = len(sentence.split())
            if 8 <= word_count <= 25:
                score += 0.5
            
            # Bonus for sentences with numbers or specific terms
            if re.search(r'\d+', sentence):
                score += 0.3
            
            if score > 0:
                sentence_scores.append((score, sentence, best_keyword))
        
        # Sort by score and return top sentences
        sentence_scores.sort(key=lambda x: x[0], reverse=True)
        return sentence_scores[:top_n]
    
    def extract_key_concepts(self, text, sentences, top_n_sentences=5):
        """
        Complete keyword and concept extraction pipeline.
        
        Args:
            text (str): Input text
            sentences (list): List of sentences
            
        Returns:
            dict: Extracted keywords, entities, and important sentences
        """
        keywords = self.extract_keywords_rake(text)
        entities = self.extract_named_entities(text)
        important_sentences = self.identify_important_sentences(sentences, keywords, top_n=top_n_sentences)
        
        return {
            'keywords': keywords,
            'named_entities': entities,
            'important_sentences': important_sentences
        }