import nltk from rake_nltk import Rake from collections import Counter import re class KeywordExtractor: def __init__(self): """Initialize the keyword extractor.""" self.rake = Rake() def extract_keywords_rake(self, text, max_keywords=10): """ Extract keywords using RAKE algorithm. Args: text (str): Input text max_keywords (int): Maximum number of keywords to extract Returns: list: List of keywords with scores """ self.rake.extract_keywords_from_text(text) keywords_with_scores = self.rake.get_ranked_phrases_with_scores() # Filter keywords clean_keywords = [] seen_keywords = set() for score, keyword in keywords_with_scores: # Remove keywords with digits or special chars if re.search(r'\d', keyword) or len(keyword) < 4: continue # Remove very short single words that are lowercase (likely noise) if ' ' not in keyword and keyword[0].islower() and len(keyword) < 5: continue # Remove duplicates if keyword.lower() in seen_keywords: continue clean_keywords.append((score, keyword)) seen_keywords.add(keyword.lower()) if len(clean_keywords) >= max_keywords: break return clean_keywords def extract_named_entities(self, text): """ Extract named entities (simple approach using capitalization patterns). Args: text (str): Input text Returns: list: List of potential named entities """ # Simple named entity extraction based on capitalization words = text.split() entities = [] for word in words: # Look for capitalized words that aren't at sentence start if word[0].isupper() and len(word) > 2: # Remove punctuation clean_word = re.sub(r'[^\w]', '', word) if clean_word and clean_word not in ['The', 'This', 'That', 'These', 'Those']: entities.append(clean_word) # Count occurrences and return most frequent entity_counts = Counter(entities) return entity_counts.most_common(10) def identify_important_sentences(self, sentences, keywords, top_n=5): """ Identify important sentences based on keyword density. Args: sentences (list): List of sentences keywords (list): List of important keywords top_n (int): Number of top sentences to return Returns: list: List of important sentences with scores """ keyword_phrases = [kw[1] for kw in keywords] # Extract phrases from (score, phrase) tuples sentence_scores = [] for sentence in sentences: score = 0 sentence_lower = sentence.lower() best_keyword = "" # Score based on keyword presence for keyword in keyword_phrases: if keyword.lower() in sentence_lower: score += 1 if not best_keyword or len(keyword) > len(best_keyword): best_keyword = keyword # Bonus for sentence length (not too short, not too long) word_count = len(sentence.split()) if 8 <= word_count <= 25: score += 0.5 # Bonus for sentences with numbers or specific terms if re.search(r'\d+', sentence): score += 0.3 if score > 0: sentence_scores.append((score, sentence, best_keyword)) # Sort by score and return top sentences sentence_scores.sort(key=lambda x: x[0], reverse=True) return sentence_scores[:top_n] def extract_key_concepts(self, text, sentences, top_n_sentences=5): """ Complete keyword and concept extraction pipeline. Args: text (str): Input text sentences (list): List of sentences Returns: dict: Extracted keywords, entities, and important sentences """ keywords = self.extract_keywords_rake(text) entities = self.extract_named_entities(text) important_sentences = self.identify_important_sentences(sentences, keywords, top_n=top_n_sentences) return { 'keywords': keywords, 'named_entities': entities, 'important_sentences': important_sentences }