Spaces:
Sleeping
Sleeping
File size: 4,873 Bytes
300f197 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | import nltk
from rake_nltk import Rake
from collections import Counter
import re
class KeywordExtractor:
def __init__(self):
"""Initialize the keyword extractor."""
self.rake = Rake()
def extract_keywords_rake(self, text, max_keywords=10):
"""
Extract keywords using RAKE algorithm.
Args:
text (str): Input text
max_keywords (int): Maximum number of keywords to extract
Returns:
list: List of keywords with scores
"""
self.rake.extract_keywords_from_text(text)
keywords_with_scores = self.rake.get_ranked_phrases_with_scores()
# Filter keywords
clean_keywords = []
seen_keywords = set()
for score, keyword in keywords_with_scores:
# Remove keywords with digits or special chars
if re.search(r'\d', keyword) or len(keyword) < 4:
continue
# Remove very short single words that are lowercase (likely noise)
if ' ' not in keyword and keyword[0].islower() and len(keyword) < 5:
continue
# Remove duplicates
if keyword.lower() in seen_keywords:
continue
clean_keywords.append((score, keyword))
seen_keywords.add(keyword.lower())
if len(clean_keywords) >= max_keywords:
break
return clean_keywords
def extract_named_entities(self, text):
"""
Extract named entities (simple approach using capitalization patterns).
Args:
text (str): Input text
Returns:
list: List of potential named entities
"""
# Simple named entity extraction based on capitalization
words = text.split()
entities = []
for word in words:
# Look for capitalized words that aren't at sentence start
if word[0].isupper() and len(word) > 2:
# Remove punctuation
clean_word = re.sub(r'[^\w]', '', word)
if clean_word and clean_word not in ['The', 'This', 'That', 'These', 'Those']:
entities.append(clean_word)
# Count occurrences and return most frequent
entity_counts = Counter(entities)
return entity_counts.most_common(10)
def identify_important_sentences(self, sentences, keywords, top_n=5):
"""
Identify important sentences based on keyword density.
Args:
sentences (list): List of sentences
keywords (list): List of important keywords
top_n (int): Number of top sentences to return
Returns:
list: List of important sentences with scores
"""
keyword_phrases = [kw[1] for kw in keywords] # Extract phrases from (score, phrase) tuples
sentence_scores = []
for sentence in sentences:
score = 0
sentence_lower = sentence.lower()
best_keyword = ""
# Score based on keyword presence
for keyword in keyword_phrases:
if keyword.lower() in sentence_lower:
score += 1
if not best_keyword or len(keyword) > len(best_keyword):
best_keyword = keyword
# Bonus for sentence length (not too short, not too long)
word_count = len(sentence.split())
if 8 <= word_count <= 25:
score += 0.5
# Bonus for sentences with numbers or specific terms
if re.search(r'\d+', sentence):
score += 0.3
if score > 0:
sentence_scores.append((score, sentence, best_keyword))
# Sort by score and return top sentences
sentence_scores.sort(key=lambda x: x[0], reverse=True)
return sentence_scores[:top_n]
def extract_key_concepts(self, text, sentences, top_n_sentences=5):
"""
Complete keyword and concept extraction pipeline.
Args:
text (str): Input text
sentences (list): List of sentences
Returns:
dict: Extracted keywords, entities, and important sentences
"""
keywords = self.extract_keywords_rake(text)
entities = self.extract_named_entities(text)
important_sentences = self.identify_important_sentences(sentences, keywords, top_n=top_n_sentences)
return {
'keywords': keywords,
'named_entities': entities,
'important_sentences': important_sentences
}
|