""" Lightweight keyword extraction """ import logging from typing import List, Dict import re from collections import Counter logger = logging.getLogger(__name__) class KeywordExtractor: """Extract important keywords from documents (lightweight, no heavy NLP).""" def __init__(self): """Initialize keyword extractor.""" # Common stopwords self.stopwords = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every', 'both', 'more', 'most', 'other', 'some', 'any', 'such', 'no', 'nor', 'not', 'only', 'same', 'so', 'than', 'too', 'very', 'just', 'about', 'also', 'our' } def extract_keywords( self, text: str, top_k: int = 10, min_length: int = 3 ) -> List[str]: """ Extract top keywords from text (simple TF approach). Args: text: Input text top_k: Number of keywords to extract min_length: Minimum keyword length Returns: List of top keywords """ # Clean and lowercase text = text.lower() # Remove special characters and extra spaces words = re.findall(r'\b[a-z_]+\b', text) # Filter stopwords and short words filtered_words = [ w for w in words if w not in self.stopwords and len(w) >= min_length ] # Count frequencies word_freq = Counter(filtered_words) # Get top keywords keywords = [word for word, _ in word_freq.most_common(top_k)] return keywords def extract_phrases( self, text: str, top_k: int = 5, phrase_len: int = 2 ) -> List[str]: """ Extract key phrases (multi-word terms). Args: text: Input text top_k: Number of phrases to extract phrase_len: Length of phrases (2-3 words) Returns: List of top phrases """ # Split into sentences sentences = re.split(r'[.!?]+', text) phrases = [] for sentence in sentences: words = re.findall(r'\b[a-z_]+\b', sentence.lower()) # Extract n-grams for i in range(len(words) - phrase_len + 1): phrase = ' '.join(words[i:i+phrase_len]) # Skip if contains stopwords if not any(w in self.stopwords for w in words[i:i+phrase_len]): phrases.append(phrase) # Count frequencies phrase_freq = Counter(phrases) # Get top phrases top_phrases = [phrase for phrase, _ in phrase_freq.most_common(top_k)] return top_phrases def extract_all( self, text: str, keywords_k: int = 10, phrases_k: int = 5 ) -> Dict[str, List[str]]: """ Extract both keywords and phrases. Args: text: Input text keywords_k: Number of keywords phrases_k: Number of phrases Returns: Dictionary with keywords and phrases """ return { 'keywords': self.extract_keywords(text, top_k=keywords_k), 'key_phrases': self.extract_phrases(text, top_k=phrases_k) } def score_keywords( self, text: str, keywords: List[str] ) -> Dict[str, float]: """ Score keywords based on frequency and position. Args: text: Input text keywords: List of keywords to score Returns: Dictionary with keyword scores """ text_lower = text.lower() scores = {} for keyword in keywords: # Count frequency count = text_lower.count(keyword) # Check position (higher score if in beginning) position_score = 1.0 if text_lower.find(keyword) < len(text) / 4: position_score = 1.5 # Calculate TF-IDF-like score score = (count * position_score) / (len(text.split()) / 100) scores[keyword] = min(score, 10.0) # Cap at 10 return scores