"""Keyword extraction tool using TF-IDF.""" import re from collections import Counter from typing import Dict, Any, List, Tuple import math from .base_tool import BaseTool class KeywordExtractor(BaseTool): """Extracts keywords from text using TF-IDF approach.""" def __init__(self): super().__init__() # Common English stop words self.stop_words = { 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how' } @property def description(self) -> str: return ( "Extracts important keywords from the text using TF-IDF scoring. " "Returns the top keywords with their relevance scores. " "Use this when you need to identify key topics or themes." ) def run(self, text: str) -> Dict[str, Any]: """Extract keywords from text. Args: text: Input text to analyze Returns: Dictionary with extracted keywords and scores """ # Tokenize and clean words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) # Remove stop words and short words filtered_words = [ w for w in words if w not in self.stop_words and len(w) > 2 ] if not filtered_words: return { "keywords": [], "num_keywords": 0 } # Calculate TF (Term Frequency) word_count = Counter(filtered_words) total_words = len(filtered_words) tf_scores = { word: count / total_words for word, count in word_count.items() } # Simple IDF approximation (treating text as multiple sentences) sentences = re.split(r'[.!?]+', text) idf_scores = {} for word in word_count: # Count how many sentences contain the word containing_sentences = sum( 1 for sent in sentences if word in sent.lower() ) if containing_sentences > 0: idf_scores[word] = math.log(len(sentences) / containing_sentences) else: idf_scores[word] = 0 # Calculate TF-IDF tfidf_scores = { word: tf_scores[word] * idf_scores.get(word, 0) for word in tf_scores } # Sort by score and get top keywords top_keywords = sorted( tfidf_scores.items(), key=lambda x: x[1], reverse=True )[:10] return { "keywords": [ {"word": word, "score": round(score, 4)} for word, score in top_keywords ], "num_keywords": len(top_keywords) }