Spaces:
Sleeping
Sleeping
| """Keyword extraction tool using TF-IDF.""" | |
| import re | |
| from collections import Counter | |
| from typing import Dict, Any, List, Tuple | |
| import math | |
| from .base_tool import BaseTool | |
| class KeywordExtractor(BaseTool): | |
| """Extracts keywords from text using TF-IDF approach.""" | |
| def __init__(self): | |
| super().__init__() | |
| # Common English stop words | |
| self.stop_words = { | |
| 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', | |
| 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', | |
| 'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have', | |
| 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how' | |
| } | |
| def description(self) -> str: | |
| return ( | |
| "Extracts important keywords from the text using TF-IDF scoring. " | |
| "Returns the top keywords with their relevance scores. " | |
| "Use this when you need to identify key topics or themes." | |
| ) | |
| def run(self, text: str) -> Dict[str, Any]: | |
| """Extract keywords from text. | |
| Args: | |
| text: Input text to analyze | |
| Returns: | |
| Dictionary with extracted keywords and scores | |
| """ | |
| # Tokenize and clean | |
| words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) | |
| # Remove stop words and short words | |
| filtered_words = [ | |
| w for w in words | |
| if w not in self.stop_words and len(w) > 2 | |
| ] | |
| if not filtered_words: | |
| return { | |
| "keywords": [], | |
| "num_keywords": 0 | |
| } | |
| # Calculate TF (Term Frequency) | |
| word_count = Counter(filtered_words) | |
| total_words = len(filtered_words) | |
| tf_scores = { | |
| word: count / total_words | |
| for word, count in word_count.items() | |
| } | |
| # Simple IDF approximation (treating text as multiple sentences) | |
| sentences = re.split(r'[.!?]+', text) | |
| idf_scores = {} | |
| for word in word_count: | |
| # Count how many sentences contain the word | |
| containing_sentences = sum( | |
| 1 for sent in sentences if word in sent.lower() | |
| ) | |
| if containing_sentences > 0: | |
| idf_scores[word] = math.log(len(sentences) / containing_sentences) | |
| else: | |
| idf_scores[word] = 0 | |
| # Calculate TF-IDF | |
| tfidf_scores = { | |
| word: tf_scores[word] * idf_scores.get(word, 0) | |
| for word in tf_scores | |
| } | |
| # Sort by score and get top keywords | |
| top_keywords = sorted( | |
| tfidf_scores.items(), | |
| key=lambda x: x[1], | |
| reverse=True | |
| )[:10] | |
| return { | |
| "keywords": [ | |
| {"word": word, "score": round(score, 4)} | |
| for word, score in top_keywords | |
| ], | |
| "num_keywords": len(top_keywords) | |
| } | |