ReAct-Text-Analyzer / src /tools /keyword_extractor.py
Easonwangzk's picture
Initial commit
a01e687
"""Keyword extraction tool using TF-IDF."""
import re
from collections import Counter
from typing import Dict, Any, List, Tuple
import math
from .base_tool import BaseTool
class KeywordExtractor(BaseTool):
"""Extracts keywords from text using TF-IDF approach."""
def __init__(self):
super().__init__()
# Common English stop words
self.stop_words = {
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
}
@property
def description(self) -> str:
return (
"Extracts important keywords from the text using TF-IDF scoring. "
"Returns the top keywords with their relevance scores. "
"Use this when you need to identify key topics or themes."
)
def run(self, text: str) -> Dict[str, Any]:
"""Extract keywords from text.
Args:
text: Input text to analyze
Returns:
Dictionary with extracted keywords and scores
"""
# Tokenize and clean
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
# Remove stop words and short words
filtered_words = [
w for w in words
if w not in self.stop_words and len(w) > 2
]
if not filtered_words:
return {
"keywords": [],
"num_keywords": 0
}
# Calculate TF (Term Frequency)
word_count = Counter(filtered_words)
total_words = len(filtered_words)
tf_scores = {
word: count / total_words
for word, count in word_count.items()
}
# Simple IDF approximation (treating text as multiple sentences)
sentences = re.split(r'[.!?]+', text)
idf_scores = {}
for word in word_count:
# Count how many sentences contain the word
containing_sentences = sum(
1 for sent in sentences if word in sent.lower()
)
if containing_sentences > 0:
idf_scores[word] = math.log(len(sentences) / containing_sentences)
else:
idf_scores[word] = 0
# Calculate TF-IDF
tfidf_scores = {
word: tf_scores[word] * idf_scores.get(word, 0)
for word in tf_scores
}
# Sort by score and get top keywords
top_keywords = sorted(
tfidf_scores.items(),
key=lambda x: x[1],
reverse=True
)[:10]
return {
"keywords": [
{"word": word, "score": round(score, 4)}
for word, score in top_keywords
],
"num_keywords": len(top_keywords)
}