Spaces:

Easonwangzk
/

ReAct-Text-Analyzer

Sleeping

App Files Files Community

ReAct-Text-Analyzer / src /tools /keyword_extractor.py

Easonwangzk

Initial commit

a01e687 4 months ago

raw

history blame contribute delete

2.95 kB

	"""Keyword extraction tool using TF-IDF."""

	import re
	from collections import Counter
	from typing import Dict, Any, List, Tuple
	import math

	from .base_tool import BaseTool


	class KeywordExtractor(BaseTool):
	"""Extracts keywords from text using TF-IDF approach."""

	def __init__(self):
	super().__init__()
	# Common English stop words
	self.stop_words = {
	'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
	'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
	'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
	'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
	}

	@property
	def description(self) -> str:
	return (
	"Extracts important keywords from the text using TF-IDF scoring. "
	"Returns the top keywords with their relevance scores. "
	"Use this when you need to identify key topics or themes."
	)

	def run(self, text: str) -> Dict[str, Any]:
	"""Extract keywords from text.

	Args:
	text: Input text to analyze

	Returns:
	Dictionary with extracted keywords and scores
	"""
	# Tokenize and clean
	words = re.findall(r'\b[a-zA-Z]+\b', text.lower())

	# Remove stop words and short words
	filtered_words = [
	w for w in words
	if w not in self.stop_words and len(w) > 2
	]

	if not filtered_words:
	return {
	"keywords": [],
	"num_keywords": 0
	}

	# Calculate TF (Term Frequency)
	word_count = Counter(filtered_words)
	total_words = len(filtered_words)

	tf_scores = {
	word: count / total_words
	for word, count in word_count.items()
	}

	# Simple IDF approximation (treating text as multiple sentences)
	sentences = re.split(r'[.!?]+', text)
	idf_scores = {}

	for word in word_count:
	# Count how many sentences contain the word
	containing_sentences = sum(
	1 for sent in sentences if word in sent.lower()
	)
	if containing_sentences > 0:
	idf_scores[word] = math.log(len(sentences) / containing_sentences)
	else:
	idf_scores[word] = 0

	# Calculate TF-IDF
	tfidf_scores = {
	word: tf_scores[word] * idf_scores.get(word, 0)
	for word in tf_scores
	}

	# Sort by score and get top keywords
	top_keywords = sorted(
	tfidf_scores.items(),
	key=lambda x: x[1],
	reverse=True
	)[:10]

	return {
	"keywords": [
	{"word": word, "score": round(score, 4)}
	for word, score in top_keywords
	],
	"num_keywords": len(top_keywords)
	}