"""Keyword extraction tool using TF-IDF."""

import re
from collections import Counter
from typing import Dict, Any, List, Tuple
import math

from .base_tool import BaseTool


class KeywordExtractor(BaseTool):
    """Extracts keywords from text using TF-IDF approach."""

    def __init__(self):
        super().__init__()
        # Common English stop words
        self.stop_words = {
            'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
            'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
            'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
            'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
        }

    @property
    def description(self) -> str:
        return (
            "Extracts important keywords from the text using TF-IDF scoring. "
            "Returns the top keywords with their relevance scores. "
            "Use this when you need to identify key topics or themes."
        )

    def run(self, text: str) -> Dict[str, Any]:
        """Extract keywords from text.

        Args:
            text: Input text to analyze

        Returns:
            Dictionary with extracted keywords and scores
        """
        # Tokenize and clean
        words = re.findall(r'\b[a-zA-Z]+\b', text.lower())

        # Remove stop words and short words
        filtered_words = [
            w for w in words
            if w not in self.stop_words and len(w) > 2
        ]

        if not filtered_words:
            return {
                "keywords": [],
                "num_keywords": 0
            }

        # Calculate TF (Term Frequency)
        word_count = Counter(filtered_words)
        total_words = len(filtered_words)

        tf_scores = {
            word: count / total_words
            for word, count in word_count.items()
        }

        # Simple IDF approximation (treating text as multiple sentences)
        sentences = re.split(r'[.!?]+', text)
        idf_scores = {}

        for word in word_count:
            # Count how many sentences contain the word
            containing_sentences = sum(
                1 for sent in sentences if word in sent.lower()
            )
            if containing_sentences > 0:
                idf_scores[word] = math.log(len(sentences) / containing_sentences)
            else:
                idf_scores[word] = 0

        # Calculate TF-IDF
        tfidf_scores = {
            word: tf_scores[word] * idf_scores.get(word, 0)
            for word in tf_scores
        }

        # Sort by score and get top keywords
        top_keywords = sorted(
            tfidf_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )[:10]

        return {
            "keywords": [
                {"word": word, "score": round(score, 4)}
                for word, score in top_keywords
            ],
            "num_keywords": len(top_keywords)
        }