| from sklearn import TfIdfVectorizer | |
| import wordcloud | |
| from pydantic import BaseModel | |
| class WordCloudExtractor: | |
| tfidf_params: Dict[str, Any] | |
| def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud: | |
| pass | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from wordcloud import WordCloud | |
| import numpy as np | |
| class TextVisualization: | |
| def extract_from_corpus(texts, max_features=100): | |
| """ | |
| Extract word frequencies from a corpus using TF-IDF vectorization | |
| and generate word cloud frequencies. | |
| Args: | |
| texts: List of text documents | |
| max_features: Maximum number of words to include | |
| Returns: | |
| Dictionary of word frequencies suitable for WordCloud | |
| """ | |
| # Initialize TF-IDF vectorizer | |
| tfidf = TfidfVectorizer( | |
| max_features=max_features, | |
| stop_words='english', | |
| lowercase=True | |
| ) | |
| # Fit and transform the texts | |
| tfidf_matrix = tfidf.fit_transform(texts) | |
| # Get feature names (words) | |
| feature_names = tfidf.get_feature_names_out() | |
| # Calculate mean TF-IDF scores across documents | |
| mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten() | |
| # Create frequency dictionary | |
| frequencies = dict(zip(feature_names, mean_tfidf)) | |
| return frequencies | |