Commit
·
42de6bd
1
Parent(s):
21d27ae
feat: Implement `extract_from_corpus` in `text_visualization.py`
Browse files- requirements.txt +2 -0
- text_visualization.py +38 -0
requirements.txt
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
gradio==3.48.0
|
| 2 |
plotly==5.24.1
|
|
|
|
|
|
|
|
|
| 1 |
gradio==3.48.0
|
| 2 |
plotly==5.24.1
|
| 3 |
+
scikit-learn==1.3.0
|
| 4 |
+
wordcloud==1.9.2
|
text_visualization.py
CHANGED
|
@@ -9,3 +9,41 @@ class WordCloudExtractor:
|
|
| 9 |
|
| 10 |
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
|
| 11 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
|
| 11 |
pass
|
| 12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
+
from wordcloud import WordCloud
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
class TextVisualization:
|
| 17 |
+
@staticmethod
|
| 18 |
+
def extract_from_corpus(texts, max_features=100):
|
| 19 |
+
"""
|
| 20 |
+
Extract word frequencies from a corpus using TF-IDF vectorization
|
| 21 |
+
and generate word cloud frequencies.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
texts: List of text documents
|
| 25 |
+
max_features: Maximum number of words to include
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Dictionary of word frequencies suitable for WordCloud
|
| 29 |
+
"""
|
| 30 |
+
# Initialize TF-IDF vectorizer
|
| 31 |
+
tfidf = TfidfVectorizer(
|
| 32 |
+
max_features=max_features,
|
| 33 |
+
stop_words='english',
|
| 34 |
+
lowercase=True
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Fit and transform the texts
|
| 38 |
+
tfidf_matrix = tfidf.fit_transform(texts)
|
| 39 |
+
|
| 40 |
+
# Get feature names (words)
|
| 41 |
+
feature_names = tfidf.get_feature_names_out()
|
| 42 |
+
|
| 43 |
+
# Calculate mean TF-IDF scores across documents
|
| 44 |
+
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
|
| 45 |
+
|
| 46 |
+
# Create frequency dictionary
|
| 47 |
+
frequencies = dict(zip(feature_names, mean_tfidf))
|
| 48 |
+
|
| 49 |
+
return frequencies
|