Commit
·
f5c0c01
1
Parent(s):
869e1b8
wordclouds
Browse files- app.py +14 -1
- pyproject.toml +11 -0
- text_visualization.py +16 -16
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import re
|
|
| 5 |
from task_visualizations import TaskVisualizations
|
| 6 |
import plotly.graph_objects as go
|
| 7 |
from functools import partial
|
|
|
|
| 8 |
|
| 9 |
logging.basicConfig(level=logging.INFO)
|
| 10 |
|
|
@@ -43,9 +44,21 @@ def display_representations(repo, representation1, representation2):
|
|
| 43 |
return text1, text2
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def setup_repository_representations_tab(repos, representation_types):
|
| 47 |
-
gr.Markdown("Select a repository and two representation types to compare them.")
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
with gr.Row():
|
| 50 |
repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
|
| 51 |
representation1 = gr.Dropdown(
|
|
|
|
| 5 |
from task_visualizations import TaskVisualizations
|
| 6 |
import plotly.graph_objects as go
|
| 7 |
from functools import partial
|
| 8 |
+
from text_visualization import WordCloudExtractor
|
| 9 |
|
| 10 |
logging.basicConfig(level=logging.INFO)
|
| 11 |
|
|
|
|
| 44 |
return text1, text2
|
| 45 |
|
| 46 |
|
| 47 |
+
def get_representation_wordclouds(representations, repos_df):
|
| 48 |
+
wordclouds = dict()
|
| 49 |
+
for representation in representations:
|
| 50 |
+
texts = list(repos_df[repos_df["representation"] == representation]["text"])
|
| 51 |
+
wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
|
| 52 |
+
return wordclouds
|
| 53 |
+
|
| 54 |
+
|
| 55 |
def setup_repository_representations_tab(repos, representation_types):
|
|
|
|
| 56 |
|
| 57 |
+
wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
|
| 58 |
+
gr.Markdown("## Wordclouds")
|
| 59 |
+
gr.Gallery([(wordcloud, representation_type) for representation_type, wordcloud in wordcloud_dict.items()])
|
| 60 |
+
|
| 61 |
+
gr.Markdown("Select a repository and two representation types to compare them.")
|
| 62 |
with gr.Row():
|
| 63 |
repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
|
| 64 |
representation1 = gr.Dropdown(
|
pyproject.toml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "github-search-visualizations"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"pydantic>=2.9.2",
|
| 9 |
+
"scikit-learn>=1.5.2",
|
| 10 |
+
"wordcloud>=1.9.3",
|
| 11 |
+
]
|
text_visualization.py
CHANGED
|
@@ -1,22 +1,23 @@
|
|
| 1 |
from typing import Dict, Any, Iterable
|
| 2 |
-
from sklearn import
|
| 3 |
import wordcloud
|
| 4 |
-
from pydantic import BaseModel
|
| 5 |
-
|
|
|
|
| 6 |
|
| 7 |
-
class WordCloudExtractor:
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
def
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
import numpy as np
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def extract_from_corpus(texts, max_features=100):
|
| 20 |
"""
|
| 21 |
Extract word frequencies from a corpus using TF-IDF vectorization
|
| 22 |
and generate word cloud frequencies.
|
|
@@ -30,9 +31,8 @@ class TextVisualization:
|
|
| 30 |
"""
|
| 31 |
# Initialize TF-IDF vectorizer
|
| 32 |
tfidf = TfidfVectorizer(
|
| 33 |
-
max_features=
|
| 34 |
-
|
| 35 |
-
lowercase=True
|
| 36 |
)
|
| 37 |
|
| 38 |
# Fit and transform the texts
|
|
|
|
| 1 |
from typing import Dict, Any, Iterable
|
| 2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 3 |
import wordcloud
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
import numpy as np
|
| 6 |
+
import PIL
|
| 7 |
|
|
|
|
| 8 |
|
| 9 |
+
class WordCloudExtractor(BaseModel):
|
| 10 |
+
max_words: int = 50
|
| 11 |
+
wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
|
| 12 |
+
tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
|
| 13 |
|
| 14 |
+
def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
|
| 15 |
+
frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
|
| 16 |
+
wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
|
| 17 |
+
return wc.to_image()
|
|
|
|
| 18 |
|
| 19 |
+
@classmethod
|
| 20 |
+
def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
|
|
|
|
| 21 |
"""
|
| 22 |
Extract word frequencies from a corpus using TF-IDF vectorization
|
| 23 |
and generate word cloud frequencies.
|
|
|
|
| 31 |
"""
|
| 32 |
# Initialize TF-IDF vectorizer
|
| 33 |
tfidf = TfidfVectorizer(
|
| 34 |
+
max_features=max_words,
|
| 35 |
+
**tfidf_params
|
|
|
|
| 36 |
)
|
| 37 |
|
| 38 |
# Fit and transform the texts
|