Spaces:

lambdaofgod
/

github_search_visualizations

Sleeping

lambdaofgod commited on Oct 31, 2024

Commit

f5c0c01

1 Parent(s): 869e1b8

wordclouds

Files changed (3) hide show

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import re
 from task_visualizations import TaskVisualizations
 import plotly.graph_objects as go
 from functools import partial
 logging.basicConfig(level=logging.INFO)
@@ -43,9 +44,21 @@ def display_representations(repo, representation1, representation2):
     return text1, text2
 def setup_repository_representations_tab(repos, representation_types):
-    gr.Markdown("Select a repository and two representation types to compare them.")
     with gr.Row():
         repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
         representation1 = gr.Dropdown(

 from task_visualizations import TaskVisualizations
 import plotly.graph_objects as go
 from functools import partial
+from text_visualization import WordCloudExtractor
 logging.basicConfig(level=logging.INFO)
     return text1, text2
+def get_representation_wordclouds(representations, repos_df):
+    wordclouds = dict()
+    for representation in representations:
+        texts = list(repos_df[repos_df["representation"] == representation]["text"])
+        wordclouds[representation] = WordCloudExtractor().extract_wordcloud_image(texts)
+    return wordclouds
 def setup_repository_representations_tab(repos, representation_types):
+    wordcloud_dict = get_representation_wordclouds(representation_types, repos_df)
+    gr.Markdown("## Wordclouds")
+    gr.Gallery([(wordcloud, representation_type) for representation_type, wordcloud in wordcloud_dict.items()])
+    gr.Markdown("Select a repository and two representation types to compare them.")
     with gr.Row():
         repo = gr.Dropdown(choices=repos, label="Repository", value=repos[0])
         representation1 = gr.Dropdown(

pyproject.toml ADDED Viewed

+[project]
+name = "github-search-visualizations"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "pydantic>=2.9.2",
+    "scikit-learn>=1.5.2",
+    "wordcloud>=1.9.3",
+]

text_visualization.py CHANGED Viewed

@@ -1,22 +1,23 @@
 from typing import Dict, Any, Iterable
-from sklearn import TfIdfVectorizer
 import wordcloud
-from pydantic import BaseModel
-class WordCloudExtractor:
-    tfidf_params: Dict[str, Any]
-    def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
-        pass
-from sklearn.feature_extraction.text import TfidfVectorizer
-from wordcloud import WordCloud
-import numpy as np
-class TextVisualization:
-    @staticmethod
-    def extract_from_corpus(texts, max_features=100):
         """
         Extract word frequencies from a corpus using TF-IDF vectorization
         and generate word cloud frequencies.
@@ -30,9 +31,8 @@ class TextVisualization:
         """
         # Initialize TF-IDF vectorizer
         tfidf = TfidfVectorizer(
-            max_features=max_features,
-            stop_words='english',
-            lowercase=True
         )
         # Fit and transform the texts

 from typing import Dict, Any, Iterable
+from sklearn.feature_extraction.text import TfidfVectorizer
 import wordcloud
+from pydantic import BaseModel, Field
+import numpy as np
+import PIL
+class WordCloudExtractor(BaseModel):
+    max_words: int = 50
+    wordcloud_params: Dict[str, Any] = Field(default_factory=dict)
+    tfidf_params: Dict[str, Any] = Field(default_factory=lambda: {"stop_words": "english"})
+    def extract_wordcloud_image(self, texts) -> PIL.Image.Image:
+        frequencies = self._extract_frequencies(texts, self.max_words, tfidf_params=self.tfidf_params)
+        wc = wordcloud.WordCloud(**self.wordcloud_params).generate_from_frequencies(frequencies)
+        return wc.to_image()
+    @classmethod
+    def _extract_frequencies(cls, texts, max_words=100, tfidf_params: dict={}) -> Dict[str, float]:
         """
         Extract word frequencies from a corpus using TF-IDF vectorization
         and generate word cloud frequencies.
         """
         # Initialize TF-IDF vectorizer
         tfidf = TfidfVectorizer(
+            max_features=max_words,
+            **tfidf_params
         )
         # Fit and transform the texts