Spaces:

GroNLP
/

agalma

Running

Mark7549 commited on May 22, 2024

Commit

6640785

1 Parent(s): 4dd8921

added lemma occurences to nearest neighbours function

Files changed (2) hide show

app.py CHANGED Viewed

@@ -136,7 +136,12 @@ if active_tab == "Nearest neighbours":
                     nearest_neighbours[model],
                     columns = ['Word', 'Cosine Similarity']
                 )
                 all_dfs.append((model, df))
                 st.table(df)

                     nearest_neighbours[model],
                     columns = ['Word', 'Cosine Similarity']
                 )
+                # Add word occurences to dataframe
+                df['Occurences'] = df['Word'].apply(lambda x: lemma_counts[model][x])
                 all_dfs.append((model, df))
                 st.table(df)

word2vec.py CHANGED Viewed

@@ -464,13 +464,18 @@ def count_lemmas(directory):
     """
     lemma_count_dict = {}
     for file in os.listdir(directory):
         if file.endswith(".txt"):
             with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                 text = f.read()
                 words = text.split()
-                lemma_count_dict[file] = Counter(words)
     return lemma_count_dict
@@ -497,7 +502,7 @@ def main():
     # Iterate over all words and print their vectors
     # iterate_over_words(model)
-    count_lemmas('lemma_list_raw')
 if __name__ == "__main__":

     """
     lemma_count_dict = {}
     for file in os.listdir(directory):
+        model_name = file.split('.')[0].replace('_', ' ').capitalize()
+        if len(model_name.split()) == 2:
+            # Also capitalize second part of model name
+            model_name = ' '.join([word.capitalize() for word in model_name.split()])
         if file.endswith(".txt"):
             with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                 text = f.read()
                 words = text.split()
+                lemma_count_dict[model_name] = Counter(words)
     return lemma_count_dict
     # Iterate over all words and print their vectors
     # iterate_over_words(model)
+    print(count_lemmas('lemma_list_raw'))
 if __name__ == "__main__":