added lemma occurences to nearest neighbours function
Browse files- app.py +5 -0
- word2vec.py +7 -2
app.py
CHANGED
|
@@ -136,7 +136,12 @@ if active_tab == "Nearest neighbours":
|
|
| 136 |
nearest_neighbours[model],
|
| 137 |
columns = ['Word', 'Cosine Similarity']
|
| 138 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
|
|
|
| 140 |
all_dfs.append((model, df))
|
| 141 |
st.table(df)
|
| 142 |
|
|
|
|
| 136 |
nearest_neighbours[model],
|
| 137 |
columns = ['Word', 'Cosine Similarity']
|
| 138 |
)
|
| 139 |
+
|
| 140 |
+
# Add word occurences to dataframe
|
| 141 |
+
df['Occurences'] = df['Word'].apply(lambda x: lemma_counts[model][x])
|
| 142 |
+
|
| 143 |
|
| 144 |
+
|
| 145 |
all_dfs.append((model, df))
|
| 146 |
st.table(df)
|
| 147 |
|
word2vec.py
CHANGED
|
@@ -464,13 +464,18 @@ def count_lemmas(directory):
|
|
| 464 |
"""
|
| 465 |
lemma_count_dict = {}
|
| 466 |
for file in os.listdir(directory):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
if file.endswith(".txt"):
|
| 468 |
with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
|
| 469 |
text = f.read()
|
| 470 |
words = text.split()
|
| 471 |
-
lemma_count_dict[
|
| 472 |
|
| 473 |
return lemma_count_dict
|
|
|
|
| 474 |
|
| 475 |
|
| 476 |
|
|
@@ -497,7 +502,7 @@ def main():
|
|
| 497 |
# Iterate over all words and print their vectors
|
| 498 |
# iterate_over_words(model)
|
| 499 |
|
| 500 |
-
count_lemmas('lemma_list_raw')
|
| 501 |
|
| 502 |
|
| 503 |
if __name__ == "__main__":
|
|
|
|
| 464 |
"""
|
| 465 |
lemma_count_dict = {}
|
| 466 |
for file in os.listdir(directory):
|
| 467 |
+
model_name = file.split('.')[0].replace('_', ' ').capitalize()
|
| 468 |
+
if len(model_name.split()) == 2:
|
| 469 |
+
# Also capitalize second part of model name
|
| 470 |
+
model_name = ' '.join([word.capitalize() for word in model_name.split()])
|
| 471 |
if file.endswith(".txt"):
|
| 472 |
with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
|
| 473 |
text = f.read()
|
| 474 |
words = text.split()
|
| 475 |
+
lemma_count_dict[model_name] = Counter(words)
|
| 476 |
|
| 477 |
return lemma_count_dict
|
| 478 |
+
|
| 479 |
|
| 480 |
|
| 481 |
|
|
|
|
| 502 |
# Iterate over all words and print their vectors
|
| 503 |
# iterate_over_words(model)
|
| 504 |
|
| 505 |
+
print(count_lemmas('lemma_list_raw'))
|
| 506 |
|
| 507 |
|
| 508 |
if __name__ == "__main__":
|