Blessmore commited on
Commit
a36f90c
·
verified ·
1 Parent(s): 1bfa3f4

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  Fast_text_50_dim/shona_fasttext_vectors_50d.kv filter=lfs diff=lfs merge=lfs -text
37
  Fast_text_100_dim/shona_corpus_E.txt filter=lfs diff=lfs merge=lfs -text
38
  Fast_text_100_dim/shona_fasttext_vectors_100d.kv filter=lfs diff=lfs merge=lfs -text
 
 
 
36
  Fast_text_50_dim/shona_fasttext_vectors_50d.kv filter=lfs diff=lfs merge=lfs -text
37
  Fast_text_100_dim/shona_corpus_E.txt filter=lfs diff=lfs merge=lfs -text
38
  Fast_text_100_dim/shona_fasttext_vectors_100d.kv filter=lfs diff=lfs merge=lfs -text
39
+ Fast_text_300_dim/shona_corpus_E.txt filter=lfs diff=lfs merge=lfs -text
40
+ Fast_text_300_dim/shona_fasttext_vectors_300d.kv filter=lfs diff=lfs merge=lfs -text
Fast_text_300_dim/.ipynb_checkpoints/FAST_TEXT -300-checkpoint.ipynb ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from gensim.models import FastText\n",
10
+ "import regex as re\n",
11
+ "import time\n",
12
+ "import os\n",
13
+ "from gensim.utils import simple_preprocess\n",
14
+ "from gensim.models import FastText\n",
15
+ "import re"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "\n",
25
+ "def preprocess_text(text):\n",
26
+ " text = text.lower() # Lowercase\n",
27
+ " text = re.sub(r'[^\\w\\s]', '', text) # Remove punctuation\n",
28
+ " return simple_preprocess(text)\n",
29
+ "\n",
30
+ "def read_corpus(file_path):\n",
31
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
32
+ " for line in file:\n",
33
+ " yield preprocess_text(line)"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "corpus_file_path = 'shona_corpus_E.txt'\n",
43
+ "# Read and preprocess the corpus\n",
44
+ "sentences = list(read_corpus(corpus_file_path))\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 4,
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "data": {
54
+ "text/plain": [
55
+ "[['mavambo',\n",
56
+ " 'kusikwa',\n",
57
+ " 'kwezvinhu',\n",
58
+ " 'zvose',\n",
59
+ " 'pakutanga',\n",
60
+ " 'mwari',\n",
61
+ " 'akasika',\n",
62
+ " 'denga',\n",
63
+ " 'nepasi'],\n",
64
+ " ['zvino',\n",
65
+ " 'rakanga',\n",
66
+ " 'risina',\n",
67
+ " 'chiumbo',\n",
68
+ " 'risina',\n",
69
+ " 'uye',\n",
70
+ " 'rakanga',\n",
71
+ " 'riri',\n",
72
+ " 'pamusoro',\n",
73
+ " 'pehwenje'],\n",
74
+ " ['mweya', 'wamwari', 'wakanga', 'uchidzengerera', 'pamusoro', 'pemvura']]"
75
+ ]
76
+ },
77
+ "execution_count": 4,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "sentences[:3]"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "start_time = time.time()\n",
93
+ "\n",
94
+ "# Train FastText model\n",
95
+ "model = FastText(\n",
96
+ " sentences, \n",
97
+ " vector_size=300, # Higher dimension for better performance\n",
98
+ " window=7, \n",
99
+ " min_count=5, \n",
100
+ " workers=4, \n",
101
+ " sg=1, # Skip-gram model\n",
102
+ " epochs=100, # More epochs for thorough training\n",
103
+ " bucket=2000000, # Large bucket size for handling subwords\n",
104
+ " min_n=3, # Minimum length of char n-grams\n",
105
+ " max_n=6 # Maximum length of char n-grams\n",
106
+ ")\n",
107
+ "end_time = time.time()\n",
108
+ "# Calculate the elapsed time\n",
109
+ "elapsed_time = end_time - start_time\n",
110
+ "print(\"Time taken:\", elapsed_time, \"minutes\")\n"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": null,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "# Save the model\n",
120
+ "model.save(\"shona_fasttext_300d.model\")\n",
121
+ "model.wv.save(\"shona_fasttext_vectors_300d.kv\")"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": null,
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "print(model)"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": []
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": null,
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": []
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": []
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "def evaluate_similarity(model, word_pairs):\n",
161
+ " similarity_scores = []\n",
162
+ " for word1, word2, score in word_pairs:\n",
163
+ " similarity_score = model.wv.similarity(word1, word2)\n",
164
+ " similarity_scores.append((word1, word2, score, similarity_score))\n",
165
+ " print(\"Similarity task evaluation:\")\n",
166
+ " for word1, word2, human_score, model_score in similarity_scores:\n",
167
+ " print(f\"{word1}-{word2}: Human score = {human_score}, Model score = {model_score}\")\n",
168
+ "\n",
169
+ "# Example similarity word pairs\n",
170
+ "similarity_word_pairs = [(\"murume\", \"mukadzi\", 0.8), (\"mwana\", \"mukomana\", 0.6)]\n",
171
+ "evaluate_similarity(model, similarity_word_pairs)\n"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
181
+ " d = model.wv[b] - model.wv[a] + model.wv[c]\n",
182
+ " closest_words = model.wv.similar_by_vector(d, topn=topn + 3) # Add extra to ensure we get at least topn unique words\n",
183
+ " result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
184
+ " return result_words[:topn]\n",
185
+ "\n",
186
+ "# Example usage\n",
187
+ "a = \"mukomana\" # man\n",
188
+ "b = \"amai\" # king\n",
189
+ "c = \"musikana\" # woman\n",
190
+ "\n",
191
+ "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
192
+ "if predicted_words:\n",
193
+ " print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
194
+ "else:\n",
195
+ " print(\"No suitable words found.\")\n"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "# Perform Analogical Reasoning\n",
205
+ "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
206
+ " # Calculate the vector d as b - a + c\n",
207
+ " d = model.wv[b] - model.wv[a] + model.wv[c]\n",
208
+ " \n",
209
+ " # Find the words that best complete the analogy\n",
210
+ " closest_words = model.wv.similar_by_vector(d, topn=topn + 3) # Add extra to ensure we get at least topn unique words\n",
211
+ " result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
212
+ " \n",
213
+ " # Ensure we return exactly 'topn' words\n",
214
+ " return result_words[:topn]\n",
215
+ "\n",
216
+ "# Example usage\n",
217
+ "a = \"murume\" # man\n",
218
+ "b = \"sekuru\" # king\n",
219
+ "c = \"mukadzi\" # woman\n",
220
+ "\n",
221
+ "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
222
+ "if predicted_words:\n",
223
+ " print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
224
+ "else:\n",
225
+ " print(\"No suitable words found.\")"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": []
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": null,
238
+ "metadata": {},
239
+ "outputs": [],
240
+ "source": [
241
+ "# Test similarity\n",
242
+ "similar_words = model.wv.most_similar(\"kudzidza\", topn=10)\n",
243
+ "print(similar_words)"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": null,
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": []
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": null,
256
+ "metadata": {},
257
+ "outputs": [],
258
+ "source": []
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "metadata": {},
264
+ "outputs": [],
265
+ "source": []
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": null,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": []
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": null,
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": []
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": []
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "metadata": {},
292
+ "outputs": [],
293
+ "source": []
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": null,
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": []
301
+ }
302
+ ],
303
+ "metadata": {
304
+ "kernelspec": {
305
+ "display_name": "Python 3 (ipykernel)",
306
+ "language": "python",
307
+ "name": "python3"
308
+ },
309
+ "language_info": {
310
+ "codemirror_mode": {
311
+ "name": "ipython",
312
+ "version": 3
313
+ },
314
+ "file_extension": ".py",
315
+ "mimetype": "text/x-python",
316
+ "name": "python",
317
+ "nbconvert_exporter": "python",
318
+ "pygments_lexer": "ipython3",
319
+ "version": "3.9.12"
320
+ }
321
+ },
322
+ "nbformat": 4,
323
+ "nbformat_minor": 4
324
+ }
Fast_text_300_dim/FAST_TEXT -300.ipynb ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from gensim.models import FastText\n",
10
+ "import regex as re\n",
11
+ "import time\n",
12
+ "import os\n",
13
+ "from gensim.utils import simple_preprocess\n",
14
+ "from gensim.models import FastText\n",
15
+ "import re"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "\n",
25
+ "def preprocess_text(text):\n",
26
+ " text = text.lower() # Lowercase\n",
27
+ " text = re.sub(r'[^\\w\\s]', '', text) # Remove punctuation\n",
28
+ " return simple_preprocess(text)\n",
29
+ "\n",
30
+ "def read_corpus(file_path):\n",
31
+ " with open(file_path, 'r', encoding='utf-8') as file:\n",
32
+ " for line in file:\n",
33
+ " yield preprocess_text(line)"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "corpus_file_path = 'shona_corpus_E.txt'\n",
43
+ "# Read and preprocess the corpus\n",
44
+ "sentences = list(read_corpus(corpus_file_path))\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 4,
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "data": {
54
+ "text/plain": [
55
+ "[['mavambo',\n",
56
+ " 'kusikwa',\n",
57
+ " 'kwezvinhu',\n",
58
+ " 'zvose',\n",
59
+ " 'pakutanga',\n",
60
+ " 'mwari',\n",
61
+ " 'akasika',\n",
62
+ " 'denga',\n",
63
+ " 'nepasi'],\n",
64
+ " ['zvino',\n",
65
+ " 'rakanga',\n",
66
+ " 'risina',\n",
67
+ " 'chiumbo',\n",
68
+ " 'risina',\n",
69
+ " 'uye',\n",
70
+ " 'rakanga',\n",
71
+ " 'riri',\n",
72
+ " 'pamusoro',\n",
73
+ " 'pehwenje'],\n",
74
+ " ['mweya', 'wamwari', 'wakanga', 'uchidzengerera', 'pamusoro', 'pemvura']]"
75
+ ]
76
+ },
77
+ "execution_count": 4,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "sentences[:3]"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 5,
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "name": "stdout",
93
+ "output_type": "stream",
94
+ "text": [
95
+ "Time taken: 6643.198479413986 minutes\n"
96
+ ]
97
+ }
98
+ ],
99
+ "source": [
100
+ "start_time = time.time()\n",
101
+ "\n",
102
+ "# Train FastText model\n",
103
+ "model = FastText(\n",
104
+ " sentences, \n",
105
+ " vector_size=300, # Higher dimension for better performance\n",
106
+ " window=7, \n",
107
+ " min_count=5, \n",
108
+ " workers=4, \n",
109
+ " sg=1, # Skip-gram model\n",
110
+ " epochs=100, # More epochs for thorough training\n",
111
+ " bucket=2000000, # Large bucket size for handling subwords\n",
112
+ " min_n=3, # Minimum length of char n-grams\n",
113
+ " max_n=6 # Maximum length of char n-grams\n",
114
+ ")\n",
115
+ "end_time = time.time()\n",
116
+ "# Calculate the elapsed time\n",
117
+ "elapsed_time = end_time - start_time\n",
118
+ "print(\"Time taken:\", elapsed_time, \"minutes\")\n"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 6,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "# Save the model\n",
128
+ "model.save(\"shona_fasttext_300d.model\")\n",
129
+ "model.wv.save(\"shona_fasttext_vectors_300d.kv\")"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 7,
135
+ "metadata": {},
136
+ "outputs": [
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "FastText(vocab=107228, vector_size=300, alpha=0.025)\n"
142
+ ]
143
+ }
144
+ ],
145
+ "source": [
146
+ "print(model)"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": null,
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": []
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "metadata": {},
160
+ "outputs": [],
161
+ "source": []
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "metadata": {},
167
+ "outputs": [],
168
+ "source": []
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 8,
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "Similarity task evaluation:\n",
180
+ "murume-mukadzi: Human score = 0.8, Model score = 0.6584572196006775\n",
181
+ "mwana-mukomana: Human score = 0.6, Model score = 0.49458423256874084\n"
182
+ ]
183
+ }
184
+ ],
185
+ "source": [
186
+ "def evaluate_similarity(model, word_pairs):\n",
187
+ " similarity_scores = []\n",
188
+ " for word1, word2, score in word_pairs:\n",
189
+ " similarity_score = model.wv.similarity(word1, word2)\n",
190
+ " similarity_scores.append((word1, word2, score, similarity_score))\n",
191
+ " print(\"Similarity task evaluation:\")\n",
192
+ " for word1, word2, human_score, model_score in similarity_scores:\n",
193
+ " print(f\"{word1}-{word2}: Human score = {human_score}, Model score = {model_score}\")\n",
194
+ "\n",
195
+ "# Example similarity word pairs\n",
196
+ "similarity_word_pairs = [(\"murume\", \"mukadzi\", 0.8), (\"mwana\", \"mukomana\", 0.6)]\n",
197
+ "evaluate_similarity(model, similarity_word_pairs)\n"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 13,
203
+ "metadata": {},
204
+ "outputs": [
205
+ {
206
+ "name": "stdout",
207
+ "output_type": "stream",
208
+ "text": [
209
+ "mukomana is to baba as musikana is to: bab, babavo, babayo, babangu, babawee\n"
210
+ ]
211
+ }
212
+ ],
213
+ "source": [
214
+ "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
215
+ " d = model.wv[b] - model.wv[a] + model.wv[c]\n",
216
+ " closest_words = model.wv.similar_by_vector(d, topn=topn + 3) # Add extra to ensure we get at least topn unique words\n",
217
+ " result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
218
+ " return result_words[:topn]\n",
219
+ "\n",
220
+ "# Example usage\n",
221
+ "a = \"mukomana\" # man\n",
222
+ "b = \"baba\" # king\n",
223
+ "c = \"musikana\" # woman\n",
224
+ "\n",
225
+ "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
226
+ "if predicted_words:\n",
227
+ " print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
228
+ "else:\n",
229
+ " print(\"No suitable words found.\")\n"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 10,
235
+ "metadata": {},
236
+ "outputs": [
237
+ {
238
+ "name": "stdout",
239
+ "output_type": "stream",
240
+ "text": [
241
+ "murume is to sekuru as mukadzi is to: sasekuru, ambuya, kwavasekuru, sekuruwo, raambuya\n"
242
+ ]
243
+ }
244
+ ],
245
+ "source": [
246
+ "# Perform Analogical Reasoning\n",
247
+ "def perform_analogical_reasoning(model, a, b, c, topn=5):\n",
248
+ " # Calculate the vector d as b - a + c\n",
249
+ " d = model.wv[b] - model.wv[a] + model.wv[c]\n",
250
+ " \n",
251
+ " # Find the words that best complete the analogy\n",
252
+ " closest_words = model.wv.similar_by_vector(d, topn=topn + 3) # Add extra to ensure we get at least topn unique words\n",
253
+ " result_words = [word for word, _ in closest_words if word not in [a, b, c]]\n",
254
+ " \n",
255
+ " # Ensure we return exactly 'topn' words\n",
256
+ " return result_words[:topn]\n",
257
+ "\n",
258
+ "# Example usage\n",
259
+ "a = \"murume\" # man\n",
260
+ "b = \"sekuru\" # king\n",
261
+ "c = \"mukadzi\" # woman\n",
262
+ "\n",
263
+ "predicted_words = perform_analogical_reasoning(model, a, b, c)\n",
264
+ "if predicted_words:\n",
265
+ " print(f\"{a} is to {b} as {c} is to: {', '.join(predicted_words)}\")\n",
266
+ "else:\n",
267
+ " print(\"No suitable words found.\")"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": []
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 12,
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "name": "stdout",
284
+ "output_type": "stream",
285
+ "text": [
286
+ "[('nezvemakuhwa', 0.4936619699001312), ('nesu', 0.48648443818092346), ('taurirana', 0.4829455614089966), ('tauraka', 0.46879878640174866), ('ndapota', 0.4648963510990143), ('utaure', 0.45030686259269714), ('chitaura', 0.44970065355300903), ('taurira', 0.4396206736564636), ('tapota', 0.42348620295524597), ('itstechschool', 0.4228824973106384)]\n"
287
+ ]
288
+ }
289
+ ],
290
+ "source": [
291
+ "# Test similarity\n",
292
+ "similar_words = model.wv.most_similar(\"taura\", topn=10)\n",
293
+ "print(similar_words)"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "metadata": {},
300
+ "outputs": [],
301
+ "source": []
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": null,
306
+ "metadata": {},
307
+ "outputs": [],
308
+ "source": []
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": null,
313
+ "metadata": {},
314
+ "outputs": [],
315
+ "source": []
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": null,
320
+ "metadata": {},
321
+ "outputs": [],
322
+ "source": []
323
+ }
324
+ ],
325
+ "metadata": {
326
+ "kernelspec": {
327
+ "display_name": "Python 3 (ipykernel)",
328
+ "language": "python",
329
+ "name": "python3"
330
+ },
331
+ "language_info": {
332
+ "codemirror_mode": {
333
+ "name": "ipython",
334
+ "version": 3
335
+ },
336
+ "file_extension": ".py",
337
+ "mimetype": "text/x-python",
338
+ "name": "python",
339
+ "nbconvert_exporter": "python",
340
+ "pygments_lexer": "ipython3",
341
+ "version": "3.9.12"
342
+ }
343
+ },
344
+ "nbformat": 4,
345
+ "nbformat_minor": 4
346
+ }
Fast_text_300_dim/shona_corpus_E.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8a3674c729ea64dc6cdf21ad9567b12cfc396f53f19111abb94f022cb4c619
3
+ size 98750355
Fast_text_300_dim/shona_fasttext_300d.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec2d9641ef97d862a30d2fee58dace395c1615208d03b214711fdfcef04b5a7d
3
+ size 3506557
Fast_text_300_dim/shona_fasttext_300d.model.syn1neg.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5d8e3b07cb97ac53e08b80e3b2079e6301906dcb48e8e16b5117a1d4cae8987
3
+ size 128673728
Fast_text_300_dim/shona_fasttext_300d.model.wv.vectors_ngrams.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71b6718b417033cc31b9711e93be71d222683a6250bb50d55f84d4364f024f67
3
+ size 2400000128
Fast_text_300_dim/shona_fasttext_300d.model.wv.vectors_vocab.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:787a8a1d2d0b98cf017f268c455909c477a8a64c5b74c328521656dd9d0ec6e1
3
+ size 128673728
Fast_text_300_dim/shona_fasttext_vectors_300d.kv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:332dd5bcb075a04d8d080c6b9e1629710bc5e6567b04c994f6c618fad27508d0
3
+ size 3501803
Fast_text_300_dim/shona_fasttext_vectors_300d.kv.vectors_ngrams.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71b6718b417033cc31b9711e93be71d222683a6250bb50d55f84d4364f024f67
3
+ size 2400000128
Fast_text_300_dim/shona_fasttext_vectors_300d.kv.vectors_vocab.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:787a8a1d2d0b98cf017f268c455909c477a8a64c5b74c328521656dd9d0ec6e1
3
+ size 128673728