ArabovMK commited on
Commit
deb7578
·
verified ·
1 Parent(s): f3c53e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -95
app.py CHANGED
@@ -12,10 +12,13 @@ from plotly.subplots import make_subplots
12
  import os
13
  from typing import List, Dict, Tuple, Optional
14
  import gc
 
 
15
 
16
  # Import for model loading from Hugging Face Hub
17
- from huggingface_hub import snapshot_download
18
- from gensim.models import FastText, Word2Vec
 
19
 
20
  # Page configuration
21
  st.set_page_config(
@@ -61,61 +64,104 @@ class Tatar2VecExplorer:
61
  # Model names and their paths in Hugging Face repo
62
  self.available_models = {
63
  "Word2Vec": {
64
- "best": "w2v_cbow_100", # Best overall for analogies
65
- "alternative": "w2v_sg_100" # Skip-gram alternative
 
66
  },
67
  "FastText": {
68
- "best": "ft_cbow_100", # Best FastText
69
- "alternative": "ft_cbow_200" # Larger FastText
70
  }
71
  }
72
 
 
 
 
 
 
 
 
73
  # Model paths in the Hugging Face repository
74
  self.model_configs = {
75
  "w2v_cbow_100": {
76
  "subdir": "word2vec/cbow100",
77
- "file": "w2v_cbow_100.model",
 
 
 
 
 
 
78
  "type": "word2vec",
79
  "dim": 100,
80
- "description": "Word2Vec CBOW 100-dim - Best for analogies",
81
  "analogy_accuracy": 0.60,
82
- "semantic_similarity": 0.568
 
83
  },
84
  "w2v_cbow_200": {
85
  "subdir": "word2vec/cbow200",
86
- "file": "w2v_cbow_200.model",
 
 
 
 
 
 
87
  "type": "word2vec",
88
  "dim": 200,
89
- "description": "Word2Vec CBOW 200-dim - Higher dimensionality",
90
  "analogy_accuracy": None,
91
- "semantic_similarity": None
 
92
  },
93
  "w2v_sg_100": {
94
  "subdir": "word2vec/sg100",
95
- "file": "w2v_sg_100.model",
 
 
 
 
 
96
  "type": "word2vec",
97
  "dim": 100,
98
- "description": "Word2Vec Skip-gram 100-dim - Better for rare words",
99
  "analogy_accuracy": None,
100
- "semantic_similarity": None
 
 
101
  },
102
  "ft_cbow_100": {
103
  "subdir": "fasttext/cbow100",
104
- "file": "ft_cbow_100.model",
 
 
 
 
 
 
105
  "type": "fasttext",
106
  "dim": 100,
107
- "description": "FastText CBOW 100-dim - Handles morphology",
108
  "analogy_accuracy": 0.0,
109
- "semantic_similarity": 0.582
 
110
  },
111
  "ft_cbow_200": {
112
  "subdir": "fasttext/cbow200",
113
- "file": "ft_cbow_200.model",
 
 
 
 
 
 
114
  "type": "fasttext",
115
  "dim": 200,
116
  "description": "FastText CBOW 200-dim - Larger FastText model",
117
  "analogy_accuracy": 0.0,
118
- "semantic_similarity": None
 
119
  }
120
  }
121
 
@@ -134,44 +180,101 @@ class Tatar2VecExplorer:
134
  progress_bar = st.progress(0)
135
  status_text = st.empty()
136
 
137
- status_text.text(f"Downloading {model_key} from Hugging Face...")
138
- progress_bar.progress(20)
139
-
140
- # Download only the specific model files
141
- model_dir = snapshot_download(
142
- repo_id=repo_id,
143
- allow_patterns=[f"{config['subdir']}/*"],
144
- ignore_patterns=["*.git*", "README.md", "*.txt"],
145
- local_files_only=False
146
- )
147
-
148
- progress_bar.progress(60)
149
- status_text.text(f"Files downloaded, loading model...")
150
 
151
- # Full path to the model file
152
- model_path = os.path.join(model_dir, config['subdir'], config['file'])
153
 
154
- # Check if model file exists
155
- if not os.path.exists(model_path):
156
- # Try to find any .model file in the directory
157
- model_dir_path = os.path.join(model_dir, config['subdir'])
158
- if os.path.exists(model_dir_path):
159
- model_files = [f for f in os.listdir(model_dir_path) if f.endswith('.model')]
160
- if model_files:
161
- model_path = os.path.join(model_dir_path, model_files[0])
162
- status_text.text(f"Found model file: {model_files[0]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  progress_bar.progress(80)
 
165
 
166
- # Load the model
167
  try:
168
- if config['type'] == "fasttext":
169
- model = FastText.load(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  else:
171
- model = Word2Vec.load(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  progress_bar.progress(100)
174
- status_text.text(f"✅ Successfully loaded {model_key}!")
 
 
 
175
 
176
  # Clean up progress indicators after 2 seconds
177
  import time
@@ -182,7 +285,9 @@ class Tatar2VecExplorer:
182
  return model
183
 
184
  except Exception as e:
185
- st.error(f"Error loading model from {model_path}: {str(e)}")
 
 
186
  return None
187
 
188
  except Exception as e:
@@ -194,7 +299,7 @@ class Tatar2VecExplorer:
194
  names = {
195
  "w2v_cbow_100": "🥇 Word2Vec CBOW (100-dim)",
196
  "w2v_cbow_200": "📈 Word2Vec CBOW (200-dim)",
197
- "w2v_sg_100": "🎯 Word2Vec Skip-gram (100-dim)",
198
  "ft_cbow_100": "⚡ FastText CBOW (100-dim)",
199
  "ft_cbow_200": "🚀 FastText CBOW (200-dim)"
200
  }
@@ -204,13 +309,20 @@ class Tatar2VecExplorer:
204
  """Get model information"""
205
  return self.model_configs.get(model_key, {})
206
 
 
 
 
 
207
  def find_similar_words(self, model, word: str, topn: int = 10):
208
  """Find semantically similar words"""
209
  try:
 
210
  if hasattr(model, 'wv'):
211
  return model.wv.most_similar(word, topn=topn)
212
- else:
213
  return model.most_similar(word, topn=topn)
 
 
214
  except KeyError:
215
  return []
216
  except Exception as e:
@@ -222,8 +334,10 @@ class Tatar2VecExplorer:
222
  try:
223
  if hasattr(model, 'wv'):
224
  return model.wv.most_similar(positive=positive, negative=negative, topn=topn)
225
- else:
226
  return model.most_similar(positive=positive, negative=negative, topn=topn)
 
 
227
  except Exception as e:
228
  st.error(f"Error performing analogy: {e}")
229
  return []
@@ -233,8 +347,12 @@ class Tatar2VecExplorer:
233
  try:
234
  if hasattr(model, 'wv'):
235
  return model.wv[word]
236
- else:
 
 
237
  return model[word]
 
 
238
  except KeyError:
239
  return None
240
 
@@ -247,6 +365,10 @@ class Tatar2VecExplorer:
247
  in_vocab = False
248
  if hasattr(model, 'wv'):
249
  in_vocab = word in model.wv.key_to_index
 
 
 
 
250
 
251
  similar = self.find_similar_words(model, word, 3) if in_vocab else []
252
  results.append({
@@ -261,12 +383,6 @@ class Tatar2VecExplorer:
261
  'similar_words': []
262
  })
263
  return results
264
-
265
- def unload_model(self, model_key: str):
266
- """Unload model to free memory"""
267
- if model_key in self.loaded_models:
268
- del self.loaded_models[model_key]
269
- gc.collect()
270
 
271
  def create_performance_comparison():
272
  """Create model performance comparison charts"""
@@ -292,7 +408,7 @@ def create_performance_comparison():
292
  x=['Word2Vec CBOW 100', 'FastText CBOW 100'],
293
  y=analogy_scores,
294
  marker_color=['#1f77b4', '#d62728'],
295
- text=[f"{score*100:.1f}%" if score > 0 else "0%" for score in analogy_scores],
296
  textposition='auto',
297
  ),
298
  row=1, col=1
@@ -312,7 +428,7 @@ def create_performance_comparison():
312
  )
313
 
314
  fig.update_layout(
315
- title_text="Model Performance Comparison",
316
  showlegend=False,
317
  height=400,
318
  width=800
@@ -341,14 +457,38 @@ def main():
341
  index=0
342
  )
343
 
344
- # Model variant selection
345
- model_variant = st.radio(
346
- "Model Variant:",
347
- ["best", "alternative"],
348
- format_func=lambda x: "🥇 Best Model (CBOW 100)" if x == "best" else "🥈 Alternative Model"
349
- )
350
 
351
- model_key = explorer.available_models[model_type][model_variant]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  # Model information section
354
  st.markdown("---")
@@ -359,11 +499,14 @@ def main():
359
  st.markdown(f"**{explorer.get_model_display_name(model_key)}**")
360
  st.caption(model_info.get('description', ''))
361
 
 
 
 
362
  col1, col2 = st.columns(2)
363
  with col1:
364
  if model_info.get('analogy_accuracy') is not None:
365
  acc = model_info['analogy_accuracy']
366
- st.metric("Analogy Accuracy", f"{acc*100:.1f}%" if acc > 0 else "N/A")
367
 
368
  with col2:
369
  if model_info.get('semantic_similarity') is not None:
@@ -371,6 +514,10 @@ def main():
371
  st.metric("Semantic Similarity", f"{sim:.3f}" if sim else "N/A")
372
 
373
  st.metric("Vector Dimension", model_info.get('dim', 'N/A'))
 
 
 
 
374
 
375
  # Quick search examples
376
  st.markdown("---")
@@ -378,9 +525,8 @@ def main():
378
  quick_words = ["татар", "Казан", "тел", "мәктәп", "китап", "уку", "язу", "бәйрәм"]
379
  selected_quick = st.selectbox("Example words:", quick_words)
380
 
381
- if st.button("Quick Similarity Search"):
382
  st.session_state.quick_search = selected_quick
383
- st.session_state.active_tab = "Word Search"
384
 
385
  # Main content area with tabs
386
  tab1, tab2, tab3, tab4 = st.tabs(["🔍 Word Search", "🧠 Analogies", "📊 Analysis", "ℹ️ About"])
@@ -403,7 +549,7 @@ def main():
403
  with col2:
404
  top_n = st.slider("Number of similar words:", 5, 20, 10)
405
 
406
- if st.button("Find Similar Words", type="primary") or (search_word and search_word != default_word):
407
  if search_word.strip():
408
  with st.spinner(f"Finding words similar to '{search_word}'..."):
409
  model = explorer.load_model(model_key)
@@ -469,6 +615,10 @@ def main():
469
  with tab2:
470
  st.header("Word Analogies")
471
 
 
 
 
 
472
  st.info("""
473
  **Example:** Париж - Франция + Татарстан = Казан?
474
  (Paris - France + Tatarstan = Kazan?)
@@ -562,20 +712,25 @@ def main():
562
  "Model": explorer.get_model_display_name(key),
563
  "Type": "Word2Vec" if "w2v" in key else "FastText",
564
  "Dimensions": config['dim'],
565
- "Analogy Accuracy": f"{config['analogy_accuracy']*100:.1f}%" if config['analogy_accuracy'] else "N/A",
566
- "Semantic Similarity": f"{config['semantic_similarity']:.3f}" if config['semantic_similarity'] else "N/A"
 
567
  })
568
 
569
  df_specs = pd.DataFrame(specs_data)
570
  st.dataframe(df_specs, use_container_width=True)
571
 
572
- # OOV words testing
573
  st.subheader("🔤 OOV (Out-of-Vocabulary) Testing")
574
 
575
- st.info("""
576
- **FastText models** can handle words not seen during training thanks to subword information.
577
- Word2Vec models cannot generate vectors for OOV words.
578
- """)
 
 
 
 
579
 
580
  oov_words = st.text_area(
581
  "Enter words for OOV testing (one per line):",
@@ -637,10 +792,8 @@ def main():
637
 
638
  ### 📁 Model Files Structure:
639
 
640
- Each model consists of three files:
641
- - `*.model` - Main model file
642
- - `*.model.syn1neg.npy` - Weights file
643
- - `*.model.wv.vectors.npy` - Word vectors file
644
 
645
  ### 📜 Certificate:
646
 
@@ -653,21 +806,22 @@ def main():
653
  ### 🚀 Usage Example:
654
 
655
  ```python
656
- from huggingface_hub import snapshot_download
657
- from gensim.models import Word2Vec
658
 
659
- # Download the model
660
- model_path = snapshot_download(
661
  repo_id="TatarNLPWorld/Tatar2Vec",
662
- allow_patterns="word2vec/cbow100/*"
663
  )
 
664
 
665
- # Load the model
666
- model = Word2Vec.load("word2vec/cbow100/w2v_cbow_100.model")
667
-
668
- # Find similar words
669
- similar = model.wv.most_similar("татар")
670
- print(similar)
671
  ```
672
 
673
  ### 📝 License:
 
12
  import os
13
  from typing import List, Dict, Tuple, Optional
14
  import gc
15
+ import tempfile
16
+ import shutil
17
 
18
  # Import for model loading from Hugging Face Hub
19
+ from huggingface_hub import snapshot_download, hf_hub_download
20
+ from gensim.models import FastText, Word2Vec, KeyedVectors
21
+ import gensim
22
 
23
  # Page configuration
24
  st.set_page_config(
 
64
  # Model names and their paths in Hugging Face repo
65
  self.available_models = {
66
  "Word2Vec": {
67
+ "cbow_100": "w2v_cbow_100", # CBOW 100-dim
68
+ "sg_100": "w2v_sg_100", # Skip-gram 100-dim
69
+ "cbow_200": "w2v_cbow_200" # CBOW 200-dim
70
  },
71
  "FastText": {
72
+ "cbow_100": "ft_cbow_100", # FastText CBOW 100-dim
73
+ "cbow_200": "ft_cbow_200" # FastText CBOW 200-dim
74
  }
75
  }
76
 
77
+ # Human-readable names for variants
78
+ self.variant_names = {
79
+ "cbow_100": "🥇 CBOW (100-dim) - Best for analogies",
80
+ "sg_100": "🎯 Skip-gram (100-dim) - Better for rare words",
81
+ "cbow_200": "📈 CBOW (200-dim) - Higher dimensionality"
82
+ }
83
+
84
  # Model paths in the Hugging Face repository
85
  self.model_configs = {
86
  "w2v_cbow_100": {
87
  "subdir": "word2vec/cbow100",
88
+ "has_main_file": True,
89
+ "main_file": "w2v_cbow_100.model",
90
+ "files": [
91
+ "w2v_cbow_100.model",
92
+ "w2v_cbow_100.model.syn1neg.npy",
93
+ "w2v_cbow_100.model.wv.vectors.npy"
94
+ ],
95
  "type": "word2vec",
96
  "dim": 100,
97
+ "description": "Word2Vec CBOW 100-dim - Best for analogies (60% accuracy)",
98
  "analogy_accuracy": 0.60,
99
+ "semantic_similarity": 0.568,
100
+ "variant": "cbow_100"
101
  },
102
  "w2v_cbow_200": {
103
  "subdir": "word2vec/cbow200",
104
+ "has_main_file": True,
105
+ "main_file": "w2v_cbow_200.model",
106
+ "files": [
107
+ "w2v_cbow_200.model",
108
+ "w2v_cbow_200.model.syn1neg.npy",
109
+ "w2v_cbow_200.model.wv.vectors.npy"
110
+ ],
111
  "type": "word2vec",
112
  "dim": 200,
113
+ "description": "Word2Vec CBOW 200-dim - Higher dimensionality, more expressive",
114
  "analogy_accuracy": None,
115
+ "semantic_similarity": None,
116
+ "variant": "cbow_200"
117
  },
118
  "w2v_sg_100": {
119
  "subdir": "word2vec/sg100",
120
+ "has_main_file": False, # No main .model file
121
+ "main_file": None,
122
+ "files": [
123
+ "w2v_sg_100.model.syn1neg.npy",
124
+ "w2v_sg_100.model.wv.vectors.npy"
125
+ ],
126
  "type": "word2vec",
127
  "dim": 100,
128
+ "description": "Word2Vec Skip-gram 100-dim - Better for rare words (only vectors available)",
129
  "analogy_accuracy": None,
130
+ "semantic_similarity": None,
131
+ "variant": "sg_100",
132
+ "note": "Only word vectors available, full model with training weights not included"
133
  },
134
  "ft_cbow_100": {
135
  "subdir": "fasttext/cbow100",
136
+ "has_main_file": True,
137
+ "main_file": "ft_cbow_100.model",
138
+ "files": [
139
+ "ft_cbow_100.model",
140
+ "ft_cbow_100.model.syn1neg.npy",
141
+ "ft_cbow_100.model.wv.vectors.npy"
142
+ ],
143
  "type": "fasttext",
144
  "dim": 100,
145
+ "description": "FastText CBOW 100-dim - Handles morphology, good for OOV words",
146
  "analogy_accuracy": 0.0,
147
+ "semantic_similarity": 0.582,
148
+ "variant": "cbow_100"
149
  },
150
  "ft_cbow_200": {
151
  "subdir": "fasttext/cbow200",
152
+ "has_main_file": True,
153
+ "main_file": "ft_cbow_200.model",
154
+ "files": [
155
+ "ft_cbow_200.model",
156
+ "ft_cbow_200.model.syn1neg.npy",
157
+ "ft_cbow_200.model.wv.vectors.npy"
158
+ ],
159
  "type": "fasttext",
160
  "dim": 200,
161
  "description": "FastText CBOW 200-dim - Larger FastText model",
162
  "analogy_accuracy": 0.0,
163
+ "semantic_similarity": None,
164
+ "variant": "cbow_200"
165
  }
166
  }
167
 
 
180
  progress_bar = st.progress(0)
181
  status_text = st.empty()
182
 
183
+ # Create a temporary directory for this model
184
+ temp_dir = tempfile.mkdtemp()
185
+ model_dir = os.path.join(temp_dir, config['subdir'])
186
+ os.makedirs(model_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
187
 
188
+ status_text.text(f"Downloading {_self.get_model_display_name(model_key)} from Hugging Face...")
189
+ progress_bar.progress(10)
190
 
191
+ # Download all required files for the model
192
+ total_files = len(config['files'])
193
+ for i, filename in enumerate(config['files']):
194
+ file_path = os.path.join(config['subdir'], filename)
195
+ status_text.text(f"Downloading {filename}... ({i+1}/{total_files})")
196
+
197
+ try:
198
+ # Download the file
199
+ downloaded_path = hf_hub_download(
200
+ repo_id=repo_id,
201
+ filename=file_path,
202
+ repo_type="model",
203
+ local_dir=temp_dir,
204
+ local_dir_use_symlinks=False
205
+ )
206
+
207
+ # Update progress
208
+ progress = 10 + (i + 1) * 60 // total_files
209
+ progress_bar.progress(progress)
210
+
211
+ except Exception as e:
212
+ st.warning(f"Note: {filename} may be downloaded differently: {e}")
213
+ continue
214
 
215
  progress_bar.progress(80)
216
+ status_text.text("Files downloaded, loading model...")
217
 
218
+ # Load the model based on available files
219
  try:
220
+ if config['has_main_file'] and config['main_file']:
221
+ # Full model with main file
222
+ model_path = os.path.join(temp_dir, config['subdir'], config['main_file'])
223
+ if os.path.exists(model_path):
224
+ if config['type'] == "fasttext":
225
+ model = FastText.load(model_path)
226
+ else:
227
+ model = Word2Vec.load(model_path)
228
+ else:
229
+ # Try to find any .model file
230
+ model_files = [f for f in os.listdir(os.path.join(temp_dir, config['subdir']))
231
+ if f.endswith('.model')]
232
+ if model_files:
233
+ model_path = os.path.join(temp_dir, config['subdir'], model_files[0])
234
+ if config['type'] == "fasttext":
235
+ model = FastText.load(model_path)
236
+ else:
237
+ model = Word2Vec.load(model_path)
238
+ else:
239
+ # If no model file, try to load just the vectors
240
+ status_text.text("Loading word vectors only...")
241
+ vectors_file = None
242
+ for file in config['files']:
243
+ if 'vectors' in file:
244
+ vectors_file = os.path.join(temp_dir, config['subdir'], file)
245
+ break
246
+
247
+ if vectors_file and os.path.exists(vectors_file):
248
+ # Create a KeyedVectors instance
249
+ model = KeyedVectors.load(vectors_file)
250
+ # Add a dummy train method to maintain compatibility
251
+ model.train = lambda *args, **kwargs: None
252
+ else:
253
+ raise Exception("No model or vectors file found")
254
  else:
255
+ # Model with only vectors (like sg100)
256
+ status_text.text("Loading word vectors only (Skip-gram model)...")
257
+ vectors_file = None
258
+ for file in config['files']:
259
+ if 'vectors' in file:
260
+ vectors_file = os.path.join(temp_dir, config['subdir'], file)
261
+ break
262
+
263
+ if vectors_file and os.path.exists(vectors_file):
264
+ # Create a KeyedVectors instance
265
+ model = KeyedVectors.load(vectors_file)
266
+ # Add a dummy train method to maintain compatibility
267
+ model.train = lambda *args, **kwargs: None
268
+ # Add warning about limited functionality
269
+ st.info("⚠️ Skip-gram model loaded in vectors-only mode. Some training features are not available.")
270
+ else:
271
+ raise Exception("No vectors file found for Skip-gram model")
272
 
273
  progress_bar.progress(100)
274
+ status_text.text(f"✅ Successfully loaded {_self.get_model_display_name(model_key)}!")
275
+
276
+ # Store temp dir to clean up later if needed
277
+ model._temp_dir = temp_dir
278
 
279
  # Clean up progress indicators after 2 seconds
280
  import time
 
285
  return model
286
 
287
  except Exception as e:
288
+ st.error(f"Error loading model: {str(e)}")
289
+ # Clean up temp dir
290
+ shutil.rmtree(temp_dir, ignore_errors=True)
291
  return None
292
 
293
  except Exception as e:
 
299
  names = {
300
  "w2v_cbow_100": "🥇 Word2Vec CBOW (100-dim)",
301
  "w2v_cbow_200": "📈 Word2Vec CBOW (200-dim)",
302
+ "w2v_sg_100": "🎯 Word2Vec Skip-gram (100-dim) [Vectors Only]",
303
  "ft_cbow_100": "⚡ FastText CBOW (100-dim)",
304
  "ft_cbow_200": "🚀 FastText CBOW (200-dim)"
305
  }
 
309
  """Get model information"""
310
  return self.model_configs.get(model_key, {})
311
 
312
+ def get_variant_name(self, variant_key: str) -> str:
313
+ """Get human-readable variant name"""
314
+ return self.variant_names.get(variant_key, variant_key)
315
+
316
  def find_similar_words(self, model, word: str, topn: int = 10):
317
  """Find semantically similar words"""
318
  try:
319
+ # Handle both Word2Vec/FastText models and KeyedVectors
320
  if hasattr(model, 'wv'):
321
  return model.wv.most_similar(word, topn=topn)
322
+ elif hasattr(model, 'most_similar'):
323
  return model.most_similar(word, topn=topn)
324
+ else:
325
+ return []
326
  except KeyError:
327
  return []
328
  except Exception as e:
 
334
  try:
335
  if hasattr(model, 'wv'):
336
  return model.wv.most_similar(positive=positive, negative=negative, topn=topn)
337
+ elif hasattr(model, 'most_similar'):
338
  return model.most_similar(positive=positive, negative=negative, topn=topn)
339
+ else:
340
+ return []
341
  except Exception as e:
342
  st.error(f"Error performing analogy: {e}")
343
  return []
 
347
  try:
348
  if hasattr(model, 'wv'):
349
  return model.wv[word]
350
+ elif hasattr(model, 'get_vector'):
351
+ return model.get_vector(word)
352
+ elif hasattr(model, '__getitem__'):
353
  return model[word]
354
+ else:
355
+ return None
356
  except KeyError:
357
  return None
358
 
 
365
  in_vocab = False
366
  if hasattr(model, 'wv'):
367
  in_vocab = word in model.wv.key_to_index
368
+ elif hasattr(model, 'key_to_index'):
369
+ in_vocab = word in model.key_to_index
370
+ elif hasattr(model, 'vocab'):
371
+ in_vocab = word in model.vocab
372
 
373
  similar = self.find_similar_words(model, word, 3) if in_vocab else []
374
  results.append({
 
383
  'similar_words': []
384
  })
385
  return results
 
 
 
 
 
 
386
 
387
  def create_performance_comparison():
388
  """Create model performance comparison charts"""
 
408
  x=['Word2Vec CBOW 100', 'FastText CBOW 100'],
409
  y=analogy_scores,
410
  marker_color=['#1f77b4', '#d62728'],
411
+ text=[f"{score*100:.1f}%" if score and score > 0 else "0%" for score in analogy_scores],
412
  textposition='auto',
413
  ),
414
  row=1, col=1
 
428
  )
429
 
430
  fig.update_layout(
431
+ title_text="Model Performance Comparison (Best Models)",
432
  showlegend=False,
433
  height=400,
434
  width=800
 
457
  index=0
458
  )
459
 
460
+ st.markdown("---")
461
+ st.subheader("Model Variant:")
 
 
 
 
462
 
463
+ # Model variant selection based on type
464
+ if model_type == "Word2Vec":
465
+ # Three variants for Word2Vec
466
+ variant_options = ["cbow_100", "sg_100", "cbow_200"]
467
+
468
+ selected_variant = st.radio(
469
+ "Select Word2Vec variant:",
470
+ options=variant_options,
471
+ format_func=lambda x: explorer.get_variant_name(x),
472
+ index=0 # Default to CBOW 100
473
+ )
474
+
475
+ # Show note for Skip-gram
476
+ if selected_variant == "sg_100":
477
+ st.info("ℹ️ Skip-gram model is available in vectors-only mode")
478
+
479
+ else: # FastText
480
+ # Two variants for FastText
481
+ variant_options = ["cbow_100", "cbow_200"]
482
+
483
+ selected_variant = st.radio(
484
+ "Select FastText variant:",
485
+ options=variant_options,
486
+ format_func=lambda x: "⚡ CBOW (100-dim)" if x == "cbow_100" else "🚀 CBOW (200-dim)",
487
+ index=0
488
+ )
489
+
490
+ # Get model key based on type and variant
491
+ model_key = explorer.available_models[model_type][selected_variant]
492
 
493
  # Model information section
494
  st.markdown("---")
 
499
  st.markdown(f"**{explorer.get_model_display_name(model_key)}**")
500
  st.caption(model_info.get('description', ''))
501
 
502
+ if 'note' in model_info:
503
+ st.caption(f"*Note: {model_info['note']}*")
504
+
505
  col1, col2 = st.columns(2)
506
  with col1:
507
  if model_info.get('analogy_accuracy') is not None:
508
  acc = model_info['analogy_accuracy']
509
+ st.metric("Analogy Accuracy", f"{acc*100:.1f}%" if acc and acc > 0 else "N/A")
510
 
511
  with col2:
512
  if model_info.get('semantic_similarity') is not None:
 
514
  st.metric("Semantic Similarity", f"{sim:.3f}" if sim else "N/A")
515
 
516
  st.metric("Vector Dimension", model_info.get('dim', 'N/A'))
517
+
518
+ # Show file info
519
+ file_count = len(model_info.get('files', []))
520
+ st.caption(f"📁 {file_count} file(s) in model")
521
 
522
  # Quick search examples
523
  st.markdown("---")
 
525
  quick_words = ["татар", "Казан", "тел", "мәктәп", "китап", "уку", "язу", "бәйрәм"]
526
  selected_quick = st.selectbox("Example words:", quick_words)
527
 
528
+ if st.button("Quick Similarity Search", use_container_width=True):
529
  st.session_state.quick_search = selected_quick
 
530
 
531
  # Main content area with tabs
532
  tab1, tab2, tab3, tab4 = st.tabs(["🔍 Word Search", "🧠 Analogies", "📊 Analysis", "ℹ️ About"])
 
549
  with col2:
550
  top_n = st.slider("Number of similar words:", 5, 20, 10)
551
 
552
+ if st.button("Find Similar Words", type="primary", use_container_width=True):
553
  if search_word.strip():
554
  with st.spinner(f"Finding words similar to '{search_word}'..."):
555
  model = explorer.load_model(model_key)
 
615
  with tab2:
616
  st.header("Word Analogies")
617
 
618
+ # Check if model supports analogies (Skip-gram in vectors mode might have limitations)
619
+ if model_key == "w2v_sg_100":
620
+ st.warning("⚠️ Skip-gram model is in vectors-only mode. Analogies might not work perfectly.")
621
+
622
  st.info("""
623
  **Example:** Париж - Франция + Татарстан = Казан?
624
  (Paris - France + Tatarstan = Kazan?)
 
712
  "Model": explorer.get_model_display_name(key),
713
  "Type": "Word2Vec" if "w2v" in key else "FastText",
714
  "Dimensions": config['dim'],
715
+ "Files": len(config['files']),
716
+ "Analogy Accuracy": f"{config['analogy_accuracy']*100:.1f}%" if config.get('analogy_accuracy') else "N/A",
717
+ "Semantic Similarity": f"{config['semantic_similarity']:.3f}" if config.get('semantic_similarity') else "N/A"
718
  })
719
 
720
  df_specs = pd.DataFrame(specs_data)
721
  st.dataframe(df_specs, use_container_width=True)
722
 
723
+ # OOV words testing (only for FastText)
724
  st.subheader("🔤 OOV (Out-of-Vocabulary) Testing")
725
 
726
+ if model_type == "FastText":
727
+ st.info("""
728
+ **FastText models** can handle words not seen during training thanks to subword information.
729
+ """)
730
+ else:
731
+ st.info("""
732
+ **Word2Vec models** cannot generate vectors for OOV words. Only words in vocabulary will show results.
733
+ """)
734
 
735
  oov_words = st.text_area(
736
  "Enter words for OOV testing (one per line):",
 
792
 
793
  ### 📁 Model Files Structure:
794
 
795
+ - **CBOW models**: 3 files (`.model`, `.syn1neg.npy`, `.wv.vectors.npy`)
796
+ - **Skip-gram model**: 2 files (`.syn1neg.npy`, `.wv.vectors.npy`) - vectors only
 
 
797
 
798
  ### 📜 Certificate:
799
 
 
806
  ### 🚀 Usage Example:
807
 
808
  ```python
809
+ from huggingface_hub import hf_hub_download
810
+ from gensim.models import Word2Vec, KeyedVectors
811
 
812
+ # For CBOW models with full model
813
+ model_path = hf_hub_download(
814
  repo_id="TatarNLPWorld/Tatar2Vec",
815
+ filename="word2vec/cbow100/w2v_cbow_100.model"
816
  )
817
+ model = Word2Vec.load(model_path)
818
 
819
+ # For Skip-gram with vectors only
820
+ vectors_path = hf_hub_download(
821
+ repo_id="TatarNLPWorld/Tatar2Vec",
822
+ filename="word2vec/sg100/w2v_sg_100.model.wv.vectors.npy"
823
+ )
824
+ vectors = KeyedVectors.load(vectors_path)
825
  ```
826
 
827
  ### 📝 License: