Spaces:
Sleeping
Sleeping
| """ | |
| Tatar2Vec Demo - Interactive Word Embeddings Explorer | |
| Run: streamlit run app.py | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import tempfile | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import List, Dict, Tuple, Optional | |
| import requests | |
| import json | |
| # Import for model loading from Hugging Face Hub | |
| from huggingface_hub import snapshot_download | |
| from gensim.models import FastText, Word2Vec | |
| import gensim.downloader as api | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Tatar2Vec Demo", | |
| page_icon="🏆", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for improved styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| color: #1f77b4; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .model-card { | |
| background-color: #f0f2f6; | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| border-left: 4px solid #1f77b4; | |
| margin-bottom: 1rem; | |
| } | |
| .metric-card { | |
| background-color: white; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| text-align: center; | |
| } | |
| .word-cloud { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 0.5rem 1rem; | |
| border-radius: 20px; | |
| display: inline-block; | |
| margin: 0.2rem; | |
| font-weight: 500; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| class Tatar2VecExplorer: | |
| def __init__(self): | |
| self.loaded_models = {} | |
| self.available_models = { | |
| "FastText": { | |
| "best": "ft_dim100_win5_min5_ngram3-6_sg.epoch1", | |
| "alternative": "ft_dim100_win5_min5_ngram3-6_sg.epoch3" | |
| }, | |
| "Word2Vec": { | |
| "best": "w2v_dim200_win5_min5_sg.epoch4", | |
| "alternative": "w2v_dim100_win5_min5_sg" | |
| } | |
| } | |
| def load_model(_self, model_name: str, model_type: str = "fasttext"): | |
| """Load model with caching for better performance""" | |
| try: | |
| # Download model from Hugging Face Hub | |
| model_dir = snapshot_download( | |
| repo_id="arabovs-ai-lab/Tatar2Vec", | |
| allow_patterns=f"{model_type}/{model_name}/*" | |
| ) | |
| # Construct model path | |
| model_path = os.path.join(model_dir, model_type, model_name, f"{model_name}.model") | |
| # Load appropriate model type | |
| if model_type == "fasttext": | |
| model = FastText.load(model_path) | |
| else: | |
| model = Word2Vec.load(model_path) | |
| return model | |
| except Exception as e: | |
| st.error(f"Error loading model: {e}") | |
| return None | |
| def get_model_display_name(self, model_key: str) -> str: | |
| """Get human-readable model name""" | |
| names = { | |
| "ft_dim100_win5_min5_ngram3-6_sg.epoch1": "🥇 Best FastText", | |
| "ft_dim100_win5_min5_ngram3-6_sg.epoch3": "🥈 Alternative FastText", | |
| "w2v_dim200_win5_min5_sg.epoch4": "🥇 Best Word2Vec", | |
| "w2v_dim100_win5_min5_sg": "🥈 Compact Word2Vec" | |
| } | |
| return names.get(model_key, model_key) | |
| def get_model_performance(self, model_key: str) -> dict: | |
| """Get model performance metrics""" | |
| performance = { | |
| "ft_dim100_win5_min5_ngram3-6_sg.epoch1": { | |
| "composite": 0.7019, "semantic": 0.7368, "analogy": 0.0476, | |
| "oov": 1.0000, "coherence": 0.9588 | |
| }, | |
| "ft_dim100_win5_min5_ngram3-6_sg.epoch3": { | |
| "composite": 0.6675, "semantic": 0.6894, "analogy": 0.0476, | |
| "oov": 1.0000, "coherence": 0.9388 | |
| }, | |
| "w2v_dim200_win5_min5_sg.epoch4": { | |
| "composite": 0.5685, "semantic": 0.4445, "analogy": 0.3214, | |
| "oov": 0.3854, "coherence": 0.7307 | |
| }, | |
| "w2v_dim100_win5_min5_sg": { | |
| "composite": 0.5566, "semantic": 0.5187, "analogy": 0.2500, | |
| "oov": 0.3854, "coherence": 0.8051 | |
| } | |
| } | |
| return performance.get(model_key, {}) | |
| def find_similar_words(self, model, word: str, topn: int = 10): | |
| """Find semantically similar words""" | |
| try: | |
| if hasattr(model, 'wv'): | |
| return model.wv.most_similar(word, topn=topn) | |
| else: | |
| return model.most_similar(word, topn=topn) | |
| except KeyError: | |
| return [] | |
| except Exception as e: | |
| st.error(f"Error finding similar words: {e}") | |
| return [] | |
| def word_analogy(self, model, positive: List[str], negative: List[str], topn: int = 5): | |
| """Perform word analogy operation (king - man + woman = queen)""" | |
| try: | |
| if hasattr(model, 'wv'): | |
| return model.wv.most_similar(positive=positive, negative=negative, topn=topn) | |
| else: | |
| return model.most_similar(positive=positive, negative=negative, topn=topn) | |
| except Exception as e: | |
| st.error(f"Error performing analogy: {e}") | |
| return [] | |
| def get_word_vector(self, model, word: str): | |
| """Get word vector representation""" | |
| try: | |
| if hasattr(model, 'wv'): | |
| return model.wv[word] | |
| else: | |
| return model[word] | |
| except KeyError: | |
| return None | |
| def handle_oov_words(self, model, words: List[str]): | |
| """Handle Out-of-Vocabulary words (FastText only)""" | |
| results = [] | |
| for word in words: | |
| try: | |
| vector = self.get_word_vector(model, word) | |
| similar = self.find_similar_words(model, word, 3) | |
| results.append({ | |
| 'word': word, | |
| 'in_vocab': vector is not None, | |
| 'similar_words': similar | |
| }) | |
| except Exception: | |
| results.append({ | |
| 'word': word, | |
| 'in_vocab': False, | |
| 'similar_words': [] | |
| }) | |
| return results | |
| def create_performance_comparison(): | |
| """Create model performance comparison charts""" | |
| models = [ | |
| "ft_dim100_win5_min5_ngram3-6_sg.epoch1", | |
| "ft_dim100_win5_min5_ngram3-6_sg.epoch3", | |
| "w2v_dim200_win5_min5_sg.epoch4", | |
| "w2v_dim100_win5_min5_sg", | |
| "cc.tt.300 (Meta)" | |
| ] | |
| composite_scores = [0.7019, 0.6675, 0.5685, 0.5566, 0.2000] | |
| semantic_scores = [0.7368, 0.6894, 0.4445, 0.5187, None] | |
| # Create subplots for comparison | |
| fig = make_subplots( | |
| rows=1, cols=2, | |
| subplot_titles=('Composite Score', 'Semantic Similarity'), | |
| specs=[[{"type": "bar"}, {"type": "bar"}]] | |
| ) | |
| # Composite scores | |
| fig.add_trace( | |
| go.Bar(name='Composite Score', x=models, y=composite_scores, | |
| marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e', '#d62728']), | |
| row=1, col=1 | |
| ) | |
| # Filter out None values for semantic similarity | |
| semantic_models = [models[i] for i in range(len(models)) if semantic_scores[i] is not None] | |
| semantic_values = [score for score in semantic_scores if score is not None] | |
| # Semantic similarity scores | |
| fig.add_trace( | |
| go.Bar(name='Semantic Similarity', x=semantic_models, y=semantic_values, | |
| marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e']), | |
| row=1, col=2 | |
| ) | |
| fig.update_layout( | |
| title_text="Model Performance Comparison", | |
| showlegend=False, | |
| height=400 | |
| ) | |
| return fig | |
| def create_word_cloud(similar_words, title): | |
| """Create word cloud visualization for similar words""" | |
| if not similar_words: | |
| return None | |
| words = [word for word, score in similar_words] | |
| scores = [score for word, score in similar_words] | |
| # Normalize scores for font sizes | |
| sizes = [30 + (score * 70) for score in scores] | |
| fig = go.Figure() | |
| # Add each word as annotation with random position | |
| for i, (word, size) in enumerate(zip(words, sizes)): | |
| fig.add_annotation( | |
| text=word, | |
| x=np.random.uniform(0.1, 0.9), | |
| y=np.random.uniform(0.1, 0.9), | |
| showarrow=False, | |
| font=dict(size=size, color=f"hsl({i*40}, 70%, 50%)"), | |
| bgcolor="rgba(255,255,255,0.7)", | |
| bordercolor="rgba(0,0,0,0.1)", | |
| borderwidth=1, | |
| borderpad=2, | |
| ) | |
| fig.update_layout( | |
| title=title, | |
| xaxis=dict(showticklabels=False, showgrid=False, zeroline=False), | |
| yaxis=dict(showticklabels=False, showgrid=False, zeroline=False), | |
| plot_bgcolor='rgba(0,0,0,0)', | |
| height=300, | |
| margin=dict(l=20, r=20, t=40, b=20) | |
| ) | |
| return fig | |
| def main(): | |
| # Application header | |
| st.markdown('<h1 class="main-header">🏆 Tatar2Vec Demo - Tatar Word Embeddings</h1>', unsafe_allow_html=True) | |
| # Initialize explorer | |
| explorer = Tatar2VecExplorer() | |
| # Sidebar configuration | |
| with st.sidebar: | |
| st.header("⚙️ Model Settings") | |
| # Model type selection | |
| model_type = st.selectbox( | |
| "Model Type:", | |
| ["FastText", "Word2Vec"], | |
| index=0 | |
| ) | |
| # Model variant selection | |
| model_variant = st.radio( | |
| "Model Variant:", | |
| ["best", "alternative"], | |
| format_func=lambda x: "🥇 Best Model" if x == "best" else "🥈 Alternative Model" | |
| ) | |
| model_key = explorer.available_models[model_type][model_variant] | |
| # Model information section | |
| st.markdown("---") | |
| st.subheader("📊 Model Information") | |
| performance = explorer.get_model_performance(model_key) | |
| if performance: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric("Composite Score", f"{performance['composite']:.4f}") | |
| st.metric("Semantic Similarity", f"{performance['semantic']:.4f}") | |
| with col2: | |
| st.metric("Analogy Accuracy", f"{performance['analogy']:.4f}") | |
| st.metric("OOV Handling", f"{performance['oov']:.4f}") | |
| # Quick search examples | |
| st.markdown("---") | |
| st.subheader("🔍 Quick Search") | |
| quick_words = ["мәктәп", "китап", "тел", "фән", "табигать"] | |
| selected_quick = st.selectbox("Example words:", quick_words) | |
| if st.button("Quick Similarity Search"): | |
| st.session_state.quick_search = selected_quick | |
| # Main content area with tabs | |
| tab1, tab2, tab3, tab4 = st.tabs(["🔍 Word Search", "🧠 Analogies", "📊 Analysis", "ℹ️ About"]) | |
| with tab1: | |
| st.header("Similar Word Search") | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| search_word = st.text_input( | |
| "Enter Tatar word:", | |
| value=getattr(st.session_state, 'quick_search', 'мәктәп'), | |
| placeholder="e.g., мәктәп, китап, тел..." | |
| ) | |
| with col2: | |
| top_n = st.slider("Number of similar words:", 5, 20, 10) | |
| if st.button("Find Similar Words") or search_word: | |
| with st.spinner(f"Loading model and finding words similar to '{search_word}'..."): | |
| model = explorer.load_model(model_key, model_type.lower()) | |
| if model and search_word.strip(): | |
| similar_words = explorer.find_similar_words(model, search_word.strip(), top_n) | |
| if similar_words: | |
| # Display results in two columns | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| st.subheader("📈 Similar Words") | |
| df = pd.DataFrame(similar_words, columns=["Word", "Similarity"]) | |
| st.dataframe(df, use_container_width=True) | |
| with col2: | |
| fig = create_word_cloud(similar_words, f"Words similar to '{search_word}'") | |
| if fig: | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Additional information | |
| st.subheader("📋 Details") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| try: | |
| vector = explorer.get_word_vector(model, search_word.strip()) | |
| if vector is not None: | |
| st.metric("Vector Dimension", len(vector)) | |
| except: | |
| pass | |
| with col2: | |
| st.metric("Similar Words Found", len(similar_words)) | |
| with col3: | |
| if similar_words: | |
| st.metric("Max Similarity", f"{similar_words[0][1]:.4f}") | |
| else: | |
| st.warning(f"Word '{search_word}' not found in model vocabulary.") | |
| with tab2: | |
| st.header("Word Analogies") | |
| st.markdown(""" | |
| **Example:** табиб - ир + хатын = ? (doctor - man + woman = female doctor) | |
| """) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| positive1 = st.text_input("Positive word 1:", "табиб", placeholder="doctor") | |
| positive2 = st.text_input("Positive word 2:", "хатын", placeholder="woman") | |
| with col2: | |
| negative = st.text_input("Negative word:", "ир", placeholder="man") | |
| with col3: | |
| analogy_topn = st.slider("Number of results:", 3, 10, 5) | |
| if st.button("Perform Analogy"): | |
| if positive1 and positive2 and negative: | |
| with st.spinner("Performing analogy..."): | |
| model = explorer.load_model(model_key, model_type.lower()) | |
| if model: | |
| analogy_results = explorer.word_analogy( | |
| model, | |
| positive=[positive1, positive2], | |
| negative=[negative], | |
| topn=analogy_topn | |
| ) | |
| if analogy_results: | |
| st.subheader("🎯 Analogy Results") | |
| df = pd.DataFrame(analogy_results, columns=["Word", "Similarity"]) | |
| st.dataframe(df, use_container_width=True) | |
| # Visualization | |
| fig = px.bar( | |
| df, | |
| x='Similarity', | |
| y='Word', | |
| orientation='h', | |
| title=f"Analogy: {positive1} - {negative} + {positive2}", | |
| color='Similarity', | |
| color_continuous_scale='viridis' | |
| ) | |
| fig.update_layout(yaxis={'categoryorder':'total ascending'}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.error("Could not perform analogy. Please check the input words.") | |
| # Predefined analogy examples | |
| st.subheader("🎪 Example Analogies") | |
| presets = { | |
| "Education": ("укытучы", "мәктәп", "өй", "teacher - home + school"), | |
| "Family": ("ата", "кыз", "ул", "father - son + daughter"), | |
| "Professions": ("шеф", "аш", "ресторан", "chef - restaurant + food") | |
| } | |
| cols = st.columns(len(presets)) | |
| for idx, (name, (p1, p2, n, desc)) in enumerate(presets.items()): | |
| with cols[idx]: | |
| if st.button(f"🧩 {name}", key=f"preset_{idx}"): | |
| st.session_state.analogy_p1 = p1 | |
| st.session_state.analogy_p2 = p2 | |
| st.session_state.analogy_n = n | |
| st.rerun() | |
| with tab3: | |
| st.header("Model Analysis") | |
| # Performance comparison | |
| st.subheader("📊 Model Performance Comparison") | |
| perf_fig = create_performance_comparison() | |
| st.plotly_chart(perf_fig, use_container_width=True) | |
| # OOV words testing | |
| st.subheader("🔤 OOV (Out-of-Vocabulary) Testing") | |
| st.markdown(""" | |
| **FastText models** can handle words not seen during training | |
| thanks to subword information. | |
| """) | |
| oov_words = st.text_area( | |
| "Enter words for OOV testing (one per line):", | |
| "технологияләштерү\nцифрлаштыру\nвиртуальлаштыру\nмәктәпчә" | |
| ) | |
| if st.button("Test OOV") and model_type == "FastText": | |
| test_words = [word.strip() for word in oov_words.split('\n') if word.strip()] | |
| with st.spinner("Testing OOV words..."): | |
| model = explorer.load_model(model_key, "fasttext") | |
| if model: | |
| results = explorer.handle_oov_words(model, test_words) | |
| st.subheader("OOV Testing Results") | |
| for result in results: | |
| col1, col2 = st.columns([1, 3]) | |
| with col1: | |
| status = "✅ In Vocabulary" if result['in_vocab'] else "🆕 OOV Word" | |
| st.write(f"**{result['word']}** - {status}") | |
| with col2: | |
| if result['similar_words']: | |
| similar_str = ", ".join([f"{word}({score:.3f})" for word, score in result['similar_words']]) | |
| st.write(f"Similar: {similar_str}") | |
| else: | |
| st.write("No similar words found") | |
| # Model comparison | |
| st.subheader("🔄 Model Comparison") | |
| compare_words = st.text_input("Words to compare across models (comma-separated):", "мәктәп, китап, тел, фән") | |
| if st.button("Compare Models"): | |
| words_to_compare = [word.strip() for word in compare_words.split(',')] | |
| comparison_data = [] | |
| for model_type_comp in ["FastText", "Word2Vec"]: | |
| for variant in ["best", "alternative"]: | |
| model_key_comp = explorer.available_models[model_type_comp][variant] | |
| with st.spinner(f"Testing {model_key_comp}..."): | |
| model = explorer.load_model(model_key_comp, model_type_comp.lower()) | |
| if model: | |
| for word in words_to_compare: | |
| similar = explorer.find_similar_words(model, word, 3) | |
| if similar: | |
| for sim_word, score in similar: | |
| comparison_data.append({ | |
| 'Model': explorer.get_model_display_name(model_key_comp), | |
| 'Type': model_type_comp, | |
| 'Source Word': word, | |
| 'Similar Word': sim_word, | |
| 'Similarity': score | |
| }) | |
| if comparison_data: | |
| df_compare = pd.DataFrame(comparison_data) | |
| st.dataframe(df_compare, use_container_width=True) | |
| with tab4: | |
| st.header("ℹ️ About Tatar2Vec") | |
| st.markdown(""" | |
| ## 🏆 Tatar2Vec - High-quality Tatar Word Embeddings | |
| This repository contains the best performing FastText and Word2Vec models for Tatar, | |
| selected through comprehensive evaluation of 57 different model configurations. | |
| ### 🎯 Key Features: | |
| - **High Quality**: Our models significantly outperform pre-trained Meta models | |
| - **Large Vocabulary**: 637.7K words | |
| - **Multiple Architectures**: FastText and Word2Vec | |
| - **OOV Support**: FastText models handle out-of-vocabulary words | |
| ### 📊 Key Results: | |
| - **Best Model**: FastText with composite score 0.7019 (vs 0.2000 for Meta) | |
| - **Best Architecture**: Skip-gram outperforms CBOW | |
| - **Optimal Dimension**: 100-dimensional models perform better than 200/300-dimensional | |
| ### 🎪 Use Cases: | |
| - Semantic similarity search | |
| - Word analogies | |
| - Text classification | |
| - Machine translation | |
| - And much more! | |
| ### 📚 Training Corpus: | |
| - **Total Tokens**: 207.02M | |
| - **Unique Words**: 2.1M | |
| - **Vocabulary**: 637.7K | |
| - **Sources**: Wikipedia, news, books, social media | |
| ### 📜 Citation: | |
| ```bibtex | |
| @misc{Tatar2Vec_20251109, | |
| title = {Tatar2Vec: Tatar Word Embeddings}, | |
| author = {Arabovs AI Lab}, | |
| year = 2025, | |
| publisher = {Hugging Face}, | |
| url = {https://huggingface.co/arabovs-ai-lab/Tatar2Vec} | |
| } | |
| ``` | |
| ### 📄 License: MIT License | |
| """) | |
| if __name__ == "__main__": | |
| main() |