""" Tatar2Vec Demo - Interactive Word Embeddings Explorer Run: streamlit run app.py """ import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import tempfile import os import sys from pathlib import Path from typing import List, Dict, Tuple, Optional import requests import json # Import for model loading from Hugging Face Hub from huggingface_hub import snapshot_download from gensim.models import FastText, Word2Vec import gensim.downloader as api # Page configuration st.set_page_config( page_title="Tatar2Vec Demo", page_icon="🏆", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for improved styling st.markdown(""" """, unsafe_allow_html=True) class Tatar2VecExplorer: def __init__(self): self.loaded_models = {} self.available_models = { "FastText": { "best": "ft_dim100_win5_min5_ngram3-6_sg.epoch1", "alternative": "ft_dim100_win5_min5_ngram3-6_sg.epoch3" }, "Word2Vec": { "best": "w2v_dim200_win5_min5_sg.epoch4", "alternative": "w2v_dim100_win5_min5_sg" } } @st.cache_resource(show_spinner="Loading Tatar2Vec model...") def load_model(_self, model_name: str, model_type: str = "fasttext"): """Load model with caching for better performance""" try: # Download model from Hugging Face Hub model_dir = snapshot_download( repo_id="arabovs-ai-lab/Tatar2Vec", allow_patterns=f"{model_type}/{model_name}/*" ) # Construct model path model_path = os.path.join(model_dir, model_type, model_name, f"{model_name}.model") # Load appropriate model type if model_type == "fasttext": model = FastText.load(model_path) else: model = Word2Vec.load(model_path) return model except Exception as e: st.error(f"Error loading model: {e}") return None def get_model_display_name(self, model_key: str) -> str: """Get human-readable model name""" names = { "ft_dim100_win5_min5_ngram3-6_sg.epoch1": "🥇 Best FastText", "ft_dim100_win5_min5_ngram3-6_sg.epoch3": "🥈 Alternative FastText", "w2v_dim200_win5_min5_sg.epoch4": "🥇 Best Word2Vec", "w2v_dim100_win5_min5_sg": "🥈 Compact Word2Vec" } return names.get(model_key, model_key) def get_model_performance(self, model_key: str) -> dict: """Get model performance metrics""" performance = { "ft_dim100_win5_min5_ngram3-6_sg.epoch1": { "composite": 0.7019, "semantic": 0.7368, "analogy": 0.0476, "oov": 1.0000, "coherence": 0.9588 }, "ft_dim100_win5_min5_ngram3-6_sg.epoch3": { "composite": 0.6675, "semantic": 0.6894, "analogy": 0.0476, "oov": 1.0000, "coherence": 0.9388 }, "w2v_dim200_win5_min5_sg.epoch4": { "composite": 0.5685, "semantic": 0.4445, "analogy": 0.3214, "oov": 0.3854, "coherence": 0.7307 }, "w2v_dim100_win5_min5_sg": { "composite": 0.5566, "semantic": 0.5187, "analogy": 0.2500, "oov": 0.3854, "coherence": 0.8051 } } return performance.get(model_key, {}) def find_similar_words(self, model, word: str, topn: int = 10): """Find semantically similar words""" try: if hasattr(model, 'wv'): return model.wv.most_similar(word, topn=topn) else: return model.most_similar(word, topn=topn) except KeyError: return [] except Exception as e: st.error(f"Error finding similar words: {e}") return [] def word_analogy(self, model, positive: List[str], negative: List[str], topn: int = 5): """Perform word analogy operation (king - man + woman = queen)""" try: if hasattr(model, 'wv'): return model.wv.most_similar(positive=positive, negative=negative, topn=topn) else: return model.most_similar(positive=positive, negative=negative, topn=topn) except Exception as e: st.error(f"Error performing analogy: {e}") return [] def get_word_vector(self, model, word: str): """Get word vector representation""" try: if hasattr(model, 'wv'): return model.wv[word] else: return model[word] except KeyError: return None def handle_oov_words(self, model, words: List[str]): """Handle Out-of-Vocabulary words (FastText only)""" results = [] for word in words: try: vector = self.get_word_vector(model, word) similar = self.find_similar_words(model, word, 3) results.append({ 'word': word, 'in_vocab': vector is not None, 'similar_words': similar }) except Exception: results.append({ 'word': word, 'in_vocab': False, 'similar_words': [] }) return results def create_performance_comparison(): """Create model performance comparison charts""" models = [ "ft_dim100_win5_min5_ngram3-6_sg.epoch1", "ft_dim100_win5_min5_ngram3-6_sg.epoch3", "w2v_dim200_win5_min5_sg.epoch4", "w2v_dim100_win5_min5_sg", "cc.tt.300 (Meta)" ] composite_scores = [0.7019, 0.6675, 0.5685, 0.5566, 0.2000] semantic_scores = [0.7368, 0.6894, 0.4445, 0.5187, None] # Create subplots for comparison fig = make_subplots( rows=1, cols=2, subplot_titles=('Composite Score', 'Semantic Similarity'), specs=[[{"type": "bar"}, {"type": "bar"}]] ) # Composite scores fig.add_trace( go.Bar(name='Composite Score', x=models, y=composite_scores, marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e', '#d62728']), row=1, col=1 ) # Filter out None values for semantic similarity semantic_models = [models[i] for i in range(len(models)) if semantic_scores[i] is not None] semantic_values = [score for score in semantic_scores if score is not None] # Semantic similarity scores fig.add_trace( go.Bar(name='Semantic Similarity', x=semantic_models, y=semantic_values, marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e']), row=1, col=2 ) fig.update_layout( title_text="Model Performance Comparison", showlegend=False, height=400 ) return fig def create_word_cloud(similar_words, title): """Create word cloud visualization for similar words""" if not similar_words: return None words = [word for word, score in similar_words] scores = [score for word, score in similar_words] # Normalize scores for font sizes sizes = [30 + (score * 70) for score in scores] fig = go.Figure() # Add each word as annotation with random position for i, (word, size) in enumerate(zip(words, sizes)): fig.add_annotation( text=word, x=np.random.uniform(0.1, 0.9), y=np.random.uniform(0.1, 0.9), showarrow=False, font=dict(size=size, color=f"hsl({i*40}, 70%, 50%)"), bgcolor="rgba(255,255,255,0.7)", bordercolor="rgba(0,0,0,0.1)", borderwidth=1, borderpad=2, ) fig.update_layout( title=title, xaxis=dict(showticklabels=False, showgrid=False, zeroline=False), yaxis=dict(showticklabels=False, showgrid=False, zeroline=False), plot_bgcolor='rgba(0,0,0,0)', height=300, margin=dict(l=20, r=20, t=40, b=20) ) return fig def main(): # Application header st.markdown('

🏆 Tatar2Vec Demo - Tatar Word Embeddings

', unsafe_allow_html=True) # Initialize explorer explorer = Tatar2VecExplorer() # Sidebar configuration with st.sidebar: st.header("⚙️ Model Settings") # Model type selection model_type = st.selectbox( "Model Type:", ["FastText", "Word2Vec"], index=0 ) # Model variant selection model_variant = st.radio( "Model Variant:", ["best", "alternative"], format_func=lambda x: "🥇 Best Model" if x == "best" else "🥈 Alternative Model" ) model_key = explorer.available_models[model_type][model_variant] # Model information section st.markdown("---") st.subheader("📊 Model Information") performance = explorer.get_model_performance(model_key) if performance: col1, col2 = st.columns(2) with col1: st.metric("Composite Score", f"{performance['composite']:.4f}") st.metric("Semantic Similarity", f"{performance['semantic']:.4f}") with col2: st.metric("Analogy Accuracy", f"{performance['analogy']:.4f}") st.metric("OOV Handling", f"{performance['oov']:.4f}") # Quick search examples st.markdown("---") st.subheader("🔍 Quick Search") quick_words = ["мәктәп", "китап", "тел", "фән", "табигать"] selected_quick = st.selectbox("Example words:", quick_words) if st.button("Quick Similarity Search"): st.session_state.quick_search = selected_quick # Main content area with tabs tab1, tab2, tab3, tab4 = st.tabs(["🔍 Word Search", "🧠 Analogies", "📊 Analysis", "ℹ️ About"]) with tab1: st.header("Similar Word Search") col1, col2 = st.columns([2, 1]) with col1: search_word = st.text_input( "Enter Tatar word:", value=getattr(st.session_state, 'quick_search', 'мәктәп'), placeholder="e.g., мәктәп, китап, тел..." ) with col2: top_n = st.slider("Number of similar words:", 5, 20, 10) if st.button("Find Similar Words") or search_word: with st.spinner(f"Loading model and finding words similar to '{search_word}'..."): model = explorer.load_model(model_key, model_type.lower()) if model and search_word.strip(): similar_words = explorer.find_similar_words(model, search_word.strip(), top_n) if similar_words: # Display results in two columns col1, col2 = st.columns([1, 1]) with col1: st.subheader("📈 Similar Words") df = pd.DataFrame(similar_words, columns=["Word", "Similarity"]) st.dataframe(df, use_container_width=True) with col2: fig = create_word_cloud(similar_words, f"Words similar to '{search_word}'") if fig: st.plotly_chart(fig, use_container_width=True) # Additional information st.subheader("📋 Details") col1, col2, col3 = st.columns(3) with col1: try: vector = explorer.get_word_vector(model, search_word.strip()) if vector is not None: st.metric("Vector Dimension", len(vector)) except: pass with col2: st.metric("Similar Words Found", len(similar_words)) with col3: if similar_words: st.metric("Max Similarity", f"{similar_words[0][1]:.4f}") else: st.warning(f"Word '{search_word}' not found in model vocabulary.") with tab2: st.header("Word Analogies") st.markdown(""" **Example:** табиб - ир + хатын = ? (doctor - man + woman = female doctor) """) col1, col2, col3 = st.columns(3) with col1: positive1 = st.text_input("Positive word 1:", "табиб", placeholder="doctor") positive2 = st.text_input("Positive word 2:", "хатын", placeholder="woman") with col2: negative = st.text_input("Negative word:", "ир", placeholder="man") with col3: analogy_topn = st.slider("Number of results:", 3, 10, 5) if st.button("Perform Analogy"): if positive1 and positive2 and negative: with st.spinner("Performing analogy..."): model = explorer.load_model(model_key, model_type.lower()) if model: analogy_results = explorer.word_analogy( model, positive=[positive1, positive2], negative=[negative], topn=analogy_topn ) if analogy_results: st.subheader("🎯 Analogy Results") df = pd.DataFrame(analogy_results, columns=["Word", "Similarity"]) st.dataframe(df, use_container_width=True) # Visualization fig = px.bar( df, x='Similarity', y='Word', orientation='h', title=f"Analogy: {positive1} - {negative} + {positive2}", color='Similarity', color_continuous_scale='viridis' ) fig.update_layout(yaxis={'categoryorder':'total ascending'}) st.plotly_chart(fig, use_container_width=True) else: st.error("Could not perform analogy. Please check the input words.") # Predefined analogy examples st.subheader("🎪 Example Analogies") presets = { "Education": ("укытучы", "мәктәп", "өй", "teacher - home + school"), "Family": ("ата", "кыз", "ул", "father - son + daughter"), "Professions": ("шеф", "аш", "ресторан", "chef - restaurant + food") } cols = st.columns(len(presets)) for idx, (name, (p1, p2, n, desc)) in enumerate(presets.items()): with cols[idx]: if st.button(f"🧩 {name}", key=f"preset_{idx}"): st.session_state.analogy_p1 = p1 st.session_state.analogy_p2 = p2 st.session_state.analogy_n = n st.rerun() with tab3: st.header("Model Analysis") # Performance comparison st.subheader("📊 Model Performance Comparison") perf_fig = create_performance_comparison() st.plotly_chart(perf_fig, use_container_width=True) # OOV words testing st.subheader("🔤 OOV (Out-of-Vocabulary) Testing") st.markdown(""" **FastText models** can handle words not seen during training thanks to subword information. """) oov_words = st.text_area( "Enter words for OOV testing (one per line):", "технологияләштерү\nцифрлаштыру\nвиртуальлаштыру\nмәктәпчә" ) if st.button("Test OOV") and model_type == "FastText": test_words = [word.strip() for word in oov_words.split('\n') if word.strip()] with st.spinner("Testing OOV words..."): model = explorer.load_model(model_key, "fasttext") if model: results = explorer.handle_oov_words(model, test_words) st.subheader("OOV Testing Results") for result in results: col1, col2 = st.columns([1, 3]) with col1: status = "✅ In Vocabulary" if result['in_vocab'] else "🆕 OOV Word" st.write(f"**{result['word']}** - {status}") with col2: if result['similar_words']: similar_str = ", ".join([f"{word}({score:.3f})" for word, score in result['similar_words']]) st.write(f"Similar: {similar_str}") else: st.write("No similar words found") # Model comparison st.subheader("🔄 Model Comparison") compare_words = st.text_input("Words to compare across models (comma-separated):", "мәктәп, китап, тел, фән") if st.button("Compare Models"): words_to_compare = [word.strip() for word in compare_words.split(',')] comparison_data = [] for model_type_comp in ["FastText", "Word2Vec"]: for variant in ["best", "alternative"]: model_key_comp = explorer.available_models[model_type_comp][variant] with st.spinner(f"Testing {model_key_comp}..."): model = explorer.load_model(model_key_comp, model_type_comp.lower()) if model: for word in words_to_compare: similar = explorer.find_similar_words(model, word, 3) if similar: for sim_word, score in similar: comparison_data.append({ 'Model': explorer.get_model_display_name(model_key_comp), 'Type': model_type_comp, 'Source Word': word, 'Similar Word': sim_word, 'Similarity': score }) if comparison_data: df_compare = pd.DataFrame(comparison_data) st.dataframe(df_compare, use_container_width=True) with tab4: st.header("ℹ️ About Tatar2Vec") st.markdown(""" ## 🏆 Tatar2Vec - High-quality Tatar Word Embeddings This repository contains the best performing FastText and Word2Vec models for Tatar, selected through comprehensive evaluation of 57 different model configurations. ### 🎯 Key Features: - **High Quality**: Our models significantly outperform pre-trained Meta models - **Large Vocabulary**: 637.7K words - **Multiple Architectures**: FastText and Word2Vec - **OOV Support**: FastText models handle out-of-vocabulary words ### 📊 Key Results: - **Best Model**: FastText with composite score 0.7019 (vs 0.2000 for Meta) - **Best Architecture**: Skip-gram outperforms CBOW - **Optimal Dimension**: 100-dimensional models perform better than 200/300-dimensional ### 🎪 Use Cases: - Semantic similarity search - Word analogies - Text classification - Machine translation - And much more! ### 📚 Training Corpus: - **Total Tokens**: 207.02M - **Unique Words**: 2.1M - **Vocabulary**: 637.7K - **Sources**: Wikipedia, news, books, social media ### 📜 Citation: ```bibtex @misc{Tatar2Vec_20251109, title = {Tatar2Vec: Tatar Word Embeddings}, author = {Arabovs AI Lab}, year = 2025, publisher = {Hugging Face}, url = {https://huggingface.co/arabovs-ai-lab/Tatar2Vec} } ``` ### 📄 License: MIT License """) if __name__ == "__main__": main()