tatar2vec-demo / app.py
ArabovMK's picture
Update app.py
6dc8172 verified
"""
Tatar2Vec Demo - Interactive Word Embeddings Explorer
Run: streamlit run app.py
"""
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import tempfile
import os
import sys
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import requests
import json
# Import for model loading from Hugging Face Hub
from huggingface_hub import snapshot_download
from gensim.models import FastText, Word2Vec
import gensim.downloader as api
# Page configuration
st.set_page_config(
page_title="Tatar2Vec Demo",
page_icon="🏆",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for improved styling
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
color: #1f77b4;
text-align: center;
margin-bottom: 2rem;
}
.model-card {
background-color: #f0f2f6;
padding: 1.5rem;
border-radius: 10px;
border-left: 4px solid #1f77b4;
margin-bottom: 1rem;
}
.metric-card {
background-color: white;
padding: 1rem;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
text-align: center;
}
.word-cloud {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 0.5rem 1rem;
border-radius: 20px;
display: inline-block;
margin: 0.2rem;
font-weight: 500;
}
</style>
""", unsafe_allow_html=True)
class Tatar2VecExplorer:
def __init__(self):
self.loaded_models = {}
self.available_models = {
"FastText": {
"best": "ft_dim100_win5_min5_ngram3-6_sg.epoch1",
"alternative": "ft_dim100_win5_min5_ngram3-6_sg.epoch3"
},
"Word2Vec": {
"best": "w2v_dim200_win5_min5_sg.epoch4",
"alternative": "w2v_dim100_win5_min5_sg"
}
}
@st.cache_resource(show_spinner="Loading Tatar2Vec model...")
def load_model(_self, model_name: str, model_type: str = "fasttext"):
"""Load model with caching for better performance"""
try:
# Download model from Hugging Face Hub
model_dir = snapshot_download(
repo_id="arabovs-ai-lab/Tatar2Vec",
allow_patterns=f"{model_type}/{model_name}/*"
)
# Construct model path
model_path = os.path.join(model_dir, model_type, model_name, f"{model_name}.model")
# Load appropriate model type
if model_type == "fasttext":
model = FastText.load(model_path)
else:
model = Word2Vec.load(model_path)
return model
except Exception as e:
st.error(f"Error loading model: {e}")
return None
def get_model_display_name(self, model_key: str) -> str:
"""Get human-readable model name"""
names = {
"ft_dim100_win5_min5_ngram3-6_sg.epoch1": "🥇 Best FastText",
"ft_dim100_win5_min5_ngram3-6_sg.epoch3": "🥈 Alternative FastText",
"w2v_dim200_win5_min5_sg.epoch4": "🥇 Best Word2Vec",
"w2v_dim100_win5_min5_sg": "🥈 Compact Word2Vec"
}
return names.get(model_key, model_key)
def get_model_performance(self, model_key: str) -> dict:
"""Get model performance metrics"""
performance = {
"ft_dim100_win5_min5_ngram3-6_sg.epoch1": {
"composite": 0.7019, "semantic": 0.7368, "analogy": 0.0476,
"oov": 1.0000, "coherence": 0.9588
},
"ft_dim100_win5_min5_ngram3-6_sg.epoch3": {
"composite": 0.6675, "semantic": 0.6894, "analogy": 0.0476,
"oov": 1.0000, "coherence": 0.9388
},
"w2v_dim200_win5_min5_sg.epoch4": {
"composite": 0.5685, "semantic": 0.4445, "analogy": 0.3214,
"oov": 0.3854, "coherence": 0.7307
},
"w2v_dim100_win5_min5_sg": {
"composite": 0.5566, "semantic": 0.5187, "analogy": 0.2500,
"oov": 0.3854, "coherence": 0.8051
}
}
return performance.get(model_key, {})
def find_similar_words(self, model, word: str, topn: int = 10):
"""Find semantically similar words"""
try:
if hasattr(model, 'wv'):
return model.wv.most_similar(word, topn=topn)
else:
return model.most_similar(word, topn=topn)
except KeyError:
return []
except Exception as e:
st.error(f"Error finding similar words: {e}")
return []
def word_analogy(self, model, positive: List[str], negative: List[str], topn: int = 5):
"""Perform word analogy operation (king - man + woman = queen)"""
try:
if hasattr(model, 'wv'):
return model.wv.most_similar(positive=positive, negative=negative, topn=topn)
else:
return model.most_similar(positive=positive, negative=negative, topn=topn)
except Exception as e:
st.error(f"Error performing analogy: {e}")
return []
def get_word_vector(self, model, word: str):
"""Get word vector representation"""
try:
if hasattr(model, 'wv'):
return model.wv[word]
else:
return model[word]
except KeyError:
return None
def handle_oov_words(self, model, words: List[str]):
"""Handle Out-of-Vocabulary words (FastText only)"""
results = []
for word in words:
try:
vector = self.get_word_vector(model, word)
similar = self.find_similar_words(model, word, 3)
results.append({
'word': word,
'in_vocab': vector is not None,
'similar_words': similar
})
except Exception:
results.append({
'word': word,
'in_vocab': False,
'similar_words': []
})
return results
def create_performance_comparison():
"""Create model performance comparison charts"""
models = [
"ft_dim100_win5_min5_ngram3-6_sg.epoch1",
"ft_dim100_win5_min5_ngram3-6_sg.epoch3",
"w2v_dim200_win5_min5_sg.epoch4",
"w2v_dim100_win5_min5_sg",
"cc.tt.300 (Meta)"
]
composite_scores = [0.7019, 0.6675, 0.5685, 0.5566, 0.2000]
semantic_scores = [0.7368, 0.6894, 0.4445, 0.5187, None]
# Create subplots for comparison
fig = make_subplots(
rows=1, cols=2,
subplot_titles=('Composite Score', 'Semantic Similarity'),
specs=[[{"type": "bar"}, {"type": "bar"}]]
)
# Composite scores
fig.add_trace(
go.Bar(name='Composite Score', x=models, y=composite_scores,
marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e', '#d62728']),
row=1, col=1
)
# Filter out None values for semantic similarity
semantic_models = [models[i] for i in range(len(models)) if semantic_scores[i] is not None]
semantic_values = [score for score in semantic_scores if score is not None]
# Semantic similarity scores
fig.add_trace(
go.Bar(name='Semantic Similarity', x=semantic_models, y=semantic_values,
marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e']),
row=1, col=2
)
fig.update_layout(
title_text="Model Performance Comparison",
showlegend=False,
height=400
)
return fig
def create_word_cloud(similar_words, title):
"""Create word cloud visualization for similar words"""
if not similar_words:
return None
words = [word for word, score in similar_words]
scores = [score for word, score in similar_words]
# Normalize scores for font sizes
sizes = [30 + (score * 70) for score in scores]
fig = go.Figure()
# Add each word as annotation with random position
for i, (word, size) in enumerate(zip(words, sizes)):
fig.add_annotation(
text=word,
x=np.random.uniform(0.1, 0.9),
y=np.random.uniform(0.1, 0.9),
showarrow=False,
font=dict(size=size, color=f"hsl({i*40}, 70%, 50%)"),
bgcolor="rgba(255,255,255,0.7)",
bordercolor="rgba(0,0,0,0.1)",
borderwidth=1,
borderpad=2,
)
fig.update_layout(
title=title,
xaxis=dict(showticklabels=False, showgrid=False, zeroline=False),
yaxis=dict(showticklabels=False, showgrid=False, zeroline=False),
plot_bgcolor='rgba(0,0,0,0)',
height=300,
margin=dict(l=20, r=20, t=40, b=20)
)
return fig
def main():
# Application header
st.markdown('<h1 class="main-header">🏆 Tatar2Vec Demo - Tatar Word Embeddings</h1>', unsafe_allow_html=True)
# Initialize explorer
explorer = Tatar2VecExplorer()
# Sidebar configuration
with st.sidebar:
st.header("⚙️ Model Settings")
# Model type selection
model_type = st.selectbox(
"Model Type:",
["FastText", "Word2Vec"],
index=0
)
# Model variant selection
model_variant = st.radio(
"Model Variant:",
["best", "alternative"],
format_func=lambda x: "🥇 Best Model" if x == "best" else "🥈 Alternative Model"
)
model_key = explorer.available_models[model_type][model_variant]
# Model information section
st.markdown("---")
st.subheader("📊 Model Information")
performance = explorer.get_model_performance(model_key)
if performance:
col1, col2 = st.columns(2)
with col1:
st.metric("Composite Score", f"{performance['composite']:.4f}")
st.metric("Semantic Similarity", f"{performance['semantic']:.4f}")
with col2:
st.metric("Analogy Accuracy", f"{performance['analogy']:.4f}")
st.metric("OOV Handling", f"{performance['oov']:.4f}")
# Quick search examples
st.markdown("---")
st.subheader("🔍 Quick Search")
quick_words = ["мәктәп", "китап", "тел", "фән", "табигать"]
selected_quick = st.selectbox("Example words:", quick_words)
if st.button("Quick Similarity Search"):
st.session_state.quick_search = selected_quick
# Main content area with tabs
tab1, tab2, tab3, tab4 = st.tabs(["🔍 Word Search", "🧠 Analogies", "📊 Analysis", "ℹ️ About"])
with tab1:
st.header("Similar Word Search")
col1, col2 = st.columns([2, 1])
with col1:
search_word = st.text_input(
"Enter Tatar word:",
value=getattr(st.session_state, 'quick_search', 'мәктәп'),
placeholder="e.g., мәктәп, китап, тел..."
)
with col2:
top_n = st.slider("Number of similar words:", 5, 20, 10)
if st.button("Find Similar Words") or search_word:
with st.spinner(f"Loading model and finding words similar to '{search_word}'..."):
model = explorer.load_model(model_key, model_type.lower())
if model and search_word.strip():
similar_words = explorer.find_similar_words(model, search_word.strip(), top_n)
if similar_words:
# Display results in two columns
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("📈 Similar Words")
df = pd.DataFrame(similar_words, columns=["Word", "Similarity"])
st.dataframe(df, use_container_width=True)
with col2:
fig = create_word_cloud(similar_words, f"Words similar to '{search_word}'")
if fig:
st.plotly_chart(fig, use_container_width=True)
# Additional information
st.subheader("📋 Details")
col1, col2, col3 = st.columns(3)
with col1:
try:
vector = explorer.get_word_vector(model, search_word.strip())
if vector is not None:
st.metric("Vector Dimension", len(vector))
except:
pass
with col2:
st.metric("Similar Words Found", len(similar_words))
with col3:
if similar_words:
st.metric("Max Similarity", f"{similar_words[0][1]:.4f}")
else:
st.warning(f"Word '{search_word}' not found in model vocabulary.")
with tab2:
st.header("Word Analogies")
st.markdown("""
**Example:** табиб - ир + хатын = ? (doctor - man + woman = female doctor)
""")
col1, col2, col3 = st.columns(3)
with col1:
positive1 = st.text_input("Positive word 1:", "табиб", placeholder="doctor")
positive2 = st.text_input("Positive word 2:", "хатын", placeholder="woman")
with col2:
negative = st.text_input("Negative word:", "ир", placeholder="man")
with col3:
analogy_topn = st.slider("Number of results:", 3, 10, 5)
if st.button("Perform Analogy"):
if positive1 and positive2 and negative:
with st.spinner("Performing analogy..."):
model = explorer.load_model(model_key, model_type.lower())
if model:
analogy_results = explorer.word_analogy(
model,
positive=[positive1, positive2],
negative=[negative],
topn=analogy_topn
)
if analogy_results:
st.subheader("🎯 Analogy Results")
df = pd.DataFrame(analogy_results, columns=["Word", "Similarity"])
st.dataframe(df, use_container_width=True)
# Visualization
fig = px.bar(
df,
x='Similarity',
y='Word',
orientation='h',
title=f"Analogy: {positive1} - {negative} + {positive2}",
color='Similarity',
color_continuous_scale='viridis'
)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
st.plotly_chart(fig, use_container_width=True)
else:
st.error("Could not perform analogy. Please check the input words.")
# Predefined analogy examples
st.subheader("🎪 Example Analogies")
presets = {
"Education": ("укытучы", "мәктәп", "өй", "teacher - home + school"),
"Family": ("ата", "кыз", "ул", "father - son + daughter"),
"Professions": ("шеф", "аш", "ресторан", "chef - restaurant + food")
}
cols = st.columns(len(presets))
for idx, (name, (p1, p2, n, desc)) in enumerate(presets.items()):
with cols[idx]:
if st.button(f"🧩 {name}", key=f"preset_{idx}"):
st.session_state.analogy_p1 = p1
st.session_state.analogy_p2 = p2
st.session_state.analogy_n = n
st.rerun()
with tab3:
st.header("Model Analysis")
# Performance comparison
st.subheader("📊 Model Performance Comparison")
perf_fig = create_performance_comparison()
st.plotly_chart(perf_fig, use_container_width=True)
# OOV words testing
st.subheader("🔤 OOV (Out-of-Vocabulary) Testing")
st.markdown("""
**FastText models** can handle words not seen during training
thanks to subword information.
""")
oov_words = st.text_area(
"Enter words for OOV testing (one per line):",
"технологияләштерү\nцифрлаштыру\nвиртуальлаштыру\nмәктәпчә"
)
if st.button("Test OOV") and model_type == "FastText":
test_words = [word.strip() for word in oov_words.split('\n') if word.strip()]
with st.spinner("Testing OOV words..."):
model = explorer.load_model(model_key, "fasttext")
if model:
results = explorer.handle_oov_words(model, test_words)
st.subheader("OOV Testing Results")
for result in results:
col1, col2 = st.columns([1, 3])
with col1:
status = "✅ In Vocabulary" if result['in_vocab'] else "🆕 OOV Word"
st.write(f"**{result['word']}** - {status}")
with col2:
if result['similar_words']:
similar_str = ", ".join([f"{word}({score:.3f})" for word, score in result['similar_words']])
st.write(f"Similar: {similar_str}")
else:
st.write("No similar words found")
# Model comparison
st.subheader("🔄 Model Comparison")
compare_words = st.text_input("Words to compare across models (comma-separated):", "мәктәп, китап, тел, фән")
if st.button("Compare Models"):
words_to_compare = [word.strip() for word in compare_words.split(',')]
comparison_data = []
for model_type_comp in ["FastText", "Word2Vec"]:
for variant in ["best", "alternative"]:
model_key_comp = explorer.available_models[model_type_comp][variant]
with st.spinner(f"Testing {model_key_comp}..."):
model = explorer.load_model(model_key_comp, model_type_comp.lower())
if model:
for word in words_to_compare:
similar = explorer.find_similar_words(model, word, 3)
if similar:
for sim_word, score in similar:
comparison_data.append({
'Model': explorer.get_model_display_name(model_key_comp),
'Type': model_type_comp,
'Source Word': word,
'Similar Word': sim_word,
'Similarity': score
})
if comparison_data:
df_compare = pd.DataFrame(comparison_data)
st.dataframe(df_compare, use_container_width=True)
with tab4:
st.header("ℹ️ About Tatar2Vec")
st.markdown("""
## 🏆 Tatar2Vec - High-quality Tatar Word Embeddings
This repository contains the best performing FastText and Word2Vec models for Tatar,
selected through comprehensive evaluation of 57 different model configurations.
### 🎯 Key Features:
- **High Quality**: Our models significantly outperform pre-trained Meta models
- **Large Vocabulary**: 637.7K words
- **Multiple Architectures**: FastText and Word2Vec
- **OOV Support**: FastText models handle out-of-vocabulary words
### 📊 Key Results:
- **Best Model**: FastText with composite score 0.7019 (vs 0.2000 for Meta)
- **Best Architecture**: Skip-gram outperforms CBOW
- **Optimal Dimension**: 100-dimensional models perform better than 200/300-dimensional
### 🎪 Use Cases:
- Semantic similarity search
- Word analogies
- Text classification
- Machine translation
- And much more!
### 📚 Training Corpus:
- **Total Tokens**: 207.02M
- **Unique Words**: 2.1M
- **Vocabulary**: 637.7K
- **Sources**: Wikipedia, news, books, social media
### 📜 Citation:
```bibtex
@misc{Tatar2Vec_20251109,
title = {Tatar2Vec: Tatar Word Embeddings},
author = {Arabovs AI Lab},
year = 2025,
publisher = {Hugging Face},
url = {https://huggingface.co/arabovs-ai-lab/Tatar2Vec}
}
```
### 📄 License: MIT License
""")
if __name__ == "__main__":
main()