tatar2vec-demo

Sleeping

App Files Files Community

tatar2vec-demo / app.py

ArabovMK

Update app.py

6dc8172 verified 3 months ago

raw

history blame contribute delete

22.6 kB

	"""
	Tatar2Vec Demo - Interactive Word Embeddings Explorer
	Run: streamlit run app.py
	"""

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import tempfile
	import os
	import sys
	from pathlib import Path
	from typing import List, Dict, Tuple, Optional
	import requests
	import json

	# Import for model loading from Hugging Face Hub
	from huggingface_hub import snapshot_download
	from gensim.models import FastText, Word2Vec
	import gensim.downloader as api

	# Page configuration
	st.set_page_config(
	page_title="Tatar2Vec Demo",
	page_icon="🏆",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for improved styling
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	color: #1f77b4;
	text-align: center;
	margin-bottom: 2rem;
	}
	.model-card {
	background-color: #f0f2f6;
	padding: 1.5rem;
	border-radius: 10px;
	border-left: 4px solid #1f77b4;
	margin-bottom: 1rem;
	}
	.metric-card {
	background-color: white;
	padding: 1rem;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	text-align: center;
	}
	.word-cloud {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 0.5rem 1rem;
	border-radius: 20px;
	display: inline-block;
	margin: 0.2rem;
	font-weight: 500;
	}
	</style>
	""", unsafe_allow_html=True)

	class Tatar2VecExplorer:
	def __init__(self):
	self.loaded_models = {}
	self.available_models = {
	"FastText": {
	"best": "ft_dim100_win5_min5_ngram3-6_sg.epoch1",
	"alternative": "ft_dim100_win5_min5_ngram3-6_sg.epoch3"
	},
	"Word2Vec": {
	"best": "w2v_dim200_win5_min5_sg.epoch4",
	"alternative": "w2v_dim100_win5_min5_sg"
	}
	}

	@st.cache_resource(show_spinner="Loading Tatar2Vec model...")
	def load_model(_self, model_name: str, model_type: str = "fasttext"):
	"""Load model with caching for better performance"""
	try:
	# Download model from Hugging Face Hub
	model_dir = snapshot_download(
	repo_id="arabovs-ai-lab/Tatar2Vec",
	allow_patterns=f"{model_type}/{model_name}/*"
	)

	# Construct model path
	model_path = os.path.join(model_dir, model_type, model_name, f"{model_name}.model")

	# Load appropriate model type
	if model_type == "fasttext":
	model = FastText.load(model_path)
	else:
	model = Word2Vec.load(model_path)

	return model
	except Exception as e:
	st.error(f"Error loading model: {e}")
	return None

	def get_model_display_name(self, model_key: str) -> str:
	"""Get human-readable model name"""
	names = {
	"ft_dim100_win5_min5_ngram3-6_sg.epoch1": "🥇 Best FastText",
	"ft_dim100_win5_min5_ngram3-6_sg.epoch3": "🥈 Alternative FastText",
	"w2v_dim200_win5_min5_sg.epoch4": "🥇 Best Word2Vec",
	"w2v_dim100_win5_min5_sg": "🥈 Compact Word2Vec"
	}
	return names.get(model_key, model_key)

	def get_model_performance(self, model_key: str) -> dict:
	"""Get model performance metrics"""
	performance = {
	"ft_dim100_win5_min5_ngram3-6_sg.epoch1": {
	"composite": 0.7019, "semantic": 0.7368, "analogy": 0.0476,
	"oov": 1.0000, "coherence": 0.9588
	},
	"ft_dim100_win5_min5_ngram3-6_sg.epoch3": {
	"composite": 0.6675, "semantic": 0.6894, "analogy": 0.0476,
	"oov": 1.0000, "coherence": 0.9388
	},
	"w2v_dim200_win5_min5_sg.epoch4": {
	"composite": 0.5685, "semantic": 0.4445, "analogy": 0.3214,
	"oov": 0.3854, "coherence": 0.7307
	},
	"w2v_dim100_win5_min5_sg": {
	"composite": 0.5566, "semantic": 0.5187, "analogy": 0.2500,
	"oov": 0.3854, "coherence": 0.8051
	}
	}
	return performance.get(model_key, {})

	def find_similar_words(self, model, word: str, topn: int = 10):
	"""Find semantically similar words"""
	try:
	if hasattr(model, 'wv'):
	return model.wv.most_similar(word, topn=topn)
	else:
	return model.most_similar(word, topn=topn)
	except KeyError:
	return []
	except Exception as e:
	st.error(f"Error finding similar words: {e}")
	return []

	def word_analogy(self, model, positive: List[str], negative: List[str], topn: int = 5):
	"""Perform word analogy operation (king - man + woman = queen)"""
	try:
	if hasattr(model, 'wv'):
	return model.wv.most_similar(positive=positive, negative=negative, topn=topn)
	else:
	return model.most_similar(positive=positive, negative=negative, topn=topn)
	except Exception as e:
	st.error(f"Error performing analogy: {e}")
	return []

	def get_word_vector(self, model, word: str):
	"""Get word vector representation"""
	try:
	if hasattr(model, 'wv'):
	return model.wv[word]
	else:
	return model[word]
	except KeyError:
	return None

	def handle_oov_words(self, model, words: List[str]):
	"""Handle Out-of-Vocabulary words (FastText only)"""
	results = []
	for word in words:
	try:
	vector = self.get_word_vector(model, word)
	similar = self.find_similar_words(model, word, 3)
	results.append({
	'word': word,
	'in_vocab': vector is not None,
	'similar_words': similar
	})
	except Exception:
	results.append({
	'word': word,
	'in_vocab': False,
	'similar_words': []
	})
	return results

	def create_performance_comparison():
	"""Create model performance comparison charts"""
	models = [
	"ft_dim100_win5_min5_ngram3-6_sg.epoch1",
	"ft_dim100_win5_min5_ngram3-6_sg.epoch3",
	"w2v_dim200_win5_min5_sg.epoch4",
	"w2v_dim100_win5_min5_sg",
	"cc.tt.300 (Meta)"
	]

	composite_scores = [0.7019, 0.6675, 0.5685, 0.5566, 0.2000]
	semantic_scores = [0.7368, 0.6894, 0.4445, 0.5187, None]

	# Create subplots for comparison
	fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=('Composite Score', 'Semantic Similarity'),
	specs=[[{"type": "bar"}, {"type": "bar"}]]
	)

	# Composite scores
	fig.add_trace(
	go.Bar(name='Composite Score', x=models, y=composite_scores,
	marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e', '#d62728']),
	row=1, col=1
	)

	# Filter out None values for semantic similarity
	semantic_models = [models[i] for i in range(len(models)) if semantic_scores[i] is not None]
	semantic_values = [score for score in semantic_scores if score is not None]

	# Semantic similarity scores
	fig.add_trace(
	go.Bar(name='Semantic Similarity', x=semantic_models, y=semantic_values,
	marker_color=['#1f77b4', '#1f77b4', '#ff7f0e', '#ff7f0e']),
	row=1, col=2
	)

	fig.update_layout(
	title_text="Model Performance Comparison",
	showlegend=False,
	height=400
	)

	return fig

	def create_word_cloud(similar_words, title):
	"""Create word cloud visualization for similar words"""
	if not similar_words:
	return None

	words = [word for word, score in similar_words]
	scores = [score for word, score in similar_words]

	# Normalize scores for font sizes
	sizes = [30 + (score * 70) for score in scores]

	fig = go.Figure()

	# Add each word as annotation with random position
	for i, (word, size) in enumerate(zip(words, sizes)):
	fig.add_annotation(
	text=word,
	x=np.random.uniform(0.1, 0.9),
	y=np.random.uniform(0.1, 0.9),
	showarrow=False,
	font=dict(size=size, color=f"hsl({i*40}, 70%, 50%)"),
	bgcolor="rgba(255,255,255,0.7)",
	bordercolor="rgba(0,0,0,0.1)",
	borderwidth=1,
	borderpad=2,
	)

	fig.update_layout(
	title=title,
	xaxis=dict(showticklabels=False, showgrid=False, zeroline=False),
	yaxis=dict(showticklabels=False, showgrid=False, zeroline=False),
	plot_bgcolor='rgba(0,0,0,0)',
	height=300,
	margin=dict(l=20, r=20, t=40, b=20)
	)

	return fig

	def main():
	# Application header
	st.markdown('<h1 class="main-header">🏆 Tatar2Vec Demo - Tatar Word Embeddings</h1>', unsafe_allow_html=True)

	# Initialize explorer
	explorer = Tatar2VecExplorer()

	# Sidebar configuration
	with st.sidebar:
	st.header("⚙️ Model Settings")

	# Model type selection
	model_type = st.selectbox(
	"Model Type:",
	["FastText", "Word2Vec"],
	index=0
	)

	# Model variant selection
	model_variant = st.radio(
	"Model Variant:",
	["best", "alternative"],
	format_func=lambda x: "🥇 Best Model" if x == "best" else "🥈 Alternative Model"
	)

	model_key = explorer.available_models[model_type][model_variant]

	# Model information section
	st.markdown("---")
	st.subheader("📊 Model Information")
	performance = explorer.get_model_performance(model_key)

	if performance:
	col1, col2 = st.columns(2)
	with col1:
	st.metric("Composite Score", f"{performance['composite']:.4f}")
	st.metric("Semantic Similarity", f"{performance['semantic']:.4f}")
	with col2:
	st.metric("Analogy Accuracy", f"{performance['analogy']:.4f}")
	st.metric("OOV Handling", f"{performance['oov']:.4f}")

	# Quick search examples
	st.markdown("---")
	st.subheader("🔍 Quick Search")
	quick_words = ["мәктәп", "китап", "тел", "фән", "табигать"]
	selected_quick = st.selectbox("Example words:", quick_words)

	if st.button("Quick Similarity Search"):
	st.session_state.quick_search = selected_quick

	# Main content area with tabs
	tab1, tab2, tab3, tab4 = st.tabs(["🔍 Word Search", "🧠 Analogies", "📊 Analysis", "ℹ️ About"])

	with tab1:
	st.header("Similar Word Search")

	col1, col2 = st.columns([2, 1])

	with col1:
	search_word = st.text_input(
	"Enter Tatar word:",
	value=getattr(st.session_state, 'quick_search', 'мәктәп'),
	placeholder="e.g., мәктәп, китап, тел..."
	)

	with col2:
	top_n = st.slider("Number of similar words:", 5, 20, 10)

	if st.button("Find Similar Words") or search_word:
	with st.spinner(f"Loading model and finding words similar to '{search_word}'..."):
	model = explorer.load_model(model_key, model_type.lower())

	if model and search_word.strip():
	similar_words = explorer.find_similar_words(model, search_word.strip(), top_n)

	if similar_words:
	# Display results in two columns
	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("📈 Similar Words")
	df = pd.DataFrame(similar_words, columns=["Word", "Similarity"])
	st.dataframe(df, use_container_width=True)

	with col2:
	fig = create_word_cloud(similar_words, f"Words similar to '{search_word}'")
	if fig:
	st.plotly_chart(fig, use_container_width=True)

	# Additional information
	st.subheader("📋 Details")
	col1, col2, col3 = st.columns(3)

	with col1:
	try:
	vector = explorer.get_word_vector(model, search_word.strip())
	if vector is not None:
	st.metric("Vector Dimension", len(vector))
	except:
	pass

	with col2:
	st.metric("Similar Words Found", len(similar_words))

	with col3:
	if similar_words:
	st.metric("Max Similarity", f"{similar_words[0][1]:.4f}")

	else:
	st.warning(f"Word '{search_word}' not found in model vocabulary.")

	with tab2:
	st.header("Word Analogies")

	st.markdown("""
	Example: табиб - ир + хатын = ? (doctor - man + woman = female doctor)
	""")

	col1, col2, col3 = st.columns(3)

	with col1:
	positive1 = st.text_input("Positive word 1:", "табиб", placeholder="doctor")
	positive2 = st.text_input("Positive word 2:", "хатын", placeholder="woman")

	with col2:
	negative = st.text_input("Negative word:", "ир", placeholder="man")

	with col3:
	analogy_topn = st.slider("Number of results:", 3, 10, 5)

	if st.button("Perform Analogy"):
	if positive1 and positive2 and negative:
	with st.spinner("Performing analogy..."):
	model = explorer.load_model(model_key, model_type.lower())

	if model:
	analogy_results = explorer.word_analogy(
	model,
	positive=[positive1, positive2],
	negative=[negative],
	topn=analogy_topn
	)

	if analogy_results:
	st.subheader("🎯 Analogy Results")

	df = pd.DataFrame(analogy_results, columns=["Word", "Similarity"])
	st.dataframe(df, use_container_width=True)

	# Visualization
	fig = px.bar(
	df,
	x='Similarity',
	y='Word',
	orientation='h',
	title=f"Analogy: {positive1} - {negative} + {positive2}",
	color='Similarity',
	color_continuous_scale='viridis'
	)
	fig.update_layout(yaxis={'categoryorder':'total ascending'})
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.error("Could not perform analogy. Please check the input words.")

	# Predefined analogy examples
	st.subheader("🎪 Example Analogies")

	presets = {
	"Education": ("укытучы", "мәктәп", "өй", "teacher - home + school"),
	"Family": ("ата", "кыз", "ул", "father - son + daughter"),
	"Professions": ("шеф", "аш", "ресторан", "chef - restaurant + food")
	}

	cols = st.columns(len(presets))
	for idx, (name, (p1, p2, n, desc)) in enumerate(presets.items()):
	with cols[idx]:
	if st.button(f"🧩 {name}", key=f"preset_{idx}"):
	st.session_state.analogy_p1 = p1
	st.session_state.analogy_p2 = p2
	st.session_state.analogy_n = n
	st.rerun()

	with tab3:
	st.header("Model Analysis")

	# Performance comparison
	st.subheader("📊 Model Performance Comparison")
	perf_fig = create_performance_comparison()
	st.plotly_chart(perf_fig, use_container_width=True)

	# OOV words testing
	st.subheader("🔤 OOV (Out-of-Vocabulary) Testing")

	st.markdown("""
	FastText models can handle words not seen during training
	thanks to subword information.
	""")

	oov_words = st.text_area(
	"Enter words for OOV testing (one per line):",
	"технологияләштерү\nцифрлаштыру\nвиртуальлаштыру\nмәктәпчә"
	)

	if st.button("Test OOV") and model_type == "FastText":
	test_words = [word.strip() for word in oov_words.split('\n') if word.strip()]

	with st.spinner("Testing OOV words..."):
	model = explorer.load_model(model_key, "fasttext")

	if model:
	results = explorer.handle_oov_words(model, test_words)

	st.subheader("OOV Testing Results")

	for result in results:
	col1, col2 = st.columns([1, 3])

	with col1:
	status = "✅ In Vocabulary" if result['in_vocab'] else "🆕 OOV Word"
	st.write(f"{result['word']} - {status}")

	with col2:
	if result['similar_words']:
	similar_str = ", ".join([f"{word}({score:.3f})" for word, score in result['similar_words']])
	st.write(f"Similar: {similar_str}")
	else:
	st.write("No similar words found")

	# Model comparison
	st.subheader("🔄 Model Comparison")

	compare_words = st.text_input("Words to compare across models (comma-separated):", "мәктәп, китап, тел, фән")

	if st.button("Compare Models"):
	words_to_compare = [word.strip() for word in compare_words.split(',')]

	comparison_data = []

	for model_type_comp in ["FastText", "Word2Vec"]:
	for variant in ["best", "alternative"]:
	model_key_comp = explorer.available_models[model_type_comp][variant]

	with st.spinner(f"Testing {model_key_comp}..."):
	model = explorer.load_model(model_key_comp, model_type_comp.lower())

	if model:
	for word in words_to_compare:
	similar = explorer.find_similar_words(model, word, 3)
	if similar:
	for sim_word, score in similar:
	comparison_data.append({
	'Model': explorer.get_model_display_name(model_key_comp),
	'Type': model_type_comp,
	'Source Word': word,
	'Similar Word': sim_word,
	'Similarity': score
	})

	if comparison_data:
	df_compare = pd.DataFrame(comparison_data)
	st.dataframe(df_compare, use_container_width=True)

	with tab4:
	st.header("ℹ️ About Tatar2Vec")

	st.markdown("""
	## 🏆 Tatar2Vec - High-quality Tatar Word Embeddings

	This repository contains the best performing FastText and Word2Vec models for Tatar,
	selected through comprehensive evaluation of 57 different model configurations.

	### 🎯 Key Features:

	- High Quality: Our models significantly outperform pre-trained Meta models
	- Large Vocabulary: 637.7K words
	- Multiple Architectures: FastText and Word2Vec
	- OOV Support: FastText models handle out-of-vocabulary words

	### 📊 Key Results:

	- Best Model: FastText with composite score 0.7019 (vs 0.2000 for Meta)
	- Best Architecture: Skip-gram outperforms CBOW
	- Optimal Dimension: 100-dimensional models perform better than 200/300-dimensional

	### 🎪 Use Cases:

	- Semantic similarity search
	- Word analogies
	- Text classification
	- Machine translation
	- And much more!

	### 📚 Training Corpus:

	- Total Tokens: 207.02M
	- Unique Words: 2.1M
	- Vocabulary: 637.7K
	- Sources: Wikipedia, news, books, social media

	### 📜 Citation:

	```bibtex
	@misc{Tatar2Vec_20251109,
	title = {Tatar2Vec: Tatar Word Embeddings},
	author = {Arabovs AI Lab},
	year = 2025,
	publisher = {Hugging Face},
	url = {https://huggingface.co/arabovs-ai-lab/Tatar2Vec}
	}
	```

	### 📄 License: MIT License
	""")

	if __name__ == "__main__":
	main()