Spaces:

Omartificial-Intelligence-Space
/

Arabic-Semantic-Embedding-Suite

Running

App Files Files Community

Arabic-Semantic-Embedding-Suite / app.py

Omartificial-Intelligence-Space

Update app.py

e2adc18 verified 6 months ago

raw

history blame contribute delete

48.4 kB

	import gradio as gr
	import torch
	import torch.nn.functional as F
	import numpy as np
	import plotly.express as px
	import pandas as pd
	import spaces
	from typing import List, Tuple, Dict
	from torch import Tensor
	from transformers import AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer
	import json

	# Initialize the embedder at module level
	embedder = None

	AVAILABLE_MODELS = {
	"Qwen3-Embedding-0.6B": "Qwen/Qwen3-Embedding-0.6B",
	"Semantic-Ar-Qwen-Embed-0.6B": "Omartificial-Intelligence-Space/Semantic-Ar-Qwen-Embed-0.6B",
	"AraGemma-Embedding-300m" : "Omartificial-Intelligence-Space/AraGemma-Embedding-300m"
	}

	class QwenEmbedder:
	def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", embedding_dim: int = 768):
	self.model = SentenceTransformer(model_name)
	self.embedding_dim = embedding_dim
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.model.to(self.device)

	if embedding_dim != 768:
	# Add projection layer if needed
	self.projection = torch.nn.Linear(768, embedding_dim)
	self.projection.to(self.device)
	else:
	self.projection = None

	def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> torch.Tensor:
	if with_instruction:
	texts = [f"Represent this Arabic text for retrieval: {text}" for text in texts]

	embeddings = self.model.encode(texts, convert_to_tensor=True)

	if self.projection is not None:
	embeddings = self.projection(embeddings)

	# Normalize embeddings
	embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
	return embeddings

	@spaces.GPU(duration=120)
	def initialize_embedder(embedding_dim=768):
	# Initialize device inside the GPU worker
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Initializing embedder on device: {device}")

	# Create model with specified dimension
	model = QwenEmbedder(embedding_dim=embedding_dim)
	return model

	@spaces.GPU(duration=120)
	def process_with_embedder(fn_name, *args):
	"""Generic handler for embedder operations"""
	global embedder
	if embedder is None:
	embedder = initialize_embedder()

	# Map function names to actual functions
	fn_map = {
	'compute_similarity': compute_similarity,
	'rerank_documents': rerank_documents,
	'process_batch_embeddings': process_batch_embeddings,
	'process_retrieval': process_retrieval,
	'process_cross_lingual': process_cross_lingual,
	'classify_text': classify_text,
	'cluster_documents': cluster_documents,
	'analyze_sentiment': analyze_sentiment,
	'extract_concepts': extract_concepts
	}

	return fn_map[fn_name](embedder, *args)

	# Check for GPU support and configure appropriately
	device = "cuda" if torch.cuda.is_available() else "cpu"
	zero = torch.Tensor([0]).to(device)
	print(f"Device being used: {zero.device}")

	def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
	left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
	if left_padding:
	return last_hidden_states[:, -1]
	else:
	sequence_lengths = attention_mask.sum(dim=1) - 1
	batch_size = last_hidden_states.shape[0]
	return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

	def get_detailed_instruct(task_description: str, query: str) -> str:
	return f'Instruct: {task_description}\nQuery: {query}'

	def tokenize(tokenizer, input_texts, eod_id, max_length):
	batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
	for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
	seq.append(eod_id)
	att.append(1)
	batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
	return batch_dict

	def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str, model_choice: str = None, embedding_dim: int = None) -> float:
	embeddings = embedder.get_embeddings([text1, text2])
	similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
	return round(similarity, 3)

	def rerank_documents(embedder: QwenEmbedder, query: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]:
	docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]

	# Add instruction to query
	task = 'Given a search query, retrieve relevant passages that answer the query'
	query_with_instruct = get_detailed_instruct(task, query)

	# Get embeddings
	query_embedding = embedder.get_embeddings([query_with_instruct])
	doc_embeddings = embedder.get_embeddings(docs_list)

	# Calculate similarities
	scores = (query_embedding @ doc_embeddings.T).squeeze(0)
	results = [(doc, float(score)) for doc, score in zip(docs_list, scores)]
	results.sort(key=lambda x: x[1], reverse=True)

	return [(doc, round(score, 3)) for doc, score in results]

	def process_batch_embeddings(embedder: QwenEmbedder, texts: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame:
	text_list = [text.strip() for text in texts.split('\n') if text.strip()]
	if len(text_list) < 1:
	return pd.DataFrame()

	embeddings = embedder.get_embeddings(text_list)
	scores = (embeddings @ embeddings.T).cpu().numpy()

	# Create similarity matrix DataFrame
	df_similarities = pd.DataFrame(
	scores,
	index=text_list,
	columns=text_list
	)

	return df_similarities.round(3)

	def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame:
	# Process queries and documents
	query_list = [q.strip() for q in queries.split('\n') if q.strip()]
	doc_list = [d.strip() for d in documents.split('\n') if d.strip()]

	if not query_list or not doc_list:
	return pd.DataFrame()

	# Add instruction to queries
	instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list]

	# Get embeddings for both queries and documents
	query_embeddings = embedder.get_embeddings(instructed_queries)
	doc_embeddings = embedder.get_embeddings(doc_list)

	# Calculate similarity scores
	scores = (query_embeddings @ doc_embeddings.T).cpu().numpy()

	# Create DataFrame with results
	df = pd.DataFrame(scores, index=query_list, columns=doc_list)
	return df.round(3)

	def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str, model_choice: str = None, embedding_dim: int = None) -> dict:
	texts = [arabic_text, english_text]
	embeddings = embedder.get_embeddings(texts)
	similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
	return {"similarity": round(similarity, 3)}

	def classify_text(embedder: QwenEmbedder, text: str, categories: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]:
	cat_list = [c.strip() for c in categories.split('\n') if c.strip()]
	text_embedding = embedder.get_embeddings([text])
	cat_embeddings = embedder.get_embeddings(cat_list)
	scores = (text_embedding @ cat_embeddings.T).squeeze(0)
	results = [(cat, float(score)) for cat, score in zip(cat_list, scores)]
	results.sort(key=lambda x: x[1], reverse=True)
	return [(cat, round(score, 3)) for cat, score in results]

	def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame:
	from sklearn.cluster import KMeans
	doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]
	if len(doc_list) < num_clusters:
	return pd.DataFrame()

	embeddings = embedder.get_embeddings(doc_list)

	# Perform clustering
	kmeans = KMeans(n_clusters=num_clusters, random_state=42)
	clusters = kmeans.fit_predict(embeddings.cpu().numpy())

	# Calculate center document for each cluster
	cluster_centers = kmeans.cluster_centers_
	cluster_center_docs = []

	for i in range(num_clusters):
	cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i]
	cluster_embeddings = embedder.get_embeddings(cluster_docs)
	center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0)
	similarities = F.cosine_similarity(cluster_embeddings, center_embedding)
	center_doc = cluster_docs[similarities.argmax().item()]
	cluster_center_docs.append(center_doc)

	# Create results DataFrame
	df = pd.DataFrame({
	'Document': doc_list,
	'Cluster': clusters,
	'Cluster Center Document': [cluster_center_docs[c] for c in clusters]
	})
	return df.sort_values('Cluster')

	def analyze_sentiment(embedder: QwenEmbedder, text: str, model_choice: str = None, embedding_dim: int = None) -> Tuple[str, dict]:
	# Define sentiment anchors
	anchors = {
	"very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية",
	"positive": "هذا جيد وممتع",
	"neutral": "هذا عادي ومقبول",
	"negative": "هذا سيء ومزعج",
	"very_negative": "هذا فظيع جداً ومحبط للغاية"
	}

	# Get embeddings
	text_embedding = embedder.get_embeddings([text])
	anchor_embeddings = embedder.get_embeddings(list(anchors.values()))

	# Calculate similarities
	scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
	results = list(zip(anchors.keys(), scores.tolist()))
	results.sort(key=lambda x: x[1], reverse=True)

	# Return tuple of (sentiment, scores_dict)
	return (
	results[0][0],
	{k: round(float(v), 3) for k, v in results}
	)

	def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]:
	# Define concept anchors based on type
	concept_anchors = {
	"emotions": [
	"الفرح والسعادة",
	"الحزن والأسى",
	"الغضب والإحباط",
	"الخوف والقلق",
	"الحب والعاطفة",
	"الأمل والتفاؤل"
	],
	"topics": [
	"السياسة والحكم",
	"الاقتصاد والمال",
	"العلوم والتكنولوجيا",
	"الفن والثقافة",
	"الرياضة والترفيه",
	"التعليم والمعرفة"
	],
	"themes": [
	"العدالة والمساواة",
	"التقدم والتطور",
	"التقاليد والتراث",
	"الحرية والاستقلال",
	"التعاون والوحدة",
	"الإبداع والابتكار"
	]
	}

	anchors = concept_anchors.get(concept_type, concept_anchors["topics"])

	# Get embeddings
	text_embedding = embedder.get_embeddings([text])
	anchor_embeddings = embedder.get_embeddings(anchors)

	# Calculate similarities
	scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
	results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)]
	results.sort(key=lambda x: x[1], reverse=True)

	return [(concept, round(score, 3)) for concept, score in results]

	def create_embedder(model_choice: str, embedding_dim: int = 768) -> QwenEmbedder:
	model_name = AVAILABLE_MODELS[model_choice]
	return QwenEmbedder(model_name=model_name, embedding_dim=embedding_dim)

	def process_similarity(text1: str, text2: str, model_choice: str, embedding_dim: int) -> float:
	embedder = create_embedder(model_choice, embedding_dim)
	embeddings = embedder.get_embeddings([text1, text2])
	similarity = torch.nn.functional.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0))
	return float(similarity)

	def process_reranking(query: str, documents: str, model_choice: str, embedding_dim: int) -> Dict:
	embedder = create_embedder(model_choice, embedding_dim)
	documents = [doc.strip() for doc in documents.split('\n') if doc.strip()]

	query_embedding = embedder.get_embeddings([query], with_instruction=True)
	doc_embeddings = embedder.get_embeddings(documents)

	similarities = torch.nn.functional.cosine_similarity(query_embedding, doc_embeddings)

	# Sort documents by similarity
	sorted_indices = torch.argsort(similarities, descending=True)
	results = []
	for idx in sorted_indices:
	results.append({
	'document': documents[idx],
	'score': float(similarities[idx])
	})

	return {'results': results}

	def process_batch(texts: str, model_choice: str, embedding_dim: int) -> Dict:
	embedder = create_embedder(model_choice, embedding_dim)
	texts = [text.strip() for text in texts.split('\n') if text.strip()]

	embeddings = embedder.get_embeddings(texts)
	similarity_matrix = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2)

	df = pd.DataFrame(similarity_matrix.cpu().numpy(), index=texts, columns=texts)
	return {'similarity_matrix': df.to_dict()}

	def process_retrieval(prompt: str, queries: str, documents: str, model_choice: str, embedding_dim: int) -> Dict:
	embedder = create_embedder(model_choice, embedding_dim)

	# Process input strings
	queries = [q.strip() for q in queries.split('\n') if q.strip()]
	documents = [doc.strip() for doc in documents.split('\n') if doc.strip()]

	# Get embeddings
	prompt_embedding = embedder.get_embeddings([prompt], with_instruction=True)
	query_embeddings = embedder.get_embeddings(queries, with_instruction=True)
	doc_embeddings = embedder.get_embeddings(documents)

	# Calculate similarities
	query_similarities = torch.nn.functional.cosine_similarity(prompt_embedding, query_embeddings)
	doc_similarities = torch.nn.functional.cosine_similarity(prompt_embedding.repeat(len(documents), 1), doc_embeddings)

	# Process results
	results = {
	'relevant_queries': [],
	'relevant_documents': []
	}

	# Sort queries
	query_indices = torch.argsort(query_similarities, descending=True)
	for idx in query_indices:
	results['relevant_queries'].append({
	'query': queries[idx],
	'similarity': float(query_similarities[idx])
	})

	# Sort documents
	doc_indices = torch.argsort(doc_similarities, descending=True)
	for idx in doc_indices:
	results['relevant_documents'].append({
	'document': documents[idx],
	'similarity': float(doc_similarities[idx])
	})

	return results

	# Update the CSS to improve feature visibility
	custom_css = """
	:root {
	--primary-color: #2196F3;
	--secondary-color: #1976D2;
	--background-color: #f8f9fa;
	--sidebar-bg: #ffffff;
	--text-color: #333333;
	--border-color: #e0e0e0;
	}

	.container {
	max-width: 1200px;
	margin: auto;
	padding: 20px;
	}

	.sidebar {
	background-color: var(--sidebar-bg);
	border-right: 1px solid var(--border-color);
	padding: 20px;
	margin-right: 20px;
	position: sticky;
	top: 0;
	height: 100vh;
	overflow-y: auto;
	}

	.main-content {
	background-color: var(--background-color);
	padding: 20px;
	border-radius: 10px;
	}

	.features-grid {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
	gap: 15px;
	margin: 15px 0;
	}

	.feature-card {
	background: white;
	padding: 15px;
	border-radius: 6px;
	box-shadow: 0 1px 3px rgba(0,0,0,0.1);
	transition: all 0.3s ease;
	border: 1px solid var(--border-color);
	text-align: center;
	}

	.feature-card:hover {
	transform: translateY(-3px);
	box-shadow: 0 3px 6px rgba(0,0,0,0.15);
	border-color: var(--primary-color);
	}

	.feature-icon {
	font-size: 24px;
	margin-bottom: 10px;
	color: var(--primary-color);
	}

	.feature-card h3 {
	color: var(--text-color);
	margin: 8px 0;
	font-size: 0.95em;
	font-weight: 600;
	}

	.feature-card p {
	color: #666;
	font-size: 0.8em;
	line-height: 1.3;
	margin: 5px 0;
	}

	.features-summary {
	margin: 40px 0;
	padding: 30px;
	background: white;
	border-radius: 12px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.1);
	}

	.features-summary h2 {
	color: var(--text-color);
	margin-bottom: 25px;
	text-align: center;
	font-size: 1.5em;
	}

	.feature-list {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
	gap: 30px;
	}

	.feature-group {
	padding: 20px;
	background: var(--background-color);
	border-radius: 8px;
	border: 1px solid var(--border-color);
	}

	.feature-group h3 {
	color: var(--primary-color);
	margin-bottom: 15px;
	font-size: 1.2em;
	}

	.feature-group ul {
	list-style: none;
	padding: 0;
	margin: 0;
	}

	.feature-group li {
	padding: 8px 0;
	color: var(--text-color);
	position: relative;
	padding-left: 20px;
	}

	.feature-group li:before {
	content: "•";
	color: var(--primary-color);
	position: absolute;
	left: 0;
	}

	.description {
	margin: 20px 0;
	padding: 15px;
	border-radius: 8px;
	background-color: #ffffff;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}

	.example {
	margin: 10px 0;
	padding: 15px;
	border-left: 4px solid var(--primary-color);
	background-color: #ffffff;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}

	.warning {
	color: #721c24;
	background-color: #f8d7da;
	border: 1px solid #f5c6cb;
	padding: 15px;
	border-radius: 8px;
	margin: 10px 0;
	}

	.settings {
	background-color: #ffffff;
	padding: 20px;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	margin: 20px 0;
	}

	.tab-content {
	padding: 20px;
	background-color: #ffffff;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}

	.heading {
	color: var(--text-color);
	margin-bottom: 20px;
	padding-bottom: 10px;
	border-bottom: 2px solid var(--primary-color);
	}

	button.primary {
	background-color: var(--primary-color) !important;
	}

	button.secondary {
	background-color: var(--secondary-color) !important;
	}
	"""

	# Create the Gradio interface
	def create_demo():
	demo = gr.Blocks(title="Advanced Text Processing with Arabic Semantic Embeddings", css=custom_css, theme=gr.themes.Soft())

	with demo:
	with gr.Row():
	# Sidebar
	with gr.Column(scale=1, elem_classes="sidebar"):
	gr.Markdown("""
	# Arabic Semantic Embeddings

	### Navigation
	- [Configuration](#configuration)
	- [Features](#features)
	- [Documentation](#documentation)
	""")

	with gr.Accordion("Configuration", open=True):
	gr.Markdown("""
	### Model Settings
	Configure the embedding model parameters below.
	""")

	model_choice = gr.Dropdown(
	choices=list(AVAILABLE_MODELS.keys()),
	value=list(AVAILABLE_MODELS.keys())[0],
	label="Select Model"
	)
	embedding_dim = gr.Slider(
	minimum=32,
	maximum=1024,
	value=768,
	step=32,
	label="Embedding Dimension",
	elem_classes="settings"
	)
	update_dim_btn = gr.Button("Update Dimension", variant="secondary")
	dim_status = gr.Textbox(label="Status", interactive=False)

	with gr.Accordion("Documentation", open=False):
	gr.Markdown("""
	### Usage Guide

	1. Embedding Dimension
	- 32-128: Fast, simple tasks
	- 256-512: Balanced performance
	- 768: Default, full model
	- 1024: Maximum detail

	2. Best Practices
	- Use appropriate dimensions for your task
	- Consider batch size for multiple documents
	- Test different settings for optimal results
	""")

	# Main Content
	with gr.Column(scale=4):
	gr.Markdown("""
	# Advanced Text Processing Suite

	Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings.
	This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages.
	""")

	# Feature Grid
	gr.HTML("""
	<div class="features-grid">
	<div class="feature-card">
	<div class="feature-icon">🔄</div>
	<h3>Text Similarity</h3>
	<p>Compare text meanings</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">🔍</div>
	<h3>Semantic Search</h3>
	<p>Find relevant docs</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">📊</div>
	<h3>Batch Analysis</h3>
	<p>Process multiple texts</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">🎯</div>
	<h3>Multi-Query</h3>
	<p>Advanced retrieval</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">🌐</div>
	<h3>Cross-Lingual</h3>
	<p>Cross-language match</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">🏷️</div>
	<h3>Classification</h3>
	<p>Categorize texts</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">🔮</div>
	<h3>Clustering</h3>
	<p>Group documents</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">😊</div>
	<h3>Sentiment</h3>
	<p>Analyze emotions</p>
	</div>
	<div class="feature-card">
	<div class="feature-icon">🎨</div>
	<h3>Concepts</h3>
	<p>Extract themes</p>
	</div>
	</div>
	""")

	with gr.Tabs() as tabs:
	# Text Similarity Tab
	with gr.Tab("Text Similarity Analysis"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Text Similarity Analysis
	Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning).

	<div class="example">
	<strong>Try these Arabic examples:</strong><br>
	• "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"<br>
	• "السماء صافية اليوم" and "الطقس حار جداً"
	</div>
	""")

	with gr.Row():
	text1 = gr.Textbox(
	label="First Text",
	lines=3,
	placeholder="Enter first text here...",
	value="أحب القراءة كثيراً"
	)
	text2 = gr.Textbox(
	label="Second Text",
	lines=3,
	placeholder="Enter second text here...",
	value="القراءة من أحب هواياتي"
	)
	similarity_btn = gr.Button("Calculate Similarity", variant="primary")
	similarity_score = gr.Number(label="Similarity Score")

	similarity_btn.click(
	fn=lambda t1, t2, m, d: process_with_embedder('compute_similarity', t1, t2, m, d),
	inputs=[text1, text2, model_choice, embedding_dim],
	outputs=similarity_score
	)

	# Document Reranking Tab
	with gr.Tab("Semantic Search & Reranking"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Semantic Search & Document Reranking
	Search through a collection of documents and rank them by semantic relevance to your query.

	<div class="example">
	<strong>Try these Arabic queries:</strong><br>
	• "ما هي عواصم الدول العربية؟"<br>
	• "أين تقع أكبر المدن العربية؟"<br>
	• "ما هي المراكز الثقافية العربية؟"
	</div>
	""")

	query_text = gr.Textbox(
	label="Search Query",
	placeholder="Enter your search query...",
	value="ما هي عواصم الدول العربية؟"
	)
	documents_text = gr.Textbox(
	label="Documents Collection (one per line)",
	lines=10,
	placeholder="Enter documents here, one per line...",
	value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.
	الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي.
	دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا.
	بغداد عاصمة العراق وتقع على نهر دجلة.
	الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية.
	تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي."""
	)
	rerank_btn = gr.Button("Search & Rank", variant="primary")
	rerank_results = gr.Dataframe(
	headers=["Document", "Relevance Score"],
	label="Search Results"
	)

	rerank_btn.click(
	fn=lambda q, d, m, e: process_with_embedder('rerank_documents', q, d, m, e),
	inputs=[query_text, documents_text, model_choice, embedding_dim],
	outputs=rerank_results
	)

	# Batch Analysis Tab
	with gr.Tab("Batch Similarity Analysis"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Batch Similarity Analysis
	Analyze semantic relationships between multiple texts simultaneously.

	<div class="example">
	<strong>The example shows Arabic proverbs about friendship:</strong><br>
	See how the model captures the semantic relationships between similar themes.
	</div>
	""")

	batch_texts = gr.Textbox(
	label="Input Texts (one per line)",
	lines=10,
	placeholder="Enter texts here, one per line...",
	value="""الصديق وقت الضيق.
	الصديق الحقيقي يظهر عند الشدائد.
	عند المحن تعرف إخوانك.
	وقت الشدة بتعرف صحابك.
	الصاحب ساحب."""
	)
	process_btn = gr.Button("Analyze Relationships", variant="primary")
	similarity_matrix = gr.Dataframe(
	label="Similarity Matrix",
	wrap=True
	)

	process_btn.click(
	fn=lambda t, m, e: process_with_embedder('process_batch_embeddings', t, m, e),
	inputs=[batch_texts, model_choice, embedding_dim],
	outputs=[similarity_matrix]
	)

	# Add new Retrieval Tab
	with gr.Tab("Multi-Query Retrieval"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Multi-Query Document Retrieval
	Match multiple queries against multiple documents simultaneously using semantic search.

	<div class="description">
	This tab implements the exact retrieval logic from the Qwen example, allowing you to:
	- Define a custom task prompt
	- Input multiple queries
	- Input multiple documents
	- See all query-document match scores in a matrix
	</div>

	<div class="example">
	<strong>Try these examples:</strong><br>
	<strong>Task prompt:</strong> "Given a web search query, retrieve relevant passages that answer the query"<br>
	<strong>Queries:</strong>
	• "ما هي أكبر المدن العربية؟"
	• "أين تقع أهم المراكز الثقافية؟"<br>
	<strong>Documents:</strong> Use the example documents or add your own
	</div>
	""")

	task_prompt = gr.Textbox(
	label="Task Prompt",
	placeholder="Enter the task description here...",
	value="Given a web search query, retrieve relevant passages that answer the query",
	lines=2
	)

	with gr.Row():
	queries_text = gr.Textbox(
	label="Queries (one per line)",
	placeholder="Enter your queries here, one per line...",
	value="""ما هي أكبر المدن العربية؟
	أين تقع أهم المراكز الثقافية؟""",
	lines=5
	)
	documents_text = gr.Textbox(
	label="Documents (one per line)",
	placeholder="Enter your documents here, one per line...",
	value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية.
	الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم.
	دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي.
	بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""",
	lines=5
	)

	retrieve_btn = gr.Button("Process Retrieval", variant="primary")
	retrieval_matrix = gr.Dataframe(
	label="Query-Document Relevance Matrix",
	wrap=True
	)

	gr.Markdown("""
	<div class="description">
	<strong>How to read the results:</strong>
	- Each row represents a query
	- Each column represents a document
	- Values show the relevance score (0-1) between each query-document pair
	- Higher scores indicate better matches
	</div>
	""")

	retrieve_btn.click(
	fn=lambda p, q, d, m, e: process_with_embedder('process_retrieval', p, q, d, m, e),
	inputs=[task_prompt, queries_text, documents_text, model_choice, embedding_dim],
	outputs=[retrieval_matrix]
	)

	# Add Cross-Lingual Tab after the Multi-Query Retrieval tab
	with gr.Tab("Cross-Lingual Matching"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Cross-Lingual Semantic Matching
	Compare the meaning of texts across Arabic and English languages.

	<div class="description">
	This feature demonstrates the model's ability to understand semantic similarity across different languages.
	Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning.
	</div>

	<div class="example">
	<strong>Try these examples:</strong><br>
	<strong>Arabic:</strong> "القراءة غذاء العقل والروح"<br>
	<strong>English:</strong> "Reading nourishes the mind and soul"<br>
	Or try your own pairs of semantically similar texts in both languages.
	</div>
	""")

	with gr.Row():
	arabic_text = gr.Textbox(
	label="Arabic Text",
	placeholder="Enter Arabic text here...",
	value="القراءة غذاء العقل والروح",
	lines=3
	)
	english_text = gr.Textbox(
	label="English Text",
	placeholder="Enter English text here...",
	value="Reading nourishes the mind and soul",
	lines=3
	)

	match_btn = gr.Button("Compare Texts", variant="primary")
	with gr.Row():
	cross_lingual_score = gr.Number(
	label="Cross-Lingual Similarity Score",
	value=None
	)

	gr.Markdown("""
	<div class="description">
	<strong>Understanding the score:</strong>
	- Score ranges from 0 (completely different meaning) to 1 (same meaning)
	- Scores above 0.7 usually indicate strong semantic similarity
	- The model considers the meaning, not just word-for-word translation
	</div>
	""")

	match_btn.click(
	fn=lambda a, e, m, d: process_with_embedder('process_cross_lingual', a, e, m, d),
	inputs=[arabic_text, english_text, model_choice, embedding_dim],
	outputs=[cross_lingual_score]
	)

	# Add Text Classification Tab
	with gr.Tab("Text Classification"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Text Classification
	Classify text into predefined categories using semantic similarity.

	<div class="description">
	The model will compare your text against each category and rank them by relevance.
	You can define your own categories or use the provided examples.
	</div>
	""")

	input_text = gr.Textbox(
	label="Input Text",
	placeholder="Enter the text to classify...",
	value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل",
	lines=3
	)

	categories_text = gr.Textbox(
	label="Categories (one per line)",
	placeholder="Enter categories here...",
	value="""التكنولوجيا والابتكار
	الاقتصاد والأعمال
	التعليم والتدريب
	الثقافة والفنون
	الصحة والطب""",
	lines=5
	)

	classify_btn = gr.Button("Classify Text", variant="primary")
	classification_results = gr.Dataframe(
	headers=["Category", "Relevance Score"],
	label="Classification Results"
	)

	classify_btn.click(
	fn=lambda t, c, m, e: process_with_embedder('classify_text', t, c, m, e),
	inputs=[input_text, categories_text, model_choice, embedding_dim],
	outputs=classification_results
	)

	# Add Document Clustering Tab
	with gr.Tab("Document Clustering"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Document Clustering
	Group similar documents together using semantic clustering.

	<div class="description">
	This feature will:
	- Group similar documents into clusters
	- Identify the most representative document for each cluster
	- Help discover themes and patterns in your document collection
	</div>
	""")

	cluster_docs = gr.Textbox(
	label="Documents (one per line)",
	placeholder="Enter documents to cluster...",
	value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب.
	الروبوتات تساعد الأطباء في إجراء العمليات الجراحية.
	التعلم الآلي يحسن من دقة التشخيص الطبي.
	الفن يعبر عن مشاعر الإنسان وأحاسيسه.
	الموسيقى لغة عالمية تتخطى حدود الثقافات.
	الرسم والنحت من أقدم أشكال التعبير الفني.
	التجارة الإلكترونية تغير نمط التسوق التقليدي.
	التسوق عبر الإنترنت يوفر الوقت والجهد.
	المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""",
	lines=10
	)

	num_clusters = gr.Slider(
	minimum=2,
	maximum=10,
	value=3,
	step=1,
	label="Number of Clusters"
	)

	cluster_btn = gr.Button("Cluster Documents", variant="primary")
	clustering_results = gr.Dataframe(
	label="Clustering Results"
	)

	cluster_btn.click(
	fn=lambda d, n, m, e: process_with_embedder('cluster_documents', d, n, m, e),
	inputs=[cluster_docs, num_clusters, model_choice, embedding_dim],
	outputs=clustering_results
	)

	# Add Sentiment Analysis Tab
	with gr.Tab("Sentiment Analysis"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Arabic Sentiment Analysis
	Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors.

	<div class="description">
	The model will compare your text against predefined sentiment anchors and determine:
	- The overall sentiment
	- Confidence scores for each sentiment level
	</div>
	""")

	sentiment_text = gr.Textbox(
	label="Text to Analyze",
	placeholder="Enter text to analyze sentiment...",
	value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين",
	lines=3
	)

	analyze_btn = gr.Button("Analyze Sentiment", variant="primary")

	with gr.Row():
	sentiment_label = gr.Label(label="Overall Sentiment")
	sentiment_scores = gr.Json(label="Detailed Scores")

	analyze_btn.click(
	fn=lambda t, m, e: process_with_embedder('analyze_sentiment', t, m, e),
	inputs=[sentiment_text, model_choice, embedding_dim],
	outputs=[sentiment_label, sentiment_scores]
	)

	# Add Concept Extraction Tab
	with gr.Tab("Concept Extraction"):
	with gr.Column(elem_classes="tab-content"):
	gr.Markdown("""
	### Concept Extraction
	Extract key concepts and themes from Arabic text.

	<div class="description">
	Analyze text to identify:
	- Emotional content
	- Main topics
	- Underlying themes
	</div>
	""")

	concept_text = gr.Textbox(
	label="Text to Analyze",
	placeholder="Enter text to analyze...",
	value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة",
	lines=3
	)

	concept_type = gr.Radio(
	choices=["emotions", "topics", "themes"],
	value="themes",
	label="Concept Type"
	)

	extract_btn = gr.Button("Extract Concepts", variant="primary")
	concept_results = gr.Dataframe(
	headers=["Concept", "Relevance Score"],
	label="Extracted Concepts"
	)

	extract_btn.click(
	fn=lambda t, c, m, e: process_with_embedder('extract_concepts', t, c, m, e),
	inputs=[concept_text, concept_type, model_choice, embedding_dim],
	outputs=concept_results
	)

	# Update dimension handler
	@spaces.GPU(duration=120)
	def update_embedder_dim(dim):
	global embedder
	try:
	embedder = initialize_embedder(embedding_dim=dim)
	return f"Successfully updated embedding dimension to {dim}"
	except Exception as e:
	return f"Error updating dimension: {str(e)}"

	update_dim_btn.click(
	fn=update_embedder_dim,
	inputs=[embedding_dim],
	outputs=dim_status
	)

	return demo

	if __name__ == "__main__":
	demo = create_demo()
	demo.queue()
	demo.launch()