| | import gradio as gr |
| | import torch |
| | import torch.nn.functional as F |
| | import numpy as np |
| | import plotly.express as px |
| | import pandas as pd |
| | import spaces |
| | from typing import List, Tuple, Dict |
| | from torch import Tensor |
| | from transformers import AutoTokenizer, AutoModel |
| | from sentence_transformers import SentenceTransformer |
| | import json |
| |
|
| | |
| | embedder = None |
| |
|
| | AVAILABLE_MODELS = { |
| | "Qwen3-Embedding-0.6B": "Qwen/Qwen3-Embedding-0.6B", |
| | "Semantic-Ar-Qwen-Embed-0.6B": "Omartificial-Intelligence-Space/Semantic-Ar-Qwen-Embed-0.6B", |
| | "AraGemma-Embedding-300m" : "Omartificial-Intelligence-Space/AraGemma-Embedding-300m" |
| | } |
| |
|
| | class QwenEmbedder: |
| | def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", embedding_dim: int = 768): |
| | self.model = SentenceTransformer(model_name) |
| | self.embedding_dim = embedding_dim |
| | self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| | self.model.to(self.device) |
| | |
| | if embedding_dim != 768: |
| | |
| | self.projection = torch.nn.Linear(768, embedding_dim) |
| | self.projection.to(self.device) |
| | else: |
| | self.projection = None |
| |
|
| | def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> torch.Tensor: |
| | if with_instruction: |
| | texts = [f"Represent this Arabic text for retrieval: {text}" for text in texts] |
| | |
| | embeddings = self.model.encode(texts, convert_to_tensor=True) |
| | |
| | if self.projection is not None: |
| | embeddings = self.projection(embeddings) |
| | |
| | |
| | embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) |
| | return embeddings |
| |
|
| | @spaces.GPU(duration=120) |
| | def initialize_embedder(embedding_dim=768): |
| | |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | print(f"Initializing embedder on device: {device}") |
| | |
| | |
| | model = QwenEmbedder(embedding_dim=embedding_dim) |
| | return model |
| |
|
| | @spaces.GPU(duration=120) |
| | def process_with_embedder(fn_name, *args): |
| | """Generic handler for embedder operations""" |
| | global embedder |
| | if embedder is None: |
| | embedder = initialize_embedder() |
| | |
| | |
| | fn_map = { |
| | 'compute_similarity': compute_similarity, |
| | 'rerank_documents': rerank_documents, |
| | 'process_batch_embeddings': process_batch_embeddings, |
| | 'process_retrieval': process_retrieval, |
| | 'process_cross_lingual': process_cross_lingual, |
| | 'classify_text': classify_text, |
| | 'cluster_documents': cluster_documents, |
| | 'analyze_sentiment': analyze_sentiment, |
| | 'extract_concepts': extract_concepts |
| | } |
| | |
| | return fn_map[fn_name](embedder, *args) |
| |
|
| | |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | zero = torch.Tensor([0]).to(device) |
| | print(f"Device being used: {zero.device}") |
| |
|
| | def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: |
| | left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) |
| | if left_padding: |
| | return last_hidden_states[:, -1] |
| | else: |
| | sequence_lengths = attention_mask.sum(dim=1) - 1 |
| | batch_size = last_hidden_states.shape[0] |
| | return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] |
| |
|
| | def get_detailed_instruct(task_description: str, query: str) -> str: |
| | return f'Instruct: {task_description}\nQuery: {query}' |
| |
|
| | def tokenize(tokenizer, input_texts, eod_id, max_length): |
| | batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2) |
| | for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]): |
| | seq.append(eod_id) |
| | att.append(1) |
| | batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt") |
| | return batch_dict |
| |
|
| | def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str, model_choice: str = None, embedding_dim: int = None) -> float: |
| | embeddings = embedder.get_embeddings([text1, text2]) |
| | similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() |
| | return round(similarity, 3) |
| |
|
| | def rerank_documents(embedder: QwenEmbedder, query: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: |
| | docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] |
| | |
| | |
| | task = 'Given a search query, retrieve relevant passages that answer the query' |
| | query_with_instruct = get_detailed_instruct(task, query) |
| | |
| | |
| | query_embedding = embedder.get_embeddings([query_with_instruct]) |
| | doc_embeddings = embedder.get_embeddings(docs_list) |
| | |
| | |
| | scores = (query_embedding @ doc_embeddings.T).squeeze(0) |
| | results = [(doc, float(score)) for doc, score in zip(docs_list, scores)] |
| | results.sort(key=lambda x: x[1], reverse=True) |
| | |
| | return [(doc, round(score, 3)) for doc, score in results] |
| |
|
| | def process_batch_embeddings(embedder: QwenEmbedder, texts: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: |
| | text_list = [text.strip() for text in texts.split('\n') if text.strip()] |
| | if len(text_list) < 1: |
| | return pd.DataFrame() |
| | |
| | embeddings = embedder.get_embeddings(text_list) |
| | scores = (embeddings @ embeddings.T).cpu().numpy() |
| | |
| | |
| | df_similarities = pd.DataFrame( |
| | scores, |
| | index=text_list, |
| | columns=text_list |
| | ) |
| | |
| | return df_similarities.round(3) |
| |
|
| | def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: |
| | |
| | query_list = [q.strip() for q in queries.split('\n') if q.strip()] |
| | doc_list = [d.strip() for d in documents.split('\n') if d.strip()] |
| | |
| | if not query_list or not doc_list: |
| | return pd.DataFrame() |
| | |
| | |
| | instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list] |
| | |
| | |
| | query_embeddings = embedder.get_embeddings(instructed_queries) |
| | doc_embeddings = embedder.get_embeddings(doc_list) |
| | |
| | |
| | scores = (query_embeddings @ doc_embeddings.T).cpu().numpy() |
| | |
| | |
| | df = pd.DataFrame(scores, index=query_list, columns=doc_list) |
| | return df.round(3) |
| |
|
| | def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str, model_choice: str = None, embedding_dim: int = None) -> dict: |
| | texts = [arabic_text, english_text] |
| | embeddings = embedder.get_embeddings(texts) |
| | similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() |
| | return {"similarity": round(similarity, 3)} |
| |
|
| | def classify_text(embedder: QwenEmbedder, text: str, categories: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: |
| | cat_list = [c.strip() for c in categories.split('\n') if c.strip()] |
| | text_embedding = embedder.get_embeddings([text]) |
| | cat_embeddings = embedder.get_embeddings(cat_list) |
| | scores = (text_embedding @ cat_embeddings.T).squeeze(0) |
| | results = [(cat, float(score)) for cat, score in zip(cat_list, scores)] |
| | results.sort(key=lambda x: x[1], reverse=True) |
| | return [(cat, round(score, 3)) for cat, score in results] |
| |
|
| | def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int, model_choice: str = None, embedding_dim: int = None) -> pd.DataFrame: |
| | from sklearn.cluster import KMeans |
| | doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] |
| | if len(doc_list) < num_clusters: |
| | return pd.DataFrame() |
| | |
| | embeddings = embedder.get_embeddings(doc_list) |
| | |
| | |
| | kmeans = KMeans(n_clusters=num_clusters, random_state=42) |
| | clusters = kmeans.fit_predict(embeddings.cpu().numpy()) |
| | |
| | |
| | cluster_centers = kmeans.cluster_centers_ |
| | cluster_center_docs = [] |
| | |
| | for i in range(num_clusters): |
| | cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i] |
| | cluster_embeddings = embedder.get_embeddings(cluster_docs) |
| | center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0) |
| | similarities = F.cosine_similarity(cluster_embeddings, center_embedding) |
| | center_doc = cluster_docs[similarities.argmax().item()] |
| | cluster_center_docs.append(center_doc) |
| | |
| | |
| | df = pd.DataFrame({ |
| | 'Document': doc_list, |
| | 'Cluster': clusters, |
| | 'Cluster Center Document': [cluster_center_docs[c] for c in clusters] |
| | }) |
| | return df.sort_values('Cluster') |
| |
|
| | def analyze_sentiment(embedder: QwenEmbedder, text: str, model_choice: str = None, embedding_dim: int = None) -> Tuple[str, dict]: |
| | |
| | anchors = { |
| | "very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية", |
| | "positive": "هذا جيد وممتع", |
| | "neutral": "هذا عادي ومقبول", |
| | "negative": "هذا سيء ومزعج", |
| | "very_negative": "هذا فظيع جداً ومحبط للغاية" |
| | } |
| | |
| | |
| | text_embedding = embedder.get_embeddings([text]) |
| | anchor_embeddings = embedder.get_embeddings(list(anchors.values())) |
| | |
| | |
| | scores = (text_embedding @ anchor_embeddings.T).squeeze(0) |
| | results = list(zip(anchors.keys(), scores.tolist())) |
| | results.sort(key=lambda x: x[1], reverse=True) |
| | |
| | |
| | return ( |
| | results[0][0], |
| | {k: round(float(v), 3) for k, v in results} |
| | ) |
| |
|
| | def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str, model_choice: str = None, embedding_dim: int = None) -> List[Tuple[str, float]]: |
| | |
| | concept_anchors = { |
| | "emotions": [ |
| | "الفرح والسعادة", |
| | "الحزن والأسى", |
| | "الغضب والإحباط", |
| | "الخوف والقلق", |
| | "الحب والعاطفة", |
| | "الأمل والتفاؤل" |
| | ], |
| | "topics": [ |
| | "السياسة والحكم", |
| | "الاقتصاد والمال", |
| | "العلوم والتكنولوجيا", |
| | "الفن والثقافة", |
| | "الرياضة والترفيه", |
| | "التعليم والمعرفة" |
| | ], |
| | "themes": [ |
| | "العدالة والمساواة", |
| | "التقدم والتطور", |
| | "التقاليد والتراث", |
| | "الحرية والاستقلال", |
| | "التعاون والوحدة", |
| | "الإبداع والابتكار" |
| | ] |
| | } |
| | |
| | anchors = concept_anchors.get(concept_type, concept_anchors["topics"]) |
| | |
| | |
| | text_embedding = embedder.get_embeddings([text]) |
| | anchor_embeddings = embedder.get_embeddings(anchors) |
| | |
| | |
| | scores = (text_embedding @ anchor_embeddings.T).squeeze(0) |
| | results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)] |
| | results.sort(key=lambda x: x[1], reverse=True) |
| | |
| | return [(concept, round(score, 3)) for concept, score in results] |
| |
|
| | def create_embedder(model_choice: str, embedding_dim: int = 768) -> QwenEmbedder: |
| | model_name = AVAILABLE_MODELS[model_choice] |
| | return QwenEmbedder(model_name=model_name, embedding_dim=embedding_dim) |
| |
|
| | def process_similarity(text1: str, text2: str, model_choice: str, embedding_dim: int) -> float: |
| | embedder = create_embedder(model_choice, embedding_dim) |
| | embeddings = embedder.get_embeddings([text1, text2]) |
| | similarity = torch.nn.functional.cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0)) |
| | return float(similarity) |
| |
|
| | def process_reranking(query: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: |
| | embedder = create_embedder(model_choice, embedding_dim) |
| | documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] |
| | |
| | query_embedding = embedder.get_embeddings([query], with_instruction=True) |
| | doc_embeddings = embedder.get_embeddings(documents) |
| | |
| | similarities = torch.nn.functional.cosine_similarity(query_embedding, doc_embeddings) |
| | |
| | |
| | sorted_indices = torch.argsort(similarities, descending=True) |
| | results = [] |
| | for idx in sorted_indices: |
| | results.append({ |
| | 'document': documents[idx], |
| | 'score': float(similarities[idx]) |
| | }) |
| | |
| | return {'results': results} |
| |
|
| | def process_batch(texts: str, model_choice: str, embedding_dim: int) -> Dict: |
| | embedder = create_embedder(model_choice, embedding_dim) |
| | texts = [text.strip() for text in texts.split('\n') if text.strip()] |
| | |
| | embeddings = embedder.get_embeddings(texts) |
| | similarity_matrix = torch.nn.functional.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2) |
| | |
| | df = pd.DataFrame(similarity_matrix.cpu().numpy(), index=texts, columns=texts) |
| | return {'similarity_matrix': df.to_dict()} |
| |
|
| | def process_retrieval(prompt: str, queries: str, documents: str, model_choice: str, embedding_dim: int) -> Dict: |
| | embedder = create_embedder(model_choice, embedding_dim) |
| | |
| | |
| | queries = [q.strip() for q in queries.split('\n') if q.strip()] |
| | documents = [doc.strip() for doc in documents.split('\n') if doc.strip()] |
| | |
| | |
| | prompt_embedding = embedder.get_embeddings([prompt], with_instruction=True) |
| | query_embeddings = embedder.get_embeddings(queries, with_instruction=True) |
| | doc_embeddings = embedder.get_embeddings(documents) |
| | |
| | |
| | query_similarities = torch.nn.functional.cosine_similarity(prompt_embedding, query_embeddings) |
| | doc_similarities = torch.nn.functional.cosine_similarity(prompt_embedding.repeat(len(documents), 1), doc_embeddings) |
| | |
| | |
| | results = { |
| | 'relevant_queries': [], |
| | 'relevant_documents': [] |
| | } |
| | |
| | |
| | query_indices = torch.argsort(query_similarities, descending=True) |
| | for idx in query_indices: |
| | results['relevant_queries'].append({ |
| | 'query': queries[idx], |
| | 'similarity': float(query_similarities[idx]) |
| | }) |
| | |
| | |
| | doc_indices = torch.argsort(doc_similarities, descending=True) |
| | for idx in doc_indices: |
| | results['relevant_documents'].append({ |
| | 'document': documents[idx], |
| | 'similarity': float(doc_similarities[idx]) |
| | }) |
| | |
| | return results |
| |
|
| | |
| | custom_css = """ |
| | :root { |
| | --primary-color: #2196F3; |
| | --secondary-color: #1976D2; |
| | --background-color: #f8f9fa; |
| | --sidebar-bg: #ffffff; |
| | --text-color: #333333; |
| | --border-color: #e0e0e0; |
| | } |
| | |
| | .container { |
| | max-width: 1200px; |
| | margin: auto; |
| | padding: 20px; |
| | } |
| | |
| | .sidebar { |
| | background-color: var(--sidebar-bg); |
| | border-right: 1px solid var(--border-color); |
| | padding: 20px; |
| | margin-right: 20px; |
| | position: sticky; |
| | top: 0; |
| | height: 100vh; |
| | overflow-y: auto; |
| | } |
| | |
| | .main-content { |
| | background-color: var(--background-color); |
| | padding: 20px; |
| | border-radius: 10px; |
| | } |
| | |
| | .features-grid { |
| | display: grid; |
| | grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); |
| | gap: 15px; |
| | margin: 15px 0; |
| | } |
| | |
| | .feature-card { |
| | background: white; |
| | padding: 15px; |
| | border-radius: 6px; |
| | box-shadow: 0 1px 3px rgba(0,0,0,0.1); |
| | transition: all 0.3s ease; |
| | border: 1px solid var(--border-color); |
| | text-align: center; |
| | } |
| | |
| | .feature-card:hover { |
| | transform: translateY(-3px); |
| | box-shadow: 0 3px 6px rgba(0,0,0,0.15); |
| | border-color: var(--primary-color); |
| | } |
| | |
| | .feature-icon { |
| | font-size: 24px; |
| | margin-bottom: 10px; |
| | color: var(--primary-color); |
| | } |
| | |
| | .feature-card h3 { |
| | color: var(--text-color); |
| | margin: 8px 0; |
| | font-size: 0.95em; |
| | font-weight: 600; |
| | } |
| | |
| | .feature-card p { |
| | color: #666; |
| | font-size: 0.8em; |
| | line-height: 1.3; |
| | margin: 5px 0; |
| | } |
| | |
| | .features-summary { |
| | margin: 40px 0; |
| | padding: 30px; |
| | background: white; |
| | border-radius: 12px; |
| | box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
| | } |
| | |
| | .features-summary h2 { |
| | color: var(--text-color); |
| | margin-bottom: 25px; |
| | text-align: center; |
| | font-size: 1.5em; |
| | } |
| | |
| | .feature-list { |
| | display: grid; |
| | grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); |
| | gap: 30px; |
| | } |
| | |
| | .feature-group { |
| | padding: 20px; |
| | background: var(--background-color); |
| | border-radius: 8px; |
| | border: 1px solid var(--border-color); |
| | } |
| | |
| | .feature-group h3 { |
| | color: var(--primary-color); |
| | margin-bottom: 15px; |
| | font-size: 1.2em; |
| | } |
| | |
| | .feature-group ul { |
| | list-style: none; |
| | padding: 0; |
| | margin: 0; |
| | } |
| | |
| | .feature-group li { |
| | padding: 8px 0; |
| | color: var(--text-color); |
| | position: relative; |
| | padding-left: 20px; |
| | } |
| | |
| | .feature-group li:before { |
| | content: "•"; |
| | color: var(--primary-color); |
| | position: absolute; |
| | left: 0; |
| | } |
| | |
| | .description { |
| | margin: 20px 0; |
| | padding: 15px; |
| | border-radius: 8px; |
| | background-color: #ffffff; |
| | box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
| | } |
| | |
| | .example { |
| | margin: 10px 0; |
| | padding: 15px; |
| | border-left: 4px solid var(--primary-color); |
| | background-color: #ffffff; |
| | box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
| | } |
| | |
| | .warning { |
| | color: #721c24; |
| | background-color: #f8d7da; |
| | border: 1px solid #f5c6cb; |
| | padding: 15px; |
| | border-radius: 8px; |
| | margin: 10px 0; |
| | } |
| | |
| | .settings { |
| | background-color: #ffffff; |
| | padding: 20px; |
| | border-radius: 8px; |
| | box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
| | margin: 20px 0; |
| | } |
| | |
| | .tab-content { |
| | padding: 20px; |
| | background-color: #ffffff; |
| | border-radius: 8px; |
| | box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
| | } |
| | |
| | .heading { |
| | color: var(--text-color); |
| | margin-bottom: 20px; |
| | padding-bottom: 10px; |
| | border-bottom: 2px solid var(--primary-color); |
| | } |
| | |
| | button.primary { |
| | background-color: var(--primary-color) !important; |
| | } |
| | |
| | button.secondary { |
| | background-color: var(--secondary-color) !important; |
| | } |
| | """ |
| |
|
| | |
| | def create_demo(): |
| | demo = gr.Blocks(title="Advanced Text Processing with Arabic Semantic Embeddings", css=custom_css, theme=gr.themes.Soft()) |
| | |
| | with demo: |
| | with gr.Row(): |
| | |
| | with gr.Column(scale=1, elem_classes="sidebar"): |
| | gr.Markdown(""" |
| | # Arabic Semantic Embeddings |
| | |
| | ### Navigation |
| | - [Configuration](#configuration) |
| | - [Features](#features) |
| | - [Documentation](#documentation) |
| | """) |
| | |
| | with gr.Accordion("Configuration", open=True): |
| | gr.Markdown(""" |
| | ### Model Settings |
| | Configure the embedding model parameters below. |
| | """) |
| | |
| | model_choice = gr.Dropdown( |
| | choices=list(AVAILABLE_MODELS.keys()), |
| | value=list(AVAILABLE_MODELS.keys())[0], |
| | label="Select Model" |
| | ) |
| | embedding_dim = gr.Slider( |
| | minimum=32, |
| | maximum=1024, |
| | value=768, |
| | step=32, |
| | label="Embedding Dimension", |
| | elem_classes="settings" |
| | ) |
| | update_dim_btn = gr.Button("Update Dimension", variant="secondary") |
| | dim_status = gr.Textbox(label="Status", interactive=False) |
| | |
| | with gr.Accordion("Documentation", open=False): |
| | gr.Markdown(""" |
| | ### Usage Guide |
| | |
| | 1. **Embedding Dimension** |
| | - 32-128: Fast, simple tasks |
| | - 256-512: Balanced performance |
| | - 768: Default, full model |
| | - 1024: Maximum detail |
| | |
| | 2. **Best Practices** |
| | - Use appropriate dimensions for your task |
| | - Consider batch size for multiple documents |
| | - Test different settings for optimal results |
| | """) |
| | |
| | |
| | with gr.Column(scale=4): |
| | gr.Markdown(""" |
| | # Advanced Text Processing Suite |
| | |
| | Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings. |
| | This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages. |
| | """) |
| | |
| | |
| | gr.HTML(""" |
| | <div class="features-grid"> |
| | <div class="feature-card"> |
| | <div class="feature-icon">🔄</div> |
| | <h3>Text Similarity</h3> |
| | <p>Compare text meanings</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">🔍</div> |
| | <h3>Semantic Search</h3> |
| | <p>Find relevant docs</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">📊</div> |
| | <h3>Batch Analysis</h3> |
| | <p>Process multiple texts</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">🎯</div> |
| | <h3>Multi-Query</h3> |
| | <p>Advanced retrieval</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">🌐</div> |
| | <h3>Cross-Lingual</h3> |
| | <p>Cross-language match</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">🏷️</div> |
| | <h3>Classification</h3> |
| | <p>Categorize texts</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">🔮</div> |
| | <h3>Clustering</h3> |
| | <p>Group documents</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">😊</div> |
| | <h3>Sentiment</h3> |
| | <p>Analyze emotions</p> |
| | </div> |
| | <div class="feature-card"> |
| | <div class="feature-icon">🎨</div> |
| | <h3>Concepts</h3> |
| | <p>Extract themes</p> |
| | </div> |
| | </div> |
| | """) |
| | |
| | with gr.Tabs() as tabs: |
| | |
| | with gr.Tab("Text Similarity Analysis"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Text Similarity Analysis |
| | Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning). |
| | |
| | <div class="example"> |
| | <strong>Try these Arabic examples:</strong><br> |
| | • "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"<br> |
| | • "السماء صافية اليوم" and "الطقس حار جداً" |
| | </div> |
| | """) |
| | |
| | with gr.Row(): |
| | text1 = gr.Textbox( |
| | label="First Text", |
| | lines=3, |
| | placeholder="Enter first text here...", |
| | value="أحب القراءة كثيراً" |
| | ) |
| | text2 = gr.Textbox( |
| | label="Second Text", |
| | lines=3, |
| | placeholder="Enter second text here...", |
| | value="القراءة من أحب هواياتي" |
| | ) |
| | similarity_btn = gr.Button("Calculate Similarity", variant="primary") |
| | similarity_score = gr.Number(label="Similarity Score") |
| | |
| | similarity_btn.click( |
| | fn=lambda t1, t2, m, d: process_with_embedder('compute_similarity', t1, t2, m, d), |
| | inputs=[text1, text2, model_choice, embedding_dim], |
| | outputs=similarity_score |
| | ) |
| | |
| | |
| | with gr.Tab("Semantic Search & Reranking"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Semantic Search & Document Reranking |
| | Search through a collection of documents and rank them by semantic relevance to your query. |
| | |
| | <div class="example"> |
| | <strong>Try these Arabic queries:</strong><br> |
| | • "ما هي عواصم الدول العربية؟"<br> |
| | • "أين تقع أكبر المدن العربية؟"<br> |
| | • "ما هي المراكز الثقافية العربية؟" |
| | </div> |
| | """) |
| | |
| | query_text = gr.Textbox( |
| | label="Search Query", |
| | placeholder="Enter your search query...", |
| | value="ما هي عواصم الدول العربية؟" |
| | ) |
| | documents_text = gr.Textbox( |
| | label="Documents Collection (one per line)", |
| | lines=10, |
| | placeholder="Enter documents here, one per line...", |
| | value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها. |
| | الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي. |
| | دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا. |
| | بغداد عاصمة العراق وتقع على نهر دجلة. |
| | الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية. |
| | تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي.""" |
| | ) |
| | rerank_btn = gr.Button("Search & Rank", variant="primary") |
| | rerank_results = gr.Dataframe( |
| | headers=["Document", "Relevance Score"], |
| | label="Search Results" |
| | ) |
| | |
| | rerank_btn.click( |
| | fn=lambda q, d, m, e: process_with_embedder('rerank_documents', q, d, m, e), |
| | inputs=[query_text, documents_text, model_choice, embedding_dim], |
| | outputs=rerank_results |
| | ) |
| | |
| | |
| | with gr.Tab("Batch Similarity Analysis"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Batch Similarity Analysis |
| | Analyze semantic relationships between multiple texts simultaneously. |
| | |
| | <div class="example"> |
| | <strong>The example shows Arabic proverbs about friendship:</strong><br> |
| | See how the model captures the semantic relationships between similar themes. |
| | </div> |
| | """) |
| | |
| | batch_texts = gr.Textbox( |
| | label="Input Texts (one per line)", |
| | lines=10, |
| | placeholder="Enter texts here, one per line...", |
| | value="""الصديق وقت الضيق. |
| | الصديق الحقيقي يظهر عند الشدائد. |
| | عند المحن تعرف إخوانك. |
| | وقت الشدة بتعرف صحابك. |
| | الصاحب ساحب.""" |
| | ) |
| | process_btn = gr.Button("Analyze Relationships", variant="primary") |
| | similarity_matrix = gr.Dataframe( |
| | label="Similarity Matrix", |
| | wrap=True |
| | ) |
| | |
| | process_btn.click( |
| | fn=lambda t, m, e: process_with_embedder('process_batch_embeddings', t, m, e), |
| | inputs=[batch_texts, model_choice, embedding_dim], |
| | outputs=[similarity_matrix] |
| | ) |
| |
|
| | |
| | with gr.Tab("Multi-Query Retrieval"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Multi-Query Document Retrieval |
| | Match multiple queries against multiple documents simultaneously using semantic search. |
| | |
| | <div class="description"> |
| | This tab implements the exact retrieval logic from the Qwen example, allowing you to: |
| | - Define a custom task prompt |
| | - Input multiple queries |
| | - Input multiple documents |
| | - See all query-document match scores in a matrix |
| | </div> |
| | |
| | <div class="example"> |
| | <strong>Try these examples:</strong><br> |
| | <strong>Task prompt:</strong> "Given a web search query, retrieve relevant passages that answer the query"<br> |
| | <strong>Queries:</strong> |
| | • "ما هي أكبر المدن العربية؟" |
| | • "أين تقع أهم المراكز الثقافية؟"<br> |
| | <strong>Documents:</strong> Use the example documents or add your own |
| | </div> |
| | """) |
| | |
| | task_prompt = gr.Textbox( |
| | label="Task Prompt", |
| | placeholder="Enter the task description here...", |
| | value="Given a web search query, retrieve relevant passages that answer the query", |
| | lines=2 |
| | ) |
| | |
| | with gr.Row(): |
| | queries_text = gr.Textbox( |
| | label="Queries (one per line)", |
| | placeholder="Enter your queries here, one per line...", |
| | value="""ما هي أكبر المدن العربية؟ |
| | أين تقع أهم المراكز الثقافية؟""", |
| | lines=5 |
| | ) |
| | documents_text = gr.Textbox( |
| | label="Documents (one per line)", |
| | placeholder="Enter your documents here, one per line...", |
| | value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية. |
| | الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم. |
| | دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي. |
| | بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""", |
| | lines=5 |
| | ) |
| | |
| | retrieve_btn = gr.Button("Process Retrieval", variant="primary") |
| | retrieval_matrix = gr.Dataframe( |
| | label="Query-Document Relevance Matrix", |
| | wrap=True |
| | ) |
| | |
| | gr.Markdown(""" |
| | <div class="description"> |
| | <strong>How to read the results:</strong> |
| | - Each row represents a query |
| | - Each column represents a document |
| | - Values show the relevance score (0-1) between each query-document pair |
| | - Higher scores indicate better matches |
| | </div> |
| | """) |
| | |
| | retrieve_btn.click( |
| | fn=lambda p, q, d, m, e: process_with_embedder('process_retrieval', p, q, d, m, e), |
| | inputs=[task_prompt, queries_text, documents_text, model_choice, embedding_dim], |
| | outputs=[retrieval_matrix] |
| | ) |
| |
|
| | |
| | with gr.Tab("Cross-Lingual Matching"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Cross-Lingual Semantic Matching |
| | Compare the meaning of texts across Arabic and English languages. |
| | |
| | <div class="description"> |
| | This feature demonstrates the model's ability to understand semantic similarity across different languages. |
| | Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning. |
| | </div> |
| | |
| | <div class="example"> |
| | <strong>Try these examples:</strong><br> |
| | <strong>Arabic:</strong> "القراءة غذاء العقل والروح"<br> |
| | <strong>English:</strong> "Reading nourishes the mind and soul"<br> |
| | Or try your own pairs of semantically similar texts in both languages. |
| | </div> |
| | """) |
| | |
| | with gr.Row(): |
| | arabic_text = gr.Textbox( |
| | label="Arabic Text", |
| | placeholder="Enter Arabic text here...", |
| | value="القراءة غذاء العقل والروح", |
| | lines=3 |
| | ) |
| | english_text = gr.Textbox( |
| | label="English Text", |
| | placeholder="Enter English text here...", |
| | value="Reading nourishes the mind and soul", |
| | lines=3 |
| | ) |
| | |
| | match_btn = gr.Button("Compare Texts", variant="primary") |
| | with gr.Row(): |
| | cross_lingual_score = gr.Number( |
| | label="Cross-Lingual Similarity Score", |
| | value=None |
| | ) |
| | |
| | gr.Markdown(""" |
| | <div class="description"> |
| | <strong>Understanding the score:</strong> |
| | - Score ranges from 0 (completely different meaning) to 1 (same meaning) |
| | - Scores above 0.7 usually indicate strong semantic similarity |
| | - The model considers the meaning, not just word-for-word translation |
| | </div> |
| | """) |
| | |
| | match_btn.click( |
| | fn=lambda a, e, m, d: process_with_embedder('process_cross_lingual', a, e, m, d), |
| | inputs=[arabic_text, english_text, model_choice, embedding_dim], |
| | outputs=[cross_lingual_score] |
| | ) |
| |
|
| | |
| | with gr.Tab("Text Classification"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Text Classification |
| | Classify text into predefined categories using semantic similarity. |
| | |
| | <div class="description"> |
| | The model will compare your text against each category and rank them by relevance. |
| | You can define your own categories or use the provided examples. |
| | </div> |
| | """) |
| | |
| | input_text = gr.Textbox( |
| | label="Input Text", |
| | placeholder="Enter the text to classify...", |
| | value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل", |
| | lines=3 |
| | ) |
| | |
| | categories_text = gr.Textbox( |
| | label="Categories (one per line)", |
| | placeholder="Enter categories here...", |
| | value="""التكنولوجيا والابتكار |
| | الاقتصاد والأعمال |
| | التعليم والتدريب |
| | الثقافة والفنون |
| | الصحة والطب""", |
| | lines=5 |
| | ) |
| | |
| | classify_btn = gr.Button("Classify Text", variant="primary") |
| | classification_results = gr.Dataframe( |
| | headers=["Category", "Relevance Score"], |
| | label="Classification Results" |
| | ) |
| | |
| | classify_btn.click( |
| | fn=lambda t, c, m, e: process_with_embedder('classify_text', t, c, m, e), |
| | inputs=[input_text, categories_text, model_choice, embedding_dim], |
| | outputs=classification_results |
| | ) |
| |
|
| | |
| | with gr.Tab("Document Clustering"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Document Clustering |
| | Group similar documents together using semantic clustering. |
| | |
| | <div class="description"> |
| | This feature will: |
| | - Group similar documents into clusters |
| | - Identify the most representative document for each cluster |
| | - Help discover themes and patterns in your document collection |
| | </div> |
| | """) |
| | |
| | cluster_docs = gr.Textbox( |
| | label="Documents (one per line)", |
| | placeholder="Enter documents to cluster...", |
| | value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب. |
| | الروبوتات تساعد الأطباء في إجراء العمليات الجراحية. |
| | التعلم الآلي يحسن من دقة التشخيص الطبي. |
| | الفن يعبر عن مشاعر الإنسان وأحاسيسه. |
| | الموسيقى لغة عالمية تتخطى حدود الثقافات. |
| | الرسم والنحت من أقدم أشكال التعبير الفني. |
| | التجارة الإلكترونية تغير نمط التسوق التقليدي. |
| | التسوق عبر الإنترنت يوفر الوقت والجهد. |
| | المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""", |
| | lines=10 |
| | ) |
| | |
| | num_clusters = gr.Slider( |
| | minimum=2, |
| | maximum=10, |
| | value=3, |
| | step=1, |
| | label="Number of Clusters" |
| | ) |
| | |
| | cluster_btn = gr.Button("Cluster Documents", variant="primary") |
| | clustering_results = gr.Dataframe( |
| | label="Clustering Results" |
| | ) |
| | |
| | cluster_btn.click( |
| | fn=lambda d, n, m, e: process_with_embedder('cluster_documents', d, n, m, e), |
| | inputs=[cluster_docs, num_clusters, model_choice, embedding_dim], |
| | outputs=clustering_results |
| | ) |
| |
|
| | |
| | with gr.Tab("Sentiment Analysis"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Arabic Sentiment Analysis |
| | Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors. |
| | |
| | <div class="description"> |
| | The model will compare your text against predefined sentiment anchors and determine: |
| | - The overall sentiment |
| | - Confidence scores for each sentiment level |
| | </div> |
| | """) |
| | |
| | sentiment_text = gr.Textbox( |
| | label="Text to Analyze", |
| | placeholder="Enter text to analyze sentiment...", |
| | value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين", |
| | lines=3 |
| | ) |
| | |
| | analyze_btn = gr.Button("Analyze Sentiment", variant="primary") |
| | |
| | with gr.Row(): |
| | sentiment_label = gr.Label(label="Overall Sentiment") |
| | sentiment_scores = gr.Json(label="Detailed Scores") |
| | |
| | analyze_btn.click( |
| | fn=lambda t, m, e: process_with_embedder('analyze_sentiment', t, m, e), |
| | inputs=[sentiment_text, model_choice, embedding_dim], |
| | outputs=[sentiment_label, sentiment_scores] |
| | ) |
| |
|
| | |
| | with gr.Tab("Concept Extraction"): |
| | with gr.Column(elem_classes="tab-content"): |
| | gr.Markdown(""" |
| | ### Concept Extraction |
| | Extract key concepts and themes from Arabic text. |
| | |
| | <div class="description"> |
| | Analyze text to identify: |
| | - Emotional content |
| | - Main topics |
| | - Underlying themes |
| | </div> |
| | """) |
| | |
| | concept_text = gr.Textbox( |
| | label="Text to Analyze", |
| | placeholder="Enter text to analyze...", |
| | value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة", |
| | lines=3 |
| | ) |
| | |
| | concept_type = gr.Radio( |
| | choices=["emotions", "topics", "themes"], |
| | value="themes", |
| | label="Concept Type" |
| | ) |
| | |
| | extract_btn = gr.Button("Extract Concepts", variant="primary") |
| | concept_results = gr.Dataframe( |
| | headers=["Concept", "Relevance Score"], |
| | label="Extracted Concepts" |
| | ) |
| | |
| | extract_btn.click( |
| | fn=lambda t, c, m, e: process_with_embedder('extract_concepts', t, c, m, e), |
| | inputs=[concept_text, concept_type, model_choice, embedding_dim], |
| | outputs=concept_results |
| | ) |
| |
|
| | |
| | @spaces.GPU(duration=120) |
| | def update_embedder_dim(dim): |
| | global embedder |
| | try: |
| | embedder = initialize_embedder(embedding_dim=dim) |
| | return f"Successfully updated embedding dimension to {dim}" |
| | except Exception as e: |
| | return f"Error updating dimension: {str(e)}" |
| | |
| | update_dim_btn.click( |
| | fn=update_embedder_dim, |
| | inputs=[embedding_dim], |
| | outputs=dim_status |
| | ) |
| | |
| | return demo |
| |
|
| | if __name__ == "__main__": |
| | demo = create_demo() |
| | demo.queue() |
| | demo.launch() |