#!/usr/bin/env python3 """ Kabyle Semantic Toolkit Hugging Face Space using boffire/kabyle-sentence-transformer-mpnet """ import warnings warnings.filterwarnings("ignore") import gradio as gr import torch import numpy as np import pandas as pd from sentence_transformers import SentenceTransformer import torch.nn.functional as F import os # Load model once print("Loading model...") MODEL = SentenceTransformer("boffire/kabyle-sentence-transformer-mpnet") print("Model loaded") # Pre-load and pre-compute search index at startup print("Pre-computing search index...") try: from datasets import load_dataset ds = load_dataset("Imsidag-community/english-kabyle-parallel", split="train") SEARCH_PAIRS = [(row["en"], row["kab"]) for row in ds.select(range(min(500, len(ds))))] except Exception as e: print("Could not load dataset, using fallback: " + str(e)) SEARCH_PAIRS = [ ("Hello!", "Azul!"), ("How are you?", "Amek i telliḍ?"), ("Thank you", "Tanemmirt"), ("Good morning", "Tifawin"), ("Water is life", "Aman d tudert"), ] # Pre-compute embeddings once at startup _all_texts = [en for en, _ in SEARCH_PAIRS] + [kab for _, kab in SEARCH_PAIRS] SEARCH_EMBEDDINGS = MODEL.encode(_all_texts, convert_to_tensor=True, show_progress_bar=False) print("Search index ready: " + str(len(SEARCH_PAIRS)) + " pairs") def get_embeddings(texts): return MODEL.encode(texts, convert_to_tensor=True) def check_quality(en_text, kab_text): """Tab 1: Translation Quality Checker""" if not en_text.strip() or not kab_text.strip(): return "Please enter both sentences", None emb = get_embeddings([en_text, kab_text]) sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item() if sim > 0.85: quality = "Excellent match" elif sim > 0.6: quality = "Good match" else: quality = "Poor match" result = "Similarity: " + str(round(sim, 4)) + os.linesep + "Quality: " + quality return result, sim def search_similar(query, top_k=5): """Tab 2: Semantic Search - fast because embeddings are pre-computed""" if not query.strip(): return "Please enter a query" query_emb = get_embeddings([query]) # Search both English and Kabyle sides scores = F.cosine_similarity(query_emb, SEARCH_EMBEDDINGS).cpu().numpy() top_indices = np.argsort(scores)[::-1][:top_k] results = [] seen = set() for idx in top_indices: if idx < len(SEARCH_PAIRS): pair = SEARCH_PAIRS[idx] else: pair = SEARCH_PAIRS[idx - len(SEARCH_PAIRS)] key = pair[0] + " || " + pair[1] if key not in seen: seen.add(key) results.append(pair[1] + os.linesep + " (EN: " + pair[0] + ") -- Score: " + str(round(scores[idx], 4))) return (os.linesep + os.linesep).join(results) if results else "No results found" def validate_csv(file): """Tab 3: Parallel Data Validator""" if file is None: return None, "Please upload a CSV file with 'en' and 'kab' columns" df = pd.read_csv(file.name) if "en" not in df.columns or "kab" not in df.columns: return None, "CSV must have 'en' and 'kab' columns" scores = [] for _, row in df.iterrows(): emb = get_embeddings([str(row["en"]), str(row["kab"])]) sim = F.cosine_similarity(emb[0].unsqueeze(0), emb[1].unsqueeze(0)).item() scores.append(sim) df["similarity"] = scores df["quality"] = df["similarity"].apply( lambda s: "good" if s > 0.6 else "poor" ) # Save result output_path = "/tmp/validated_pairs.csv" df.to_csv(output_path, index=False) summary = "Processed " + str(len(df)) + " pairs" + os.linesep summary += "Good quality: " + str(len(df[df["quality"]=="good"])) + os.linesep summary += "Poor quality: " + str(len(df[df["quality"]=="poor"])) return output_path, summary # Build UI with Soft theme with gr.Blocks(title="Kabyle Semantic Toolkit", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # Kabyle Semantic Toolkit Powered by [**boffire/kabyle-sentence-transformer-mpnet**](https://huggingface.co/boffire/kabyle-sentence-transformer-mpnet) This tool understands meaning, not just words. Use it to check translations, find similar sentences, or validate your parallel data. """) with gr.Tabs(): # Tab 1: Quality Checker with gr.TabItem("Translation Quality"): gr.Markdown("Check if an English-Kabyle pair has similar meaning.") with gr.Row(): with gr.Column(scale=2): en_input = gr.Textbox( label="English", placeholder="Enter English text...", lines=3 ) kab_input = gr.Textbox( label="Kabyle", placeholder="Enter Kabyle text...", lines=3 ) with gr.Row(): clear_btn_1 = gr.Button("Clear", variant="secondary") check_btn = gr.Button("Check Quality", variant="primary") with gr.Column(scale=3): result_text = gr.Textbox( label="Result", lines=3, interactive=False ) score_bar = gr.Slider( 0, 1, label="Similarity Score", interactive=False ) check_btn.click( fn=check_quality, inputs=[en_input, kab_input], outputs=[result_text, score_bar] ) gr.Examples( examples=[ ["Hello!", "Azul!"], ["The computer works.", "Aselkim iteddu."], ["I love you.", "Hemmleɣ-kent."], ["Hello!", "Aselkim iteddu."], ], inputs=[en_input, kab_input], label="Try these examples" ) clear_btn_1.click( fn=lambda: ("", "", "", None), outputs=[en_input, kab_input, result_text, score_bar] ) # Tab 2: Similar Search with gr.TabItem("Similar Sentences"): gr.Markdown("Find Kabyle sentences similar to your query. Search index is pre-loaded for instant results.") with gr.Row(): with gr.Column(scale=2): query_input = gr.Textbox( label="Query (English or Kabyle)", placeholder="Enter text to search...", lines=3 ) top_k_slider = gr.Slider( 1, 10, value=5, step=1, label="Number of results" ) with gr.Row(): clear_btn_2 = gr.Button("Clear", variant="secondary") search_btn = gr.Button("Search", variant="primary") with gr.Column(scale=3): search_output = gr.Textbox( label="Results", lines=10, interactive=False ) search_btn.click( fn=search_similar, inputs=[query_input, top_k_slider], outputs=search_output ) gr.Examples( examples=["How are you?", "Thank you", "Water is life"], inputs=query_input, label="Example queries" ) clear_btn_2.click( fn=lambda: ("", 5, ""), outputs=[query_input, top_k_slider, search_output] ) # Tab 3: Data Validator with gr.TabItem("Data Validator"): gr.Markdown("Upload a CSV with 'en' and 'kab' columns to validate alignment quality.") with gr.Row(): with gr.Column(scale=2): file_input = gr.File( label="Upload CSV", file_types=[".csv"] ) validate_btn = gr.Button("Validate", variant="primary") with gr.Column(scale=3): summary_output = gr.Textbox( label="Summary", lines=4, interactive=False ) download_output = gr.File(label="Download Results") validate_btn.click( fn=validate_csv, inputs=file_input, outputs=[download_output, summary_output] ) gr.Markdown(""" --- **Related tools**: [LibreTranslate](https://imsidag-community-libretranslate-kabyle.hf.space/) | [MarianMT](https://huggingface.co/boffire/marianmt-en-kab) """) if __name__ == "__main__": demo.launch()