Spaces:

Remeinium
/

Embedding_Siyabasa

Running

App Files Files Community

thekusaldarshana commited on Sep 22, 2025

Commit

fdfa9e5

verified ·

1 Parent(s): a7ff38b

Update app.py

Browse files

Files changed (1) hide show

app.py +411 -4

app.py CHANGED Viewed

@@ -1,8 +1,35 @@
 import gradio as gr
 from huggingface_hub import hf_hub_download
 import fasttext
 import os
 styles = """
 #button {
     background: linear-gradient(to right, #6A359C, #B589D6);
@@ -10,6 +37,9 @@ styles = """
 }
 """
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     raise EnvironmentError("❌ HF_TOKEN is not set. Please add it in Space Settings > Secrets.")
@@ -27,6 +57,9 @@ try:
 except Exception as e:
     raise RuntimeError(f"❌ Failed to load model: {str(e)}")
 def get_embedding(word: str):
     if not word or not word.strip():
         return {"error": "⚠️ කරුණාකර සිංහල වචනයක් ඇතුළත් කරන්න."}
@@ -36,9 +69,278 @@ def get_embedding(word: str):
     except Exception as e:
         return {"error": f"💥 Something went wrong..: {str(e)}"}
-# ------------------------------------------
-# 🖥️ Gradio App — Now your PUBLIC UI
-# ------------------------------------------
 with gr.Blocks(title="Embedding_Siyabasa", css=styles) as demo:
     gr.Markdown("""
     # 🇱🇰 Sinhala Word Embeddings
@@ -46,6 +348,7 @@ with gr.Blocks(title="Embedding_Siyabasa", css=styles) as demo:
     ඔබේ සිංහල වචනය ඇතුළත් කර, එහි 300D embedding vector එක ලබා ගන්න.
     """)
     with gr.Row():
         inp = gr.Textbox(label="සිංහල වචනය", placeholder="උදා: අම්මා, සියබස, නූතන, ප්‍රජාතන්ත්‍රවාදය")
         out = gr.JSON(label="Embedding Vector (300D)")
@@ -67,9 +370,113 @@ with gr.Blocks(title="Embedding_Siyabasa", css=styles) as demo:
         cache_examples=True
     )
     gr.Markdown("""
     ---
     ✨ *Remeinium AI - Intelligence for a greater tomorrow*
     """)
-demo.queue(default_concurrency_limit=10).launch()

+# app.py
 import gradio as gr
 from huggingface_hub import hf_hub_download
 import fasttext
 import os
+import numpy as np
+from functools import lru_cache
+import math
+import tempfile
+import io
+from typing import List, Tuple, Optional
+# visualization libs (attempt imports, fallbacks handled later)
+try:
+    import umap
+    _HAS_UMAP = True
+except Exception:
+    _HAS_UMAP = False
+try:
+    from sklearn.manifold import TSNE
+    _HAS_TSNE = True
+except Exception:
+    _HAS_TSNE = False
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+# -------------------------
+# your original styles + button id (kept)
+# -------------------------
 styles = """
 #button {
     background: linear-gradient(to right, #6A359C, #B589D6);
 }
 """
+# -------------------------
+# HF token + model download (kept)
+# -------------------------
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     raise EnvironmentError("❌ HF_TOKEN is not set. Please add it in Space Settings > Secrets.")
 except Exception as e:
     raise RuntimeError(f"❌ Failed to load model: {str(e)}")
+# -------------------------
+# original get_embedding (untouched)
+# -------------------------
 def get_embedding(word: str):
     if not word or not word.strip():
         return {"error": "⚠️ කරුණාකර සිංහල වචනයක් ඇතුළත් කරන්න."}
     except Exception as e:
         return {"error": f"💥 Something went wrong..: {str(e)}"}
+# -------------------------
+# Utilities & precomputations
+# -------------------------
+def safe_strip(s: Optional[str]) -> str:
+    return "" if s is None else s.strip()
+@lru_cache(maxsize=1)
+def load_vocab_and_matrix(max_words: int = 100000):
+    try:
+        words = model.get_words()
+    except Exception:
+        # fallback: build words from common examples (unlikely)
+        raise RuntimeError("Failed to get words from fastText model via model.get_words().")
+    if max_words and len(words) > max_words:
+        words = words[:max_words]
+    vectors = []
+    for w in words:
+        vec = model.get_word_vector(w)
+        vectors.append(vec)
+    mat = np.vstack(vectors).astype(np.float32)  # N x D
+    # compute normalized vectors for cosine similarity (avoid division by zero)
+    norms = np.linalg.norm(mat, axis=1, keepdims=True)
+    norms[norms == 0.0] = 1.0
+    mat_norm = mat / norms
+    return words, mat, mat_norm
+def cosine_similarity_vec(u: np.ndarray, mat_norm: np.ndarray) -> np.ndarray:
+    # normalize u
+    u_norm = np.linalg.norm(u)
+    if u_norm == 0:
+        return np.zeros(mat_norm.shape[0], dtype=np.float32)
+    u = (u / u_norm).astype(np.float32)
+    sims = np.dot(mat_norm, u)  # (N,)
+    return sims
+def top_k_words_for_vector(vec: np.ndarray, words: List[str], mat_norm: np.ndarray, k: int = 10, filter_self: Optional[str] = None) -> List[Tuple[str, float]]:
+    sims = cosine_similarity_vec(vec, mat_norm)
+    # argsort descending
+    idx = np.argsort(-sims)[: k + (1 if filter_self else 0)]
+    results = []
+    for i in idx:
+        w = words[i]
+        score = float(sims[i])
+        if filter_self and w == filter_self:
+            continue
+        results.append((w, round(score, 6)))
+        if len(results) >= k:
+            break
+    return results
+# -------------------------
+# 1: Word similarity (two words)
+# -------------------------
+def word_similarity(a: str, b: str):
+    a = safe_strip(a)
+    b = safe_strip(b)
+    if not a or not b:
+        return {"error": "⚠️ Enter 2 valid sinhala words"}
+    try:
+        va = model.get_word_vector(a)
+        vb = model.get_word_vector(b)
+        # cosine sim
+        denom = (np.linalg.norm(va) * np.linalg.norm(vb))
+        if denom == 0:
+            sim = 0.0
+        else:
+            sim = float(np.dot(va, vb) / denom)
+        return {
+            "word_a": a,
+            "word_b": b,
+            "cosine_similarity": round(sim, 6),
+            "explanation": "1.0 = identical in vector space, -1.0 = opposite. Values near 0 mean unrelated."
+        }
+    except Exception as e:
+        return {"error": f"💥 Error computing similarity: {str(e)}"}
+# -------------------------
+# 2: Nearest neighbors / semantic search
+# -------------------------
+def nearest_neighbors(word: str, top_k: int = 10):
+    word = safe_strip(word)
+    if not word:
+        return {"error": "⚠️ Please enter any Sinhala word"}
+    try:
+        words, mat, mat_norm = load_vocab_and_matrix()
+        vec = model.get_word_vector(word)
+        results = top_k_words_for_vector(vec, words, mat_norm, k=top_k, filter_self=word)
+        return {"query": word, "neighbors": [{"word": w, "score": s} for w, s in results]}
+    except Exception as e:
+        return {"error": f"���� Error finding neighbors: {str(e)}"}
+# -------------------------
+# Feature 3: Sentence embeddings (average word vectors) + similarity
+# -------------------------
+def sentence_to_embedding(sentence: str):
+    s = safe_strip(sentence)
+    if not s:
+        return {"error": "⚠️ Please enter any Sinhala sentence"}
+    try:
+        # simple whitespace tokenization + strip punctuation
+        tokens = [t for t in s.split() if t.strip()]
+        if len(tokens) == 0:
+            return {"error": "⚠️ Couldn't find words from the sentence"}
+        vecs = [model.get_word_vector(t) for t in tokens]
+        mat = np.vstack(vecs)
+        avg = mat.mean(axis=0)
+        return {"sentence": s, "tokens": tokens, "embedding": avg.tolist()}
+    except Exception as e:
+        return {"error": f"💥 Error computing sentence embedding: {str(e)}"}
+def sentence_similarity(s1: str, s2: str):
+    try:
+        r1 = sentence_to_embedding(s1)
+        r2 = sentence_to_embedding(s2)
+        if "error" in r1 or "error" in r2:
+            return {"error": r1.get("error") or r2.get("error")}
+        v1 = np.array(r1["embedding"], dtype=np.float32)
+        v2 = np.array(r2["embedding"], dtype=np.float32)
+        denom = (np.linalg.norm(v1) * np.linalg.norm(v2))
+        if denom == 0:
+            sim = 0.0
+        else:
+            sim = float(np.dot(v1, v2) / denom)
+        return {"sentence_a": s1, "sentence_b": s2, "cosine_similarity": round(sim, 6)}
+    except Exception as e:
+        return {"error": f"💥 Error computing sentence similarity: {str(e)}"}
+# -------------------------
+# Feature 4: Visualization
+# -------------------------
+def visualize_words(words_text: str, use_neighbors: bool = False, neighbors_k: int = 10, projection_method: str = "umap"):
+    words_raw = [w.strip() for w in words_text.replace(",", "\n").splitlines() if w.strip()]
+    if not words_raw:
+        return {"error": "⚠️ Something went wrong"}
+    try:
+        words, mat, mat_norm = load_vocab_and_matrix()
+        selected_words = []
+        for w in words_raw:
+            selected_words.append(w)
+            if use_neighbors:
+                vec = model.get_word_vector(w)
+                nn = top_k_words_for_vector(vec, words, mat_norm, k=neighbors_k, filter_self=w)
+                selected_words.extend([x for x, _ in nn])
+        # dedupe preserving order
+        seen = set()
+        final_words = []
+        for w in selected_words:
+            if w not in seen:
+                final_words.append(w)
+                seen.add(w)
+        # fetch vectors (if OOV, fasttext provides vector from subwords)
+        vecs = np.vstack([model.get_word_vector(w) for w in final_words])
+        # projection
+        if projection_method == "umap" and _HAS_UMAP:
+            reducer = umap.UMAP(n_components=2, random_state=42)
+            coords = reducer.fit_transform(vecs)
+        elif _HAS_TSNE:
+            tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(final_words)-1 or 2))
+            coords = tsne.fit_transform(vecs)
+        elif _HAS_UMAP:
+            # if user requested tsne but only umap available
+            reducer = umap.UMAP(n_components=2, random_state=42)
+            coords = reducer.fit_transform(vecs)
+        else:
+            return {"error": "⚠️ Neither UMAP nor t-SNE is available in this environment. Please install 'umap-learn' or 'scikit-learn'."}
+        # plot
+        fig, ax = plt.subplots(figsize=(8, 6))
+        ax.scatter(coords[:, 0], coords[:, 1], s=40)
+        for i, w in enumerate(final_words):
+            ax.annotate(w, (coords[i, 0], coords[i, 1]), fontsize=9, alpha=0.9)
+        ax.set_title("2D Projection of Sinhala Words (embedding space)")
+        ax.set_xticks([])
+        ax.set_yticks([])
+        buf = io.BytesIO()
+        fig.tight_layout()
+        fig.savefig(buf, format="png", dpi=150)
+        plt.close(fig)
+        buf.seek(0)
+        return buf
+    except Exception as e:
+        return {"error": f"💥 Error creating visualization: {str(e)}"}
+# -------------------------
+# Feature 5: Practical demo - index uploaded documents and search
+# -------------------------
+def parse_uploaded_documents(file):
+    if file is None:
+        return {"error": "⚠️ Please upload a file (txt/csv)."}
+    try:
+        raw = file.read().decode("utf-8")
+    except Exception:
+        try:
+            raw = file.read().decode("latin-1")
+        except Exception as e:
+            return {"error": f"💥 Something went wrong: {str(e)}"}
+    docs = []
+    # simple CSV detection: many commas vs newlines
+    if "," in raw and raw.count(",") > raw.count("\n"):
+        # parse as CSV rows
+        for line in raw.splitlines():
+            if not line.strip():
+                continue
+            # take entire line (or split and take last column). Keep it simple.
+            docs.append(line.strip())
+    else:
+        for line in raw.splitlines():
+            if line.strip():
+                docs.append(line.strip())
+    if not docs:
+        return {"error": "⚠️ Couldn't identify words from the file"}
+    return {"documents": docs}
+def index_documents_for_search(docs: List[str]):
+    if not docs:
+        return {"error": "⚠️ The file was empty"}
+    try:
+        vecs = []
+        for d in docs:
+            tokens = [t for t in d.split() if t.strip()]
+            if not tokens:
+                vecs.append(np.zeros((model.get_dimension(),), dtype=np.float32))
+                continue
+            mats = np.vstack([model.get_word_vector(t) for t in tokens])
+            vecs.append(mats.mean(axis=0))
+        M = np.vstack(vecs).astype(np.float32)  # num_docs x D
+        norms = np.linalg.norm(M, axis=1, keepdims=True)
+        norms[norms == 0] = 1.0
+        M_norm = M / norms
+        return {"matrix": M, "matrix_norm": M_norm, "docs": docs}
+    except Exception as e:
+        return {"error": f"💥 දත්ත सूचीක් Index කිරීමේ දෝෂයක්: {str(e)}"}
+def search_documents(query: str, indexed):
+    """
+    indexed: dict returned by index_documents_for_search
+    returns top-5 matching docs
+    """
+    q = safe_strip(query)
+    if not q:
+        return {"error": "⚠️ Enter a query to search"}
+    try:
+        q_tokens = [t for t in q.split() if t.strip()]
+        if not q_tokens:
+            return {"error": "⚠️ Couldn't search from the query"}
+        q_vecs = np.vstack([model.get_word_vector(t) for t in q_tokens])
+        q_avg = q_vecs.mean(axis=0)
+        q_norm = np.linalg.norm(q_avg)
+        if q_norm == 0:
+            sims = np.zeros(indexed["matrix_norm"].shape[0], dtype=np.float32)
+        else:
+            q_avg = (q_avg / q_norm).astype(np.float32)
+            sims = np.dot(indexed["matrix_norm"], q_avg)
+        idx = np.argsort(-sims)[:10]
+        results = []
+        for i in idx:
+            results.append({"doc": indexed["docs"][i], "score": float(round(float(sims[i]), 6))})
+        return {"query": q, "results": results}
+    except Exception as e:
+        return {"error": f"💥 සෙවුම කළ විට දෝෂයක්: {str(e)}"}
+# -------------------------
+# Gradio UI - keep original section and add new blocks
+# -------------------------
 with gr.Blocks(title="Embedding_Siyabasa", css=styles) as demo:
     gr.Markdown("""
     # 🇱🇰 Sinhala Word Embeddings
     ඔබේ සිංහල වචනය ඇතුළත් කර, එහි 300D embedding vector එක ලබා ගන්න.
     """)
+    # Original simple embedding (kept exactly as before)
     with gr.Row():
         inp = gr.Textbox(label="සිංහල වචනය", placeholder="උදා: අම්මා, සියබස, නූතන, ප්‍රජාතන්ත්‍රවාදය")
         out = gr.JSON(label="Embedding Vector (300D)")
         cache_examples=True
     )
+    # -------------------------
+    # NEW: Word similarity
+    # -------------------------
+    gr.Markdown("## 🔎 Word Similarity for 2 words — cosine similarity")
+    with gr.Row():
+        ws_a = gr.Textbox(label="Word A", placeholder="උදා: අම්මා")
+        ws_b = gr.Textbox(label="Word B", placeholder="උදා: තාත්තා")
+        ws_out = gr.JSON(label="Similarity Result")
+    ws_btn = gr.Button("🔁 Compare", elem_id="button")
+    ws_btn.click(fn=word_similarity, inputs=[ws_a, ws_b], outputs=ws_out)
+    # -------------------------
+    # NEW: Nearest neighbors (word -> top-N)
+    # -------------------------
+    gr.Markdown("## 🧭 Semantic Search")
+    with gr.Row():
+        nn_word = gr.Textbox(label="Query Word (සිංහල)", placeholder="උදා: ගුරු")
+        nn_k = gr.Slider(minimum=1, maximum=50, step=1, value=10, label="Top K (අවුරුදු)")
+        nn_out = gr.JSON(label="Top-K Neighbors")
+    nn_btn = gr.Button("🔎 Find Neighbors", elem_id="button")
+    nn_btn.click(fn=nearest_neighbors, inputs=[nn_word, nn_k], outputs=nn_out)
+    # -------------------------
+    # NEW: Sentence embeddings
+    # -------------------------
+    gr.Markdown("## 🧾 Sentence Embeddings")
+    with gr.Row():
+        sent_inp = gr.Textbox(label="සිංහල වාක්‍යය", placeholder="උදා: මම පාසලට යමි.")
+        sent_out = gr.JSON(label="Sentence Embedding (avg)")
+    sent_btn = gr.Button("🧠 Get Sentence Embedding", elem_id="button")
+    sent_btn.click(fn=sentence_to_embedding, inputs=sent_inp, outputs=sent_out)
+    # sentence similarity
+    with gr.Row():
+        sa = gr.Textbox(label="Sentence A")
+        sb = gr.Textbox(label="Sentence B")
+        ssim_out = gr.JSON(label="Sentence Similarity")
+    ssim_btn = gr.Button("🔁 Compare Sentences", elem_id="button")
+    ssim_btn.click(fn=sentence_similarity, inputs=[sa, sb], outputs=ssim_out)
+    # -------------------------
+    # NEW: Visualization (UMAP / t-SNE)
+    # -------------------------
+    gr.Markdown("## 📊 Visualization")
+    with gr.Row():
+        viz_words = gr.Textbox(label="Words (comma or newline separated)", placeholder="උදා: අම්මා, සියබස, පාසල")
+        viz_use_neighbors = gr.Checkbox(label="Expand with nearest neighbors", value=False)
+        viz_k = gr.Slider(minimum=1, maximum=40, step=1, value=10, label="Neighbors per word (if expanded)")
+        viz_method = gr.Radio(choices=["umap", "tsne"], value="umap", label="Projection method")
+        viz_img = gr.Image(type="pil", label="Projection (PNG)")
+    viz_btn = gr.Button("🎨 Create Visualization", elem_id="button")
+    def _viz_wrapper(words_text, use_neighbors, k, method):
+        res = visualize_words(words_text, use_neighbors, neighbors_k=int(k), projection_method=method)
+        if isinstance(res, dict) and "error" in res:
+            return gr.update(value=None), gr.update(value=f"Error: {res['error']}")
+        # res is a BytesIO
+        return res, ""
+    viz_btn.click(fn=_viz_wrapper, inputs=[viz_words, viz_use_neighbors, viz_k, viz_method], outputs=[viz_img, gr.Textbox(visible=False)])
+    # -------------------------
+    # NEW: Practical demo - upload docs and semantic search
+    # -------------------------
+    gr.Markdown("## 📚 Practical demo — Upload Sinhala documents and semantic search")
+    with gr.Row():
+        upload = gr.File(label="Upload a .txt or .csv (each line is a doc)", file_count="single")
+        docs_list = gr.Dataframe(headers=["Document (first 200 chars)"], interactive=False)
+    idx_btn = gr.Button("📥 Index Documents", elem_id="button")
+    # store indexed dataset in a state object
+    indexed_state = gr.State(value=None)
+    def _index_upload(file):
+        parsed = parse_uploaded_documents(file)
+        if "error" in parsed:
+            return None, gr.update(value=[]), {"error": parsed["error"]}
+        docs = parsed["documents"]
+        indexed = index_documents_for_search(docs)
+        if "error" in indexed:
+            return None, gr.update(value=[]), {"error": indexed["error"]}
+        # return the indexed object into state, and display a preview
+        preview = [[(d[:200] + "..." if len(d) > 200 else d)] for d in docs[:30]]
+        return indexed, gr.update(value=preview), {"success": f"Indexed {len(docs)} documents."}
+    idx_btn.click(fn=_index_upload, inputs=[upload], outputs=[indexed_state, docs_list, gr.Textbox(visible=False)])
+    with gr.Row():
+        q = gr.Textbox(label="Search query (Sinhala)")
+        topn = gr.Slider(1, 20, value=5, step=1, label="Top N results")
+        results_out = gr.JSON(label="Search Results")
+    def _search_wrapper(query, topn_, state):
+        if state is None:
+            return {"error": "⚠️ Please index documents first (upload + Index Documents)."}
+        # ensure state has matrix_norm
+        indexed = state
+        # run search (we ignore topn_ for now but results are sorted)
+        res = search_documents(query, indexed)
+        if "error" in res:
+            return res
+        # truncate to topn_
+        if "results" in res:
+            res["results"] = res["results"][:int(topn_)]
+        return res
+    gr.Button("🔎 Search Documents", elem_id="button").click(fn=_search_wrapper, inputs=[q, topn, indexed_state], outputs=[results_out])
     gr.Markdown("""
     ---
     ✨ *Remeinium AI - Intelligence for a greater tomorrow*
     """)
+# Keep queue and launch (same as before)
+demo.queue(default_concurrency_limit=10).launch()