Spaces:
Running
Running
| # app.py | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from huggingface_hub import HfApi | |
| import fasttext | |
| import os | |
| import numpy as np | |
| from functools import lru_cache | |
| import json | |
| import time | |
| from typing import List, Tuple, Optional, Dict, Any | |
| from collections import defaultdict, deque | |
| import hashlib | |
| import uuid | |
| import tempfile | |
| import requests | |
| import webbrowser | |
| # ------------------------- | |
| # Styles | |
| # ------------------------- | |
| styles = """ | |
| body{ | |
| background : #161616; | |
| } | |
| #button { | |
| background: linear-gradient(to right, #6A359C, #B589D6); | |
| color: #efefef; | |
| font-weight: 600; | |
| border: none; | |
| border-radius: 8px; | |
| margin : 8px auto; | |
| transition: all 0.3s ease; | |
| } | |
| #button_green { | |
| background: linear-gradient(to right, #18de78, #50eb9b); | |
| color: #efefef; | |
| font-weight: 600; | |
| border: none; | |
| width: 50%; | |
| color : #1d1d1d; | |
| margin : 8px auto; | |
| border-radius: 8px; | |
| transition: all 0.3s ease; | |
| } | |
| #button:hover { | |
| background: linear-gradient(to right, #5A2D8C, #A579C6); | |
| transform: translateY(-2px); | |
| box-shadow: 0 4px 12px rgba(106, 53, 156, 0.3); | |
| } | |
| a{ | |
| color : #1baaf2; | |
| text-decoration: none; | |
| } | |
| .normal-text{ | |
| font-size: 25px; | |
| } | |
| """ | |
| # ------------------------- | |
| # Website References | |
| # ------------------------- | |
| website = 'https://ai.remeinium.com' | |
| docs = 'https://esdocs.ai.remeinium.com' | |
| js_docs = 'https://esdocs.ai.remeinium.com/api-reference/introduction#javascript' | |
| cu_docs = 'https://esdocs.ai.remeinium.com/api-reference/introduction#curl' | |
| status = 'https://stats.uptimerobot.com/HZFBOsSvBT' | |
| model = 'https://huggingface.com/Remeinium/UgannA_SiyabasaV2' | |
| # ------------------------- | |
| # Model Loading | |
| # ------------------------- | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if not HF_TOKEN: | |
| raise EnvironmentError("HF_TOKEN is not set. Please add it in Space Settings > Secrets.") | |
| try: | |
| print("Downloading UgannA_SiyabasaV2 model...") | |
| model_path = hf_hub_download( | |
| repo_id="Remeinium/UgannA_SiyabasaV2", | |
| filename="UgannA_SiyabasaV2.bin", | |
| token=HF_TOKEN, | |
| repo_type="model" | |
| ) | |
| model = fasttext.load_model(model_path) | |
| print("Model loaded successfully!") | |
| MODEL_INFO = { | |
| "name": "UgannA_SiyabasaV2", | |
| "version": "2.0", | |
| "dimensions": model.get_dimension(), | |
| "vocabulary_size": len(model.get_words()), | |
| "language": "Sinhala", | |
| "architecture": "FastText" | |
| } | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to load model: {str(e)}") | |
| # ------------------------- | |
| # Rate Limiting | |
| # ------------------------- | |
| class RateLimiter: | |
| def __init__(self): | |
| self.requests = defaultdict(deque) | |
| self.user_limits = defaultdict(deque) | |
| # limits | |
| self.limits = { | |
| "per_minute": 120, | |
| "per_hour": 2000, | |
| "per_day": 100000 | |
| } | |
| def check_limit(self, client_id: str, user_id: str = None) -> Tuple[bool, Dict[str, Any]]: | |
| now = time.time() | |
| identifier = user_id if user_id else client_id | |
| client_requests = self.requests[identifier] | |
| # Clean old requests (24 hour window) | |
| while client_requests and client_requests[0] < now - 86400: | |
| client_requests.popleft() | |
| current_count = len(client_requests) | |
| # Check daily limit | |
| if current_count >= self.limits["per_day"]: | |
| return False, { | |
| "allowed": False, | |
| "limit": self.limits["per_day"], | |
| "current": current_count, | |
| "reset_in": 86400 - (now - client_requests[0]) if client_requests else 86400 | |
| } | |
| # Check hourly limit | |
| hourly_requests = [req for req in client_requests if req > now - 3600] | |
| if len(hourly_requests) >= self.limits["per_hour"]: | |
| return False, { | |
| "allowed": False, | |
| "limit": self.limits["per_hour"], | |
| "current": len(hourly_requests), | |
| "reset_in": 3600 - (now - hourly_requests[0]) if hourly_requests else 3600 | |
| } | |
| # Check minute-level | |
| minute_requests = [req for req in client_requests if req > now - 60] | |
| if len(minute_requests) >= self.limits["per_minute"]: | |
| return False, { | |
| "allowed": False, | |
| "limit": self.limits["per_minute"], | |
| "current": len(minute_requests), | |
| "reset_in": 60 - (now - minute_requests[0]) if minute_requests else 60 | |
| } | |
| # Allow request | |
| client_requests.append(now) | |
| return True, { | |
| "allowed": True, | |
| "limits": self.limits, | |
| "current_daily": current_count + 1, | |
| "remaining_daily": self.limits["per_day"] - current_count - 1 | |
| } | |
| rate_limiter = RateLimiter() | |
| # ------------------------- | |
| # Core Embedding Functions | |
| # ------------------------- | |
| def enhanced_embedding_response(original_result, text, endpoint_type="word"): | |
| """Enhance the response with additional metadata""" | |
| if "error" in original_result: | |
| return original_result | |
| # Common metadata | |
| original_result["model"] = "UgannA_SiyabasaV2" | |
| original_result["language"] = "Sinhala" | |
| original_result["dimensions"] = 300 | |
| # Format based on endpoint type | |
| if endpoint_type == "word": | |
| return { | |
| "text": text, | |
| "embedding": original_result.get("embedding", []), | |
| "dimensions": original_result["dimensions"], | |
| "model": original_result["model"], | |
| "language": original_result["language"] | |
| } | |
| elif endpoint_type == "sentence": | |
| return { | |
| "sentence": text, | |
| "embedding": original_result.get("embedding", []), | |
| "dimensions": original_result["dimensions"], | |
| "tokens": original_result.get("tokens", []), | |
| "token_count": original_result.get("token_count", 0), | |
| "model": original_result["model"], | |
| "language": original_result["language"] | |
| } | |
| else: | |
| # For similarity and neighbors | |
| return original_result | |
| def safe_strip(s: Optional[str]) -> str: | |
| return "" if s is None else s.strip() | |
| def load_vocab_and_matrix(max_words: int = 500000): | |
| try: | |
| words = model.get_words()[:max_words] | |
| vectors = [model.get_word_vector(w) for w in words] | |
| mat = np.vstack(vectors).astype(np.float32) | |
| norms = np.linalg.norm(mat, axis=1, keepdims=True) | |
| norms[norms == 0.0] = 1.0 | |
| mat_norm = mat / norms | |
| return words, mat, mat_norm | |
| except Exception: | |
| raise RuntimeError("Failed to load vocabulary matrix") | |
| def cosine_similarity_vec(u: np.ndarray, mat_norm: np.ndarray) -> np.ndarray: | |
| u_norm = np.linalg.norm(u) | |
| if u_norm == 0: | |
| return np.zeros(mat_norm.shape[0], dtype=np.float32) | |
| u = (u / u_norm).astype(np.float32) | |
| return np.dot(mat_norm, u) | |
| def get_embedding(word: str) -> Dict[str, Any]: | |
| word = safe_strip(word) | |
| if not word: | |
| return {"error": "Please provide a Sinhala word"} | |
| try: | |
| emb = model.get_word_vector(word) | |
| base_result = { | |
| "word": word, | |
| "embedding": emb.tolist(), | |
| "dimensions": len(emb) | |
| } | |
| return enhanced_embedding_response(base_result, word, "word") | |
| except Exception as e: | |
| return {"error": f"Failed to generate embedding: {str(e)}"} | |
| def word_similarity(word1: str, word2: str) -> Dict[str, Any]: | |
| word1, word2 = safe_strip(word1), safe_strip(word2) | |
| if not word1 or not word2: | |
| return {"error": "Both words are required"} | |
| try: | |
| v1, v2 = model.get_word_vector(word1), model.get_word_vector(word2) | |
| denom = (np.linalg.norm(v1) * np.linalg.norm(v2)) | |
| similarity = float(np.dot(v1, v2) / denom) if denom != 0 else 0.0 | |
| base_result = { | |
| "word1": word1, | |
| "word2": word2, | |
| "similarity": round(similarity, 6) | |
| } | |
| return enhanced_embedding_response(base_result, f"{word1} vs {word2}", "similarity") | |
| except Exception as e: | |
| return {"error": f"Similarity computation failed: {str(e)}"} | |
| def nearest_neighbors(word: str, top_k: int = 10) -> Dict[str, Any]: | |
| word = safe_strip(word) | |
| if not word: | |
| return {"error": "Word input required"} | |
| try: | |
| words, mat, mat_norm = load_vocab_and_matrix() | |
| vec = model.get_word_vector(word) | |
| sims = cosine_similarity_vec(vec, mat_norm) | |
| indices = np.argsort(-sims)[:top_k + 1] | |
| results = [] | |
| for i in indices: | |
| neighbor = words[i] | |
| score = float(sims[i]) | |
| if neighbor != word: | |
| results.append({"word": neighbor, "similarity": round(score, 6)}) | |
| if len(results) >= top_k: | |
| break | |
| base_result = { | |
| "query": word, | |
| "neighbors": results | |
| } | |
| return enhanced_embedding_response(base_result, word, "neighbors") | |
| except Exception as e: | |
| return {"error": f"Neighbor search failed: {str(e)}"} | |
| def sentence_embedding(sentence: str) -> Dict[str, Any]: | |
| sentence = safe_strip(sentence) | |
| if not sentence: | |
| return {"error": "Sentence input required"} | |
| try: | |
| tokens = [t for t in sentence.split() if t.strip()] | |
| if not tokens: | |
| return {"error": "No valid tokens found"} | |
| vectors = [model.get_word_vector(token) for token in tokens] | |
| avg_vector = np.mean(vectors, axis=0) | |
| base_result = { | |
| "sentence": sentence, | |
| "embedding": avg_vector.tolist(), | |
| "tokens": tokens, | |
| "token_count": len(tokens) | |
| } | |
| return enhanced_embedding_response(base_result, sentence, "sentence") | |
| except Exception as e: | |
| return {"error": f"Sentence embedding failed: {str(e)}"} | |
| def sentence_similarity(sentence1: str, sentence2: str) -> Dict[str, Any]: | |
| try: | |
| emb1 = sentence_embedding(sentence1) | |
| emb2 = sentence_embedding(sentence2) | |
| if "error" in emb1 or "error" in emb2: | |
| return {"error": emb1.get("error", emb2.get("error"))} | |
| v1 = np.array(emb1["embedding"]) | |
| v2 = np.array(emb2["embedding"]) | |
| denom = (np.linalg.norm(v1) * np.linalg.norm(v2)) | |
| similarity = float(np.dot(v1, v2) / denom) if denom != 0 else 0.0 | |
| base_result = { | |
| "sentence1": sentence1, | |
| "sentence2": sentence2, | |
| "similarity": round(similarity, 6) | |
| } | |
| return enhanced_embedding_response(base_result, f"{sentence1} vs {sentence2}", "sentence_similarity") | |
| except Exception as e: | |
| return {"error": f"Sentence similarity failed: {str(e)}"} | |
| # ------------------------- | |
| # Document Search | |
| # ------------------------- | |
| def parse_uploaded_documents(file): | |
| if file is None: | |
| return {"error": "Please upload a file (txt/csv)."} | |
| try: | |
| if hasattr(file, 'name'): | |
| file_path = file.name | |
| else: | |
| file_path = str(file) | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| raw = f.read() | |
| except UnicodeDecodeError: | |
| try: | |
| with open(file_path, 'r', encoding='latin-1') as f: | |
| raw = f.read() | |
| except Exception as e: | |
| return {"error": f"Encoding error: {str(e)}"} | |
| except Exception as e: | |
| return {"error": f"File reading error: {str(e)}"} | |
| docs = [] | |
| if "," in raw and raw.count(",") > raw.count("\n"): | |
| for line in raw.splitlines(): | |
| if line.strip(): | |
| docs.append(line.strip()) | |
| else: | |
| for line in raw.splitlines(): | |
| if line.strip(): | |
| docs.append(line.strip()) | |
| if not docs: | |
| return {"error": "No documents found in the file"} | |
| return {"documents": docs} | |
| def index_documents_for_search(docs: List[str]): | |
| if not docs: | |
| return {"error": "The file was empty"} | |
| try: | |
| vecs = [] | |
| for d in docs: | |
| tokens = [t for t in d.split() if t.strip()] | |
| if not tokens: | |
| vecs.append(np.zeros((model.get_dimension(),), dtype=np.float32)) | |
| continue | |
| mats = np.vstack([model.get_word_vector(t) for t in tokens]) | |
| vecs.append(mats.mean(axis=0)) | |
| M = np.vstack(vecs).astype(np.float32) | |
| norms = np.linalg.norm(M, axis=1, keepdims=True) | |
| norms[norms == 0] = 1.0 | |
| M_norm = M / norms | |
| return {"matrix": M, "matrix_norm": M_norm, "docs": docs} | |
| except Exception as e: | |
| return {"error": f"Error while data indexing: {str(e)}"} | |
| def search_documents(query: str, indexed): | |
| q = safe_strip(query) | |
| if not q: | |
| return {"error": "Enter a query to search"} | |
| try: | |
| q_tokens = [t for t in q.split() if t.strip()] | |
| if not q_tokens: | |
| return {"error": "Couldn't extract tokens from query"} | |
| q_vecs = np.vstack([model.get_word_vector(t) for t in q_tokens]) | |
| q_avg = q_vecs.mean(axis=0) | |
| q_norm = np.linalg.norm(q_avg) | |
| if q_norm == 0: | |
| sims = np.zeros(indexed["matrix_norm"].shape[0], dtype=np.float32) | |
| else: | |
| q_avg = (q_avg / q_norm).astype(np.float32) | |
| sims = np.dot(indexed["matrix_norm"], q_avg) | |
| idx = np.argsort(-sims)[:10] | |
| results = [] | |
| for i in idx: | |
| results.append({"document": indexed["docs"][i], "score": float(round(sims[i], 6))}) | |
| return {"query": q, "results": results} | |
| except Exception as e: | |
| return {"error": f"Search failed: {str(e)}"} | |
| # ------------------------- | |
| # API Platform | |
| # ------------------------- | |
| def create_api_platform(): | |
| with gr.Column(): | |
| # Quick Start Section | |
| gr.Markdown("## Quick start") | |
| gr.Markdown("Get started with the `Embedding_Siyabasa API` in minutes.") | |
| with gr.Tabs(): | |
| with gr.TabItem("🐍 Python"): | |
| gr.Markdown(""" | |
| ```python | |
| from gradio_client import Client | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| word="අම්මා", | |
| api_name="/get_embedding" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` | |
| """) | |
| gr.Markdown(""" | |
| #### **Accepts 1 parameter:** | |
| - `word` : `string` _<u>\*Required</u>_ | |
| - The input value that is provided in the "Sinhala Word" Textbox component. | |
| #### **Returns 1 element** | |
| - `str | float | bool | list | dict` | |
| - The output value that appears in the "Embedding Vector" Json component. | |
| """) | |
| # API Endpoints Section | |
| gr.Markdown("## API endpoints") | |
| # Word Embedding Endpoint | |
| with gr.Accordion("GET WORD EMBEDDING", open=True): | |
| gr.Markdown(""" | |
| Get the embedding vector for a Sinhala word. | |
| **Python example:** | |
| ```python | |
| from gradio_client import Client | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| word="අම්මා", | |
| api_name="/get_embedding" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` | |
| **Response format:** | |
| ```json | |
| { | |
| "text": "අම්මා", | |
| "embedding": [0.123, -0.456, 0.789, ...], | |
| "dimensions": 300, | |
| "model": "UgannA_SiyabasaV2", | |
| "language": "Sinhala" | |
| } | |
| ``` | |
| """) | |
| gr.Markdown(""" | |
| #### **Accepts 1 parameter:** | |
| - `word` : `string` _<u>\*Required</u>_ | |
| - The input value that is provided in the "Sinhala Word" Textbox component. | |
| #### **Returns 1 element** | |
| - `str | float | bool | list | dict` | |
| - The output value that appears in the "Embedding Vector" Json component. | |
| """) | |
| # Word Similarity Endpoint | |
| with gr.Accordion("GET WORD SIMILARITY", open=False): | |
| gr.Markdown(""" | |
| Compute the similarity between two Sinhala words. | |
| **Python example:** | |
| ```python | |
| from gradio_client import Client | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| word1="අම්මා", | |
| word2="තාත්තා", | |
| api_name="/word_similarity" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` | |
| **Response format:** | |
| ```json | |
| { | |
| "word1": "අම්මා", | |
| "word2": "තාත්තා", | |
| "similarity": 0.856234, | |
| "model": "UgannA_SiyabasaV2" | |
| } | |
| ``` | |
| """) | |
| gr.Markdown(""" | |
| #### **Accepts 2 parameters:** | |
| 1. `word1` :`string` \*_<u>Required</u>_ | |
| - The input value that is provided in the "Word 1" Textbox component. | |
| 2. `word2` : `string` \*_<u>Required</u>_ | |
| - The input value that is provided in the "Word 2: Textbox component. | |
| #### **Returns 1 element** | |
| `str | float | bool | list | dict` | |
| - The output value that appears in the "Similarity Result" Json component. | |
| """) | |
| # Nearest Neighbors Endpoint | |
| with gr.Accordion("GET NEAREST NEIGHBORS", open=False): | |
| gr.Markdown(""" | |
| Find semantically similar words for a given Sinhala word. | |
| **Python example:** | |
| ```python | |
| from gradio_client import Client | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| word="පෞරාණික", | |
| top_k=5, | |
| api_name="/nearest_neighbors" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` | |
| **Response format:** | |
| ```json | |
| { | |
| "query": "පෞරාණික", | |
| "neighbors": [ | |
| {"word": "ඉපැරණි", "similarity": 0.755...}, | |
| {"word": "පුරාවිද්යාත්මක", "similarity": 0.749...}, | |
| ... | |
| ], | |
| "model": "UgannA_SiyabasaV2" | |
| } | |
| ``` | |
| """) | |
| gr.Markdown(""" | |
| #### **Accepts 2 parameters:** | |
| 1. `word` : `str` \*_<u>Required</u>_ | |
| - The input value that is provided in the "Query Word" Textbox component. | |
| 2. `top_k` : `float` _Default: 10_ | |
| - The input value that is provided in the "Number of Results" Slider component. | |
| #### **Returns 1 element** | |
| `str | float | bool | list | dict` | |
| - The output value that appears in the "Similar Words" Json component.""") | |
| # Sentence Embedding Endpoint | |
| with gr.Accordion("GET SENTENCE EMBEDDING", open=False): | |
| gr.Markdown(""" | |
| Get the embedding vector for a Sinhala sentence. | |
| **Python example:** | |
| ```python | |
| from gradio_client import Client | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| sentence="මම පාසලට යමි", | |
| api_name="/sentence_embedding" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` | |
| **Response format:** | |
| ```json | |
| { | |
| "sentence": "මම පාසලට යමි", | |
| "embedding": [0.123, -0.456, 0.789, ...], | |
| "dimensions": 300, | |
| "tokens": ["මම", "පාසලට", "යමි"], | |
| "model": "UgannA_SiyabasaV2" | |
| } | |
| ``` | |
| """) | |
| gr.Markdown(""" | |
| #### Accepts 1 parameter: | |
| - `sentence` : `str` \*_<u>Required</u>_ | |
| - The input value that is provided in the "Sinhala Sentence" Textbox component. | |
| #### **Returns 1 element** | |
| `str | float | bool | list | dict` | |
| - The output value that appears in the "Sentence Embedding" Json component. | |
| """) | |
| # Sentence Similarity Endpoint | |
| with gr.Accordion("GET SENTENCE SIMILARITY", open=False): | |
| gr.Markdown(""" | |
| Compute the similarity between two Sinhala sentences. | |
| **Python example:** | |
| ```python | |
| from gradio_client import Client | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| sentence1="මම පාසලට යමි", | |
| sentence2="ඔහු පාසලට යයි", | |
| api_name="/sentence_similarity" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` | |
| **Response format:** | |
| ```json | |
| { | |
| "sentence1": "මම පාසලට යමි", | |
| "sentence2": "ඔහු පාසලට යයි", | |
| "similarity": 0.734567, | |
| "model": "UgannA_SiyabasaV2" | |
| } | |
| ``` | |
| """) | |
| gr.Markdown(""" | |
| **Accepts 2 parameters:** | |
| 1. `sentence1` : `str` \*_<u>Required</u>_ | |
| - The input value that is provided in the "Sentence A" Textbox component. | |
| 2. `sentence2` : `str` \*_<u>Required</u>_ | |
| - The input value that is provided in the "Sentence B" Textbox component. | |
| #### **Returns 1 element** | |
| `str | float | bool | list | dict` | |
| - The output value that appears in the "Sentence Similarity" Json component. | |
| """) | |
| # Document Search Endpoints | |
| with gr.Accordion("DOCUMENT SEARCH", open=False): | |
| gr.Markdown(""" | |
| Upload documents and perform semantic search. | |
| **Step 1: Index documents** | |
| ```python | |
| from gradio_client import Client, handle_file | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| file=handle_file('path/to/documents.txt'), | |
| api_name="/_index_upload" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` """) | |
| gr.Markdown(""" | |
| #### **Accepts 1 parameter:** | |
| 1. `file` : `filepath` \*_<u>Required</u>_ | |
| The input value that is provided in the "Upload .txt or .csv File" File component. The FileData class is a subclass of the GradioModel class that represents a file object within a Gradio interface. It is used to store file data and metadata when a file is uploaded. Attributes: path: The server file path where the file is stored. url: The normalized server URL pointing to the file. size: The size of the file in bytes. orig_name: The original filename before upload. mime_type: The MIME type of the file. is_stream: Indicates whether the file is a stream. meta: Additional metadata used internally (should not be changed). | |
| #### **Returns tuple of 2 elements** | |
| 1. `dict(headers: list[Any], data: list[list[Any]], metadata: dict(str, list[Any] | None) | None)` | |
| - The output value that appears in the `value_45` Dataframe component. | |
| 2. `str` | |
| - The output value that appears in the "Status" Textbox component. | |
| """) | |
| gr.Markdown(""" | |
| **Step 2: Search documents** | |
| ```python | |
| from gradio_client import Client | |
| client = Client("Remeinium/Embedding_Siyabasa") | |
| result = client.predict( | |
| query="සිංහල භාෂාව", | |
| topn_=5, | |
| api_name="/_search_wrapper" | |
| ) | |
| print(json.dumps(result, indent=4)) | |
| ``` | |
| """) | |
| gr.Markdown(""" | |
| ### **Accepts 2 parameters:** | |
| 1. `query` : `string` \*_<u>Required</u>_ | |
| - The input value that is provided in the `Search Query` Textbox component. | |
| 2. `topn_` : `float` _Default 5_ | |
| - The input value that is provided in the "Number of Results" Slider component. | |
| #### **Returns 1 element** | |
| `str | float | bool | list | dict` | |
| - The output value that appears in the `Search Results` Json component. | |
| """) | |
| with gr.TabItem("</> JavaScript"): | |
| gr.Markdown(""" | |
| ```javascript | |
| import { Client } from "@gradio/client"; | |
| const client = await Client.connect("Remeinium/Embedding_Siyabasa"); | |
| const result = await client.predict("/get_embedding", { | |
| word: "අම්මා" | |
| }); | |
| console.log(result.data); | |
| ``` | |
| """) | |
| web_btn_js = gr.Button("Refer the Complete Javascript API Documentation", elem_id="button_green") | |
| js_code = f"() => window.open('{cu_docs}', '_blank')" | |
| web_btn_js.click(None, None, None, js=js_code) | |
| with gr.TabItem("␥ cURL"): | |
| gr.Markdown(""" | |
| ```bash | |
| curl -X POST https://remeinium-embedding-siyabasa.hf.space/gradio_api/call/get_embedding \\ | |
| -H "Content-Type: application/json" \\ | |
| -d '{"data": ["අම්මා"]}' | awk -F'"' '{ print $4}' | read EVENT_ID; \\ | |
| curl -N https://remeinium-embedding-siyabasa.hf.space/gradio_api/call/get_embedding/$EVENT_ID | |
| ``` | |
| """) | |
| web_btn_cu = gr.Button("Refer the Complete cURL API Documentation", elem_id="button_green") | |
| js_code = f"() => window.open('{cu_docs}', '_blank')" | |
| web_btn_cu.click(None, None, None, js=js_code) | |
| # Model Information | |
| gr.Markdown("## Model Details") | |
| gr.Markdown(""" | |
| | Property | Description | | |
| |----------|-------------| | |
| | **Model**| Embedding_Siyabasa API<br>`UgannA_SiyabasaV2` | | |
| | **Supported data types**<br>Input<br>Output | <br>Text<br>Text embeddings | | |
| | **Token limits**<br>Input token limit<br>Output dimension size | <br>1000<br>300 | | |
| | **Version**<br>Model<br>API | <br>V_2.0<br>V_1.0| | |
| | **Latest update** | August 2025 | | |
| | **Language** | `Sinhala` only | | |
| """) | |
| # Usage and Limits | |
| gr.Markdown("## Usage and limits") | |
| gr.Markdown(""" | |
| - **Always Free**: Unlimited requests (subject to fair usage) | |
| - **Rate limits**: Applied only during high traffic to ensure service stability | |
| """) | |
| # Support | |
| gr.Markdown("## Support") | |
| gr.Markdown(""" | |
| - **Read Official <a href="https://esdocs.ai.remeinium.com" target="_blank">Documentation</a>.** | |
| - **Technical support**: support@remeinium.com | |
| - **Bug reports**: Create an issue in the Space discussions | |
| - **Feature requests**: Contact support@remeinium.com | |
| > **Note**: This API is designed specifically for **Sinhala** language processing and **may not work with other languages.** | |
| """) | |
| web_btn_site = gr.Button("Visit Remeinium AI", elem_id="button_green") | |
| js_code = f"() => window.open('{website}', '_blank')" | |
| web_btn_site.click(None, None, None, js=js_code) | |
| # ------------------------- | |
| # Main Application | |
| # ------------------------- | |
| with gr.Blocks(title="Sinhala Embeddings API", css=styles) as demo: | |
| gr.Markdown(""" | |
| # 🇱🇰 Embedding_Siyabasa - Sinhala | An Advanced Embeddings API for Sinhala Language | |
| ## Welcome to the official HuggingFace Space for _Embedding Siyabasa_ | |
| The `Embedding_Siyabasa API` provides high-quality text embedding models specifically designed for the `Sinhala` language. Generate embeddings for Sinhala words, phrases, and sentences using our latest model `UgannA_SiyabasaV2`. These language-specific embeddings power advanced **NLP tasks such as semantic search, text classification, and document clustering**, delivering more accurate and context-aware results than traditional keyword-based approaches. | |
| Get the Model (`UgannA_SiyabasaV2`): https://huggingface.co/Remeinium/UgannA_SiyabasaV2 | |
| **Key features:** | |
| - **Language-specific**: Optimized exclusively for Sinhala text | |
| - **300-dimensional embeddings**: Rich semantic representations | |
| - **FastText architecture**: Proven performance for morphologically rich languages | |
| """) | |
| with gr.Row(): | |
| web_btn = gr.Button("Refer the Complete API Documentation", elem_id="button_green") | |
| js_code = f"() => window.open('{docs}', '_blank')" | |
| web_btn.click(None, None, None, js=js_code) | |
| web_btn_site = gr.Button("Visit Remeinium AI", elem_id="button") | |
| js_code = f"() => window.open('{website}', '_blank')" | |
| web_btn_site.click(None, None, None, js=js_code) | |
| with gr.Tabs(): | |
| # Playground | |
| with gr.TabItem("🧩 Embedding Playground"): | |
| gr.Markdown("## Explore Model Capabilities") | |
| gr.Markdown("Test the model directly without API access requirements.") | |
| # Word Embedding | |
| with gr.Row(): | |
| inp = gr.Textbox(label="Sinhala Word", placeholder="අම්මා, සියබස, නූතන") | |
| out = gr.JSON(label="Embedding Vector") | |
| gr.Examples( | |
| examples=[["අම්මා"], ["සියබස"], ["නූතන"], ["ප්රජාතන්ත්රවාදය"]], | |
| inputs=inp, outputs=out, fn=get_embedding, cache_examples=True | |
| ) | |
| btn = gr.Button("Get Embedding", elem_id="button") | |
| btn.click(fn=get_embedding, inputs=inp, outputs=out) | |
| # Word Similarity | |
| gr.Markdown("### Word Similarity") | |
| with gr.Row(): | |
| ws_a = gr.Textbox(label="Word A", placeholder="අම්මා") | |
| ws_b = gr.Textbox(label="Word B", placeholder="තාත්තා") | |
| ws_out = gr.JSON(label="Similarity Result") | |
| ws_btn = gr.Button("Compare Words", elem_id="button") | |
| ws_btn.click(fn=word_similarity, inputs=[ws_a, ws_b], outputs=ws_out) | |
| # Nearest Neighbors | |
| gr.Markdown("### Semantic Search") | |
| with gr.Row(): | |
| nn_word = gr.Textbox(label="Query Word", placeholder="පෞරාණික") | |
| nn_k = gr.Slider(1, 50, 10, label="Number of Results") | |
| nn_out = gr.JSON(label="Similar Words") | |
| gr.Examples( | |
| examples=[["අම්මා"], ["සියබස"], ["නූතන"], ["ප්රජාතන්ත්රවාදය"]], | |
| inputs=nn_word, outputs=nn_out, fn=nearest_neighbors, cache_examples=True | |
| ) | |
| nn_btn = gr.Button("Find Similar Words", elem_id="button") | |
| nn_btn.click(fn=nearest_neighbors, inputs=[nn_word, nn_k], outputs=nn_out) | |
| # Sentence Operations | |
| gr.Markdown("### Sentence Operations") | |
| with gr.Row(): | |
| sent_inp = gr.Textbox(label="Sinhala Sentence", placeholder="මම පාසලට යමි") | |
| sent_out = gr.JSON(label="Sentence Embedding") | |
| gr.Examples( | |
| examples=[["මම පාසලට යමි"], ["ආරෝග්යා පරමා ලාභා"], ["ඔබට බොහොම ස්තුතියි."]], | |
| inputs=sent_inp, outputs=sent_out, fn=sentence_embedding, cache_examples=True | |
| ) | |
| sent_btn = gr.Button("Get Sentence Embedding", elem_id="button") | |
| sent_btn.click(fn=sentence_embedding, inputs=sent_inp, outputs=sent_out) | |
| with gr.Row(): | |
| sa = gr.Textbox(label="Sentence A", placeholder="මම පාසලට යමි") | |
| sb = gr.Textbox(label="Sentence B", placeholder="ඔහු පාසලට යයි") | |
| ssim_out = gr.JSON(label="Sentence Similarity") | |
| ssim_btn = gr.Button("Compare Sentences", elem_id="button") | |
| ssim_btn.click(fn=sentence_similarity, inputs=[sa, sb], outputs=ssim_out) | |
| # Document Search | |
| gr.Markdown("### Document Semantic Search") | |
| gr.Markdown("Upload a text file (one document per line) for semantic search.") | |
| status_display = gr.Textbox(label="Status", value="Ready to upload documents", interactive=False) | |
| with gr.Row(): | |
| upload = gr.File(label="Upload .txt or .csv File", file_count="single") | |
| docs_list = gr.Dataframe(headers=["Document Preview"], interactive=False) | |
| idx_btn = gr.Button("Index Documents", elem_id="button") | |
| indexed_state = gr.State(value=None) | |
| def _index_upload(file): | |
| if file is None: | |
| return None, gr.update(value=[]), "Please upload a file first" | |
| parsed = parse_uploaded_documents(file) | |
| if "error" in parsed: | |
| return None, gr.update(value=[]), parsed["error"] | |
| docs = parsed["documents"] | |
| indexed = index_documents_for_search(docs) | |
| if "error" in indexed: | |
| return None, gr.update(value=[]), indexed["error"] | |
| preview = [[(d[:200] + "..." if len(d) > 200 else d)] for d in docs[:20]] | |
| return indexed, gr.update(value=preview), f"Indexed {len(docs)} documents" | |
| idx_btn.click(_index_upload, inputs=[upload], outputs=[indexed_state, docs_list, status_display]) | |
| with gr.Row(): | |
| q = gr.Textbox(label="Search Query") | |
| topn = gr.Slider(1, 20, 5, label="Number of Results") | |
| results_out = gr.JSON(label="Search Results") | |
| def _search_wrapper(query, topn_, state): | |
| if state is None: | |
| return {"error": "Please index documents first"} | |
| res = search_documents(query, state) | |
| if "results" in res: | |
| res["results"] = res["results"][:int(topn_)] | |
| return res | |
| search_btn = gr.Button("Search Documents", elem_id="button") | |
| search_btn.click(fn=_search_wrapper, inputs=[q, topn, indexed_state], outputs=[results_out]) | |
| # API Platform Tab | |
| with gr.TabItem("⚡ API Platform"): | |
| create_api_platform() | |
| with gr.TabItem("💡 Status"): | |
| # gr.Markdown("Check at : https://stats.uptimerobot.com/HZFBOsSvBT") | |
| web_btn_status = gr.Button("Check Status", elem_id="button") | |
| js_code = f"() => window.open('{status}', '_blank')" | |
| web_btn_status.click(None, None, None, js=js_code) | |
| gr.Markdown(""" | |
| --- | |
| *✨ **<a href="https://ai.remeinium.com" target="_blank">Remeinium AI</a>** · _Intelligence for a greater tomorrow._* | |
| """) | |
| if __name__ == "__main__": | |
| # demo.queue(default_concurrency_limit=10, max_size=20).launch() | |
| demo.launch() |