Spaces:

Remeinium
/

Embedding_Siyabasa

Running

App Files Files Community

Embedding_Siyabasa / app.py

thekusaldarshana

Update app.py

8f94d61 verified 13 days ago

raw

history blame contribute delete

38.8 kB

	# app.py
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from huggingface_hub import HfApi
	import fasttext
	import os
	import numpy as np
	from functools import lru_cache
	import json
	import time
	from typing import List, Tuple, Optional, Dict, Any
	from collections import defaultdict, deque
	import hashlib
	import uuid
	import tempfile
	import requests
	import webbrowser

	# -------------------------
	# Styles
	# -------------------------
	styles = """
	body{
	background : #161616;
	}
	#button {
	background: linear-gradient(to right, #6A359C, #B589D6);
	color: #efefef;
	font-weight: 600;
	border: none;
	border-radius: 8px;
	margin : 8px auto;
	transition: all 0.3s ease;
	}
	#button_green {
	background: linear-gradient(to right, #18de78, #50eb9b);
	color: #efefef;
	font-weight: 600;
	border: none;
	width: 50%;
	color : #1d1d1d;
	margin : 8px auto;
	border-radius: 8px;
	transition: all 0.3s ease;
	}
	#button:hover {
	background: linear-gradient(to right, #5A2D8C, #A579C6);
	transform: translateY(-2px);
	box-shadow: 0 4px 12px rgba(106, 53, 156, 0.3);
	}
	a{
	color : #1baaf2;
	text-decoration: none;
	}
	.normal-text{
	font-size: 25px;
	}
	"""

	# -------------------------
	# Website References
	# -------------------------
	website = 'https://ai.remeinium.com'
	docs = 'https://esdocs.ai.remeinium.com'
	js_docs = 'https://esdocs.ai.remeinium.com/api-reference/introduction#javascript'
	cu_docs = 'https://esdocs.ai.remeinium.com/api-reference/introduction#curl'
	status = 'https://stats.uptimerobot.com/HZFBOsSvBT'
	model = 'https://huggingface.com/Remeinium/UgannA_SiyabasaV2'

	# -------------------------
	# Model Loading
	# -------------------------
	HF_TOKEN = os.getenv("HF_TOKEN")
	if not HF_TOKEN:
	raise EnvironmentError("HF_TOKEN is not set. Please add it in Space Settings > Secrets.")

	try:
	print("Downloading UgannA_SiyabasaV2 model...")
	model_path = hf_hub_download(
	repo_id="Remeinium/UgannA_SiyabasaV2",
	filename="UgannA_SiyabasaV2.bin",
	token=HF_TOKEN,
	repo_type="model"
	)
	model = fasttext.load_model(model_path)
	print("Model loaded successfully!")
	MODEL_INFO = {
	"name": "UgannA_SiyabasaV2",
	"version": "2.0",
	"dimensions": model.get_dimension(),
	"vocabulary_size": len(model.get_words()),
	"language": "Sinhala",
	"architecture": "FastText"
	}
	except Exception as e:
	raise RuntimeError(f"Failed to load model: {str(e)}")

	# -------------------------
	# Rate Limiting
	# -------------------------
	class RateLimiter:
	def __init__(self):
	self.requests = defaultdict(deque)
	self.user_limits = defaultdict(deque)
	# limits
	self.limits = {
	"per_minute": 120,
	"per_hour": 2000,
	"per_day": 100000
	}

	def check_limit(self, client_id: str, user_id: str = None) -> Tuple[bool, Dict[str, Any]]:
	now = time.time()
	identifier = user_id if user_id else client_id

	client_requests = self.requests[identifier]

	# Clean old requests (24 hour window)
	while client_requests and client_requests[0] < now - 86400:
	client_requests.popleft()

	current_count = len(client_requests)

	# Check daily limit
	if current_count >= self.limits["per_day"]:
	return False, {
	"allowed": False,
	"limit": self.limits["per_day"],
	"current": current_count,
	"reset_in": 86400 - (now - client_requests[0]) if client_requests else 86400
	}

	# Check hourly limit
	hourly_requests = [req for req in client_requests if req > now - 3600]
	if len(hourly_requests) >= self.limits["per_hour"]:
	return False, {
	"allowed": False,
	"limit": self.limits["per_hour"],
	"current": len(hourly_requests),
	"reset_in": 3600 - (now - hourly_requests[0]) if hourly_requests else 3600
	}

	# Check minute-level
	minute_requests = [req for req in client_requests if req > now - 60]
	if len(minute_requests) >= self.limits["per_minute"]:
	return False, {
	"allowed": False,
	"limit": self.limits["per_minute"],
	"current": len(minute_requests),
	"reset_in": 60 - (now - minute_requests[0]) if minute_requests else 60
	}

	# Allow request
	client_requests.append(now)
	return True, {
	"allowed": True,
	"limits": self.limits,
	"current_daily": current_count + 1,
	"remaining_daily": self.limits["per_day"] - current_count - 1
	}

	rate_limiter = RateLimiter()

	# -------------------------
	# Core Embedding Functions
	# -------------------------
	def enhanced_embedding_response(original_result, text, endpoint_type="word"):
	"""Enhance the response with additional metadata"""
	if "error" in original_result:
	return original_result

	# Common metadata
	original_result["model"] = "UgannA_SiyabasaV2"
	original_result["language"] = "Sinhala"
	original_result["dimensions"] = 300

	# Format based on endpoint type
	if endpoint_type == "word":
	return {
	"text": text,
	"embedding": original_result.get("embedding", []),
	"dimensions": original_result["dimensions"],
	"model": original_result["model"],
	"language": original_result["language"]
	}
	elif endpoint_type == "sentence":
	return {
	"sentence": text,
	"embedding": original_result.get("embedding", []),
	"dimensions": original_result["dimensions"],
	"tokens": original_result.get("tokens", []),
	"token_count": original_result.get("token_count", 0),
	"model": original_result["model"],
	"language": original_result["language"]
	}
	else:
	# For similarity and neighbors
	return original_result

	def safe_strip(s: Optional[str]) -> str:
	return "" if s is None else s.strip()

	@lru_cache(maxsize=1)
	def load_vocab_and_matrix(max_words: int = 500000):
	try:
	words = model.get_words()[:max_words]
	vectors = [model.get_word_vector(w) for w in words]
	mat = np.vstack(vectors).astype(np.float32)
	norms = np.linalg.norm(mat, axis=1, keepdims=True)
	norms[norms == 0.0] = 1.0
	mat_norm = mat / norms
	return words, mat, mat_norm
	except Exception:
	raise RuntimeError("Failed to load vocabulary matrix")

	def cosine_similarity_vec(u: np.ndarray, mat_norm: np.ndarray) -> np.ndarray:
	u_norm = np.linalg.norm(u)
	if u_norm == 0:
	return np.zeros(mat_norm.shape[0], dtype=np.float32)
	u = (u / u_norm).astype(np.float32)
	return np.dot(mat_norm, u)

	def get_embedding(word: str) -> Dict[str, Any]:
	word = safe_strip(word)
	if not word:
	return {"error": "Please provide a Sinhala word"}
	try:
	emb = model.get_word_vector(word)
	base_result = {
	"word": word,
	"embedding": emb.tolist(),
	"dimensions": len(emb)
	}
	return enhanced_embedding_response(base_result, word, "word")
	except Exception as e:
	return {"error": f"Failed to generate embedding: {str(e)}"}

	def word_similarity(word1: str, word2: str) -> Dict[str, Any]:
	word1, word2 = safe_strip(word1), safe_strip(word2)
	if not word1 or not word2:
	return {"error": "Both words are required"}
	try:
	v1, v2 = model.get_word_vector(word1), model.get_word_vector(word2)
	denom = (np.linalg.norm(v1) * np.linalg.norm(v2))
	similarity = float(np.dot(v1, v2) / denom) if denom != 0 else 0.0
	base_result = {
	"word1": word1,
	"word2": word2,
	"similarity": round(similarity, 6)
	}
	return enhanced_embedding_response(base_result, f"{word1} vs {word2}", "similarity")
	except Exception as e:
	return {"error": f"Similarity computation failed: {str(e)}"}

	def nearest_neighbors(word: str, top_k: int = 10) -> Dict[str, Any]:
	word = safe_strip(word)
	if not word:
	return {"error": "Word input required"}
	try:
	words, mat, mat_norm = load_vocab_and_matrix()
	vec = model.get_word_vector(word)
	sims = cosine_similarity_vec(vec, mat_norm)
	indices = np.argsort(-sims)[:top_k + 1]

	results = []
	for i in indices:
	neighbor = words[i]
	score = float(sims[i])
	if neighbor != word:
	results.append({"word": neighbor, "similarity": round(score, 6)})
	if len(results) >= top_k:
	break

	base_result = {
	"query": word,
	"neighbors": results
	}
	return enhanced_embedding_response(base_result, word, "neighbors")
	except Exception as e:
	return {"error": f"Neighbor search failed: {str(e)}"}

	def sentence_embedding(sentence: str) -> Dict[str, Any]:
	sentence = safe_strip(sentence)
	if not sentence:
	return {"error": "Sentence input required"}
	try:
	tokens = [t for t in sentence.split() if t.strip()]
	if not tokens:
	return {"error": "No valid tokens found"}

	vectors = [model.get_word_vector(token) for token in tokens]
	avg_vector = np.mean(vectors, axis=0)

	base_result = {
	"sentence": sentence,
	"embedding": avg_vector.tolist(),
	"tokens": tokens,
	"token_count": len(tokens)
	}
	return enhanced_embedding_response(base_result, sentence, "sentence")
	except Exception as e:
	return {"error": f"Sentence embedding failed: {str(e)}"}

	def sentence_similarity(sentence1: str, sentence2: str) -> Dict[str, Any]:
	try:
	emb1 = sentence_embedding(sentence1)
	emb2 = sentence_embedding(sentence2)

	if "error" in emb1 or "error" in emb2:
	return {"error": emb1.get("error", emb2.get("error"))}

	v1 = np.array(emb1["embedding"])
	v2 = np.array(emb2["embedding"])

	denom = (np.linalg.norm(v1) * np.linalg.norm(v2))
	similarity = float(np.dot(v1, v2) / denom) if denom != 0 else 0.0

	base_result = {
	"sentence1": sentence1,
	"sentence2": sentence2,
	"similarity": round(similarity, 6)
	}
	return enhanced_embedding_response(base_result, f"{sentence1} vs {sentence2}", "sentence_similarity")
	except Exception as e:
	return {"error": f"Sentence similarity failed: {str(e)}"}

	# -------------------------
	# Document Search
	# -------------------------
	def parse_uploaded_documents(file):
	if file is None:
	return {"error": "Please upload a file (txt/csv)."}

	try:
	if hasattr(file, 'name'):
	file_path = file.name
	else:
	file_path = str(file)

	with open(file_path, 'r', encoding='utf-8') as f:
	raw = f.read()
	except UnicodeDecodeError:
	try:
	with open(file_path, 'r', encoding='latin-1') as f:
	raw = f.read()
	except Exception as e:
	return {"error": f"Encoding error: {str(e)}"}
	except Exception as e:
	return {"error": f"File reading error: {str(e)}"}

	docs = []
	if "," in raw and raw.count(",") > raw.count("\n"):
	for line in raw.splitlines():
	if line.strip():
	docs.append(line.strip())
	else:
	for line in raw.splitlines():
	if line.strip():
	docs.append(line.strip())

	if not docs:
	return {"error": "No documents found in the file"}
	return {"documents": docs}

	def index_documents_for_search(docs: List[str]):
	if not docs:
	return {"error": "The file was empty"}
	try:
	vecs = []
	for d in docs:
	tokens = [t for t in d.split() if t.strip()]
	if not tokens:
	vecs.append(np.zeros((model.get_dimension(),), dtype=np.float32))
	continue
	mats = np.vstack([model.get_word_vector(t) for t in tokens])
	vecs.append(mats.mean(axis=0))
	M = np.vstack(vecs).astype(np.float32)
	norms = np.linalg.norm(M, axis=1, keepdims=True)
	norms[norms == 0] = 1.0
	M_norm = M / norms
	return {"matrix": M, "matrix_norm": M_norm, "docs": docs}
	except Exception as e:
	return {"error": f"Error while data indexing: {str(e)}"}

	def search_documents(query: str, indexed):
	q = safe_strip(query)
	if not q:
	return {"error": "Enter a query to search"}
	try:
	q_tokens = [t for t in q.split() if t.strip()]
	if not q_tokens:
	return {"error": "Couldn't extract tokens from query"}
	q_vecs = np.vstack([model.get_word_vector(t) for t in q_tokens])
	q_avg = q_vecs.mean(axis=0)
	q_norm = np.linalg.norm(q_avg)
	if q_norm == 0:
	sims = np.zeros(indexed["matrix_norm"].shape[0], dtype=np.float32)
	else:
	q_avg = (q_avg / q_norm).astype(np.float32)
	sims = np.dot(indexed["matrix_norm"], q_avg)
	idx = np.argsort(-sims)[:10]
	results = []
	for i in idx:
	results.append({"document": indexed["docs"][i], "score": float(round(sims[i], 6))})
	return {"query": q, "results": results}
	except Exception as e:
	return {"error": f"Search failed: {str(e)}"}

	# -------------------------
	# API Platform
	# -------------------------
	def create_api_platform():
	with gr.Column():
	# Quick Start Section
	gr.Markdown("## Quick start")
	gr.Markdown("Get started with the `Embedding_Siyabasa API` in minutes.")

	with gr.Tabs():
	with gr.TabItem("🐍 Python"):
	gr.Markdown("""
	```python
	from gradio_client import Client

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	word="අම්මා",
	api_name="/get_embedding"
	)
	print(json.dumps(result, indent=4))
	```
	""")

	gr.Markdown("""
	#### Accepts 1 parameter:
	- `word` : `string` _<u>\*Required</u>_
	- The input value that is provided in the "Sinhala Word" Textbox component.

	#### Returns 1 element
	- `str \| float \| bool \| list \| dict`
	- The output value that appears in the "Embedding Vector" Json component.
	""")

	# API Endpoints Section
	gr.Markdown("## API endpoints")

	# Word Embedding Endpoint
	with gr.Accordion("GET WORD EMBEDDING", open=True):
	gr.Markdown("""
	Get the embedding vector for a Sinhala word.

	Python example:
	```python
	from gradio_client import Client

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	word="අම්මා",
	api_name="/get_embedding"
	)
	print(json.dumps(result, indent=4))
	```

	Response format:
	```json
	{
	"text": "අම්මා",
	"embedding": [0.123, -0.456, 0.789, ...],
	"dimensions": 300,
	"model": "UgannA_SiyabasaV2",
	"language": "Sinhala"
	}
	```
	""")

	gr.Markdown("""
	#### Accepts 1 parameter:
	- `word` : `string` _<u>\*Required</u>_
	- The input value that is provided in the "Sinhala Word" Textbox component.

	#### Returns 1 element
	- `str \| float \| bool \| list \| dict`
	- The output value that appears in the "Embedding Vector" Json component.
	""")

	# Word Similarity Endpoint
	with gr.Accordion("GET WORD SIMILARITY", open=False):
	gr.Markdown("""
	Compute the similarity between two Sinhala words.

	Python example:
	```python
	from gradio_client import Client

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	word1="අම්මා",
	word2="තාත්තා",
	api_name="/word_similarity"
	)
	print(json.dumps(result, indent=4))
	```

	Response format:
	```json
	{
	"word1": "අම්මා",
	"word2": "තාත්තා",
	"similarity": 0.856234,
	"model": "UgannA_SiyabasaV2"
	}
	```
	""")

	gr.Markdown("""
	#### Accepts 2 parameters:
	1. `word1` :`string` \*_<u>Required</u>_
	- The input value that is provided in the "Word 1" Textbox component.
	2. `word2` : `string` \*_<u>Required</u>_
	- The input value that is provided in the "Word 2: Textbox component.

	#### Returns 1 element
	`str \| float \| bool \| list \| dict`
	- The output value that appears in the "Similarity Result" Json component.
	""")

	# Nearest Neighbors Endpoint
	with gr.Accordion("GET NEAREST NEIGHBORS", open=False):
	gr.Markdown("""
	Find semantically similar words for a given Sinhala word.

	Python example:
	```python
	from gradio_client import Client

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	word="පෞරාණික",
	top_k=5,
	api_name="/nearest_neighbors"
	)
	print(json.dumps(result, indent=4))
	```

	Response format:
	```json
	{
	"query": "පෞරාණික",
	"neighbors": [
	{"word": "ඉපැරණි", "similarity": 0.755...},
	{"word": "පුරාවිද්යාත්මක", "similarity": 0.749...},
	...
	],
	"model": "UgannA_SiyabasaV2"
	}
	```
	""")

	gr.Markdown("""
	#### Accepts 2 parameters:
	1. `word` : `str` \*_<u>Required</u>_
	- The input value that is provided in the "Query Word" Textbox component.
	2. `top_k` : `float` _Default: 10_
	- The input value that is provided in the "Number of Results" Slider component.

	#### Returns 1 element
	`str \| float \| bool \| list \| dict`
	- The output value that appears in the "Similar Words" Json component.""")

	# Sentence Embedding Endpoint
	with gr.Accordion("GET SENTENCE EMBEDDING", open=False):
	gr.Markdown("""
	Get the embedding vector for a Sinhala sentence.

	Python example:
	```python
	from gradio_client import Client

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	sentence="මම පාසලට යමි",
	api_name="/sentence_embedding"
	)
	print(json.dumps(result, indent=4))
	```

	Response format:
	```json
	{
	"sentence": "මම පාසලට යමි",
	"embedding": [0.123, -0.456, 0.789, ...],
	"dimensions": 300,
	"tokens": ["මම", "පාසලට", "යමි"],
	"model": "UgannA_SiyabasaV2"
	}
	```
	""")

	gr.Markdown("""
	#### Accepts 1 parameter:
	- `sentence` : `str` \*_<u>Required</u>_
	- The input value that is provided in the "Sinhala Sentence" Textbox component.

	#### Returns 1 element
	`str \| float \| bool \| list \| dict`
	- The output value that appears in the "Sentence Embedding" Json component.
	""")

	# Sentence Similarity Endpoint
	with gr.Accordion("GET SENTENCE SIMILARITY", open=False):
	gr.Markdown("""
	Compute the similarity between two Sinhala sentences.

	Python example:
	```python
	from gradio_client import Client

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	sentence1="මම පාසලට යමි",
	sentence2="ඔහු පාසලට යයි",
	api_name="/sentence_similarity"
	)
	print(json.dumps(result, indent=4))
	```

	Response format:
	```json
	{
	"sentence1": "මම පාසලට යමි",
	"sentence2": "ඔහු පාසලට යයි",
	"similarity": 0.734567,
	"model": "UgannA_SiyabasaV2"
	}
	```
	""")

	gr.Markdown("""
	Accepts 2 parameters:
	1. `sentence1` : `str` \*_<u>Required</u>_
	- The input value that is provided in the "Sentence A" Textbox component.
	2. `sentence2` : `str` \*_<u>Required</u>_
	- The input value that is provided in the "Sentence B" Textbox component.

	#### Returns 1 element
	`str \| float \| bool \| list \| dict`
	- The output value that appears in the "Sentence Similarity" Json component.
	""")

	# Document Search Endpoints
	with gr.Accordion("DOCUMENT SEARCH", open=False):
	gr.Markdown("""
	Upload documents and perform semantic search.

	Step 1: Index documents
	```python
	from gradio_client import Client, handle_file

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	file=handle_file('path/to/documents.txt'),
	api_name="/_index_upload"
	)
	print(json.dumps(result, indent=4))
	``` """)

	gr.Markdown("""
	#### Accepts 1 parameter:
	1. `file` : `filepath` \*_<u>Required</u>_
	The input value that is provided in the "Upload .txt or .csv File" File component. The FileData class is a subclass of the GradioModel class that represents a file object within a Gradio interface. It is used to store file data and metadata when a file is uploaded. Attributes: path: The server file path where the file is stored. url: The normalized server URL pointing to the file. size: The size of the file in bytes. orig_name: The original filename before upload. mime_type: The MIME type of the file. is_stream: Indicates whether the file is a stream. meta: Additional metadata used internally (should not be changed).

	#### Returns tuple of 2 elements
	1. `dict(headers: list[Any], data: list[list[Any]], metadata: dict(str, list[Any] \| None) \| None)`
	- The output value that appears in the `value_45` Dataframe component.
	2. `str`
	- The output value that appears in the "Status" Textbox component.
	""")

	gr.Markdown("""
	Step 2: Search documents
	```python
	from gradio_client import Client

	client = Client("Remeinium/Embedding_Siyabasa")
	result = client.predict(
	query="සිංහල භාෂාව",
	topn_=5,
	api_name="/_search_wrapper"
	)
	print(json.dumps(result, indent=4))
	```
	""")

	gr.Markdown("""
	### Accepts 2 parameters:
	1. `query` : `string` \*_<u>Required</u>_
	- The input value that is provided in the `Search Query` Textbox component.
	2. `topn_` : `float` _Default 5_
	- The input value that is provided in the "Number of Results" Slider component.

	#### Returns 1 element
	`str \| float \| bool \| list \| dict`
	- The output value that appears in the `Search Results` Json component.
	""")

	with gr.TabItem("</> JavaScript"):
	gr.Markdown("""
	```javascript
	import { Client } from "@gradio/client";

	const client = await Client.connect("Remeinium/Embedding_Siyabasa");
	const result = await client.predict("/get_embedding", {
	word: "අම්මා"
	});
	console.log(result.data);
	```
	""")

	web_btn_js = gr.Button("Refer the Complete Javascript API Documentation", elem_id="button_green")
	js_code = f"() => window.open('{cu_docs}', '_blank')"
	web_btn_js.click(None, None, None, js=js_code)

	with gr.TabItem("␥ cURL"):
	gr.Markdown("""
	```bash
	curl -X POST https://remeinium-embedding-siyabasa.hf.space/gradio_api/call/get_embedding \\
	-H "Content-Type: application/json" \\
	-d '{"data": ["අම්මා"]}' \| awk -F'"' '{ print $4}' \| read EVENT_ID; \\
	curl -N https://remeinium-embedding-siyabasa.hf.space/gradio_api/call/get_embedding/$EVENT_ID
	```
	""")

	web_btn_cu = gr.Button("Refer the Complete cURL API Documentation", elem_id="button_green")
	js_code = f"() => window.open('{cu_docs}', '_blank')"
	web_btn_cu.click(None, None, None, js=js_code)

	# Model Information
	gr.Markdown("## Model Details")

	gr.Markdown("""
	\| Property \| Description \|
	\|----------\|-------------\|
	\| Model\| Embedding_Siyabasa API<br>`UgannA_SiyabasaV2` \|
	\| Supported data types<br>Input<br>Output \| <br>Text<br>Text embeddings \|
	\| Token limits<br>Input token limit<br>Output dimension size \| <br>1000<br>300 \|
	\| Version<br>Model<br>API \| <br>V_2.0<br>V_1.0\|
	\| Latest update \| August 2025 \|
	\| Language \| `Sinhala` only \|
	""")

	# Usage and Limits
	gr.Markdown("## Usage and limits")
	gr.Markdown("""
	- Always Free: Unlimited requests (subject to fair usage)
	- Rate limits: Applied only during high traffic to ensure service stability
	""")

	# Support
	gr.Markdown("## Support")
	gr.Markdown("""
	- Read Official <a href="https://esdocs.ai.remeinium.com" target="_blank">Documentation</a>.
	- Technical support: support@remeinium.com
	- Bug reports: Create an issue in the Space discussions
	- Feature requests: Contact support@remeinium.com

	> Note: This API is designed specifically for Sinhala language processing and may not work with other languages.
	""")

	web_btn_site = gr.Button("Visit Remeinium AI", elem_id="button_green")
	js_code = f"() => window.open('{website}', '_blank')"
	web_btn_site.click(None, None, None, js=js_code)

	# -------------------------
	# Main Application
	# -------------------------
	with gr.Blocks(title="Sinhala Embeddings API", css=styles) as demo:
	gr.Markdown("""
	# 🇱🇰 Embedding_Siyabasa - Sinhala \| An Advanced Embeddings API for Sinhala Language
	## Welcome to the official HuggingFace Space for _Embedding Siyabasa_

	The `Embedding_Siyabasa API` provides high-quality text embedding models specifically designed for the `Sinhala` language. Generate embeddings for Sinhala words, phrases, and sentences using our latest model `UgannA_SiyabasaV2`. These language-specific embeddings power advanced NLP tasks such as semantic search, text classification, and document clustering, delivering more accurate and context-aware results than traditional keyword-based approaches.

	Get the Model (`UgannA_SiyabasaV2`): https://huggingface.co/Remeinium/UgannA_SiyabasaV2

	Key features:
	- Language-specific: Optimized exclusively for Sinhala text
	- 300-dimensional embeddings: Rich semantic representations
	- FastText architecture: Proven performance for morphologically rich languages
	""")

	with gr.Row():
	web_btn = gr.Button("Refer the Complete API Documentation", elem_id="button_green")
	js_code = f"() => window.open('{docs}', '_blank')"
	web_btn.click(None, None, None, js=js_code)

	web_btn_site = gr.Button("Visit Remeinium AI", elem_id="button")
	js_code = f"() => window.open('{website}', '_blank')"
	web_btn_site.click(None, None, None, js=js_code)

	with gr.Tabs():
	# Playground
	with gr.TabItem("🧩 Embedding Playground"):
	gr.Markdown("## Explore Model Capabilities")
	gr.Markdown("Test the model directly without API access requirements.")

	# Word Embedding
	with gr.Row():
	inp = gr.Textbox(label="Sinhala Word", placeholder="අම්මා, සියබස, නූතන")
	out = gr.JSON(label="Embedding Vector")

	gr.Examples(
	examples=[["අම්මා"], ["සියබස"], ["නූතන"], ["ප්‍රජාතන්ත්‍රවාදය"]],
	inputs=inp, outputs=out, fn=get_embedding, cache_examples=True
	)

	btn = gr.Button("Get Embedding", elem_id="button")
	btn.click(fn=get_embedding, inputs=inp, outputs=out)

	# Word Similarity
	gr.Markdown("### Word Similarity")
	with gr.Row():
	ws_a = gr.Textbox(label="Word A", placeholder="අම්මා")
	ws_b = gr.Textbox(label="Word B", placeholder="තාත්තා")
	ws_out = gr.JSON(label="Similarity Result")
	ws_btn = gr.Button("Compare Words", elem_id="button")
	ws_btn.click(fn=word_similarity, inputs=[ws_a, ws_b], outputs=ws_out)

	# Nearest Neighbors
	gr.Markdown("### Semantic Search")
	with gr.Row():
	nn_word = gr.Textbox(label="Query Word", placeholder="පෞරාණික")
	nn_k = gr.Slider(1, 50, 10, label="Number of Results")
	nn_out = gr.JSON(label="Similar Words")

	gr.Examples(
	examples=[["අම්මා"], ["සියබස"], ["නූතන"], ["ප්‍රජාතන්ත්‍රවාදය"]],
	inputs=nn_word, outputs=nn_out, fn=nearest_neighbors, cache_examples=True
	)

	nn_btn = gr.Button("Find Similar Words", elem_id="button")
	nn_btn.click(fn=nearest_neighbors, inputs=[nn_word, nn_k], outputs=nn_out)

	# Sentence Operations
	gr.Markdown("### Sentence Operations")
	with gr.Row():
	sent_inp = gr.Textbox(label="Sinhala Sentence", placeholder="මම පාසලට යමි")
	sent_out = gr.JSON(label="Sentence Embedding")

	gr.Examples(
	examples=[["මම පාසලට යමි"], ["ආරෝග්‍යා පරමා ලාභා"], ["ඔබට බොහොම ස්තුතියි."]],
	inputs=sent_inp, outputs=sent_out, fn=sentence_embedding, cache_examples=True
	)

	sent_btn = gr.Button("Get Sentence Embedding", elem_id="button")
	sent_btn.click(fn=sentence_embedding, inputs=sent_inp, outputs=sent_out)

	with gr.Row():
	sa = gr.Textbox(label="Sentence A", placeholder="මම පාසලට යමි")
	sb = gr.Textbox(label="Sentence B", placeholder="ඔහු පාසලට යයි")
	ssim_out = gr.JSON(label="Sentence Similarity")
	ssim_btn = gr.Button("Compare Sentences", elem_id="button")
	ssim_btn.click(fn=sentence_similarity, inputs=[sa, sb], outputs=ssim_out)

	# Document Search
	gr.Markdown("### Document Semantic Search")
	gr.Markdown("Upload a text file (one document per line) for semantic search.")

	status_display = gr.Textbox(label="Status", value="Ready to upload documents", interactive=False)

	with gr.Row():
	upload = gr.File(label="Upload .txt or .csv File", file_count="single")
	docs_list = gr.Dataframe(headers=["Document Preview"], interactive=False)

	idx_btn = gr.Button("Index Documents", elem_id="button")
	indexed_state = gr.State(value=None)

	def _index_upload(file):
	if file is None:
	return None, gr.update(value=[]), "Please upload a file first"

	parsed = parse_uploaded_documents(file)
	if "error" in parsed:
	return None, gr.update(value=[]), parsed["error"]

	docs = parsed["documents"]
	indexed = index_documents_for_search(docs)
	if "error" in indexed:
	return None, gr.update(value=[]), indexed["error"]

	preview = [[(d[:200] + "..." if len(d) > 200 else d)] for d in docs[:20]]
	return indexed, gr.update(value=preview), f"Indexed {len(docs)} documents"

	idx_btn.click(_index_upload, inputs=[upload], outputs=[indexed_state, docs_list, status_display])

	with gr.Row():
	q = gr.Textbox(label="Search Query")
	topn = gr.Slider(1, 20, 5, label="Number of Results")
	results_out = gr.JSON(label="Search Results")

	def _search_wrapper(query, topn_, state):
	if state is None:
	return {"error": "Please index documents first"}
	res = search_documents(query, state)
	if "results" in res:
	res["results"] = res["results"][:int(topn_)]
	return res

	search_btn = gr.Button("Search Documents", elem_id="button")
	search_btn.click(fn=_search_wrapper, inputs=[q, topn, indexed_state], outputs=[results_out])

	# API Platform Tab
	with gr.TabItem("⚡ API Platform"):
	create_api_platform()

	with gr.TabItem("💡 Status"):
	# gr.Markdown("Check at : https://stats.uptimerobot.com/HZFBOsSvBT")

	web_btn_status = gr.Button("Check Status", elem_id="button")
	js_code = f"() => window.open('{status}', '_blank')"
	web_btn_status.click(None, None, None, js=js_code)


	gr.Markdown("""
	---
	✨ <a href="https://ai.remeinium.com" target="_blank">Remeinium AI</a>* · _Intelligence for a greater tomorrow._*
	""")

	if __name__ == "__main__":
	# demo.queue(default_concurrency_limit=10, max_size=20).launch()
	demo.launch()