Spaces:

Mazenbs
/

emmd

Running

App Files Files Community

Mazenbs commited on Dec 23, 2025

Commit

ff7efde

verified ·

1 Parent(s): 913fabc

Create cop.py

Browse files

Files changed (1) hide show

cop.py +142 -0

cop.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# main.py
+# FastAPI app for ultra-low-latency Arabic query embeddings using ONNX (INT8) on CPU.
+# Single-file, production-ready for Hugging Face Spaces (CPU, single worker).
+import re
+import time
+import numpy as np
+import multiprocessing
+import onnxruntime as ort
+from functools import lru_cache
+from fastapi import FastAPI, Query, Response
+from transformers import AutoTokenizer
+# ==============================
+# Config
+# ==============================
+MODEL_PATH = "lib/intfloat_multilingual-e5-small_merged_int8.onnx"
+TOKENIZER_PATH = "lib"  # directory containing tokenizer files
+MAX_LENGTH = 64         # tuned for short queries (≤ ~15 words)
+# ==============================
+# ONNX Runtime session (max CPU acceleration)
+# ==============================
+session_options = ort.SessionOptions()
+session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+session_options.enable_cpu_mem_arena = True
+session_options.intra_op_num_threads = multiprocessing.cpu_count()  # use all available cores
+session_options.inter_op_num_threads = 1
+# Optional: write optimized graph once; harmless if it can't be written
+session_options.optimized_model_filepath = "optimized_model.onnx"
+session = ort.InferenceSession(
+    MODEL_PATH,
+    providers=[('CPUExecutionProvider', {})],
+    sess_options=session_options
+)
+# ==============================
+# Tokenizer: load once
+# ==============================
+tokenizer = AutoTokenizer.from_pretrained(
+    TOKENIZER_PATH,
+    local_files_only=True,
+    use_fast=True
+)
+# ==============================
+# Arabic normalization (cached)
+# ==============================
+@lru_cache(maxsize=4096)
+def normalize_arabic(text: str) -> str:
+    # Remove diacritics
+    text = re.sub(r'[ًٌٍَُِّْـ]', '', text)
+    # Normalize hamza/aleph variants
+    text = re.sub(r'[إأآ]', 'ا', text)
+    text = re.sub(r'ى', 'ي', text)
+    text = re.sub(r'ؤ', 'و', text)
+    text = re.sub(r'ئ', 'ي', text)
+    # Ta marbuta at word end -> ha (common retrieval normalization)
+    text = re.sub(r'ة\b', 'ه', text)
+    # Strip non-word chars, collapse spaces
+    text = re.sub(r'[^\w\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+# ==============================
+# Embedding function (cached, L2 normalized)
+# ==============================
+@lru_cache(maxsize=4096)
+def embed_query_cached(query: str, do_normalize: bool) -> np.ndarray:
+    if do_normalize:
+        query = normalize_arabic(query)
+    # Fixed-length tokenization for stable shapes and faster CPU execution
+    inputs = tokenizer(
+        "query: " + query,
+        return_tensors="np",
+        truncation=True,
+        padding="max_length",
+        max_length=MAX_LENGTH,
+        return_attention_mask=True,
+        return_token_type_ids=False
+    )
+    # ONNX inference (INT8 model)
+    ort_outs = session.run(None, dict(inputs))
+    # E5-style pooled embedding (second output); adjust if your model differs
+    vector = ort_outs[1][0].astype(np.float32)
+    # L2 normalization
+    norm = np.linalg.norm(vector)
+    if norm > 0.0:
+        vector /= norm
+    return vector
+def query_to_embedding(query: str, normalize_text: bool = True) -> np.ndarray:
+    # Route through cached function to maximize single-query latency performance
+    return embed_query_cached(query.strip(), normalize_text)
+# ==============================
+# FastAPI app
+# ==============================
+app = FastAPI()
+# Warm-up on startup: builds caches, JIT paths, memory arenas
+@app.on_event("startup")
+def warmup():
+    try:
+        _ = query_to_embedding("مرحبا بالعالم", normalize_text=True)
+    except Exception:
+        # Avoid any heavy logging; fail silently to keep startup lightweight
+        pass
+# Ultra-low-latency GET endpoint (no extra middlewares/gzip/logging)
+@app.get("/query")
+def query_endpoint(
+    q: str = Query(..., min_length=1),
+    normalize: bool = Query(True)
+):
+    # Minimal validation and fast path
+    s = q.strip()
+    if not s:
+        return Response(status_code=400)
+    start = time.perf_counter()
+    vec = query_to_embedding(s, normalize_text=normalize)
+    latency_ms = (time.perf_counter() - start) * 1000.0
+    # Return only essentials (embedding as list); omit heavy metadata
+    return {
+        "embedding": vec.tolist(),
+        "length": len(vec),
+        "normalized": True,
+        "latency_ms": round(latency_ms, 3)
+    }
+# Optional root for quick health checks without noise
+@app.get("/")
+def root():
+    return {"status": "ok", "model": "onnx-int8", "max_length": MAX_LENGTH}