Spaces:

ccocks-deca
/

embed-this

Runtime error

App Files Files Community

ccocks-deca commited on Apr 4

Commit

13a3776

verified ·

1 Parent(s): cb198a7

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -28

app.py CHANGED Viewed

@@ -2,59 +2,66 @@ import gradio as gr
 import os
 from model2vec import StaticModel
-# Suppress warnings
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Distilled Qwen3-Embedding-0.6B → static & ultra-fast on CPU
 model = StaticModel.from_pretrained("futur/Qwen3-Embedding-0.6B-model2vec-onnx")
-def generate_embeddings(texts: str, use_query_mode: bool = True):
-    """One text per line. Handles 500–1000+ tokens instantly on CPU."""
-    lines = [line.strip() for line in texts.split("\n") if line.strip()]
-    if not lines:
-        return {"embeddings": [], "texts": [], "dimension": 1024}
-    # Query mode adds instruction (keeps the original Qwen3 behavior)
     prompt_name = "query" if use_query_mode else None
-    embeddings = model.encode(
-        lines,
         prompt_name=prompt_name,
         convert_to_numpy=True,
-        normalize_embeddings=True,
-    ).tolist()
     return {
-        "embeddings": embeddings,    # list of 1024-dim vectors (or reduced if you want)
-        "texts": lines,
-        "dimension": len(embeddings[0]) if embeddings else 1024,
         "mode": "query (with instruction)" if use_query_mode else "document (raw)",
-        "note": "Distilled Qwen3-Embedding-0.6B (Model2Vec) • 500× faster on CPU • <1s for 500–1000 tokens",
     }
-# Gradio 5+ Interface + full REST API
 demo = gr.Interface(
-    fn=generate_embeddings,
     inputs=[
         gr.Textbox(
-            lines=12,
-            placeholder="Paste one text/document per line...\n\n(500–1000+ tokens per line works instantly now)",
-            label="Input Texts (one per line)",
         ),
         gr.Checkbox(
-            label="Use Query Mode (recommended for search/RAG)",
             value=True,
-            info="Adds Qwen3-style instruction automatically",
         ),
     ],
     outputs=gr.JSON(label="Embedding Response"),
-    title="🚀 Qwen3-Embedding-0.6B (Distilled Model2Vec) CPU API",
-    description="""Exact Qwen3-Embedding-0.6B you wanted — now distilled to static Model2Vec.
-    Same quality + 500× faster on free CPU • Handles long sequences instantly.""",
     examples=[
-        ["What is the capital of France?\n" + "Long document text repeated to simulate 500–1000 tokens... " * 30, True],
-        ["Short sentence.\nAnother longer paragraph with many tokens for testing speed...", False],
     ],
 )

 import os
 from model2vec import StaticModel
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Distilled Qwen3-Embedding-0.6B (static + ultra-fast on CPU)
 model = StaticModel.from_pretrained("futur/Qwen3-Embedding-0.6B-model2vec-onnx")
+def generate_embedding(text: str, use_query_mode: bool):
+    """Single text input → returns one embedding vector."""
+    if not text or not text.strip():
+        return {
+            "embedding": [],
+            "text": "",
+            "dimension": 1024,
+            "mode": "query (with instruction)" if use_query_mode else "document",
+            "note": "Empty input"
+        }
+    cleaned_text = text.strip()
+    # Query mode adds instruction (keeps Qwen3 behavior for better retrieval)
     prompt_name = "query" if use_query_mode else None
+    embedding = model.encode(
+        [cleaned_text],                    # single text wrapped as list
         prompt_name=prompt_name,
         convert_to_numpy=True,
+        normalize_embeddings=True,         # cosine similarity ready
+    )[0].tolist()                          # take first (and only) vector
     return {
+        "embedding": embedding,            # single list of floats (1024-dim)
+        "text": cleaned_text,
+        "dimension": len(embedding),
         "mode": "query (with instruction)" if use_query_mode else "document (raw)",
+        "note": "Distilled Qwen3-Embedding-0.6B (Model2Vec) • <1s even for 500–1000+ tokens on free CPU"
     }
+# Simple Gradio 5+ interface + clean REST API
 demo = gr.Interface(
+    fn=generate_embedding,
     inputs=[
         gr.Textbox(
+            lines=10,
+            placeholder="Paste your text here (500–1000+ tokens is fine – it will be fast)...",
+            label="Input Text",
         ),
         gr.Checkbox(
+            label="Use Query Mode (recommended for search / RAG)",
             value=True,
+            info="Adds instruction automatically for much better retrieval performance",
         ),
     ],
     outputs=gr.JSON(label="Embedding Response"),
+    title="🚀 Qwen3-Embedding-0.6B (Distilled) – Single Text API",
+    description="""Distilled version of Qwen/Qwen3-Embedding-0.6B using Model2Vec.
+    Same family & quality • 500× faster on CPU • Handles long texts instantly.
+    Returns **one** embedding vector per call.""",
     examples=[
+        ["What is the capital of France? Explain it in detail with historical context.", True],
+        ["A very long document text repeated many times to simulate 800 tokens... " * 40, False],
     ],
 )