ccocks-deca commited on
Commit
13a3776
Β·
verified Β·
1 Parent(s): cb198a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -28
app.py CHANGED
@@ -2,59 +2,66 @@ import gradio as gr
2
  import os
3
  from model2vec import StaticModel
4
 
5
- # Suppress warnings
6
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
7
 
8
- # Distilled Qwen3-Embedding-0.6B β†’ static & ultra-fast on CPU
9
  model = StaticModel.from_pretrained("futur/Qwen3-Embedding-0.6B-model2vec-onnx")
10
 
11
- def generate_embeddings(texts: str, use_query_mode: bool = True):
12
- """One text per line. Handles 500–1000+ tokens instantly on CPU."""
13
- lines = [line.strip() for line in texts.split("\n") if line.strip()]
14
- if not lines:
15
- return {"embeddings": [], "texts": [], "dimension": 1024}
 
 
 
 
 
16
 
17
- # Query mode adds instruction (keeps the original Qwen3 behavior)
 
 
18
  prompt_name = "query" if use_query_mode else None
19
 
20
- embeddings = model.encode(
21
- lines,
22
  prompt_name=prompt_name,
23
  convert_to_numpy=True,
24
- normalize_embeddings=True,
25
- ).tolist()
26
 
27
  return {
28
- "embeddings": embeddings, # list of 1024-dim vectors (or reduced if you want)
29
- "texts": lines,
30
- "dimension": len(embeddings[0]) if embeddings else 1024,
31
  "mode": "query (with instruction)" if use_query_mode else "document (raw)",
32
- "note": "Distilled Qwen3-Embedding-0.6B (Model2Vec) β€’ 500Γ— faster on CPU β€’ <1s for 500–1000 tokens",
33
  }
34
 
35
 
36
- # Gradio 5+ Interface + full REST API
37
  demo = gr.Interface(
38
- fn=generate_embeddings,
39
  inputs=[
40
  gr.Textbox(
41
- lines=12,
42
- placeholder="Paste one text/document per line...\n\n(500–1000+ tokens per line works instantly now)",
43
- label="Input Texts (one per line)",
44
  ),
45
  gr.Checkbox(
46
- label="Use Query Mode (recommended for search/RAG)",
47
  value=True,
48
- info="Adds Qwen3-style instruction automatically",
49
  ),
50
  ],
51
  outputs=gr.JSON(label="Embedding Response"),
52
- title="πŸš€ Qwen3-Embedding-0.6B (Distilled Model2Vec) CPU API",
53
- description="""Exact Qwen3-Embedding-0.6B you wanted β€” now distilled to static Model2Vec.
54
- Same quality + 500Γ— faster on free CPU β€’ Handles long sequences instantly.""",
 
55
  examples=[
56
- ["What is the capital of France?\n" + "Long document text repeated to simulate 500–1000 tokens... " * 30, True],
57
- ["Short sentence.\nAnother longer paragraph with many tokens for testing speed...", False],
58
  ],
59
  )
60
 
 
2
  import os
3
  from model2vec import StaticModel
4
 
 
5
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
6
 
7
+ # Distilled Qwen3-Embedding-0.6B (static + ultra-fast on CPU)
8
  model = StaticModel.from_pretrained("futur/Qwen3-Embedding-0.6B-model2vec-onnx")
9
 
10
+ def generate_embedding(text: str, use_query_mode: bool):
11
+ """Single text input β†’ returns one embedding vector."""
12
+ if not text or not text.strip():
13
+ return {
14
+ "embedding": [],
15
+ "text": "",
16
+ "dimension": 1024,
17
+ "mode": "query (with instruction)" if use_query_mode else "document",
18
+ "note": "Empty input"
19
+ }
20
 
21
+ cleaned_text = text.strip()
22
+
23
+ # Query mode adds instruction (keeps Qwen3 behavior for better retrieval)
24
  prompt_name = "query" if use_query_mode else None
25
 
26
+ embedding = model.encode(
27
+ [cleaned_text], # single text wrapped as list
28
  prompt_name=prompt_name,
29
  convert_to_numpy=True,
30
+ normalize_embeddings=True, # cosine similarity ready
31
+ )[0].tolist() # take first (and only) vector
32
 
33
  return {
34
+ "embedding": embedding, # single list of floats (1024-dim)
35
+ "text": cleaned_text,
36
+ "dimension": len(embedding),
37
  "mode": "query (with instruction)" if use_query_mode else "document (raw)",
38
+ "note": "Distilled Qwen3-Embedding-0.6B (Model2Vec) β€’ <1s even for 500–1000+ tokens on free CPU"
39
  }
40
 
41
 
42
+ # Simple Gradio 5+ interface + clean REST API
43
  demo = gr.Interface(
44
+ fn=generate_embedding,
45
  inputs=[
46
  gr.Textbox(
47
+ lines=10,
48
+ placeholder="Paste your text here (500–1000+ tokens is fine – it will be fast)...",
49
+ label="Input Text",
50
  ),
51
  gr.Checkbox(
52
+ label="Use Query Mode (recommended for search / RAG)",
53
  value=True,
54
+ info="Adds instruction automatically for much better retrieval performance",
55
  ),
56
  ],
57
  outputs=gr.JSON(label="Embedding Response"),
58
+ title="πŸš€ Qwen3-Embedding-0.6B (Distilled) – Single Text API",
59
+ description="""Distilled version of Qwen/Qwen3-Embedding-0.6B using Model2Vec.
60
+ Same family & quality β€’ 500Γ— faster on CPU β€’ Handles long texts instantly.
61
+ Returns **one** embedding vector per call.""",
62
  examples=[
63
+ ["What is the capital of France? Explain it in detail with historical context.", True],
64
+ ["A very long document text repeated many times to simulate 800 tokens... " * 40, False],
65
  ],
66
  )
67