ccocks-deca commited on
Commit
a6b7976
Β·
verified Β·
1 Parent(s): 13a3776

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -35
app.py CHANGED
@@ -2,66 +2,55 @@ import gradio as gr
2
  import os
3
  from model2vec import StaticModel
4
 
 
5
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
6
 
7
- # Distilled Qwen3-Embedding-0.6B (static + ultra-fast on CPU)
8
- model = StaticModel.from_pretrained("futur/Qwen3-Embedding-0.6B-model2vec-onnx")
9
 
10
- def generate_embedding(text: str, use_query_mode: bool):
11
- """Single text input β†’ returns one embedding vector."""
12
  if not text or not text.strip():
13
  return {
14
  "embedding": [],
15
  "text": "",
16
- "dimension": 1024,
17
- "mode": "query (with instruction)" if use_query_mode else "document",
18
  "note": "Empty input"
19
  }
20
 
21
  cleaned_text = text.strip()
22
 
23
- # Query mode adds instruction (keeps Qwen3 behavior for better retrieval)
24
- prompt_name = "query" if use_query_mode else None
25
-
26
  embedding = model.encode(
27
- [cleaned_text], # single text wrapped as list
28
- prompt_name=prompt_name,
29
  convert_to_numpy=True,
30
- normalize_embeddings=True, # cosine similarity ready
31
- )[0].tolist() # take first (and only) vector
32
 
33
  return {
34
- "embedding": embedding, # single list of floats (1024-dim)
35
  "text": cleaned_text,
36
  "dimension": len(embedding),
37
- "mode": "query (with instruction)" if use_query_mode else "document (raw)",
38
- "note": "Distilled Qwen3-Embedding-0.6B (Model2Vec) β€’ <1s even for 500–1000+ tokens on free CPU"
39
  }
40
 
41
 
42
- # Simple Gradio 5+ interface + clean REST API
43
  demo = gr.Interface(
44
  fn=generate_embedding,
45
- inputs=[
46
- gr.Textbox(
47
- lines=10,
48
- placeholder="Paste your text here (500–1000+ tokens is fine – it will be fast)...",
49
- label="Input Text",
50
- ),
51
- gr.Checkbox(
52
- label="Use Query Mode (recommended for search / RAG)",
53
- value=True,
54
- info="Adds instruction automatically for much better retrieval performance",
55
- ),
56
- ],
57
  outputs=gr.JSON(label="Embedding Response"),
58
- title="πŸš€ Qwen3-Embedding-0.6B (Distilled) – Single Text API",
59
- description="""Distilled version of Qwen/Qwen3-Embedding-0.6B using Model2Vec.
60
- Same family & quality β€’ 500Γ— faster on CPU β€’ Handles long texts instantly.
61
- Returns **one** embedding vector per call.""",
62
  examples=[
63
- ["What is the capital of France? Explain it in detail with historical context.", True],
64
- ["A very long document text repeated many times to simulate 800 tokens... " * 40, False],
65
  ],
66
  )
67
 
 
2
  import os
3
  from model2vec import StaticModel
4
 
5
+ # Suppress tokenizer warnings
6
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
7
 
8
+ # Best working static model – ultra-fast on CPU + long texts
9
+ model = StaticModel.from_pretrained("minishlab/potion-base-32M")
10
 
11
+ def generate_embedding(text: str):
12
+ """Single text input β†’ one embedding vector (fast even for 500–1000+ tokens)."""
13
  if not text or not text.strip():
14
  return {
15
  "embedding": [],
16
  "text": "",
17
+ "dimension": 256,
 
18
  "note": "Empty input"
19
  }
20
 
21
  cleaned_text = text.strip()
22
 
23
+ # Static Model2Vec – no query/document prompt needed (always high-quality)
 
 
24
  embedding = model.encode(
25
+ [cleaned_text],
 
26
  convert_to_numpy=True,
27
+ normalize_embeddings=True, # ready for cosine similarity
28
+ )[0].tolist()
29
 
30
  return {
31
+ "embedding": embedding, # single list of 256 floats
32
  "text": cleaned_text,
33
  "dimension": len(embedding),
34
+ "note": "minishlab/potion-base-32M (static) β€’ <0.5s even for 500–1000+ tokens on free CPU"
 
35
  }
36
 
37
 
38
+ # Clean single-text Gradio interface + full REST API
39
  demo = gr.Interface(
40
  fn=generate_embedding,
41
+ inputs=gr.Textbox(
42
+ lines=12,
43
+ placeholder="Paste your text here (500–1000+ tokens works instantly)...",
44
+ label="Input Text",
45
+ ),
 
 
 
 
 
 
 
46
  outputs=gr.JSON(label="Embedding Response"),
47
+ title="⚑ Qwen3-Style Fast Embedding API (Single Text)",
48
+ description="""Ultra-fast static embedding model (potion-base-32M).
49
+ Best reliable CPU option β€’ 500Γ— faster than transformers β€’ Handles long texts instantly.
50
+ Returns **one** 256-dim embedding vector per call.""",
51
  examples=[
52
+ ["What is the capital of France? Explain it in detail with historical context and why it matters today."],
53
+ ["A very long document with many tokens to test speed... " * 50],
54
  ],
55
  )
56