Spaces:

tusarway
/

rag-backend

Running

imtrt004 commited on Mar 6

Commit

ab16882

1 Parent(s): 6780118

fix: update model

Files changed (1) hide show

model/loader.py CHANGED Viewed

@@ -5,8 +5,9 @@ All speeds measured on 2 vCPU / 16 GB RAM (HF Free Tier).
 Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
   #1  TinyLlama/TinyLlama-1.1B-Chat-v1.0           ~1 GB    40-60 tok/s  Apache 2.0   demos, prototypes
   #2  Qwen/Qwen3-0.6B                               ~0.5 GB  45-55 tok/s  Apache 2.0   speed-critical, Think mode
-  #3  meta-llama/Llama-3.2-1B-Instruct  [DEF]       ~1.5 GB  35-50 tok/s  Community    128K ctx, long-context
   #4  HuggingFaceTB/SmolLM2-1.7B-Instruct          ~2 GB    25-35 tok/s  Apache 2.0   good quality/size ratio
   #5  Qwen/Qwen2.5-1.5B-Instruct                    ~2 GB    25-40 tok/s  Apache 2.0   multilingual, 32K ctx
   #6  stabilityai/stablelm-2-zephyr-1_6b            ~2 GB    25-40 tok/s  MIT          DPO-tuned chat feel
   #7  Qwen/Qwen2.5-Coder-1.5B-Instruct              ~2 GB    25-40 tok/s  Apache 2.0   code completion/review
@@ -34,7 +35,7 @@ warnings.filterwarnings(
     category=FutureWarning,
 )
-MODEL_ID = os.environ.get("LLM_MODEL", "meta-llama/Llama-3.2-1B-Instruct")
 # Models that need trust_remote_code=True (custom architectures)
 _TRUST_REMOTE_CODE_MODELS = (

 Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
   #1  TinyLlama/TinyLlama-1.1B-Chat-v1.0           ~1 GB    40-60 tok/s  Apache 2.0   demos, prototypes
   #2  Qwen/Qwen3-0.6B                               ~0.5 GB  45-55 tok/s  Apache 2.0   speed-critical, Think mode
+  #3  meta-llama/Llama-3.2-1B-Instruct              ~1.5 GB  35-50 tok/s  Community    128K ctx, long-context (needs HF_TOKEN)
   #4  HuggingFaceTB/SmolLM2-1.7B-Instruct          ~2 GB    25-35 tok/s  Apache 2.0   good quality/size ratio
+  #5  HuggingFaceTB/SmolLM2-360M-Instruct  [DEF]   ~0.4 GB  60-80 tok/s  Apache 2.0   fastest, no token needed
   #5  Qwen/Qwen2.5-1.5B-Instruct                    ~2 GB    25-40 tok/s  Apache 2.0   multilingual, 32K ctx
   #6  stabilityai/stablelm-2-zephyr-1_6b            ~2 GB    25-40 tok/s  MIT          DPO-tuned chat feel
   #7  Qwen/Qwen2.5-Coder-1.5B-Instruct              ~2 GB    25-40 tok/s  Apache 2.0   code completion/review
     category=FutureWarning,
 )
+MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")
 # Models that need trust_remote_code=True (custom architectures)
 _TRUST_REMOTE_CODE_MODELS = (