Spaces:

rphrp1985
/

chatbots1

Running on Zero

rphrp1985 commited on Feb 14

Commit

55a6ce7

verified ·

1 Parent(s): 217b0c1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -50,14 +50,35 @@ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 # )
 from huggingface_hub import snapshot_download
-snapshot_download(
     repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
-    repo_type="model",
-    local_dir="./models/stepfun",
-    # allow_patterns=["UD-TQ1_0/*"],   # 👈 folder inside repo
-    token=huggingface_token      # only if gated/private
-)
 # llm = Llama.from_pretrained(
@@ -92,18 +113,18 @@ def respond(
     global llm
     global llm_model
-    if llm is None or llm_model != model:
-        llm = Llama(
-           model_path=f"models/{model}",
-            flash_attn=True,
-            n_gpu_layers=-1,
-            n_batch=2048,        # increase
-            n_ctx=4096,          # reduce if you don’t need 8k
-            n_threads=16,        # set to your CPU cores
-            use_mlock=True,
-            verbose=False
-        )
-        llm_model = model
     provider = LlamaCppPythonProvider(llm)

 # )
 from huggingface_hub import snapshot_download
+# snapshot_download(
+#     repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
+#     repo_type="model",
+#     local_dir="./models/stepfun",
+#     # allow_patterns=["UD-TQ1_0/*"],   # 👈 folder inside repo
+#     token=huggingface_token      # only if gated/private
+# )
+llm = Llama.from_pretrained(
     repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
+    # ALWAYS first shard only here
+    filename="UD-TQ1_0/step3p5_flash_Q4_K_S-00001-of-00012.gguf",
+    # Download all shards
+    additional_files=[
+        f"UD-TQ1_0/step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf"
+        for i in range(2, 13)
+    ],
+    local_dir="./models",
+    # Performance settings
+    flash_attn=True,
+    n_gpu_layers=-1,      # use full GPU (if you have enough VRAM)
+    n_batch=2048,
+    n_ctx=4096,           # 8000 is heavy unless needed
+)
 # llm = Llama.from_pretrained(
     global llm
     global llm_model
+    # if llm is None or llm_model != model:
+    #     llm = Llama(
+    #        model_path=f"models/{model}",
+    #         flash_attn=True,
+    #         n_gpu_layers=-1,
+    #         n_batch=2048,        # increase
+    #         n_ctx=4096,          # reduce if you don’t need 8k
+    #         n_threads=16,        # set to your CPU cores
+    #         use_mlock=True,
+    #         verbose=False
+    #     )
+    #     llm_model = model
     provider = LlamaCppPythonProvider(llm)