Update app.py
Browse files
app.py
CHANGED
|
@@ -50,14 +50,35 @@ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
|
| 50 |
# )
|
| 51 |
from huggingface_hub import snapshot_download
|
| 52 |
|
| 53 |
-
snapshot_download(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
|
| 55 |
-
repo_type="model",
|
| 56 |
-
local_dir="./models/stepfun",
|
| 57 |
-
# allow_patterns=["UD-TQ1_0/*"], # 👈 folder inside repo
|
| 58 |
-
token=huggingface_token # only if gated/private
|
| 59 |
-
)
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
# llm = Llama.from_pretrained(
|
|
@@ -92,18 +113,18 @@ def respond(
|
|
| 92 |
global llm
|
| 93 |
global llm_model
|
| 94 |
|
| 95 |
-
if llm is None or llm_model != model:
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
|
| 108 |
provider = LlamaCppPythonProvider(llm)
|
| 109 |
|
|
|
|
| 50 |
# )
|
| 51 |
from huggingface_hub import snapshot_download
|
| 52 |
|
| 53 |
+
# snapshot_download(
|
| 54 |
+
# repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
|
| 55 |
+
# repo_type="model",
|
| 56 |
+
# local_dir="./models/stepfun",
|
| 57 |
+
# # allow_patterns=["UD-TQ1_0/*"], # 👈 folder inside repo
|
| 58 |
+
# token=huggingface_token # only if gated/private
|
| 59 |
+
# )
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
llm = Llama.from_pretrained(
|
| 63 |
repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
# ALWAYS first shard only here
|
| 66 |
+
filename="UD-TQ1_0/step3p5_flash_Q4_K_S-00001-of-00012.gguf",
|
| 67 |
+
|
| 68 |
+
# Download all shards
|
| 69 |
+
additional_files=[
|
| 70 |
+
f"UD-TQ1_0/step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf"
|
| 71 |
+
for i in range(2, 13)
|
| 72 |
+
],
|
| 73 |
+
|
| 74 |
+
local_dir="./models",
|
| 75 |
+
|
| 76 |
+
# Performance settings
|
| 77 |
+
flash_attn=True,
|
| 78 |
+
n_gpu_layers=-1, # use full GPU (if you have enough VRAM)
|
| 79 |
+
n_batch=2048,
|
| 80 |
+
n_ctx=4096, # 8000 is heavy unless needed
|
| 81 |
+
)
|
| 82 |
|
| 83 |
|
| 84 |
# llm = Llama.from_pretrained(
|
|
|
|
| 113 |
global llm
|
| 114 |
global llm_model
|
| 115 |
|
| 116 |
+
# if llm is None or llm_model != model:
|
| 117 |
+
# llm = Llama(
|
| 118 |
+
# model_path=f"models/{model}",
|
| 119 |
+
# flash_attn=True,
|
| 120 |
+
# n_gpu_layers=-1,
|
| 121 |
+
# n_batch=2048, # increase
|
| 122 |
+
# n_ctx=4096, # reduce if you don’t need 8k
|
| 123 |
+
# n_threads=16, # set to your CPU cores
|
| 124 |
+
# use_mlock=True,
|
| 125 |
+
# verbose=False
|
| 126 |
+
# )
|
| 127 |
+
# llm_model = model
|
| 128 |
|
| 129 |
provider = LlamaCppPythonProvider(llm)
|
| 130 |
|