rphrp1985 commited on
Commit
55a6ce7
·
verified ·
1 Parent(s): 217b0c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -18
app.py CHANGED
@@ -50,14 +50,35 @@ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
50
  # )
51
  from huggingface_hub import snapshot_download
52
 
53
- snapshot_download(
 
 
 
 
 
 
 
 
 
54
  repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
55
- repo_type="model",
56
- local_dir="./models/stepfun",
57
- # allow_patterns=["UD-TQ1_0/*"], # 👈 folder inside repo
58
- token=huggingface_token # only if gated/private
59
- )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  # llm = Llama.from_pretrained(
@@ -92,18 +113,18 @@ def respond(
92
  global llm
93
  global llm_model
94
 
95
- if llm is None or llm_model != model:
96
- llm = Llama(
97
- model_path=f"models/{model}",
98
- flash_attn=True,
99
- n_gpu_layers=-1,
100
- n_batch=2048, # increase
101
- n_ctx=4096, # reduce if you don’t need 8k
102
- n_threads=16, # set to your CPU cores
103
- use_mlock=True,
104
- verbose=False
105
- )
106
- llm_model = model
107
 
108
  provider = LlamaCppPythonProvider(llm)
109
 
 
50
  # )
51
  from huggingface_hub import snapshot_download
52
 
53
+ # snapshot_download(
54
+ # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
55
+ # repo_type="model",
56
+ # local_dir="./models/stepfun",
57
+ # # allow_patterns=["UD-TQ1_0/*"], # 👈 folder inside repo
58
+ # token=huggingface_token # only if gated/private
59
+ # )
60
+
61
+
62
+ llm = Llama.from_pretrained(
63
  repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S",
 
 
 
 
 
64
 
65
+ # ALWAYS first shard only here
66
+ filename="UD-TQ1_0/step3p5_flash_Q4_K_S-00001-of-00012.gguf",
67
+
68
+ # Download all shards
69
+ additional_files=[
70
+ f"UD-TQ1_0/step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf"
71
+ for i in range(2, 13)
72
+ ],
73
+
74
+ local_dir="./models",
75
+
76
+ # Performance settings
77
+ flash_attn=True,
78
+ n_gpu_layers=-1, # use full GPU (if you have enough VRAM)
79
+ n_batch=2048,
80
+ n_ctx=4096, # 8000 is heavy unless needed
81
+ )
82
 
83
 
84
  # llm = Llama.from_pretrained(
 
113
  global llm
114
  global llm_model
115
 
116
+ # if llm is None or llm_model != model:
117
+ # llm = Llama(
118
+ # model_path=f"models/{model}",
119
+ # flash_attn=True,
120
+ # n_gpu_layers=-1,
121
+ # n_batch=2048, # increase
122
+ # n_ctx=4096, # reduce if you don’t need 8k
123
+ # n_threads=16, # set to your CPU cores
124
+ # use_mlock=True,
125
+ # verbose=False
126
+ # )
127
+ # llm_model = model
128
 
129
  provider = LlamaCppPythonProvider(llm)
130