Spaces:
Running
on
Zero
Running
on
Zero
jedick
commited on
Commit
·
b4a8032
1
Parent(s):
c3715f2
Use transformers==4.54.1
Browse files- main.py +4 -1
- mods/tool_calling_llm.py +3 -3
- requirements.txt +1 -1
main.py
CHANGED
|
@@ -156,9 +156,12 @@ def GetChatModel(compute_mode, ckpt_dir=None):
|
|
| 156 |
torch_dtype=torch.bfloat16,
|
| 157 |
# Enable FlashAttention (requires pip install flash-attn)
|
| 158 |
# https://huggingface.co/docs/transformers/en/attention_interface
|
| 159 |
-
# https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
|
| 160 |
attn_implementation="flash_attention_2",
|
|
|
|
| 161 |
)
|
|
|
|
|
|
|
| 162 |
|
| 163 |
# Use MyTextGenerationPipeline with custom preprocess() method
|
| 164 |
pipe = MyTextGenerationPipeline(
|
|
|
|
| 156 |
torch_dtype=torch.bfloat16,
|
| 157 |
# Enable FlashAttention (requires pip install flash-attn)
|
| 158 |
# https://huggingface.co/docs/transformers/en/attention_interface
|
| 159 |
+
# https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention
|
| 160 |
attn_implementation="flash_attention_2",
|
| 161 |
+
device_map="auto",
|
| 162 |
)
|
| 163 |
+
# For Flash Attention version of Qwen3
|
| 164 |
+
tokenizer.padding_side = "left"
|
| 165 |
|
| 166 |
# Use MyTextGenerationPipeline with custom preprocess() method
|
| 167 |
pipe = MyTextGenerationPipeline(
|
mods/tool_calling_llm.py
CHANGED
|
@@ -177,9 +177,9 @@ class ToolCallingLLM(BaseChatModel, ABC):
|
|
| 177 |
# Extract <think>...</think> content and text after </think> for further processing 20250726 jmd
|
| 178 |
think_text, post_think = extract_think(response_message.content)
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
|
| 184 |
# Remove backticks around code blocks
|
| 185 |
post_think = re.sub(r"^```json", "", post_think)
|
|
|
|
| 177 |
# Extract <think>...</think> content and text after </think> for further processing 20250726 jmd
|
| 178 |
think_text, post_think = extract_think(response_message.content)
|
| 179 |
|
| 180 |
+
# For debugging
|
| 181 |
+
print("post_think")
|
| 182 |
+
print(post_think)
|
| 183 |
|
| 184 |
# Remove backticks around code blocks
|
| 185 |
post_think = re.sub(r"^```json", "", post_think)
|
requirements.txt
CHANGED
|
@@ -13,7 +13,7 @@ flash-attn==2.8.2
|
|
| 13 |
# SmolLM3: transformers>=4.53
|
| 14 |
# NOTE: Gemma 3 with transformers==4.54.0 gives:
|
| 15 |
# ValueError: Max cache length is not consistent across layers
|
| 16 |
-
transformers==4.
|
| 17 |
tokenizers==0.21.2
|
| 18 |
# Required by langchain-huggingface
|
| 19 |
sentence-transformers==5.0.0
|
|
|
|
| 13 |
# SmolLM3: transformers>=4.53
|
| 14 |
# NOTE: Gemma 3 with transformers==4.54.0 gives:
|
| 15 |
# ValueError: Max cache length is not consistent across layers
|
| 16 |
+
transformers==4.54.1
|
| 17 |
tokenizers==0.21.2
|
| 18 |
# Required by langchain-huggingface
|
| 19 |
sentence-transformers==5.0.0
|