Upload folder using huggingface_hub
Browse files- src/dispatchai/core.py +26 -5
src/dispatchai/core.py
CHANGED
|
@@ -71,6 +71,17 @@ class DispatchModel:
|
|
| 71 |
self._tokenizer = None
|
| 72 |
self._loaded = False
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def _load(self):
|
| 75 |
"""Lazily load the model on first use."""
|
| 76 |
if self._loaded:
|
|
@@ -102,7 +113,13 @@ class DispatchModel:
|
|
| 102 |
|
| 103 |
from huggingface_hub import hf_hub_download
|
| 104 |
gguf_path = hf_hub_download(self.repo_id, "model.gguf")
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
self._loaded = True
|
| 108 |
|
|
@@ -150,13 +167,17 @@ class DispatchModel:
|
|
| 150 |
return response.strip()
|
| 151 |
|
| 152 |
elif self.backend == "gguf":
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
max_tokens=max_tokens,
|
| 156 |
temperature=temperature,
|
| 157 |
-
echo=False,
|
| 158 |
)
|
| 159 |
-
return response["choices"][0]["
|
| 160 |
|
| 161 |
return ""
|
| 162 |
|
|
|
|
| 71 |
self._tokenizer = None
|
| 72 |
self._loaded = False
|
| 73 |
|
| 74 |
+
def _detect_chat_format(self, model_name: str) -> Optional[str]:
|
| 75 |
+
"""Auto-detect the correct chat format from model name."""
|
| 76 |
+
lower = model_name.lower()
|
| 77 |
+
if "smollm" in lower or "llama-3" in lower:
|
| 78 |
+
return "llama-3"
|
| 79 |
+
elif "gemma" in lower:
|
| 80 |
+
return "gemma"
|
| 81 |
+
elif "qwen" in lower or "phi" in lower or "tinyllama" in lower or "minicpm" in lower or "moondream" in lower:
|
| 82 |
+
return "chatml"
|
| 83 |
+
return "chatml" # Safe default
|
| 84 |
+
|
| 85 |
def _load(self):
|
| 86 |
"""Lazily load the model on first use."""
|
| 87 |
if self._loaded:
|
|
|
|
| 113 |
|
| 114 |
from huggingface_hub import hf_hub_download
|
| 115 |
gguf_path = hf_hub_download(self.repo_id, "model.gguf")
|
| 116 |
+
|
| 117 |
+
# Determine chat format from model name
|
| 118 |
+
chat_format = self._detect_chat_format(self.model_name)
|
| 119 |
+
kwargs = dict(model_path=gguf_path, n_ctx=512, n_threads=4, verbose=False)
|
| 120 |
+
if chat_format:
|
| 121 |
+
kwargs["chat_format"] = chat_format
|
| 122 |
+
self._model = Llama(**kwargs)
|
| 123 |
|
| 124 |
self._loaded = True
|
| 125 |
|
|
|
|
| 167 |
return response.strip()
|
| 168 |
|
| 169 |
elif self.backend == "gguf":
|
| 170 |
+
messages = []
|
| 171 |
+
if system:
|
| 172 |
+
messages.append({"role": "system", "content": system})
|
| 173 |
+
messages.append({"role": "user", "content": message})
|
| 174 |
+
|
| 175 |
+
response = self._model.create_chat_completion(
|
| 176 |
+
messages=messages,
|
| 177 |
max_tokens=max_tokens,
|
| 178 |
temperature=temperature,
|
|
|
|
| 179 |
)
|
| 180 |
+
return response["choices"][0]["message"]["content"].strip()
|
| 181 |
|
| 182 |
return ""
|
| 183 |
|