Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -220,29 +220,46 @@ class OnnxBgeEmbeddings(Embeddings):
|
|
| 220 |
def embed_query(self, text):
|
| 221 |
return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
|
| 222 |
|
|
|
|
|
|
|
|
|
|
| 223 |
# ---------------------------------------------------------
|
| 224 |
# 2. LLM Evaluator Class (Llama-3.2-1B ONNX)
|
| 225 |
# ---------------------------------------------------------
|
| 226 |
class LLMEvaluator:
|
| 227 |
def __init__(self):
|
| 228 |
self.repo_id = "onnx-community/Llama-3.2-1B-Instruct"
|
|
|
|
|
|
|
| 229 |
print(f"π Preparing LLM: {self.repo_id}...")
|
| 230 |
|
| 231 |
# [CRITICAL FIX]
|
| 232 |
-
#
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
| 235 |
repo_id=self.repo_id,
|
| 236 |
-
local_dir=
|
| 237 |
-
local_dir_use_symlinks=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
)
|
| 239 |
-
print("β
Download complete.")
|
| 240 |
|
| 241 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 242 |
|
| 243 |
-
# Load
|
|
|
|
| 244 |
self.model = ORTModelForCausalLM.from_pretrained(
|
| 245 |
-
|
|
|
|
| 246 |
use_cache=True,
|
| 247 |
use_io_binding=False
|
| 248 |
)
|
|
@@ -293,7 +310,6 @@ class LLMEvaluator:
|
|
| 293 |
skip_special_tokens=True
|
| 294 |
)
|
| 295 |
return response
|
| 296 |
-
|
| 297 |
# ---------------------------------------------------------
|
| 298 |
# 3. Main Application Logic
|
| 299 |
# ---------------------------------------------------------
|
|
|
|
| 220 |
def embed_query(self, text):
|
| 221 |
return self._process_batch(["Represent this sentence for searching relevant passages: " + text])[0]
|
| 222 |
|
| 223 |
+
# ---------------------------------------------------------
|
| 224 |
+
# 2. LLM Evaluator Class (Llama-3.2-1B ONNX)
|
| 225 |
+
# ---------------------------------------------------------
|
| 226 |
# ---------------------------------------------------------
|
| 227 |
# 2. LLM Evaluator Class (Llama-3.2-1B ONNX)
|
| 228 |
# ---------------------------------------------------------
|
| 229 |
class LLMEvaluator:
|
| 230 |
def __init__(self):
|
| 231 |
self.repo_id = "onnx-community/Llama-3.2-1B-Instruct"
|
| 232 |
+
self.local_dir = "onnx_llama_local"
|
| 233 |
+
|
| 234 |
print(f"π Preparing LLM: {self.repo_id}...")
|
| 235 |
|
| 236 |
# [CRITICAL FIX]
|
| 237 |
+
# We use 'allow_patterns' to download ONLY the specific FP16 model (~2GB)
|
| 238 |
+
# and ignore the huge standard and quantized files.
|
| 239 |
+
print(f"π₯ Downloading ONLY the FP16 version to {self.local_dir}...")
|
| 240 |
+
|
| 241 |
+
snapshot_download(
|
| 242 |
repo_id=self.repo_id,
|
| 243 |
+
local_dir=self.local_dir,
|
| 244 |
+
local_dir_use_symlinks=False,
|
| 245 |
+
allow_patterns=[
|
| 246 |
+
"config.json",
|
| 247 |
+
"generation_config.json",
|
| 248 |
+
"tokenizer*",
|
| 249 |
+
"special_tokens_map.json",
|
| 250 |
+
"*.jinja",
|
| 251 |
+
"onnx/model_fp16.onnx" # <--- CHANGED to FP16 model
|
| 252 |
+
]
|
| 253 |
)
|
| 254 |
+
print("β
Download complete (Filtered to ~2GB).")
|
| 255 |
|
| 256 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
|
| 257 |
|
| 258 |
+
# Load the specific FP16 ONNX file
|
| 259 |
+
# We point strictly to the file we downloaded
|
| 260 |
self.model = ORTModelForCausalLM.from_pretrained(
|
| 261 |
+
self.local_dir,
|
| 262 |
+
file_name="onnx/model_fp16.onnx", # <--- CHANGED to load FP16 file
|
| 263 |
use_cache=True,
|
| 264 |
use_io_binding=False
|
| 265 |
)
|
|
|
|
| 310 |
skip_special_tokens=True
|
| 311 |
)
|
| 312 |
return response
|
|
|
|
| 313 |
# ---------------------------------------------------------
|
| 314 |
# 3. Main Application Logic
|
| 315 |
# ---------------------------------------------------------
|