Spaces:
Runtime error
Runtime error
Update models/loader.py
Browse files- models/loader.py +38 -16
models/loader.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
| 1 |
# models/loader.py
|
| 2 |
import torch
|
| 3 |
-
import os
|
| 4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 5 |
from backend.agents import ROLE_PROMPTS
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
torch.set_num_threads(num_threads)
|
| 9 |
-
# The following configs are no longer used for CPU, but kept for future GPU use.
|
| 10 |
QUANTIZATION_CONFIG = BitsAndBytesConfig(
|
| 11 |
load_in_4bit=True,
|
| 12 |
bnb_4bit_quant_type="nf4",
|
|
@@ -27,24 +24,48 @@ MODEL_REGISTRY = {
|
|
| 27 |
}
|
| 28 |
_MODEL_CACHE = {}
|
| 29 |
|
| 30 |
-
def get_model_and_tokenizer(model_name
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
if model_name not in _MODEL_CACHE:
|
| 32 |
print(f"Loading model: {model_name}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
_MODEL_CACHE[model_name] = {
|
| 34 |
-
"model":
|
| 35 |
-
|
| 36 |
-
device_map=None,
|
| 37 |
-
quantization_config=None,
|
| 38 |
-
trust_remote_code=True,
|
| 39 |
-
),
|
| 40 |
-
"tokenizer": AutoTokenizer.from_pretrained(model_name)
|
| 41 |
}
|
| 42 |
-
# Explicitly move the model to the CPU after loading
|
| 43 |
-
_MODEL_CACHE[model_name]["model"].to("cpu")
|
| 44 |
|
| 45 |
return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
|
| 46 |
|
| 47 |
def generate_with_model(agent_role, prompt):
|
|
|
|
|
|
|
|
|
|
| 48 |
model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
|
| 49 |
model, tokenizer = get_model_and_tokenizer(model_name)
|
| 50 |
|
|
@@ -59,7 +80,8 @@ def generate_with_model(agent_role, prompt):
|
|
| 59 |
do_sample=True,
|
| 60 |
temperature=0.7,
|
| 61 |
top_p=0.9,
|
| 62 |
-
repetition_penalty=1.1
|
|
|
|
| 63 |
)
|
| 64 |
|
| 65 |
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
|
| 1 |
# models/loader.py
|
| 2 |
import torch
|
|
|
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 4 |
from backend.agents import ROLE_PROMPTS
|
| 5 |
+
|
| 6 |
+
# This configuration is not used for CPU, but is kept for future GPU use.
|
|
|
|
|
|
|
| 7 |
QUANTIZATION_CONFIG = BitsAndBytesConfig(
|
| 8 |
load_in_4bit=True,
|
| 9 |
bnb_4bit_quant_type="nf4",
|
|
|
|
| 24 |
}
|
| 25 |
_MODEL_CACHE = {}
|
| 26 |
|
| 27 |
+
def get_model_and_tokenizer(model_name):
|
| 28 |
+
"""
|
| 29 |
+
Loads a model and its tokenizer from the Hugging Face Hub.
|
| 30 |
+
Implements caching to avoid reloading the model for each call.
|
| 31 |
+
"""
|
| 32 |
if model_name not in _MODEL_CACHE:
|
| 33 |
print(f"Loading model: {model_name}...")
|
| 34 |
+
|
| 35 |
+
# Load the tokenizer first
|
| 36 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 37 |
+
|
| 38 |
+
# FIX: Explicitly set the pad token if it's the same as the eos token.
|
| 39 |
+
# This prevents the model from getting stuck in a generation loop.
|
| 40 |
+
if tokenizer.pad_token is None:
|
| 41 |
+
if tokenizer.eos_token is not None:
|
| 42 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 43 |
+
else:
|
| 44 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
| 45 |
+
print("Added a new [PAD] token to the tokenizer.")
|
| 46 |
+
|
| 47 |
+
# Load the model with no device map or quantization for CPU inference
|
| 48 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 49 |
+
model_name,
|
| 50 |
+
device_map=None,
|
| 51 |
+
quantization_config=None,
|
| 52 |
+
trust_remote_code=True,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# Explicitly move the model to the CPU after loading
|
| 56 |
+
model.to("cpu")
|
| 57 |
+
|
| 58 |
_MODEL_CACHE[model_name] = {
|
| 59 |
+
"model": model,
|
| 60 |
+
"tokenizer": tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
|
|
|
|
|
|
| 62 |
|
| 63 |
return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
|
| 64 |
|
| 65 |
def generate_with_model(agent_role, prompt):
|
| 66 |
+
"""
|
| 67 |
+
Generates a response using the specified agent's model.
|
| 68 |
+
"""
|
| 69 |
model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
|
| 70 |
model, tokenizer = get_model_and_tokenizer(model_name)
|
| 71 |
|
|
|
|
| 80 |
do_sample=True,
|
| 81 |
temperature=0.7,
|
| 82 |
top_p=0.9,
|
| 83 |
+
repetition_penalty=1.1,
|
| 84 |
+
pad_token_id=tokenizer.pad_token_id
|
| 85 |
)
|
| 86 |
|
| 87 |
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
|