anaspro
commited on
Commit
·
154d3ef
1
Parent(s):
d7e9b4a
upadte
Browse files- app.py +11 -2
- test_model.py +12 -3
app.py
CHANGED
|
@@ -24,12 +24,21 @@ model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
|
|
| 24 |
hf_token = os.getenv("HF_TOKEN")
|
| 25 |
|
| 26 |
# Initialize pipeline for chat
|
|
|
|
| 27 |
pipeline_model = pipeline(
|
| 28 |
"text-generation",
|
| 29 |
model=model_path,
|
| 30 |
-
|
|
|
|
| 31 |
token=hf_token,
|
| 32 |
-
trust_remote_code=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
)
|
| 34 |
|
| 35 |
def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
|
|
|
|
| 24 |
hf_token = os.getenv("HF_TOKEN")
|
| 25 |
|
| 26 |
# Initialize pipeline for chat
|
| 27 |
+
# For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
|
| 28 |
pipeline_model = pipeline(
|
| 29 |
"text-generation",
|
| 30 |
model=model_path,
|
| 31 |
+
device=0, # Use GPU device directly
|
| 32 |
+
torch_dtype=torch.bfloat16,
|
| 33 |
token=hf_token,
|
| 34 |
+
trust_remote_code=True,
|
| 35 |
+
model_kwargs={
|
| 36 |
+
"torch_dtype": torch.bfloat16,
|
| 37 |
+
"load_in_4bit": True,
|
| 38 |
+
"bnb_4bit_compute_dtype": torch.bfloat16,
|
| 39 |
+
"bnb_4bit_use_double_quant": False,
|
| 40 |
+
"bnb_4bit_quant_type": "nf4",
|
| 41 |
+
}
|
| 42 |
)
|
| 43 |
|
| 44 |
def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
|
test_model.py
CHANGED
|
@@ -6,7 +6,7 @@ import torch
|
|
| 6 |
import transformers
|
| 7 |
from transformers import pipeline
|
| 8 |
|
| 9 |
-
model_path = "unsloth/
|
| 10 |
|
| 11 |
# إذا كان فيه HF_TOKEN في البيئة
|
| 12 |
hf_token = os.getenv("HF_TOKEN")
|
|
@@ -14,12 +14,21 @@ hf_token = os.getenv("HF_TOKEN")
|
|
| 14 |
print("Loading model...")
|
| 15 |
try:
|
| 16 |
# Initialize pipeline for chat
|
|
|
|
| 17 |
pipeline_model = pipeline(
|
| 18 |
"text-generation",
|
| 19 |
model=model_path,
|
| 20 |
-
|
|
|
|
| 21 |
token=hf_token,
|
| 22 |
-
trust_remote_code=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
print("Model loaded successfully!")
|
|
|
|
| 6 |
import transformers
|
| 7 |
from transformers import pipeline
|
| 8 |
|
| 9 |
+
model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
|
| 10 |
|
| 11 |
# إذا كان فيه HF_TOKEN في البيئة
|
| 12 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
| 14 |
print("Loading model...")
|
| 15 |
try:
|
| 16 |
# Initialize pipeline for chat
|
| 17 |
+
# For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
|
| 18 |
pipeline_model = pipeline(
|
| 19 |
"text-generation",
|
| 20 |
model=model_path,
|
| 21 |
+
device=0, # Use GPU device directly
|
| 22 |
+
torch_dtype=torch.bfloat16,
|
| 23 |
token=hf_token,
|
| 24 |
+
trust_remote_code=True,
|
| 25 |
+
model_kwargs={
|
| 26 |
+
"torch_dtype": torch.bfloat16,
|
| 27 |
+
"load_in_4bit": True,
|
| 28 |
+
"bnb_4bit_compute_dtype": torch.bfloat16,
|
| 29 |
+
"bnb_4bit_use_double_quant": False,
|
| 30 |
+
"bnb_4bit_quant_type": "nf4",
|
| 31 |
+
}
|
| 32 |
)
|
| 33 |
|
| 34 |
print("Model loaded successfully!")
|