|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import torch |
|
|
import transformers |
|
|
from transformers import pipeline |
|
|
|
|
|
model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit" |
|
|
|
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
|
|
print("Loading model...") |
|
|
try: |
|
|
|
|
|
|
|
|
pipeline_model = pipeline( |
|
|
"text-generation", |
|
|
model=model_path, |
|
|
device=0, |
|
|
torch_dtype=torch.bfloat16, |
|
|
token=hf_token, |
|
|
trust_remote_code=True, |
|
|
model_kwargs={ |
|
|
"torch_dtype": torch.bfloat16, |
|
|
"load_in_4bit": True, |
|
|
"bnb_4bit_compute_dtype": torch.bfloat16, |
|
|
"bnb_4bit_use_double_quant": False, |
|
|
"bnb_4bit_quant_type": "nf4", |
|
|
} |
|
|
) |
|
|
|
|
|
print("Model loaded successfully!") |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
|
{"role": "user", "content": "Hello!"}, |
|
|
] |
|
|
|
|
|
print("Testing generation...") |
|
|
|
|
|
prompt = pipeline_model.tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
outputs = pipeline_model( |
|
|
prompt, |
|
|
max_new_tokens=50, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
do_sample=True, |
|
|
return_full_text=False |
|
|
) |
|
|
|
|
|
response = outputs[0]["generated_text"] |
|
|
print(f"Test response: {response}") |
|
|
print("✅ Model test successful!") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|