#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import torch
import transformers
from transformers import pipeline

model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"

# إذا كان فيه HF_TOKEN في البيئة
hf_token = os.getenv("HF_TOKEN")

print("Loading model...")
try:
    # Initialize pipeline for chat
    # For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
    pipeline_model = pipeline(
        "text-generation",
        model=model_path,
        device=0,  # Use GPU device directly
        torch_dtype=torch.bfloat16,
        token=hf_token,
        trust_remote_code=True,
        model_kwargs={
            "torch_dtype": torch.bfloat16,
            "load_in_4bit": True,
            "bnb_4bit_compute_dtype": torch.bfloat16,
            "bnb_4bit_use_double_quant": False,
            "bnb_4bit_quant_type": "nf4",
        }
    )

    print("Model loaded successfully!")

    # Test with a simple message
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello!"},
    ]

    print("Testing generation...")
    # Apply chat template for unsloth models
    prompt = pipeline_model.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    outputs = pipeline_model(
        prompt,
        max_new_tokens=50,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        return_full_text=False
    )

    response = outputs[0]["generated_text"]
    print(f"Test response: {response}")
    print("✅ Model test successful!")

except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()