import os
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login

# 🔹 Força depuração CUDA (caso seja erro de VRAM)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# 🔹 Obter Token do Hugging Face do ambiente
HF_API_KEY = os.getenv("HF_API_KEY")
if not HF_API_KEY:
    raise ValueError("❌ ERRO: Token Hugging Face não encontrado no ambiente.")

# ✅ Fazer login no Hugging Face
print("🔄 Conectando ao Hugging Face...")
login(HF_API_KEY)
print("✅ Conectado ao Hugging Face!")

# 🔹 Nome do modelo treinado
MODEL_NAME = "rwayz/tributario-llama-8b-v1"

# 🔄 Carregar modelo e tokenizer
print(f"🔄 Carregando modelo: {MODEL_NAME}...")
start_time = time.time()

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_API_KEY)
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, 
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # 🔹 Usa `fp16` na GPU para eficiência
        device_map="auto",  # 🔹 Distribui automaticamente para a GPU (se disponível)
        use_cache=False,  # 🔹 Necessário se foi treinado com `gradient_checkpointing`
        token=HF_API_KEY
    )
    print(f"✅ Modelo carregado! Tempo: {time.time() - start_time:.2f}s")

except Exception as e:
    print(f"❌ ERRO AO CARREGAR O MODELO: {str(e)}")
    exit()

# 🔹 Definir `pad_token` caso esteja ausente
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 🔹 Pergunta padrão para teste
pergunta = "Quais são os principais tributos aplicáveis a empresas no Brasil?"
print("📝 Pergunta de teste:", pergunta)

# 🔄 Tokenizar entrada
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = tokenizer(f"Pergunta: {pergunta}", return_tensors="pt").to(device)

# 🔄 Liberar memória antes da inferência
torch.cuda.empty_cache()

# 🔄 Geração robusta de resposta
print("🤖 Gerando resposta do modelo...")
inference_start_time = time.time()

try:
    with torch.no_grad():
        output = model.generate(
            **inputs, 
            max_length=256,  
            do_sample=True,   
            top_k=50,         # Teste valores maiores
            top_p=0.95,       # Teste valores mais altos
            temperature=0.7,  # Aumente para diversificar respostas
            pad_token_id=tokenizer.pad_token_id  
        )
    
    # 🔹 Decodificar resposta
    resposta = tokenizer.decode(output[0], skip_special_tokens=True)
    print("✅ Resposta gerada! Tempo de inferência:", time.time() - inference_start_time, "s")
    print("\n📝 **Resposta do Modelo:**\n", resposta)

except RuntimeError as e:
    print(f"❌ ERRO NA GERAÇÃO: {str(e)}")

    # 🔹 Se o erro for na GPU, tentar na CPU
    if "CUDA error" in str(e) or "inf" in str(e) or "nan" in str(e):
        print("🔄 Tentando rodar na CPU como fallback...")
        model.to("cpu")
        inputs = inputs.to("cpu")
        
        with torch.no_grad():
            output = model.generate(
                **inputs, 
                max_length=256,  
                do_sample=True,   
                top_k=30,         
                top_p=0.85,  
                temperature=0.5,  
                pad_token_id=tokenizer.pad_token_id  
            )

        resposta = tokenizer.decode(output[0], skip_special_tokens=True)
        print("✅ Resposta gerada na CPU:")
        print(resposta)