Spaces:

rwayz
/

tributario

Build error

App Files Files Community

tributario / model.py

rwayz

Update model.py

51b6b2a verified 12 months ago

raw

history blame contribute delete

3.47 kB

	import os
	import time
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from huggingface_hub import login

	# 🔹 Força depuração CUDA (caso seja erro de VRAM)
	os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

	# 🔹 Obter Token do Hugging Face do ambiente
	HF_API_KEY = os.getenv("HF_API_KEY")
	if not HF_API_KEY:
	raise ValueError("❌ ERRO: Token Hugging Face não encontrado no ambiente.")

	# ✅ Fazer login no Hugging Face
	print("🔄 Conectando ao Hugging Face...")
	login(HF_API_KEY)
	print("✅ Conectado ao Hugging Face!")

	# 🔹 Nome do modelo treinado
	MODEL_NAME = "rwayz/tributario-llama-8b-v1"

	# 🔄 Carregar modelo e tokenizer
	print(f"🔄 Carregando modelo: {MODEL_NAME}...")
	start_time = time.time()

	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_API_KEY)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # 🔹 Usa `fp16` na GPU para eficiência
	device_map="auto", # 🔹 Distribui automaticamente para a GPU (se disponível)
	use_cache=False, # 🔹 Necessário se foi treinado com `gradient_checkpointing`
	token=HF_API_KEY
	)
	print(f"✅ Modelo carregado! Tempo: {time.time() - start_time:.2f}s")

	except Exception as e:
	print(f"❌ ERRO AO CARREGAR O MODELO: {str(e)}")
	exit()

	# 🔹 Definir `pad_token` caso esteja ausente
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# 🔹 Pergunta padrão para teste
	pergunta = "Quais são os principais tributos aplicáveis a empresas no Brasil?"
	print("📝 Pergunta de teste:", pergunta)

	# 🔄 Tokenizar entrada
	device = "cuda" if torch.cuda.is_available() else "cpu"
	inputs = tokenizer(f"Pergunta: {pergunta}", return_tensors="pt").to(device)

	# 🔄 Liberar memória antes da inferência
	torch.cuda.empty_cache()

	# 🔄 Geração robusta de resposta
	print("🤖 Gerando resposta do modelo...")
	inference_start_time = time.time()

	try:
	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_length=256,
	do_sample=True,
	top_k=50, # Teste valores maiores
	top_p=0.95, # Teste valores mais altos
	temperature=0.7, # Aumente para diversificar respostas
	pad_token_id=tokenizer.pad_token_id
	)

	# 🔹 Decodificar resposta
	resposta = tokenizer.decode(output[0], skip_special_tokens=True)
	print("✅ Resposta gerada! Tempo de inferência:", time.time() - inference_start_time, "s")
	print("\n📝 Resposta do Modelo:\n", resposta)

	except RuntimeError as e:
	print(f"❌ ERRO NA GERAÇÃO: {str(e)}")

	# 🔹 Se o erro for na GPU, tentar na CPU
	if "CUDA error" in str(e) or "inf" in str(e) or "nan" in str(e):
	print("🔄 Tentando rodar na CPU como fallback...")
	model.to("cpu")
	inputs = inputs.to("cpu")

	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_length=256,
	do_sample=True,
	top_k=30,
	top_p=0.85,
	temperature=0.5,
	pad_token_id=tokenizer.pad_token_id
	)

	resposta = tokenizer.decode(output[0], skip_special_tokens=True)
	print("✅ Resposta gerada na CPU:")
	print(resposta)