eda_trainning_lora / scripts /setup_huggingface.py
Ademir
Initial clean commit: scripts and config without logs
d4a00b2
#!/usr/bin/env python3
"""
Script para configurar reposit贸rios e pipeline de treinamento no HuggingFace.
Cria:
- Reposit贸rio de dataset
- Reposit贸rio de modelo (LoRA)
- Espa莽o de treinamento (se necess谩rio)
Autentica莽茫o: defina HF_TOKEN no ambiente ou use `huggingface-cli login`.
Nunca commite tokens no reposit贸rio.
"""
import json
import os
import sys
from pathlib import Path
from huggingface_hub import HfApi, create_repo, login, whoami
HF_USER = os.environ.get("HF_USER", "beAnalytic")
DATASET_REPO = os.environ.get("DATASET_REPO", f"{HF_USER}/eda-training-dataset")
MODEL_REPO = os.environ.get("OUTPUT_REPO", f"{HF_USER}/eda-llm-qwen2.5-lora")
TRAINING_SPACE = os.environ.get("TRAINING_SPACE", f"{HF_USER}/Training")
def _token() -> str | None:
return os.environ.get("HF_TOKEN")
def print_step(message: str, status: str = "info"):
"""Print formatado para diferentes tipos de mensagens"""
colors = {
"info": "\033[0;34m",
"success": "\033[0;32m",
"warning": "\033[1;33m",
"error": "\033[0;31m",
}
symbols = {
"info": "i",
"success": "ok",
"warning": "!",
"error": "x",
}
reset = "\033[0m"
print(f"{colors.get(status, colors['info'])}[{symbols.get(status, 'i')}] {message}{reset}")
def main():
print_step("Configurando HuggingFace - BeAnalytic", "info")
print()
hf_token = _token()
if not hf_token:
print_step(
"HF_TOKEN nao definido. Exporte HF_TOKEN ou execute: huggingface-cli login",
"error",
)
sys.exit(1)
print_step("Autenticando no HuggingFace...", "info")
try:
login(token=hf_token, add_to_git_credential=False)
user_info = whoami()
print_step(f"Autenticado como: {user_info['name']}", "success")
if user_info["name"] != HF_USER:
print_step(
f"Aviso: usuario esperado '{HF_USER}' mas autenticado como '{user_info['name']}'",
"warning",
)
except Exception as e:
print_step(f"Erro ao autenticar: {e}", "error")
sys.exit(1)
api = HfApi(token=hf_token)
print()
print_step(f"Criando repositorio de dataset: {DATASET_REPO}", "info")
try:
create_repo(
repo_id=DATASET_REPO,
repo_type="dataset",
token=hf_token,
exist_ok=True,
private=False,
)
print_step(f"Dataset: {DATASET_REPO}", "success")
except Exception as e:
print_step(f"Erro ao criar dataset: {e}", "error")
if "already exists" not in str(e).lower():
sys.exit(1)
print_step("Repositorio ja existe, continuando...", "warning")
print()
print_step(f"Criando repositorio de modelo (LoRA): {MODEL_REPO}", "info")
try:
create_repo(
repo_id=MODEL_REPO,
repo_type="model",
token=hf_token,
exist_ok=True,
private=False,
)
print_step(f"Modelo: {MODEL_REPO}", "success")
except Exception as e:
print_step(f"Erro ao criar modelo: {e}", "error")
if "already exists" not in str(e).lower():
sys.exit(1)
print_step("Repositorio ja existe, continuando...", "warning")
print()
print_step(f"Criando espaco de treinamento: {TRAINING_SPACE}", "info")
try:
create_repo(
repo_id=TRAINING_SPACE,
repo_type="space",
token=hf_token,
exist_ok=True,
private=False,
space_sdk="docker",
)
print_step(f"Space: {TRAINING_SPACE}", "success")
except Exception as e:
print_step(f"Erro ao criar space: {e}", "error")
if "already exists" not in str(e).lower():
print_step("Continuando sem space...", "warning")
else:
print_step("Space ja existe, continuando...", "warning")
print()
print_step("Atualizando arquivos de configuracao locais...", "info")
script_dir = Path(__file__).parent
config_file = script_dir.parent / "training_config.json"
if config_file.exists():
with open(config_file, encoding="utf-8") as f:
config = json.load(f)
config["dataset"] = DATASET_REPO
config["output_dir"] = MODEL_REPO
with open(config_file, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
print_step(f"Atualizado: {config_file.name}", "success")
dockerfile = script_dir.parent / "Dockerfile"
if dockerfile.exists():
content = dockerfile.read_text(encoding="utf-8")
content = content.replace("DATASET_REPO=amarorn/eda-training-dataset", f"DATASET_REPO={DATASET_REPO}")
content = content.replace("OUTPUT_REPO=amarorn/eda-llm-model", f"OUTPUT_REPO={MODEL_REPO}")
dockerfile.write_text(content, encoding="utf-8")
print_step(f"Atualizado: {dockerfile.name}", "success")
env_example = script_dir / ".env.example"
env_content = f"""# Copie para .env e preencha (nao commite .env)
# HF_TOKEN=obtenha em https://huggingface.co/settings/tokens
HF_USER={HF_USER}
DATASET_REPO={DATASET_REPO}
OUTPUT_REPO={MODEL_REPO}
MODEL_NAME=Qwen/Qwen2.5-1.5B-Instruct
NUM_EPOCHS=3
BATCH_SIZE=4
LEARNING_RATE=3e-05
"""
env_example.write_text(env_content, encoding="utf-8")
print_step(f"Atualizado: {env_example.name}", "success")
print()
print_step("=" * 60, "info")
print_step("Configuracao concluida.", "success")
print_step("=" * 60, "info")
print()
print("Repositorios:")
print(f" - Dataset: https://huggingface.co/datasets/{DATASET_REPO}")
print(f" - Modelo: https://huggingface.co/{MODEL_REPO}")
print(f" - Training: https://huggingface.co/spaces/{TRAINING_SPACE}")
print()
print("Registro auditavel: ml/configs/huggingface_training_config/hf_registry.json")
print()
print("Proximos passos:")
print(" 1. Upload do dataset para o repositorio de dataset")
print(" 2. No Space, secrets: HF_TOKEN, DATASET_REPO, OUTPUT_REPO, MODEL_NAME")
print(" 3. Executar treinamento")
print()
if __name__ == "__main__":
main()