#!/usr/bin/env python3 """ Script para configurar repositórios e pipeline de treinamento no HuggingFace. Cria: - Repositório de dataset - Repositório de modelo (LoRA) - Espaço de treinamento (se necessário) Autenticação: defina HF_TOKEN no ambiente ou use `huggingface-cli login`. Nunca commite tokens no repositório. """ import json import os import sys from pathlib import Path from huggingface_hub import HfApi, create_repo, login, whoami HF_USER = os.environ.get("HF_USER", "beAnalytic") DATASET_REPO = os.environ.get("DATASET_REPO", f"{HF_USER}/eda-training-dataset") MODEL_REPO = os.environ.get("OUTPUT_REPO", f"{HF_USER}/eda-llm-qwen2.5-lora") TRAINING_SPACE = os.environ.get("TRAINING_SPACE", f"{HF_USER}/Training") def _token() -> str | None: return os.environ.get("HF_TOKEN") def print_step(message: str, status: str = "info"): """Print formatado para diferentes tipos de mensagens""" colors = { "info": "\033[0;34m", "success": "\033[0;32m", "warning": "\033[1;33m", "error": "\033[0;31m", } symbols = { "info": "i", "success": "ok", "warning": "!", "error": "x", } reset = "\033[0m" print(f"{colors.get(status, colors['info'])}[{symbols.get(status, 'i')}] {message}{reset}") def main(): print_step("Configurando HuggingFace - BeAnalytic", "info") print() hf_token = _token() if not hf_token: print_step( "HF_TOKEN nao definido. Exporte HF_TOKEN ou execute: huggingface-cli login", "error", ) sys.exit(1) print_step("Autenticando no HuggingFace...", "info") try: login(token=hf_token, add_to_git_credential=False) user_info = whoami() print_step(f"Autenticado como: {user_info['name']}", "success") if user_info["name"] != HF_USER: print_step( f"Aviso: usuario esperado '{HF_USER}' mas autenticado como '{user_info['name']}'", "warning", ) except Exception as e: print_step(f"Erro ao autenticar: {e}", "error") sys.exit(1) api = HfApi(token=hf_token) print() print_step(f"Criando repositorio de dataset: {DATASET_REPO}", "info") try: create_repo( repo_id=DATASET_REPO, repo_type="dataset", token=hf_token, exist_ok=True, private=False, ) print_step(f"Dataset: {DATASET_REPO}", "success") except Exception as e: print_step(f"Erro ao criar dataset: {e}", "error") if "already exists" not in str(e).lower(): sys.exit(1) print_step("Repositorio ja existe, continuando...", "warning") print() print_step(f"Criando repositorio de modelo (LoRA): {MODEL_REPO}", "info") try: create_repo( repo_id=MODEL_REPO, repo_type="model", token=hf_token, exist_ok=True, private=False, ) print_step(f"Modelo: {MODEL_REPO}", "success") except Exception as e: print_step(f"Erro ao criar modelo: {e}", "error") if "already exists" not in str(e).lower(): sys.exit(1) print_step("Repositorio ja existe, continuando...", "warning") print() print_step(f"Criando espaco de treinamento: {TRAINING_SPACE}", "info") try: create_repo( repo_id=TRAINING_SPACE, repo_type="space", token=hf_token, exist_ok=True, private=False, space_sdk="docker", ) print_step(f"Space: {TRAINING_SPACE}", "success") except Exception as e: print_step(f"Erro ao criar space: {e}", "error") if "already exists" not in str(e).lower(): print_step("Continuando sem space...", "warning") else: print_step("Space ja existe, continuando...", "warning") print() print_step("Atualizando arquivos de configuracao locais...", "info") script_dir = Path(__file__).parent config_file = script_dir.parent / "training_config.json" if config_file.exists(): with open(config_file, encoding="utf-8") as f: config = json.load(f) config["dataset"] = DATASET_REPO config["output_dir"] = MODEL_REPO with open(config_file, "w", encoding="utf-8") as f: json.dump(config, f, indent=2) print_step(f"Atualizado: {config_file.name}", "success") dockerfile = script_dir.parent / "Dockerfile" if dockerfile.exists(): content = dockerfile.read_text(encoding="utf-8") content = content.replace("DATASET_REPO=amarorn/eda-training-dataset", f"DATASET_REPO={DATASET_REPO}") content = content.replace("OUTPUT_REPO=amarorn/eda-llm-model", f"OUTPUT_REPO={MODEL_REPO}") dockerfile.write_text(content, encoding="utf-8") print_step(f"Atualizado: {dockerfile.name}", "success") env_example = script_dir / ".env.example" env_content = f"""# Copie para .env e preencha (nao commite .env) # HF_TOKEN=obtenha em https://huggingface.co/settings/tokens HF_USER={HF_USER} DATASET_REPO={DATASET_REPO} OUTPUT_REPO={MODEL_REPO} MODEL_NAME=Qwen/Qwen2.5-1.5B-Instruct NUM_EPOCHS=3 BATCH_SIZE=4 LEARNING_RATE=3e-05 """ env_example.write_text(env_content, encoding="utf-8") print_step(f"Atualizado: {env_example.name}", "success") print() print_step("=" * 60, "info") print_step("Configuracao concluida.", "success") print_step("=" * 60, "info") print() print("Repositorios:") print(f" - Dataset: https://huggingface.co/datasets/{DATASET_REPO}") print(f" - Modelo: https://huggingface.co/{MODEL_REPO}") print(f" - Training: https://huggingface.co/spaces/{TRAINING_SPACE}") print() print("Registro auditavel: ml/configs/huggingface_training_config/hf_registry.json") print() print("Proximos passos:") print(" 1. Upload do dataset para o repositorio de dataset") print(" 2. No Space, secrets: HF_TOKEN, DATASET_REPO, OUTPUT_REPO, MODEL_NAME") print(" 3. Executar treinamento") print() if __name__ == "__main__": main()