| |
| """ |
| Script para configurar reposit贸rios e pipeline de treinamento no HuggingFace. |
| Cria: |
| - Reposit贸rio de dataset |
| - Reposit贸rio de modelo (LoRA) |
| - Espa莽o de treinamento (se necess谩rio) |
| |
| Autentica莽茫o: defina HF_TOKEN no ambiente ou use `huggingface-cli login`. |
| Nunca commite tokens no reposit贸rio. |
| """ |
|
|
| import json |
| import os |
| import sys |
| from pathlib import Path |
|
|
| from huggingface_hub import HfApi, create_repo, login, whoami |
|
|
| HF_USER = os.environ.get("HF_USER", "beAnalytic") |
| DATASET_REPO = os.environ.get("DATASET_REPO", f"{HF_USER}/eda-training-dataset") |
| MODEL_REPO = os.environ.get("OUTPUT_REPO", f"{HF_USER}/eda-llm-qwen2.5-lora") |
| TRAINING_SPACE = os.environ.get("TRAINING_SPACE", f"{HF_USER}/Training") |
|
|
|
|
| def _token() -> str | None: |
| return os.environ.get("HF_TOKEN") |
|
|
|
|
| def print_step(message: str, status: str = "info"): |
| """Print formatado para diferentes tipos de mensagens""" |
| colors = { |
| "info": "\033[0;34m", |
| "success": "\033[0;32m", |
| "warning": "\033[1;33m", |
| "error": "\033[0;31m", |
| } |
| symbols = { |
| "info": "i", |
| "success": "ok", |
| "warning": "!", |
| "error": "x", |
| } |
| reset = "\033[0m" |
| print(f"{colors.get(status, colors['info'])}[{symbols.get(status, 'i')}] {message}{reset}") |
|
|
|
|
| def main(): |
| print_step("Configurando HuggingFace - BeAnalytic", "info") |
| print() |
| |
| hf_token = _token() |
| if not hf_token: |
| print_step( |
| "HF_TOKEN nao definido. Exporte HF_TOKEN ou execute: huggingface-cli login", |
| "error", |
| ) |
| sys.exit(1) |
|
|
| print_step("Autenticando no HuggingFace...", "info") |
| try: |
| login(token=hf_token, add_to_git_credential=False) |
| user_info = whoami() |
| print_step(f"Autenticado como: {user_info['name']}", "success") |
| if user_info["name"] != HF_USER: |
| print_step( |
| f"Aviso: usuario esperado '{HF_USER}' mas autenticado como '{user_info['name']}'", |
| "warning", |
| ) |
| except Exception as e: |
| print_step(f"Erro ao autenticar: {e}", "error") |
| sys.exit(1) |
| |
| api = HfApi(token=hf_token) |
| |
| print() |
| print_step(f"Criando repositorio de dataset: {DATASET_REPO}", "info") |
| try: |
| create_repo( |
| repo_id=DATASET_REPO, |
| repo_type="dataset", |
| token=hf_token, |
| exist_ok=True, |
| private=False, |
| ) |
| print_step(f"Dataset: {DATASET_REPO}", "success") |
| except Exception as e: |
| print_step(f"Erro ao criar dataset: {e}", "error") |
| if "already exists" not in str(e).lower(): |
| sys.exit(1) |
| print_step("Repositorio ja existe, continuando...", "warning") |
| |
| print() |
| print_step(f"Criando repositorio de modelo (LoRA): {MODEL_REPO}", "info") |
| try: |
| create_repo( |
| repo_id=MODEL_REPO, |
| repo_type="model", |
| token=hf_token, |
| exist_ok=True, |
| private=False, |
| ) |
| print_step(f"Modelo: {MODEL_REPO}", "success") |
| except Exception as e: |
| print_step(f"Erro ao criar modelo: {e}", "error") |
| if "already exists" not in str(e).lower(): |
| sys.exit(1) |
| print_step("Repositorio ja existe, continuando...", "warning") |
| |
| print() |
| print_step(f"Criando espaco de treinamento: {TRAINING_SPACE}", "info") |
| try: |
| create_repo( |
| repo_id=TRAINING_SPACE, |
| repo_type="space", |
| token=hf_token, |
| exist_ok=True, |
| private=False, |
| space_sdk="docker", |
| ) |
| print_step(f"Space: {TRAINING_SPACE}", "success") |
| except Exception as e: |
| print_step(f"Erro ao criar space: {e}", "error") |
| if "already exists" not in str(e).lower(): |
| print_step("Continuando sem space...", "warning") |
| else: |
| print_step("Space ja existe, continuando...", "warning") |
| |
| print() |
| print_step("Atualizando arquivos de configuracao locais...", "info") |
| |
| script_dir = Path(__file__).parent |
| |
| config_file = script_dir.parent / "training_config.json" |
| if config_file.exists(): |
| with open(config_file, encoding="utf-8") as f: |
| config = json.load(f) |
| |
| config["dataset"] = DATASET_REPO |
| config["output_dir"] = MODEL_REPO |
| |
| with open(config_file, "w", encoding="utf-8") as f: |
| json.dump(config, f, indent=2) |
| print_step(f"Atualizado: {config_file.name}", "success") |
| |
| dockerfile = script_dir.parent / "Dockerfile" |
| if dockerfile.exists(): |
| content = dockerfile.read_text(encoding="utf-8") |
| content = content.replace("DATASET_REPO=amarorn/eda-training-dataset", f"DATASET_REPO={DATASET_REPO}") |
| content = content.replace("OUTPUT_REPO=amarorn/eda-llm-model", f"OUTPUT_REPO={MODEL_REPO}") |
| dockerfile.write_text(content, encoding="utf-8") |
| print_step(f"Atualizado: {dockerfile.name}", "success") |
| |
| env_example = script_dir / ".env.example" |
| env_content = f"""# Copie para .env e preencha (nao commite .env) |
| # HF_TOKEN=obtenha em https://huggingface.co/settings/tokens |
| HF_USER={HF_USER} |
| DATASET_REPO={DATASET_REPO} |
| OUTPUT_REPO={MODEL_REPO} |
| MODEL_NAME=Qwen/Qwen2.5-1.5B-Instruct |
| |
| NUM_EPOCHS=3 |
| BATCH_SIZE=4 |
| LEARNING_RATE=3e-05 |
| """ |
| env_example.write_text(env_content, encoding="utf-8") |
| print_step(f"Atualizado: {env_example.name}", "success") |
| |
| print() |
| print_step("=" * 60, "info") |
| print_step("Configuracao concluida.", "success") |
| print_step("=" * 60, "info") |
| print() |
| print("Repositorios:") |
| print(f" - Dataset: https://huggingface.co/datasets/{DATASET_REPO}") |
| print(f" - Modelo: https://huggingface.co/{MODEL_REPO}") |
| print(f" - Training: https://huggingface.co/spaces/{TRAINING_SPACE}") |
| print() |
| print("Registro auditavel: ml/configs/huggingface_training_config/hf_registry.json") |
| print() |
| print("Proximos passos:") |
| print(" 1. Upload do dataset para o repositorio de dataset") |
| print(" 2. No Space, secrets: HF_TOKEN, DATASET_REPO, OUTPUT_REPO, MODEL_NAME") |
| print(" 3. Executar treinamento") |
| print() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|