|
|
|
|
|
""" |
|
|
Multi-Agent Training Deployment Script |
|
|
|
|
|
This script provides comprehensive deployment capabilities for the multi-agent |
|
|
training system, including Docker container management, environment setup, |
|
|
and training execution. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import yaml |
|
|
import argparse |
|
|
import subprocess |
|
|
import logging |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional, Any |
|
|
from dataclasses import dataclass |
|
|
|
|
|
|
|
|
sys.path.append('src') |
|
|
|
|
|
from training.multi_agent_trainer import MultiAgentTrainingConfig, MultiAgentTrainingPipeline |
|
|
from datasets.multi_agent_loader import MultiAgentDatasetConfig |
|
|
|
|
|
@dataclass |
|
|
class DeploymentConfig: |
|
|
"""Configuration for deployment""" |
|
|
project_root: str |
|
|
dataset_path: str |
|
|
model_repo_id: str |
|
|
dataset_repo_id: Optional[str] = None |
|
|
agents_file: Optional[str] = None |
|
|
config_file: Optional[str] = None |
|
|
docker_image_name: str = "phi35moe-cpu:latest" |
|
|
output_dir: str = "./outputs" |
|
|
logs_dir: str = "./logs" |
|
|
max_steps: int = 50 |
|
|
balance_agents: bool = True |
|
|
push_to_hub: bool = True |
|
|
dry_run: bool = False |
|
|
|
|
|
class MultiAgentTrainingDeployment: |
|
|
""" |
|
|
Comprehensive deployment manager for multi-agent training |
|
|
""" |
|
|
|
|
|
def __init__(self, config: DeploymentConfig): |
|
|
self.config = config |
|
|
self.setup_logging() |
|
|
|
|
|
def setup_logging(self): |
|
|
"""Setup logging configuration""" |
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
|
handlers=[ |
|
|
logging.StreamHandler(), |
|
|
logging.FileHandler(os.path.join(self.config.logs_dir, 'deployment.log')) |
|
|
] |
|
|
) |
|
|
self.logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
os.makedirs(self.config.logs_dir, exist_ok=True) |
|
|
|
|
|
def validate_environment(self) -> bool: |
|
|
"""Validate deployment environment""" |
|
|
self.logger.info("Validating deployment environment") |
|
|
|
|
|
|
|
|
required_env_vars = ["HF_TOKEN"] |
|
|
missing_vars = [] |
|
|
|
|
|
for var in required_env_vars: |
|
|
if not os.getenv(var): |
|
|
missing_vars.append(var) |
|
|
|
|
|
if missing_vars: |
|
|
self.logger.error(f"Missing required environment variables: {missing_vars}") |
|
|
return False |
|
|
|
|
|
|
|
|
try: |
|
|
result = subprocess.run(["docker", "--version"], capture_output=True, text=True) |
|
|
if result.returncode != 0: |
|
|
self.logger.error("Docker not available") |
|
|
return False |
|
|
except FileNotFoundError: |
|
|
self.logger.error("Docker not installed") |
|
|
return False |
|
|
|
|
|
|
|
|
if not os.path.exists(self.config.dataset_path): |
|
|
self.logger.error(f"Dataset path not found: {self.config.dataset_path}") |
|
|
return False |
|
|
|
|
|
|
|
|
if self.config.agents_file and not os.path.exists(self.config.agents_file): |
|
|
self.logger.error(f"Agents file not found: {self.config.agents_file}") |
|
|
return False |
|
|
|
|
|
self.logger.info("Environment validation passed") |
|
|
return True |
|
|
|
|
|
def build_docker_image(self) -> bool: |
|
|
"""Build Docker image for training""" |
|
|
self.logger.info("Building Docker image") |
|
|
|
|
|
dockerfile_path = "docker/multi_agent_training/Dockerfile.cpu" |
|
|
if not os.path.exists(dockerfile_path): |
|
|
self.logger.error(f"Dockerfile not found: {dockerfile_path}") |
|
|
return False |
|
|
|
|
|
try: |
|
|
cmd = [ |
|
|
"docker", "build", |
|
|
"-f", dockerfile_path, |
|
|
"-t", self.config.docker_image_name, |
|
|
"docker/multi_agent_training/" |
|
|
] |
|
|
|
|
|
self.logger.info(f"Running command: {' '.join(cmd)}") |
|
|
|
|
|
if not self.config.dry_run: |
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True) |
|
|
self.logger.info("Docker image built successfully") |
|
|
else: |
|
|
self.logger.info("Dry run: Would build Docker image") |
|
|
|
|
|
return True |
|
|
|
|
|
except subprocess.CalledProcessError as e: |
|
|
self.logger.error(f"Docker build failed: {e}") |
|
|
self.logger.error(f"Error output: {e.stderr}") |
|
|
return False |
|
|
|
|
|
def create_training_command(self) -> List[str]: |
|
|
"""Create training command for Docker execution""" |
|
|
cmd = [ |
|
|
"python", "/app/train_lora_cpu_multiagent.py", |
|
|
"--dataset_path", self.config.dataset_path, |
|
|
"--hub_repo_id", self.config.model_repo_id, |
|
|
"--output_dir", "/app/outputs", |
|
|
"--max_steps", str(self.config.max_steps), |
|
|
"--logging_steps", "5", |
|
|
"--save_steps", "50", |
|
|
"--eval_steps", "25" |
|
|
] |
|
|
|
|
|
if self.config.balance_agents: |
|
|
cmd.append("--balance_agents") |
|
|
|
|
|
if self.config.push_to_hub: |
|
|
cmd.append("--push_to_hub") |
|
|
|
|
|
if self.config.agents_file: |
|
|
cmd.extend(["--agents_file", self.config.agents_file]) |
|
|
|
|
|
if self.config.dataset_repo_id: |
|
|
cmd.extend(["--push_dataset_repo", self.config.dataset_repo_id]) |
|
|
|
|
|
return cmd |
|
|
|
|
|
def run_training(self) -> bool: |
|
|
"""Run training in Docker container""" |
|
|
self.logger.info("Starting training in Docker container") |
|
|
|
|
|
|
|
|
training_cmd = self.create_training_command() |
|
|
|
|
|
|
|
|
docker_cmd = [ |
|
|
"docker", "run", "--rm", "-it", |
|
|
"-e", f"HF_TOKEN={os.getenv('HF_TOKEN')}", |
|
|
"-v", f"{os.path.abspath(self.config.dataset_path)}:{self.config.dataset_path}:ro", |
|
|
"-v", f"{os.path.abspath(self.config.output_dir)}:/app/outputs", |
|
|
"-v", f"{os.path.abspath(self.config.logs_dir)}:/app/logs", |
|
|
self.config.docker_image_name, |
|
|
"bash", "-lc" |
|
|
] |
|
|
|
|
|
|
|
|
full_cmd = " ".join(training_cmd) |
|
|
docker_cmd.append(full_cmd) |
|
|
|
|
|
self.logger.info(f"Running command: {' '.join(docker_cmd[:-1])} '{full_cmd}'") |
|
|
|
|
|
try: |
|
|
if not self.config.dry_run: |
|
|
result = subprocess.run(docker_cmd, check=True) |
|
|
self.logger.info("Training completed successfully") |
|
|
else: |
|
|
self.logger.info("Dry run: Would execute training") |
|
|
|
|
|
return True |
|
|
|
|
|
except subprocess.CalledProcessError as e: |
|
|
self.logger.error(f"Training failed: {e}") |
|
|
return False |
|
|
|
|
|
def generate_deployment_report(self) -> Dict[str, Any]: |
|
|
"""Generate deployment report""" |
|
|
report = { |
|
|
"deployment_config": { |
|
|
"project_root": self.config.project_root, |
|
|
"dataset_path": self.config.dataset_path, |
|
|
"model_repo_id": self.config.model_repo_id, |
|
|
"dataset_repo_id": self.config.dataset_repo_id, |
|
|
"docker_image_name": self.config.docker_image_name, |
|
|
"max_steps": self.config.max_steps, |
|
|
"balance_agents": self.config.balance_agents, |
|
|
"push_to_hub": self.config.push_to_hub |
|
|
}, |
|
|
"environment": { |
|
|
"hf_token_set": bool(os.getenv("HF_TOKEN")), |
|
|
"docker_available": self._check_docker_available(), |
|
|
"dataset_exists": os.path.exists(self.config.dataset_path), |
|
|
"agents_file_exists": os.path.exists(self.config.agents_file) if self.config.agents_file else None |
|
|
}, |
|
|
"deployment_status": "completed" if not self.config.dry_run else "dry_run" |
|
|
} |
|
|
|
|
|
return report |
|
|
|
|
|
def _check_docker_available(self) -> bool: |
|
|
"""Check if Docker is available""" |
|
|
try: |
|
|
result = subprocess.run(["docker", "--version"], capture_output=True, text=True) |
|
|
return result.returncode == 0 |
|
|
except FileNotFoundError: |
|
|
return False |
|
|
|
|
|
def save_deployment_report(self, report: Dict[str, Any]): |
|
|
"""Save deployment report to file""" |
|
|
report_file = os.path.join(self.config.logs_dir, "deployment_report.json") |
|
|
with open(report_file, 'w') as f: |
|
|
json.dump(report, f, indent=2) |
|
|
|
|
|
self.logger.info(f"Deployment report saved to {report_file}") |
|
|
|
|
|
def deploy(self) -> bool: |
|
|
"""Execute complete deployment process""" |
|
|
self.logger.info("Starting multi-agent training deployment") |
|
|
|
|
|
try: |
|
|
|
|
|
if not self.validate_environment(): |
|
|
return False |
|
|
|
|
|
|
|
|
if not self.build_docker_image(): |
|
|
return False |
|
|
|
|
|
|
|
|
if not self.run_training(): |
|
|
return False |
|
|
|
|
|
|
|
|
report = self.generate_deployment_report() |
|
|
self.save_deployment_report(report) |
|
|
|
|
|
self.logger.info("Deployment completed successfully") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
self.logger.error(f"Deployment failed: {e}") |
|
|
return False |
|
|
|
|
|
def load_config_from_file(config_file: str) -> Dict[str, Any]: |
|
|
"""Load configuration from YAML file""" |
|
|
with open(config_file, 'r') as f: |
|
|
return yaml.safe_load(f) |
|
|
|
|
|
def create_deployment_config_from_yaml(config_data: Dict[str, Any], |
|
|
dataset_path: str, |
|
|
model_repo_id: str) -> DeploymentConfig: |
|
|
"""Create deployment configuration from YAML data""" |
|
|
training_config = config_data.get("training", {}) |
|
|
multi_agent_config = config_data.get("multi_agent", {}) |
|
|
huggingface_config = config_data.get("huggingface", {}) |
|
|
|
|
|
return DeploymentConfig( |
|
|
project_root=os.getcwd(), |
|
|
dataset_path=dataset_path, |
|
|
model_repo_id=model_repo_id, |
|
|
dataset_repo_id=huggingface_config.get("push_dataset_repo"), |
|
|
agents_file=config_data.get("dataset", {}).get("agents_file"), |
|
|
config_file=None, |
|
|
docker_image_name="phi35moe-cpu:latest", |
|
|
output_dir=training_config.get("output_dir", "./outputs"), |
|
|
logs_dir=training_config.get("logging", {}).get("logging_dir", "./logs"), |
|
|
max_steps=training_config.get("max_steps", 50), |
|
|
balance_agents=multi_agent_config.get("balance_agents", True), |
|
|
push_to_hub=huggingface_config.get("push_to_hub", True) |
|
|
) |
|
|
|
|
|
def main(): |
|
|
"""Main deployment function""" |
|
|
parser = argparse.ArgumentParser(description="Deploy Multi-Agent Training System") |
|
|
|
|
|
|
|
|
parser.add_argument("--dataset_path", required=True, |
|
|
help="Path to multi-agent dataset") |
|
|
parser.add_argument("--model_repo_id", required=True, |
|
|
help="Hugging Face model repository ID") |
|
|
|
|
|
|
|
|
parser.add_argument("--dataset_repo_id", default="", |
|
|
help="Optional dataset repository ID") |
|
|
parser.add_argument("--agents_file", default="", |
|
|
help="Optional agents YAML file") |
|
|
parser.add_argument("--config_file", default="", |
|
|
help="Optional configuration YAML file") |
|
|
parser.add_argument("--docker_image_name", default="phi35moe-cpu:latest", |
|
|
help="Docker image name") |
|
|
parser.add_argument("--output_dir", default="./outputs", |
|
|
help="Output directory") |
|
|
parser.add_argument("--logs_dir", default="./logs", |
|
|
help="Logs directory") |
|
|
parser.add_argument("--max_steps", type=int, default=50, |
|
|
help="Maximum training steps") |
|
|
parser.add_argument("--balance_agents", action="store_true", |
|
|
help="Balance dataset across agents") |
|
|
parser.add_argument("--push_to_hub", action="store_true", |
|
|
help="Push model to Hugging Face Hub") |
|
|
parser.add_argument("--dry_run", action="store_true", |
|
|
help="Perform dry run without actual execution") |
|
|
parser.add_argument("--log_level", default="INFO", |
|
|
help="Logging level") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
logging.basicConfig(level=getattr(logging, args.log_level.upper())) |
|
|
|
|
|
try: |
|
|
|
|
|
if args.config_file and os.path.exists(args.config_file): |
|
|
config_data = load_config_from_file(args.config_file) |
|
|
deployment_config = create_deployment_config_from_yaml( |
|
|
config_data, args.dataset_path, args.model_repo_id |
|
|
) |
|
|
else: |
|
|
|
|
|
deployment_config = DeploymentConfig( |
|
|
project_root=os.getcwd(), |
|
|
dataset_path=args.dataset_path, |
|
|
model_repo_id=args.model_repo_id, |
|
|
dataset_repo_id=args.dataset_repo_id if args.dataset_repo_id else None, |
|
|
agents_file=args.agents_file if args.agents_file else None, |
|
|
config_file=args.config_file if args.config_file else None, |
|
|
docker_image_name=args.docker_image_name, |
|
|
output_dir=args.output_dir, |
|
|
logs_dir=args.logs_dir, |
|
|
max_steps=args.max_steps, |
|
|
balance_agents=args.balance_agents, |
|
|
push_to_hub=args.push_to_hub, |
|
|
dry_run=args.dry_run |
|
|
) |
|
|
|
|
|
|
|
|
deployment = MultiAgentTrainingDeployment(deployment_config) |
|
|
|
|
|
|
|
|
success = deployment.deploy() |
|
|
|
|
|
if success: |
|
|
print("β
Deployment completed successfully!") |
|
|
print(f"π Outputs: {deployment_config.output_dir}") |
|
|
print(f"π Logs: {deployment_config.logs_dir}") |
|
|
if deployment_config.push_to_hub: |
|
|
print(f"π Model: https://huggingface.co/{deployment_config.model_repo_id}") |
|
|
else: |
|
|
print("β Deployment failed!") |
|
|
sys.exit(1) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Deployment error: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|