mmrech
/

pitvqa-training-scripts

Model card Files Files and versions

xet

Community

mmrech commited on Jan 18

Commit

e4a4f4c

verified ·

1 Parent(s): 69375af

Upload pitvqa_agent_orchestrator.py with huggingface_hub

Browse files

Files changed (1) hide show

pitvqa_agent_orchestrator.py +913 -0

pitvqa_agent_orchestrator.py ADDED Viewed

	@@ -0,0 +1,913 @@

+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "huggingface_hub>=0.21.0",
+#     "requests",
+# ]
+# ///
+"""
+PitVQA Multi-Agent Orchestration System
+Specialized agents for methodologically rigorous VLM pipeline management:
+1. JobMonitorAgent - Track HuggingFace Jobs status
+2. CurationAgent - Quality-filter showcase examples
+3. DatasetAgent - Validate image-embedded dataset
+4. ModelVerifierAgent - Test merged model outputs
+5. DemoSyncAgent - Update Gradio Space with results
+Run with: python pitvqa_agent_orchestrator.py
+"""
+import os
+import json
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Any
+from datetime import datetime
+from enum import Enum
+# ============================================================
+# Agent Status Types
+# ============================================================
+class AgentStatus(Enum):
+    IDLE = "idle"
+    RUNNING = "running"
+    SUCCESS = "success"
+    FAILED = "failed"
+    WAITING = "waiting"
+@dataclass
+class AgentResult:
+    agent_name: str
+    status: AgentStatus
+    message: str
+    data: Optional[Dict] = None
+    timestamp: str = ""
+    def __post_init__(self):
+        if not self.timestamp:
+            self.timestamp = datetime.now().isoformat()
+# ============================================================
+# Base Agent
+# ============================================================
+class BaseAgent:
+    """Base class for all PitVQA agents."""
+    def __init__(self, name: str):
+        self.name = name
+        self.status = AgentStatus.IDLE
+        self.results: List[AgentResult] = []
+    def log(self, message: str, level: str = "INFO"):
+        icon = {"INFO": "ℹ️", "SUCCESS": "✅", "ERROR": "❌", "WARN": "⚠️"}.get(level, "📌")
+        print(f"[{self.name}] {icon} {message}")
+    def run(self) -> AgentResult:
+        raise NotImplementedError
+    def report(self) -> Dict:
+        return {
+            "agent": self.name,
+            "status": self.status.value,
+            "results": [r.__dict__ for r in self.results]
+        }
+# ============================================================
+# Agent 1: Job Monitor
+# ============================================================
+class JobMonitorAgent(BaseAgent):
+    """Monitors HuggingFace Jobs and reports status."""
+    def __init__(self, job_ids: List[str]):
+        super().__init__("JobMonitor")
+        self.job_ids = job_ids
+        self.job_status = {}
+    def check_job(self, job_id: str) -> Dict:
+        """Check single job status using HF API."""
+        try:
+            from huggingface_hub import HfApi
+            api = HfApi()
+            # Get job info
+            job = api.get_job(job_id)
+            return {
+                "id": job_id,
+                "status": job.status.stage if hasattr(job.status, 'stage') else str(job.status),
+                "message": job.status.message if hasattr(job.status, 'message') else None
+            }
+        except Exception as e:
+            return {"id": job_id, "status": "UNKNOWN", "error": str(e)}
+    def run(self) -> AgentResult:
+        self.status = AgentStatus.RUNNING
+        self.log(f"Checking {len(self.job_ids)} jobs...")
+        all_complete = True
+        any_failed = False
+        for job_id in self.job_ids:
+            status = self.check_job(job_id)
+            self.job_status[job_id] = status
+            stage = status.get("status", "UNKNOWN")
+            self.log(f"Job {job_id[:8]}: {stage}")
+            if stage not in ["COMPLETED", "SUCCESS"]:
+                all_complete = False
+            if stage in ["FAILED", "ERROR"]:
+                any_failed = True
+        if any_failed:
+            self.status = AgentStatus.FAILED
+            return AgentResult(self.name, AgentStatus.FAILED, "Some jobs failed", self.job_status)
+        elif all_complete:
+            self.status = AgentStatus.SUCCESS
+            return AgentResult(self.name, AgentStatus.SUCCESS, "All jobs complete", self.job_status)
+        else:
+            self.status = AgentStatus.WAITING
+            return AgentResult(self.name, AgentStatus.WAITING, "Jobs still running", self.job_status)
+# ============================================================
+# Agent 2: Curation Agent
+# ============================================================
+class CurationAgent(BaseAgent):
+    """Curates showcase examples based on quality criteria."""
+    QUALITY_CRITERIA = {
+        "coordinate_validity": lambda x, y: 0 <= x <= 100 and 0 <= y <= 100,
+        "coordinate_diversity": lambda coords: len(set(coords)) > len(coords) * 0.5,
+        "video_diversity": lambda vids: len(set(vids)) >= min(5, len(vids)),
+        "frame_diversity": lambda frames: len(set(frames)) >= min(8, len(frames)),
+    }
+    def __init__(self, results_path: str = "./curation_review/all_results.json"):
+        super().__init__("Curation")
+        self.results_path = results_path
+        self.curated_examples = []
+    def load_results(self) -> List[Dict]:
+        """Load raw curation results."""
+        try:
+            with open(self.results_path) as f:
+                return json.load(f)
+        except FileNotFoundError:
+            self.log("Results file not found - job may still be running", "WARN")
+            return []
+    def score_example(self, example: Dict) -> float:
+        """Score a single example (0-1)."""
+        score = 0.0
+        # Basic validity
+        if example.get("success"):
+            score += 0.3
+        # Coordinate quality
+        if example.get("task") == "point":
+            x, y = example.get("x"), example.get("y")
+            if x and y:
+                # Penalize edge coordinates (likely failures)
+                if 10 < x < 90 and 10 < y < 90:
+                    score += 0.3
+                else:
+                    score += 0.1
+        elif example.get("task") == "bbox":
+            bbox = example.get("bbox")
+            if bbox and len(bbox) == 4:
+                # Penalize tiny or huge boxes
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if 100 < area < 5000:
+                    score += 0.3
+                else:
+                    score += 0.1
+        # Response coherence
+        response = example.get("response", "")
+        if "<point" in response or "<box" in response:
+            score += 0.2
+        # Target relevance
+        target = example.get("target", "")
+        if target in response.lower():
+            score += 0.2
+        return min(score, 1.0)
+    def curate(self, results: List[Dict], top_k: int = 12) -> List[Dict]:
+        """Select best diverse examples."""
+        if not results:
+            return []
+        # Score all examples
+        scored = [(self.score_example(ex), ex) for ex in results if ex.get("success")]
+        scored.sort(key=lambda x: x[0], reverse=True)
+        # Ensure diversity
+        curated = []
+        used_videos = set()
+        used_frames = set()
+        used_tasks = {"point": 0, "bbox": 0}
+        for score, ex in scored:
+            if len(curated) >= top_k:
+                break
+            video = ex.get("video_id")
+            frame = ex.get("frame_idx")
+            task = ex.get("task")
+            # Diversity constraints
+            if used_videos.count(video) >= 2:  # Max 2 per video
+                continue
+            if (video, frame) in used_frames:  # Unique video+frame combos
+                continue
+            if used_tasks.get(task, 0) >= top_k // 2:  # Balance tasks
+                continue
+            curated.append({**ex, "quality_score": score})
+            used_videos.add(video)
+            used_frames.add((video, frame))
+            used_tasks[task] = used_tasks.get(task, 0) + 1
+        return curated
+    def run(self) -> AgentResult:
+        self.status = AgentStatus.RUNNING
+        self.log("Loading curation results...")
+        results = self.load_results()
+        if not results:
+            self.status = AgentStatus.WAITING
+            return AgentResult(self.name, AgentStatus.WAITING, "No results available yet")
+        self.log(f"Scoring {len(results)} examples...")
+        self.curated_examples = self.curate(results)
+        if len(self.curated_examples) >= 8:
+            self.status = AgentStatus.SUCCESS
+            # Report diversity
+            videos = set(ex["video_id"] for ex in self.curated_examples)
+            frames = set(ex["frame_idx"] for ex in self.curated_examples)
+            self.log(f"Curated {len(self.curated_examples)} examples", "SUCCESS")
+            self.log(f"  Videos: {len(videos)} unique")
+            self.log(f"  Frames: {len(frames)} unique")
+            return AgentResult(
+                self.name,
+                AgentStatus.SUCCESS,
+                f"Curated {len(self.curated_examples)} high-quality diverse examples",
+                {"examples": self.curated_examples}
+            )
+        else:
+            self.status = AgentStatus.FAILED
+            return AgentResult(
+                self.name,
+                AgentStatus.FAILED,
+                f"Only {len(self.curated_examples)} examples passed quality checks"
+            )
+# ============================================================
+# Agent 3: Dataset Validator
+# ============================================================
+class DatasetValidatorAgent(BaseAgent):
+    """Validates image-embedded dataset quality."""
+    def __init__(self, dataset_id: str = "mmrech/pitvqa-spatial-with-images"):
+        super().__init__("DatasetValidator")
+        self.dataset_id = dataset_id
+    def run(self) -> AgentResult:
+        self.status = AgentStatus.RUNNING
+        self.log(f"Validating dataset: {self.dataset_id}")
+        try:
+            from datasets import load_dataset
+            # Try to load dataset
+            ds = load_dataset(self.dataset_id, split="train[:10]")
+            # Check required fields
+            required_fields = ["image", "messages"]
+            missing = [f for f in required_fields if f not in ds.features]
+            if missing:
+                self.status = AgentStatus.FAILED
+                return AgentResult(
+                    self.name,
+                    AgentStatus.FAILED,
+                    f"Missing fields: {missing}"
+                )
+            # Validate image quality
+            valid_images = 0
+            for ex in ds:
+                img = ex.get("image")
+                if img and hasattr(img, "size") and img.size[0] > 0:
+                    valid_images += 1
+            if valid_images == len(ds):
+                self.status = AgentStatus.SUCCESS
+                return AgentResult(
+                    self.name,
+                    AgentStatus.SUCCESS,
+                    f"Dataset valid: {valid_images}/{len(ds)} images OK",
+                    {"sample_count": len(ds), "valid_images": valid_images}
+                )
+            else:
+                self.status = AgentStatus.FAILED
+                return AgentResult(
+                    self.name,
+                    AgentStatus.FAILED,
+                    f"Invalid images: {len(ds) - valid_images}/{len(ds)}"
+                )
+        except Exception as e:
+            self.status = AgentStatus.WAITING
+            return AgentResult(
+                self.name,
+                AgentStatus.WAITING,
+                f"Dataset not yet available: {e}"
+            )
+# ============================================================
+# Agent 4: Model Verifier
+# ============================================================
+class ModelVerifierAgent(BaseAgent):
+    """Verifies merged model outputs are correct."""
+    TEST_PROMPTS = [
+        ("Point to the suction device", "point"),
+        ("Draw a bounding box around the surgical instrument", "bbox"),
+        ("What surgical phase is this?", "classification"),
+    ]
+    def __init__(self, model_id: str = "mmrech/pitvqa-qwen2vl-merged"):
+        super().__init__("ModelVerifier")
+        self.model_id = model_id
+    def run(self) -> AgentResult:
+        self.status = AgentStatus.RUNNING
+        self.log(f"Verifying model: {self.model_id}")
+        try:
+            from huggingface_hub import HfApi
+            api = HfApi()
+            # Check if model exists
+            try:
+                info = api.model_info(self.model_id)
+                self.log(f"Model found: {info.modelId}")
+                # Check for required files
+                files = [f.rfilename for f in info.siblings]
+                required = ["config.json", "model.safetensors"]
+                # Check if main model files exist
+                has_model = any("safetensors" in f or "pytorch" in f for f in files)
+                has_config = "config.json" in files
+                if has_model and has_config:
+                    self.status = AgentStatus.SUCCESS
+                    return AgentResult(
+                        self.name,
+                        AgentStatus.SUCCESS,
+                        f"Model verified: {len(files)} files present",
+                        {"files": files[:10]}  # First 10 files
+                    )
+                else:
+                    self.status = AgentStatus.FAILED
+                    return AgentResult(
+                        self.name,
+                        AgentStatus.FAILED,
+                        f"Missing model files (has_model={has_model}, has_config={has_config})"
+                    )
+            except Exception as e:
+                self.status = AgentStatus.WAITING
+                return AgentResult(
+                    self.name,
+                    AgentStatus.WAITING,
+                    f"Model not yet available: {e}"
+                )
+        except Exception as e:
+            self.status = AgentStatus.FAILED
+            return AgentResult(self.name, AgentStatus.FAILED, f"Error: {e}")
+# ============================================================
+# Agent 5: Training Specialist (HF-LLM-Trainer)
+# ============================================================
+class TrainingSpecialistAgent(BaseAgent):
+    """
+    Specialist in HuggingFace LLM Training (TRL/SFT/LoRA/DPO).
+    Responsibilities:
+    - Validate training configurations
+    - Check adapter quality
+    - Recommend training improvements
+    - Verify LoRA/PEFT setup
+    """
+    TRAINING_METHODS = {
+        "SFT": "Supervised Fine-Tuning - learning from (input, output) pairs",
+        "LoRA": "Low-Rank Adaptation - parameter-efficient adapters",
+        "DPO": "Direct Preference Optimization - learning from preferences",
+        "RLHF": "Reinforcement Learning from Human Feedback",
+    }
+    OPTIMAL_CONFIG = {
+        "lora_r": 16,
+        "lora_alpha": 32,
+        "learning_rate": 1e-4,
+        "batch_size": 1,
+        "gradient_accumulation_steps": 16,
+        "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
+    }
+    def __init__(self, adapter_repo: str = "mmrech/pitvqa-qwen2vl-unified-v2"):
+        super().__init__("TrainingSpecialist")
+        self.adapter_repo = adapter_repo
+    def validate_adapter_config(self) -> Dict:
+        """Validate adapter configuration."""
+        try:
+            from huggingface_hub import hf_hub_download
+            import json
+            # Download adapter config
+            config_path = hf_hub_download(
+                repo_id=self.adapter_repo,
+                filename="stage4/adapter_config.json"
+            )
+            with open(config_path) as f:
+                config = json.load(f)
+            # Check key parameters
+            issues = []
+            recommendations = []
+            # Check LoRA rank
+            if config.get("r", 0) < 8:
+                issues.append("LoRA rank too low (r < 8)")
+            elif config.get("r", 0) > 64:
+                recommendations.append("Consider reducing LoRA rank for efficiency")
+            # Check target modules
+            target_modules = config.get("target_modules", [])
+            if not any("proj" in m for m in target_modules):
+                issues.append("No projection layers targeted")
+            return {
+                "config": config,
+                "issues": issues,
+                "recommendations": recommendations,
+                "valid": len(issues) == 0
+            }
+        except Exception as e:
+            return {"error": str(e), "valid": False}
+    def recommend_next_training(self, current_metrics: Dict = None) -> Dict:
+        """Recommend next training steps based on current metrics."""
+        recommendations = []
+        if not current_metrics:
+            recommendations.append({
+                "priority": "HIGH",
+                "action": "Run evaluation to get baseline metrics",
+                "method": "scripts/evaluate_unified_vlm.py"
+            })
+        else:
+            accuracy = current_metrics.get("accuracy", 0)
+            if accuracy < 0.7:
+                recommendations.append({
+                    "priority": "HIGH",
+                    "action": "Increase training epochs or data",
+                    "method": "SFT with more epochs"
+                })
+            if accuracy >= 0.7 and accuracy < 0.85:
+                recommendations.append({
+                    "priority": "MEDIUM",
+                    "action": "Consider DPO for preference learning",
+                    "method": "Create chosen/rejected pairs from predictions"
+                })
+            if accuracy >= 0.85:
+                recommendations.append({
+                    "priority": "LOW",
+                    "action": "Model performing well - focus on inference optimization",
+                    "method": "Merge adapters, quantize for deployment"
+                })
+        return {"recommendations": recommendations}
+    def run(self) -> AgentResult:
+        self.status = AgentStatus.RUNNING
+        self.log(f"Validating training setup: {self.adapter_repo}")
+        # Validate adapter
+        validation = self.validate_adapter_config()
+        if validation.get("valid"):
+            self.status = AgentStatus.SUCCESS
+            recommendations = self.recommend_next_training()
+            return AgentResult(
+                self.name,
+                AgentStatus.SUCCESS,
+                f"Training config valid. LoRA r={validation['config'].get('r')}",
+                {
+                    "config": validation["config"],
+                    "recommendations": recommendations["recommendations"]
+                }
+            )
+        elif validation.get("error"):
+            self.status = AgentStatus.WAITING
+            return AgentResult(
+                self.name,
+                AgentStatus.WAITING,
+                f"Could not load adapter: {validation['error']}"
+            )
+        else:
+            self.status = AgentStatus.FAILED
+            return AgentResult(
+                self.name,
+                AgentStatus.FAILED,
+                f"Issues found: {validation['issues']}",
+                validation
+            )
+# ============================================================
+# Agent 6: Evaluation Specialist
+# ============================================================
+class EvaluationSpecialistAgent(BaseAgent):
+    """
+    Specialist in Model Evaluation (metrics, benchmarks, validation).
+    Responsibilities:
+    - Compute accuracy, F1, precision, recall
+    - Validate coordinate predictions (MAE, quadrant accuracy)
+    - Compare against baselines
+    - Generate evaluation reports
+    """
+    METRICS = {
+        "classification": ["accuracy", "f1", "precision", "recall"],
+        "localization": ["mae", "quadrant_accuracy", "distance_error"],
+        "detection": ["iou", "ap", "ar"],
+    }
+    THRESHOLDS = {
+        "quadrant_accuracy": 0.75,  # Minimum acceptable
+        "mae": 15.0,  # Maximum acceptable (percentage)
+        "classification_accuracy": 0.80,
+    }
+    def __init__(self, model_repo: str = "mmrech/pitvqa-qwen2vl-unified-v2"):
+        super().__init__("EvaluationSpecialist")
+        self.model_repo = model_repo
+        self.metrics = {}
+    def load_evaluation_results(self) -> Dict:
+        """Load existing evaluation results if available."""
+        try:
+            with open("evaluation_results.json") as f:
+                return json.load(f)
+        except FileNotFoundError:
+            return {}
+    def compute_quick_metrics(self, predictions: List[Dict]) -> Dict:
+        """Compute quick metrics from predictions."""
+        if not predictions:
+            return {}
+        metrics = {}
+        # Coordinate predictions
+        coord_preds = [p for p in predictions if p.get("task") in ["point", "pointing"]]
+        if coord_preds:
+            valid = [p for p in coord_preds if p.get("x") is not None]
+            metrics["valid_rate"] = len(valid) / len(coord_preds)
+            # Calculate MAE if ground truth available
+            errors = []
+            for p in valid:
+                if p.get("gt_x") and p.get("gt_y"):
+                    err = ((p["x"] - p["gt_x"])**2 + (p["y"] - p["gt_y"])**2)**0.5
+                    errors.append(err)
+            if errors:
+                metrics["mae"] = sum(errors) / len(errors)
+                metrics["quadrant_accuracy"] = sum(1 for e in errors if e < 25) / len(errors)
+        # Classification predictions
+        class_preds = [p for p in predictions if p.get("task") == "classification"]
+        if class_preds:
+            correct = sum(1 for p in class_preds if p.get("prediction") == p.get("ground_truth"))
+            metrics["classification_accuracy"] = correct / len(class_preds)
+        return metrics
+    def evaluate_against_thresholds(self, metrics: Dict) -> Dict:
+        """Check metrics against quality thresholds."""
+        results = {"passed": [], "failed": [], "warnings": []}
+        for metric, threshold in self.THRESHOLDS.items():
+            if metric in metrics:
+                value = metrics[metric]
+                if metric == "mae":
+                    passed = value <= threshold
+                else:
+                    passed = value >= threshold
+                entry = {"metric": metric, "value": value, "threshold": threshold}
+                if passed:
+                    results["passed"].append(entry)
+                else:
+                    results["failed"].append(entry)
+        return results
+    def generate_report(self, metrics: Dict, threshold_results: Dict) -> str:
+        """Generate evaluation report."""
+        report = []
+        report.append("=" * 50)
+        report.append("EVALUATION REPORT")
+        report.append("=" * 50)
+        report.append("\n📊 METRICS:")
+        for k, v in metrics.items():
+            report.append(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")
+        report.append("\n✅ PASSED:")
+        for item in threshold_results["passed"]:
+            report.append(f"  {item['metric']}: {item['value']:.4f} (threshold: {item['threshold']})")
+        if threshold_results["failed"]:
+            report.append("\n❌ FAILED:")
+            for item in threshold_results["failed"]:
+                report.append(f"  {item['metric']}: {item['value']:.4f} (threshold: {item['threshold']})")
+        return "\n".join(report)
+    def run(self, predictions: List[Dict] = None) -> AgentResult:
+        self.status = AgentStatus.RUNNING
+        self.log("Running evaluation...")
+        # Try to load existing results
+        existing = self.load_evaluation_results()
+        if existing:
+            self.log("Found existing evaluation results")
+            self.metrics = existing
+        elif predictions:
+            self.log(f"Computing metrics from {len(predictions)} predictions")
+            self.metrics = self.compute_quick_metrics(predictions)
+        else:
+            self.status = AgentStatus.WAITING
+            return AgentResult(
+                self.name,
+                AgentStatus.WAITING,
+                "No predictions available for evaluation"
+            )
+        # Check against thresholds
+        threshold_results = self.evaluate_against_thresholds(self.metrics)
+        # Generate report
+        report = self.generate_report(self.metrics, threshold_results)
+        self.log(f"\n{report}")
+        if threshold_results["failed"]:
+            self.status = AgentStatus.FAILED
+            return AgentResult(
+                self.name,
+                AgentStatus.FAILED,
+                f"{len(threshold_results['failed'])} metrics below threshold",
+                {"metrics": self.metrics, "thresholds": threshold_results}
+            )
+        else:
+            self.status = AgentStatus.SUCCESS
+            return AgentResult(
+                self.name,
+                AgentStatus.SUCCESS,
+                f"All {len(threshold_results['passed'])} metrics passed",
+                {"metrics": self.metrics, "thresholds": threshold_results}
+            )
+# ============================================================
+# Agent 7: Demo Sync Agent
+# ============================================================
+class DemoSyncAgent(BaseAgent):
+    """Syncs curated examples to Gradio Space."""
+    def __init__(self, space_id: str = "mmrech/pitvqa-surgical-vlm"):
+        super().__init__("DemoSync")
+        self.space_id = space_id
+    def run(self, curated_examples: List[Dict] = None) -> AgentResult:
+        self.status = AgentStatus.RUNNING
+        self.log(f"Syncing to Space: {self.space_id}")
+        if not curated_examples:
+            self.status = AgentStatus.WAITING
+            return AgentResult(
+                self.name,
+                AgentStatus.WAITING,
+                "No curated examples to sync"
+            )
+        try:
+            from huggingface_hub import HfApi
+            api = HfApi()
+            # Check Space status
+            try:
+                info = api.space_info(self.space_id)
+                runtime = info.runtime
+                if runtime and runtime.stage == "RUNNING":
+                    self.log(f"Space is running", "SUCCESS")
+                    # Create examples JSON for sync
+                    examples_json = json.dumps(curated_examples, indent=2)
+                    self.status = AgentStatus.SUCCESS
+                    return AgentResult(
+                        self.name,
+                        AgentStatus.SUCCESS,
+                        f"Space running, {len(curated_examples)} examples ready for sync",
+                        {"space_status": "RUNNING", "examples_count": len(curated_examples)}
+                    )
+                else:
+                    self.status = AgentStatus.WAITING
+                    return AgentResult(
+                        self.name,
+                        AgentStatus.WAITING,
+                        f"Space not running: {runtime.stage if runtime else 'unknown'}"
+                    )
+            except Exception as e:
+                self.status = AgentStatus.FAILED
+                return AgentResult(self.name, AgentStatus.FAILED, f"Space error: {e}")
+        except Exception as e:
+            self.status = AgentStatus.FAILED
+            return AgentResult(self.name, AgentStatus.FAILED, f"Error: {e}")
+# ============================================================
+# Orchestrator
+# ============================================================
+class PitVQAOrchestrator:
+    """Coordinates all agents for the PitVQA pipeline."""
+    def __init__(self, job_ids: List[str]):
+        self.agents = {
+            "monitor": JobMonitorAgent(job_ids),
+            "curation": CurationAgent(),
+            "dataset": DatasetValidatorAgent(),
+            "model": ModelVerifierAgent(),
+            "training": TrainingSpecialistAgent(),  # HF-LLM-Trainer specialist
+            "evaluation": EvaluationSpecialistAgent(),  # Eval-Model specialist
+            "demo": DemoSyncAgent(),
+        }
+        self.results = {}
+        self.run_count = 0
+    def run_cycle(self) -> Dict:
+        """Run one orchestration cycle."""
+        self.run_count += 1
+        print(f"\n{'='*60}")
+        print(f"🔄 ORCHESTRATION CYCLE {self.run_count}")
+        print(f"{'='*60}")
+        # Phase 1: Check job status
+        print("\n📊 Phase 1: Job Monitoring")
+        monitor_result = self.agents["monitor"].run()
+        self.results["monitor"] = monitor_result
+        # Phase 2: Training Specialist - Validate adapter config
+        print("\n🎓 Phase 2: Training Validation (HF-LLM-Trainer)")
+        training_result = self.agents["training"].run()
+        self.results["training"] = training_result
+        # Phase 3: If jobs complete, run downstream agents
+        if monitor_result.status in [AgentStatus.SUCCESS, AgentStatus.WAITING]:
+            # Run curation
+            print("\n🎨 Phase 3: Curation")
+            curation_result = self.agents["curation"].run()
+            self.results["curation"] = curation_result
+            # Run dataset validation
+            print("\n📦 Phase 4: Dataset Validation")
+            dataset_result = self.agents["dataset"].run()
+            self.results["dataset"] = dataset_result
+            # Run model verification
+            print("\n🤖 Phase 5: Model Verification")
+            model_result = self.agents["model"].run()
+            self.results["model"] = model_result
+            # Run evaluation specialist
+            print("\n📈 Phase 6: Evaluation (Metrics & Quality)")
+            curated = curation_result.data.get("examples", []) if curation_result.data else []
+            eval_result = self.agents["evaluation"].run(predictions=curated)
+            self.results["evaluation"] = eval_result
+            # Run demo sync if curation succeeded
+            print("\n🌐 Phase 7: Demo Sync")
+            demo_result = self.agents["demo"].run(curated)
+            self.results["demo"] = demo_result
+        return self.generate_report()
+    def generate_report(self) -> Dict:
+        """Generate comprehensive status report."""
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "cycle": self.run_count,
+            "overall_status": self._compute_overall_status(),
+            "agents": {}
+        }
+        for name, result in self.results.items():
+            report["agents"][name] = {
+                "status": result.status.value,
+                "message": result.message
+            }
+        return report
+    def _compute_overall_status(self) -> str:
+        """Compute overall pipeline status."""
+        statuses = [r.status for r in self.results.values()]
+        if all(s == AgentStatus.SUCCESS for s in statuses):
+            return "COMPLETE"
+        elif any(s == AgentStatus.FAILED for s in statuses):
+            return "NEEDS_ATTENTION"
+        elif any(s == AgentStatus.WAITING for s in statuses):
+            return "IN_PROGRESS"
+        else:
+            return "UNKNOWN"
+    def print_summary(self, report: Dict):
+        """Print human-readable summary."""
+        print(f"\n{'='*60}")
+        print("📋 ORCHESTRATION SUMMARY")
+        print(f"{'='*60}")
+        print(f"Time: {report['timestamp']}")
+        print(f"Cycle: {report['cycle']}")
+        print(f"Overall: {report['overall_status']}")
+        print("\nAgent Status:")
+        for name, info in report["agents"].items():
+            icon = {"success": "✅", "failed": "❌", "waiting": "⏳", "running": "🔄"}.get(info["status"], "❓")
+            print(f"  {icon} {name}: {info['status']} - {info['message'][:50]}")
+# ============================================================
+# Main
+# ============================================================
+def main():
+    print("🚀 PitVQA Multi-Agent Orchestrator Starting...")
+    # Current job IDs
+    job_ids = [
+        "696cfe9946affbb321046bd9",  # Curation job
+        "696cfebf57a10a9d296ca042",  # Merge job
+    ]
+    orchestrator = PitVQAOrchestrator(job_ids)
+    # Run orchestration cycle
+    report = orchestrator.run_cycle()
+    orchestrator.print_summary(report)
+    # Save report
+    with open("orchestration_report.json", "w") as f:
+        json.dump(report, f, indent=2)
+    print(f"\n💾 Report saved to orchestration_report.json")
+    return report
+if __name__ == "__main__":
+    main()