File size: 15,079 Bytes

24c2665

#!/usr/bin/env python3
"""
TTRLVR + AZR 환경 검증 스크립트

실제 실행 환경에서 필요한 모든 컴포넌트가 올바르게 설정되어 있는지 확인합니다:
1. Python 패키지 및 버전 확인
2. GPU 및 CUDA 환경 확인  
3. 파일 경로 및 권한 확인
4. 모델 로딩 테스트
5. AZR 설정 파일 검증
6. 간단한 파이프라인 실행 테스트
"""

import os
import sys
import json
import subprocess
import tempfile
import traceback
from pathlib import Path
from datetime import datetime

# 경로 설정
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')

class EnvironmentValidator:
    """환경 검증 클래스"""
    
    def __init__(self):
        self.results = {
            'timestamp': datetime.now().isoformat(),
            'tests': {},
            'overall_success': False,
            'recommendations': []
        }
        
    def log_test(self, test_name: str, success: bool, message: str, details: str = None):
        """테스트 결과 로깅"""
        
        status = "✅ PASS" if success else "❌ FAIL"
        print(f"{status} {test_name}: {message}")
        
        if details:
            print(f"   Details: {details}")
            
        self.results['tests'][test_name] = {
            'success': success,
            'message': message,
            'details': details
        }
        
        if not success:
            print()
    
    def add_recommendation(self, recommendation: str):
        """권장사항 추가"""
        self.results['recommendations'].append(recommendation)
        print(f"💡 Recommendation: {recommendation}")
    
    def test_python_packages(self):
        """Python 패키지 확인"""
        
        required_packages = {
            'torch': '2.0.0',
            'transformers': '4.30.0', 
            'pandas': '1.5.0',
            'numpy': '1.21.0',
            'vllm': '0.3.0'
        }
        
        missing_packages = []
        version_issues = []
        
        for package, min_version in required_packages.items():
            try:
                if package == 'vllm':
                    # vLLM은 선택적 패키지
                    try:
                        import vllm
                        version = vllm.__version__
                    except ImportError:
                        self.add_recommendation(f"Consider installing vLLM for better GPU performance: pip install vllm")
                        continue
                else:
                    exec(f"import {package}")
                    version = eval(f"{package}.__version__")
                
                # 버전 비교는 간단히 문자열로 (정확한 비교는 packaging 모듈 필요)
                if version < min_version:
                    version_issues.append(f"{package}: {version} < {min_version}")
                    
            except ImportError:
                missing_packages.append(package)
            except Exception as e:
                version_issues.append(f"{package}: Error checking version - {e}")
        
        if missing_packages:
            self.log_test(
                "Python Packages", 
                False, 
                f"Missing packages: {', '.join(missing_packages)}",
                f"Install with: pip install {' '.join(missing_packages)}"
            )
            return False
        elif version_issues:
            self.log_test(
                "Python Packages",
                False,
                f"Version issues: {', '.join(version_issues)}",
                "Update packages to meet minimum requirements"
            )
            return False
        else:
            self.log_test("Python Packages", True, "All required packages installed")
            return True
    
    def test_gpu_environment(self):
        """GPU 및 CUDA 환경 확인"""
        
        try:
            import torch
            
            # CUDA 사용 가능성 확인
            if not torch.cuda.is_available():
                self.log_test("GPU Environment", False, "CUDA not available")
                self.add_recommendation("Install CUDA toolkit and PyTorch with CUDA support")
                return False
            
            # GPU 개수 및 메모리 확인
            gpu_count = torch.cuda.device_count()
            current_device = torch.cuda.current_device()
            device_name = torch.cuda.get_device_name(current_device)
            
            # 메모리 정보
            memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
            memory_reserved = torch.cuda.memory_reserved() / 1024**3   # GB
            memory_total = torch.cuda.get_device_properties(current_device).total_memory / 1024**3  # GB
            
            details = f"GPUs: {gpu_count}, Current: {device_name}, Memory: {memory_total:.1f}GB total, {memory_reserved:.1f}GB reserved"
            
            if memory_total < 8.0:  # 8GB 미만
                self.log_test("GPU Environment", False, f"GPU memory insufficient: {memory_total:.1f}GB", details)
                self.add_recommendation("Use a GPU with at least 8GB VRAM for 7B models")
                return False
            
            self.log_test("GPU Environment", True, f"GPU environment ready", details)
            return True
            
        except Exception as e:
            self.log_test("GPU Environment", False, f"Error checking GPU: {e}")
            return False
    
    def test_file_paths_and_permissions(self):
        """파일 경로 및 권한 확인"""
        
        critical_paths = {
            '/home/ubuntu/RLVR/TestTime-RLVR-v2': 'Main project directory',
            '/home/ubuntu/RLVR/TestTime-RLVR-v2/test/configs/ttrlvr_azr_7b_single_gpu.sh': 'AZR config script',
            '/data/RLVR/checkpoints': 'Checkpoint directory (will be created)',
            '/tmp': 'Temporary directory'
        }
        
        issues = []
        
        for path, description in critical_paths.items():
            if not os.path.exists(path):
                if 'checkpoints' in path:
                    # 체크포인트 디렉토리는 생성 시도
                    try:
                        os.makedirs(path, exist_ok=True)
                        self.log_test(f"Path: {description}", True, f"Created directory: {path}")
                    except Exception as e:
                        issues.append(f"{description}: Cannot create {path} - {e}")
                else:
                    issues.append(f"{description}: Not found - {path}")
            else:
                # 읽기/쓰기 권한 확인
                readable = os.access(path, os.R_OK)
                writable = os.access(path, os.W_OK)
                
                if not readable:
                    issues.append(f"{description}: No read permission - {path}")
                elif os.path.isdir(path) and not writable:
                    issues.append(f"{description}: No write permission - {path}")
                else:
                    self.log_test(f"Path: {description}", True, f"Accessible: {path}")
        
        if issues:
            self.log_test("File Paths", False, f"{len(issues)} path issues", "; ".join(issues))
            return False
        else:
            self.log_test("File Paths", True, "All critical paths accessible")
            return True
    
    def test_model_loading(self):
        """모델 로딩 테스트 (간단한 확인)"""
        
        try:
            # 빠른 테스트를 위해 transformers 라이브러리만 확인
            from transformers import AutoTokenizer
            
            # 실제 모델 로딩 대신 라이브러리 기능만 테스트
            self.log_test("Model Loading", True, "Transformers library available for model loading")
            self.add_recommendation("Model loading test skipped to avoid timeout. Run full model test separately if needed.")
            return True
                
        except Exception as e:
            self.log_test("Model Loading", False, f"Failed to import transformers: {e}")
            self.add_recommendation("Install transformers library: pip install transformers")
            return False
    
    def test_azr_config(self):
        """AZR 설정 파일 검증"""
        
        config_path = '/home/ubuntu/RLVR/TestTime-RLVR-v2/test/configs/ttrlvr_azr_7b_single_gpu.sh'
        
        try:
            if not os.path.exists(config_path):
                self.log_test("AZR Config", False, f"Config file not found: {config_path}")
                return False
            
            # 스크립트 실행 권한 확인
            if not os.access(config_path, os.X_OK):
                self.log_test("AZR Config", False, f"Config file not executable: {config_path}")
                self.add_recommendation(f"Make config executable: chmod +x {config_path}")
                return False
            
            # 설정 파일 내용 기본 검증
            with open(config_path, 'r') as f:
                content = f.read()
                
            required_settings = [
                'trainer.project_name=ttrlvr_azr',
                'azr.train_propose=False',
                'data.train_batch_size=8',
                'actor_rollout_ref.actor.ppo_mini_batch_size=24'
            ]
            
            missing_settings = []
            for setting in required_settings:
                if setting not in content:
                    missing_settings.append(setting)
            
            if missing_settings:
                self.log_test(
                    "AZR Config", 
                    False, 
                    f"Missing settings: {', '.join(missing_settings)}",
                    f"Check config file: {config_path}"
                )
                return False
            
            self.log_test("AZR Config", True, f"Config file validated: {config_path}")
            return True
            
        except Exception as e:
            self.log_test("AZR Config", False, f"Error validating config: {e}")
            return False
    
    def test_simple_pipeline(self):
        """간단한 파이프라인 실행 테스트"""
        
        try:
            from absolute_zero_reasoner.testtime.config import TestTimeConfig
            from absolute_zero_reasoner.testtime.logger import TestTimeLogger
            from absolute_zero_reasoner.testtime.task_generator import TestTimeTaskGenerator
            
            # 기본 설정 생성
            config = TestTimeConfig()
            config.model_name = "Qwen/Qwen2.5-7B"
            logger = TestTimeLogger()
            
            # Task Generator 인스턴스 생성
            task_generator = TestTimeTaskGenerator(config, logger)
            
            # 테스트용 IPO 트리플
            test_ipo_triples = [
                {
                    'id': 'test_triple_0',
                    'input': '[1, 2, 3]',
                    'actual_output': '[2, 4, 6]',
                    'program': 'def test_func(lst):\n    return [x * 2 for x in lst]',
                    'full_input_str': 'test_func([1, 2, 3])',
                    'source_program_id': 'program_0',
                    'ipo_index': 0
                }
            ]
            
            # Task 생성 테스트
            tasks = task_generator.generate_tasks(test_ipo_triples, "TestProblem", 1)
            
            # 결과 검증
            if not tasks or not any(len(task_list) > 0 for task_list in tasks.values()):
                self.log_test("Simple Pipeline", False, "No tasks generated")
                return False
            
            # AZR 메타데이터 확인
            for task_type, task_list in tasks.items():
                if task_list:
                    task = task_list[0]
                    required_fields = ['uid', 'ipo_group_id', 'basic_accuracy', 'ground_truth']
                    missing_fields = [field for field in required_fields if field not in task]
                    
                    if missing_fields:
                        self.log_test(
                            "Simple Pipeline", 
                            False, 
                            f"Missing AZR metadata: {missing_fields}"
                        )
                        return False
            
            total_tasks = sum(len(task_list) for task_list in tasks.values())
            self.log_test("Simple Pipeline", True, f"Generated {total_tasks} tasks successfully")
            return True
            
        except Exception as e:
            self.log_test("Simple Pipeline", False, f"Pipeline test failed: {e}")
            return False
    
    def run_all_tests(self):
        """모든 테스트 실행"""
        
        print("🔍 TTRLVR + AZR 환경 검증 시작")
        print("=" * 60)
        
        tests = [
            self.test_python_packages,
            self.test_gpu_environment, 
            self.test_file_paths_and_permissions,
            self.test_model_loading,
            self.test_azr_config,
            self.test_simple_pipeline
        ]
        
        passed_tests = 0
        total_tests = len(tests)
        
        for test in tests:
            try:
                if test():
                    passed_tests += 1
                print()  # 빈 줄 추가
            except Exception as e:
                print(f"❌ Test {test.__name__} crashed: {e}")
                print(f"   Traceback: {traceback.format_exc()}")
                print()
        
        # 최종 결과
        success_rate = passed_tests / total_tests * 100
        self.results['overall_success'] = passed_tests == total_tests
        
        print("=" * 60)
        print("📊 환경 검증 결과:")
        print(f"   - 통과한 테스트: {passed_tests}/{total_tests} ({success_rate:.1f}%)")
        
        if self.results['recommendations']:
            print(f"\n💡 권장사항 ({len(self.results['recommendations'])}개):")
            for i, rec in enumerate(self.results['recommendations'], 1):
                print(f"   {i}. {rec}")
        
        if self.results['overall_success']:
            print("\n🎉 환경 검증 완료! TTRLVR + AZR 실행 준비가 완료되었습니다.")
        else:
            print(f"\n⚠️ 환경 검증 실패: {total_tests - passed_tests}개 테스트 실패")
            print("   위의 권장사항을 참고하여 문제를 해결한 후 다시 시도하세요.")
        
        return self.results


def main():
    """메인 실행 함수"""
    
    validator = EnvironmentValidator()
    results = validator.run_all_tests()
    
    # 결과를 파일로 저장
    output_file = f"/tmp/ttrlvr_azr_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\n📄 상세 결과 저장: {output_file}")
    
    return 0 if results['overall_success'] else 1


if __name__ == '__main__':
    sys.exit(main())