#!/usr/bin/env python3
"""
MBPP와 HumanEval 벤치마크 문제 난이도 분석 도구

각 벤치마크의 문제들을 다양한 기준으로 분석하고 난이도 분포를 확인합니다.
- 코드 복잡도 (함수 길이, 조건문 수, 루프 수)
- 문제 설명 길이 및 복잡도
- 테스트 케이스 수
- 필요한 알고리즘/데이터 구조 유형
"""

import os
import sys
import json
import re
import argparse
from pathlib import Path
from datetime import datetime
from collections import defaultdict, Counter
import ast

# TestTime RLVR 모듈 임포트
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')


def load_jsonl(file_path):
    """JSONL 파일 로드"""
    if not os.path.exists(file_path):
        return []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f if line.strip()]


def analyze_code_complexity(code):
    """파이썬 코드 복잡도 분석"""
    try:
        tree = ast.parse(code)
    except:
        return {
            'lines': len(code.split('\n')),
            'functions': 0,
            'conditionals': 0,
            'loops': 0,
            'complexity_score': 1
        }
    
    stats = {
        'lines': len(code.split('\n')),
        'functions': 0,
        'conditionals': 0,
        'loops': 0,
        'complexity_score': 1
    }
    
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            stats['functions'] += 1
        elif isinstance(node, (ast.If, ast.IfExp)):
            stats['conditionals'] += 1
        elif isinstance(node, (ast.For, ast.While)):
            stats['loops'] += 1
    
    # 복잡도 점수 계산 (가중합)
    stats['complexity_score'] = (
        stats['lines'] * 0.1 +
        stats['functions'] * 2 +
        stats['conditionals'] * 1.5 +
        stats['loops'] * 2
    )
    
    return stats


def extract_keywords(text):
    """텍스트에서 알고리즘/데이터 구조 키워드 추출"""
    keywords = {
        'data_structures': ['list', 'array', 'dict', 'set', 'tuple', 'stack', 'queue', 'heap'],
        'algorithms': ['sort', 'search', 'binary', 'recursive', 'dynamic', 'greedy', 'graph'],
        'math': ['prime', 'fibonacci', 'factorial', 'gcd', 'lcm', 'sqrt', 'power'],
        'string': ['string', 'substring', 'regex', 'pattern', 'replace', 'split', 'join'],
        'logic': ['condition', 'boolean', 'logic', 'and', 'or', 'not', 'if', 'else']
    }
    
    text_lower = text.lower()
    found_keywords = defaultdict(list)
    
    for category, words in keywords.items():
        for word in words:
            if word in text_lower:
                found_keywords[category].append(word)
    
    return dict(found_keywords)


def analyze_problem_difficulty(problem):
    """개별 문제 난이도 분석"""
    task_id = problem.get('task_id', 'Unknown')
    prompt = problem.get('prompt', '')
    canonical_solution = problem.get('canonical_solution', '')
    test = problem.get('test', '')
    
    # 기본 정보
    analysis = {
        'task_id': task_id,
        'prompt_length': len(prompt),
        'solution_length': len(canonical_solution),
        'test_length': len(test)
    }
    
    # 코드 복잡도 분석
    if canonical_solution:
        code_stats = analyze_code_complexity(canonical_solution)
        analysis.update(code_stats)
    
    # 테스트 케이스 수 추정
    test_cases = len(re.findall(r'assert', test)) if test else 0
    analysis['test_cases'] = test_cases
    
    # 키워드 분석
    combined_text = prompt + ' ' + canonical_solution
    keywords = extract_keywords(combined_text)
    analysis['keywords'] = keywords
    analysis['keyword_count'] = sum(len(words) for words in keywords.values())
    
    # 난이도 점수 계산 (0-100)
    difficulty_score = min(100, max(0, (
        analysis.get('complexity_score', 1) * 10 +
        analysis['prompt_length'] * 0.01 +
        analysis['test_cases'] * 5 +
        analysis['keyword_count'] * 2
    )))
    
    analysis['difficulty_score'] = round(difficulty_score, 2)
    
    # 난이도 레벨 분류
    if difficulty_score < 20:
        analysis['difficulty_level'] = 'Easy'
    elif difficulty_score < 50:
        analysis['difficulty_level'] = 'Medium'
    elif difficulty_score < 80:
        analysis['difficulty_level'] = 'Hard'
    else:
        analysis['difficulty_level'] = 'Very Hard'
    
    return analysis


def analyze_benchmark_difficulty(problems, benchmark_name):
    """벤치마크 전체 난이도 분석"""
    print(f"\n🔍 {benchmark_name.upper()} 난이도 분석")
    print("="*60)
    
    if not problems:
        print("❌ 분석할 문제가 없습니다.")
        return {}
    
    analyses = []
    for problem in problems:
        analysis = analyze_problem_difficulty(problem)
        analyses.append(analysis)
    
    # 통계 계산
    difficulty_scores = [a['difficulty_score'] for a in analyses]
    complexity_scores = [a.get('complexity_score', 1) for a in analyses]
    prompt_lengths = [a['prompt_length'] for a in analyses]
    
    # 난이도 레벨 분포
    level_counts = Counter(a['difficulty_level'] for a in analyses)
    
    # 키워드 분석
    all_keywords = defaultdict(list)
    for analysis in analyses:
        for category, words in analysis['keywords'].items():
            all_keywords[category].extend(words)
    
    keyword_freq = {category: Counter(words) for category, words in all_keywords.items()}
    
    stats = {
        'total_problems': len(analyses),
        'difficulty_distribution': dict(level_counts),
        'difficulty_stats': {
            'min': min(difficulty_scores),
            'max': max(difficulty_scores),
            'mean': round(sum(difficulty_scores) / len(difficulty_scores), 2),
            'median': round(sorted(difficulty_scores)[len(difficulty_scores)//2], 2)
        },
        'complexity_stats': {
            'min': min(complexity_scores),
            'max': max(complexity_scores),
            'mean': round(sum(complexity_scores) / len(complexity_scores), 2)
        },
        'prompt_stats': {
            'min': min(prompt_lengths),
            'max': max(prompt_lengths),
            'mean': round(sum(prompt_lengths) / len(prompt_lengths), 2)
        },
        'keyword_frequency': {k: dict(v.most_common(5)) for k, v in keyword_freq.items()},
        'detailed_analyses': analyses
    }
    
    # 결과 출력
    print(f"📊 총 {stats['total_problems']}개 문제 분석 완료")
    
    print(f"\n📈 난이도 분포:")
    for level, count in level_counts.items():
        percentage = round(count / len(analyses) * 100, 1)
        print(f"  {level}: {count}개 ({percentage}%)")
    
    print(f"\n📋 난이도 점수 통계:")
    print(f"  최소: {stats['difficulty_stats']['min']}")
    print(f"  최대: {stats['difficulty_stats']['max']}")
    print(f"  평균: {stats['difficulty_stats']['mean']}")
    print(f"  중위값: {stats['difficulty_stats']['median']}")
    
    print(f"\n🔧 코드 복잡도 통계:")
    print(f"  최소: {stats['complexity_stats']['min']}")
    print(f"  최대: {stats['complexity_stats']['max']}")
    print(f"  평균: {stats['complexity_stats']['mean']}")
    
    print(f"\n📝 문제 설명 길이 통계:")
    print(f"  최소: {stats['prompt_stats']['min']} 글자")
    print(f"  최대: {stats['prompt_stats']['max']} 글자")
    print(f"  평균: {stats['prompt_stats']['mean']} 글자")
    
    print(f"\n🏷️ 주요 키워드 (상위 3개):")
    for category, freq_dict in keyword_freq.items():
        if freq_dict:
            top_words = freq_dict.most_common(3)
            print(f"  {category}: {', '.join([f'{word}({count})' for word, count in top_words])}")
    
    # 어려운 문제 샘플 출력
    hard_problems = [a for a in analyses if a['difficulty_level'] in ['Hard', 'Very Hard']]
    if hard_problems:
        print(f"\n🔥 어려운 문제 샘플 (상위 5개):")
        hard_problems_sorted = sorted(hard_problems, key=lambda x: x['difficulty_score'], reverse=True)
        for i, problem in enumerate(hard_problems_sorted[:5]):
            print(f"  {i+1}. {problem['task_id']} (점수: {problem['difficulty_score']}, 레벨: {problem['difficulty_level']})")
    
    return stats


def save_analysis_results(stats, benchmark_name, output_dir):
    """분석 결과 저장"""
    analysis_dir = os.path.join(output_dir, benchmark_name)
    os.makedirs(analysis_dir, exist_ok=True)
    
    # 전체 분석 결과
    full_analysis_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_analysis.json")
    with open(full_analysis_file, 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2, ensure_ascii=False)
    
    # 요약 보고서
    summary_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_summary.txt")
    with open(summary_file, 'w', encoding='utf-8') as f:
        f.write(f"{benchmark_name.upper()} 난이도 분석 요약\n")
        f.write("="*60 + "\n\n")
        f.write(f"생성 시간: {datetime.now().isoformat()}\n\n")
        
        f.write(f"📊 전체 통계:\n")
        f.write(f"  총 문제 수: {stats['total_problems']}개\n")
        f.write(f"  평균 난이도 점수: {stats['difficulty_stats']['mean']}\n")
        f.write(f"  평균 코드 복잡도: {stats['complexity_stats']['mean']}\n\n")
        
        f.write(f"📈 난이도 분포:\n")
        for level, count in stats['difficulty_distribution'].items():
            percentage = round(count / stats['total_problems'] * 100, 1)
            f.write(f"  {level}: {count}개 ({percentage}%)\n")
        
        f.write(f"\n🏷️ 주요 키워드:\n")
        for category, freq_dict in stats['keyword_frequency'].items():
            if freq_dict:
                f.write(f"  {category}: {', '.join(freq_dict.keys())}\n")
    
    print(f"\n💾 분석 결과가 저장되었습니다:")
    print(f"   전체 분석: {full_analysis_file}")
    print(f"   요약 보고서: {summary_file}")


def main():
    parser = argparse.ArgumentParser(description='벤치마크 문제 난이도 분석')
    parser.add_argument('--benchmark', type=str, default='all',
                       choices=['all', 'humaneval', 'mbpp'],
                       help='분석할 벤치마크 (all=모든 벤치마크)')
    parser.add_argument('--save', action='store_true',
                       help='결과를 파일로 저장')
    parser.add_argument('--output_dir', type=str, 
                       default='/home/ubuntu/RLVR/TestTime-RLVR-v2/test/analysis_results',
                       help='출력 디렉토리')
    parser.add_argument('--detailed', action='store_true',
                       help='상세 분석 결과 출력')
    
    args = parser.parse_args()
    
    # 데이터 경로 설정
    base_dir = '/home/ubuntu/RLVR/TestTime-RLVR-v2'
    humaneval_path = f'{base_dir}/evaluation/code_eval/data/HumanEvalPlus.jsonl'
    mbpp_path = f'{base_dir}/evaluation/code_eval/data/MbppPlus.jsonl'
    
    os.makedirs(args.output_dir, exist_ok=True)
    
    print("🚀 TestTime RLVR 벤치마크 난이도 분석 도구")
    print("="*80)
    
    all_results = {}
    
    if args.benchmark in ['all', 'humaneval']:
        print("\n")
        problems = load_jsonl(humaneval_path)
        if problems:
            stats = analyze_benchmark_difficulty(problems, 'humaneval')
            all_results['humaneval'] = stats
            
            if args.save and stats:
                save_analysis_results(stats, 'humaneval', args.output_dir)
    
    if args.benchmark in ['all', 'mbpp']:
        print("\n")
        problems = load_jsonl(mbpp_path)
        if problems:
            stats = analyze_benchmark_difficulty(problems, 'mbpp')
            all_results['mbpp'] = stats
            
            if args.save and stats:
                save_analysis_results(stats, 'mbpp', args.output_dir)
    
    # 벤치마크 비교
    if len(all_results) > 1:
        print("\n" + "="*80)
        print("🔄 벤치마크 비교 분석")
        print("="*80)
        
        for benchmark, stats in all_results.items():
            print(f"\n📊 {benchmark.upper()}:")
            print(f"  총 문제: {stats['total_problems']}개")
            print(f"  평균 난이도: {stats['difficulty_stats']['mean']}")
            print(f"  평균 복잡도: {stats['complexity_stats']['mean']}")
            
            easy_count = stats['difficulty_distribution'].get('Easy', 0)
            hard_count = stats['difficulty_distribution'].get('Hard', 0) + stats['difficulty_distribution'].get('Very Hard', 0)
            print(f"  쉬운 문제: {easy_count}개, 어려운 문제: {hard_count}개")
    
    # 사용법 안내
    print("\n" + "="*80)
    print("💡 사용법")
    print("="*80)
    print("특정 벤치마크만 분석:")
    print("  python test/analyze_difficulty.py --benchmark mbpp --save")
    print("  python test/analyze_difficulty.py --benchmark humaneval --detailed")
    print("\n전체 분석 및 저장:")
    print("  python test/analyze_difficulty.py --save")


if __name__ == '__main__':
    main()