File size: 6,540 Bytes
a92080e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""
HuggingFace Operations: Upload data, create PRs, validate schemas.
"""
from huggingface_hub import HfApi, login
import pandas as pd
import json
from pathlib import Path
from jsonschema import validate, ValidationError, Draft7Validator


# Load schema once at module level
SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
with open(SCHEMA_PATH, 'r') as f:
    EVAL_SCHEMA = json.load(f)
    

def validate_json_against_schema(json_data):
    """
    Validate a JSON object against eval.schema.json.
    
    Args:
        json_data: Dict containing the evaluation data
        
    Returns:
        (bool, str): (is_valid, error_message)
    """
    try:
        validate(instance=json_data, schema=EVAL_SCHEMA)
        return True, "Schema validation passed"
    except ValidationError as e:
        # Extract the most relevant error message
        error_path = " β†’ ".join(str(p) for p in e.path) if e.path else "root"
        return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
    except Exception as e:
        return False, f"❌ Validation error: {str(e)}"


def upload_to_hf_dataset(parquet_file, split_name, repo_id):
    """
    Upload a parquet file as a new split to the HF dataset.
    
    Args:
        parquet_file: Path to parquet file
        split_name: Name of the split (leaderboard name)
        repo_id: HuggingFace dataset repository ID
    """
    # TODO: Implement upload logic
    pass


def check_hf_authentication():
    """
    Check if user is authenticated with HuggingFace.
    
    Returns:
        (bool, str): (is_authenticated, username or error_message)
    """
    try:
        api = HfApi()
        user_info = api.whoami()
        return True, user_info['name']
    except Exception as e:
        return False, "Not authenticated. Run: huggingface-cli login"


def check_duplicate_pr_exists(leaderboard_name, repo_id):
    """
    Check if a PR already exists for this leaderboard.
    
    Args:
        leaderboard_name: Name of the leaderboard
        repo_id: HuggingFace dataset repository ID
        
    Returns:
        (bool, str or None): (exists, pr_url if exists)
    """
    try:
        api = HfApi()
        discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
        
        # Check for open PRs with matching title
        pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
        for discussion in discussions:
            if discussion.is_pull_request and discussion.status == "open":
                if pr_title_pattern in discussion.title.lower():
                    pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
                    return True, pr_url
        
        return False, None
    except Exception as e:
        # If we can't check, assume no duplicate (fail open)
        print(f"Warning: Could not check for duplicate PRs: {e}")
        return False, None


def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
    """
    Create a pull request to add a new leaderboard split.
    
    Args:
        leaderboard_name: Name of the new leaderboard
        parquet_file: Path to parquet file
        repo_id: HuggingFace dataset repository ID
        
    Returns:
        (success, pr_url or error_message)
    """
    # 1. Check authentication
    is_auth, auth_result = check_hf_authentication()
    if not is_auth:
        return False, f"❌ {auth_result}"
    
    # 2. Check for duplicate PR
    has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
    if has_duplicate:
        return False, f"⚠️ PR already exists: {duplicate_url}"
    
    # 3. Validate parquet file exists and has data
    parquet_path = Path(parquet_file)
    if not parquet_path.exists():
        return False, "❌ Parquet file not found"
    
    df = pd.read_parquet(parquet_file)
    if len(df) == 0:
        return False, "❌ Parquet file is empty"
    
    # 4. Create PR
    try:
        api = HfApi()
        
        # Upload the parquet file to the branch
        commit_message = f"Add new leaderboard: {leaderboard_name}"
        
        # Upload file and create PR
        commit_info = api.upload_file(
            path_or_fileobj=parquet_file,
            path_in_repo=f"data/{leaderboard_name}.parquet",
            repo_id=repo_id,
            repo_type="dataset",
            commit_message=commit_message,
            create_pr=True,
        )
        
        # Extract PR URL from commit info
        pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
        
        return True, f"PR created ({len(df)} rows): {pr_url}"
        
    except Exception as e:
        return False, f"❌ Failed to create PR: {str(e)}"


def validate_schema(parquet_file):
    """
    Validate that a parquet file matches the expected schema.
    
    Args:
        parquet_file: Path to parquet file to validate
        
    Returns:
        (bool, str): (is_valid, error_message)
    """
    try:
        df = pd.read_parquet(parquet_file)
        
        # Required columns
        required_cols = [
            '_leaderboard', '_developer', '_model', '_uuid',
            'schema_version', 'evaluation_id', 'retrieved_timestamp',
            'source_data', 'evaluation_source_name', 'evaluation_source_type',
            'source_organization_name', 'evaluator_relationship',
            'model_name', 'model_id', 'model_developer',
            'evaluation_results'
        ]
        
        missing = [col for col in required_cols if col not in df.columns]
        if missing:
            return False, f"Missing required columns: {', '.join(missing)}"
        
        # Check data types (all should be strings)
        for col in df.columns:
            if df[col].dtype not in ['object', 'string']:
                return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
        
        return True, "Schema validation passed"
    
    except Exception as e:
        return False, f"Validation error: {str(e)}"


def export_to_json(parquet_file, output_dir):
    """
    Export parquet data back to JSON files.
    Uses the parquet_to_folder function from json_to_parquet.py
    
    Args:
        parquet_file: Path to parquet file
        output_dir: Directory to write JSON files to
    """
    from json_to_parquet import parquet_to_folder
    parquet_to_folder(parquet_file, output_dir)