mle-bench-tabular / src /utils.py
Sunmarinup's picture
Load leaderboard from GitHub (#1)
f2c74ae verified
"""Utility functions for downloading content from Git repositories."""
import base64
import requests
def download_github_file_content(api_url: str, timeout: int = 30) -> str:
"""Download file content from GitHub (handles Git LFS files).
Args:
api_url: GitHub API URL for the file (e.g., contents API endpoint)
timeout: Request timeout in seconds (default: 30)
Returns:
File content as a string
Raises:
requests.HTTPError: If the HTTP request fails
ValueError: If the file content cannot be decoded or no content/download_url is found
"""
# Use GitHub API to get file content (handles Git LFS files)
response = requests.get(api_url, timeout=timeout)
response.raise_for_status()
api_data = response.json()
# Get file content - GitHub API handles Git LFS files
# If content is in the response, decode it; otherwise use download_url
if "content" in api_data:
# Decode base64 content
try:
file_content = base64.b64decode(api_data["content"]).decode("utf-8")
except Exception as e:
raise ValueError(f"Failed to decode file content: {e}")
# Check if it's a Git LFS pointer file
if file_content.startswith("version https://git-lfs.github.com/spec/v1"):
# For LFS files, use the download_url which points to the actual file
download_url = api_data.get("download_url")
if not download_url:
raise ValueError("Git LFS file found but no download_url available")
# Download the actual file content
lfs_response = requests.get(download_url, timeout=timeout)
lfs_response.raise_for_status()
file_content = lfs_response.text
elif "download_url" in api_data:
# Large files don't include content, use download_url directly
download_response = requests.get(api_data["download_url"], timeout=timeout)
download_response.raise_for_status()
file_content = download_response.text
else:
raise ValueError("No content or download_url found in API response")
return file_content