DaCrow13
Deploy to HF Spaces (Clean)
225af6a
"""Scripts to download or generate data"""
from pathlib import Path
import shutil
import zipfile
from huggingface_hub import hf_hub_download
from hopcroft_skill_classification_tool_competition.config import (
HF_FILENAME,
HF_REPO_ID,
RAW_DATA_DIR,
)
def download_skillscope_dataset(output_dir: Path = None) -> Path:
"""
Download and extract SkillScope dataset from Hugging Face Hub.
The dataset contains a SQLite database (skillscope_data.db) with:
- nlbse_tool_competition_data_by_issue: Main table with PR features and skill labels
- vw_nlbse_tool_competition_data_by_file: View with file-level labels
Dataset details:
- 7,245 merged pull requests from 11 Java repositories
- 57,206 source files; 59,644 methods; 13,097 classes
- 217 skill labels (domain/sub-domain pairs)
Args:
output_dir: Directory where to save the dataset (default: data/raw)
Returns:
Path to the extracted database file
"""
if output_dir is None:
output_dir = RAW_DATA_DIR
output_dir.mkdir(parents=True, exist_ok=True)
db_path = output_dir / "skillscope_data.db"
if db_path.exists():
print(f"Database already exists at: {db_path}")
return db_path
print("Downloading SkillScope dataset from Hugging Face...")
# Download without using cache - use local_dir to avoid .cache folder
zip_path = hf_hub_download(
repo_id=HF_REPO_ID,
filename=HF_FILENAME,
repo_type="dataset",
local_dir=output_dir,
local_dir_use_symlinks=False, # Don't create symlinks, copy directly
)
print(f"Downloaded to: {zip_path}")
print("Extracting database...")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(output_dir)
if not db_path.exists():
raise FileNotFoundError(f"Database file not found at: {db_path}")
print(f"Database extracted to: {db_path}")
# Clean up: remove zip file
print("Cleaning up temporary files...")
Path(zip_path).unlink()
# Clean up: remove .cache folder if exists
cache_dir = output_dir / ".cache"
if cache_dir.exists():
shutil.rmtree(cache_dir)
print("Removed .cache folder")
# Clean up: remove download folder if exists
download_dir = output_dir / "download"
if download_dir.exists():
shutil.rmtree(download_dir)
print("Removed download folder")
print("Cleanup completed")
print("\nDataset info:")
print(" - Table: nlbse_tool_competition_data_by_issue")
print(" - View: vw_nlbse_tool_competition_data_by_file")
return db_path
if __name__ == "__main__":
print("=" * 80)
print("SKILLSCOPE DATASET DOWNLOAD")
print("=" * 80)
download_skillscope_dataset()
print("=" * 80)
print("DOWNLOAD COMPLETED")
print("=" * 80)