Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

File size: 2,854 Bytes

225af6a

"""Scripts to download or generate data"""

from pathlib import Path
import shutil
import zipfile

from huggingface_hub import hf_hub_download

from hopcroft_skill_classification_tool_competition.config import (
    HF_FILENAME,
    HF_REPO_ID,
    RAW_DATA_DIR,
)


def download_skillscope_dataset(output_dir: Path = None) -> Path:
    """
    Download and extract SkillScope dataset from Hugging Face Hub.

    The dataset contains a SQLite database (skillscope_data.db) with:
    - nlbse_tool_competition_data_by_issue: Main table with PR features and skill labels
    - vw_nlbse_tool_competition_data_by_file: View with file-level labels

    Dataset details:
    - 7,245 merged pull requests from 11 Java repositories
    - 57,206 source files; 59,644 methods; 13,097 classes
    - 217 skill labels (domain/sub-domain pairs)

    Args:
        output_dir: Directory where to save the dataset (default: data/raw)

    Returns:
        Path to the extracted database file
    """
    if output_dir is None:
        output_dir = RAW_DATA_DIR

    output_dir.mkdir(parents=True, exist_ok=True)
    db_path = output_dir / "skillscope_data.db"

    if db_path.exists():
        print(f"Database already exists at: {db_path}")
        return db_path

    print("Downloading SkillScope dataset from Hugging Face...")

    # Download without using cache - use local_dir to avoid .cache folder
    zip_path = hf_hub_download(
        repo_id=HF_REPO_ID,
        filename=HF_FILENAME,
        repo_type="dataset",
        local_dir=output_dir,
        local_dir_use_symlinks=False,  # Don't create symlinks, copy directly
    )

    print(f"Downloaded to: {zip_path}")
    print("Extracting database...")

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(output_dir)

    if not db_path.exists():
        raise FileNotFoundError(f"Database file not found at: {db_path}")

    print(f"Database extracted to: {db_path}")

    # Clean up: remove zip file
    print("Cleaning up temporary files...")
    Path(zip_path).unlink()

    # Clean up: remove .cache folder if exists
    cache_dir = output_dir / ".cache"
    if cache_dir.exists():
        shutil.rmtree(cache_dir)
        print("Removed .cache folder")

    # Clean up: remove download folder if exists
    download_dir = output_dir / "download"
    if download_dir.exists():
        shutil.rmtree(download_dir)
        print("Removed download folder")

    print("Cleanup completed")

    print("\nDataset info:")
    print("  - Table: nlbse_tool_competition_data_by_issue")
    print("  - View: vw_nlbse_tool_competition_data_by_file")

    return db_path


if __name__ == "__main__":
    print("=" * 80)
    print("SKILLSCOPE DATASET DOWNLOAD")
    print("=" * 80)
    download_skillscope_dataset()
    print("=" * 80)
    print("DOWNLOAD COMPLETED")
    print("=" * 80)