|
|
"""Scripts to download or generate data""" |
|
|
|
|
|
from pathlib import Path |
|
|
import shutil |
|
|
import zipfile |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
from hopcroft_skill_classification_tool_competition.config import ( |
|
|
HF_FILENAME, |
|
|
HF_REPO_ID, |
|
|
RAW_DATA_DIR, |
|
|
) |
|
|
|
|
|
|
|
|
def download_skillscope_dataset(output_dir: Path = None) -> Path: |
|
|
""" |
|
|
Download and extract SkillScope dataset from Hugging Face Hub. |
|
|
|
|
|
The dataset contains a SQLite database (skillscope_data.db) with: |
|
|
- nlbse_tool_competition_data_by_issue: Main table with PR features and skill labels |
|
|
- vw_nlbse_tool_competition_data_by_file: View with file-level labels |
|
|
|
|
|
Dataset details: |
|
|
- 7,245 merged pull requests from 11 Java repositories |
|
|
- 57,206 source files; 59,644 methods; 13,097 classes |
|
|
- 217 skill labels (domain/sub-domain pairs) |
|
|
|
|
|
Args: |
|
|
output_dir: Directory where to save the dataset (default: data/raw) |
|
|
|
|
|
Returns: |
|
|
Path to the extracted database file |
|
|
""" |
|
|
if output_dir is None: |
|
|
output_dir = RAW_DATA_DIR |
|
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
db_path = output_dir / "skillscope_data.db" |
|
|
|
|
|
if db_path.exists(): |
|
|
print(f"Database already exists at: {db_path}") |
|
|
return db_path |
|
|
|
|
|
print("Downloading SkillScope dataset from Hugging Face...") |
|
|
|
|
|
|
|
|
zip_path = hf_hub_download( |
|
|
repo_id=HF_REPO_ID, |
|
|
filename=HF_FILENAME, |
|
|
repo_type="dataset", |
|
|
local_dir=output_dir, |
|
|
local_dir_use_symlinks=False, |
|
|
) |
|
|
|
|
|
print(f"Downloaded to: {zip_path}") |
|
|
print("Extracting database...") |
|
|
|
|
|
with zipfile.ZipFile(zip_path, "r") as zip_ref: |
|
|
zip_ref.extractall(output_dir) |
|
|
|
|
|
if not db_path.exists(): |
|
|
raise FileNotFoundError(f"Database file not found at: {db_path}") |
|
|
|
|
|
print(f"Database extracted to: {db_path}") |
|
|
|
|
|
|
|
|
print("Cleaning up temporary files...") |
|
|
Path(zip_path).unlink() |
|
|
|
|
|
|
|
|
cache_dir = output_dir / ".cache" |
|
|
if cache_dir.exists(): |
|
|
shutil.rmtree(cache_dir) |
|
|
print("Removed .cache folder") |
|
|
|
|
|
|
|
|
download_dir = output_dir / "download" |
|
|
if download_dir.exists(): |
|
|
shutil.rmtree(download_dir) |
|
|
print("Removed download folder") |
|
|
|
|
|
print("Cleanup completed") |
|
|
|
|
|
print("\nDataset info:") |
|
|
print(" - Table: nlbse_tool_competition_data_by_issue") |
|
|
print(" - View: vw_nlbse_tool_competition_data_by_file") |
|
|
|
|
|
return db_path |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("=" * 80) |
|
|
print("SKILLSCOPE DATASET DOWNLOAD") |
|
|
print("=" * 80) |
|
|
download_skillscope_dataset() |
|
|
print("=" * 80) |
|
|
print("DOWNLOAD COMPLETED") |
|
|
print("=" * 80) |
|
|
|