"""Scripts to download or generate data""" from pathlib import Path import shutil import zipfile from huggingface_hub import hf_hub_download from hopcroft_skill_classification_tool_competition.config import ( HF_FILENAME, HF_REPO_ID, RAW_DATA_DIR, ) def download_skillscope_dataset(output_dir: Path = None) -> Path: """ Download and extract SkillScope dataset from Hugging Face Hub. The dataset contains a SQLite database (skillscope_data.db) with: - nlbse_tool_competition_data_by_issue: Main table with PR features and skill labels - vw_nlbse_tool_competition_data_by_file: View with file-level labels Dataset details: - 7,245 merged pull requests from 11 Java repositories - 57,206 source files; 59,644 methods; 13,097 classes - 217 skill labels (domain/sub-domain pairs) Args: output_dir: Directory where to save the dataset (default: data/raw) Returns: Path to the extracted database file """ if output_dir is None: output_dir = RAW_DATA_DIR output_dir.mkdir(parents=True, exist_ok=True) db_path = output_dir / "skillscope_data.db" if db_path.exists(): print(f"Database already exists at: {db_path}") return db_path print("Downloading SkillScope dataset from Hugging Face...") # Download without using cache - use local_dir to avoid .cache folder zip_path = hf_hub_download( repo_id=HF_REPO_ID, filename=HF_FILENAME, repo_type="dataset", local_dir=output_dir, local_dir_use_symlinks=False, # Don't create symlinks, copy directly ) print(f"Downloaded to: {zip_path}") print("Extracting database...") with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(output_dir) if not db_path.exists(): raise FileNotFoundError(f"Database file not found at: {db_path}") print(f"Database extracted to: {db_path}") # Clean up: remove zip file print("Cleaning up temporary files...") Path(zip_path).unlink() # Clean up: remove .cache folder if exists cache_dir = output_dir / ".cache" if cache_dir.exists(): shutil.rmtree(cache_dir) print("Removed .cache folder") # Clean up: remove download folder if exists download_dir = output_dir / "download" if download_dir.exists(): shutil.rmtree(download_dir) print("Removed download folder") print("Cleanup completed") print("\nDataset info:") print(" - Table: nlbse_tool_competition_data_by_issue") print(" - View: vw_nlbse_tool_competition_data_by_file") return db_path if __name__ == "__main__": print("=" * 80) print("SKILLSCOPE DATASET DOWNLOAD") print("=" * 80) download_skillscope_dataset() print("=" * 80) print("DOWNLOAD COMPLETED") print("=" * 80)