Spaces:
Sleeping
Sleeping
| """Scripts to download or generate data""" | |
| from pathlib import Path | |
| import shutil | |
| import zipfile | |
| from huggingface_hub import hf_hub_download | |
| from hopcroft_skill_classification_tool_competition.config import ( | |
| HF_FILENAME, | |
| HF_REPO_ID, | |
| RAW_DATA_DIR, | |
| ) | |
| def download_skillscope_dataset(output_dir: Path = None) -> Path: | |
| """ | |
| Download and extract SkillScope dataset from Hugging Face Hub. | |
| The dataset contains a SQLite database (skillscope_data.db) with: | |
| - nlbse_tool_competition_data_by_issue: Main table with PR features and skill labels | |
| - vw_nlbse_tool_competition_data_by_file: View with file-level labels | |
| Dataset details: | |
| - 7,245 merged pull requests from 11 Java repositories | |
| - 57,206 source files; 59,644 methods; 13,097 classes | |
| - 217 skill labels (domain/sub-domain pairs) | |
| Args: | |
| output_dir: Directory where to save the dataset (default: data/raw) | |
| Returns: | |
| Path to the extracted database file | |
| """ | |
| if output_dir is None: | |
| output_dir = RAW_DATA_DIR | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| db_path = output_dir / "skillscope_data.db" | |
| if db_path.exists(): | |
| print(f"Database already exists at: {db_path}") | |
| return db_path | |
| print("Downloading SkillScope dataset from Hugging Face...") | |
| # Download without using cache - use local_dir to avoid .cache folder | |
| zip_path = hf_hub_download( | |
| repo_id=HF_REPO_ID, | |
| filename=HF_FILENAME, | |
| repo_type="dataset", | |
| local_dir=output_dir, | |
| local_dir_use_symlinks=False, # Don't create symlinks, copy directly | |
| ) | |
| print(f"Downloaded to: {zip_path}") | |
| print("Extracting database...") | |
| with zipfile.ZipFile(zip_path, "r") as zip_ref: | |
| zip_ref.extractall(output_dir) | |
| if not db_path.exists(): | |
| raise FileNotFoundError(f"Database file not found at: {db_path}") | |
| print(f"Database extracted to: {db_path}") | |
| # Clean up: remove zip file | |
| print("Cleaning up temporary files...") | |
| Path(zip_path).unlink() | |
| # Clean up: remove .cache folder if exists | |
| cache_dir = output_dir / ".cache" | |
| if cache_dir.exists(): | |
| shutil.rmtree(cache_dir) | |
| print("Removed .cache folder") | |
| # Clean up: remove download folder if exists | |
| download_dir = output_dir / "download" | |
| if download_dir.exists(): | |
| shutil.rmtree(download_dir) | |
| print("Removed download folder") | |
| print("Cleanup completed") | |
| print("\nDataset info:") | |
| print(" - Table: nlbse_tool_competition_data_by_issue") | |
| print(" - View: vw_nlbse_tool_competition_data_by_file") | |
| return db_path | |
| if __name__ == "__main__": | |
| print("=" * 80) | |
| print("SKILLSCOPE DATASET DOWNLOAD") | |
| print("=" * 80) | |
| download_skillscope_dataset() | |
| print("=" * 80) | |
| print("DOWNLOAD COMPLETED") | |
| print("=" * 80) | |