{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "65298721-8242-4bfe-bc36-ecf009ba2c7c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", "Token is valid (permission: write).\n", "Your token has been saved to /home/austin/.cache/huggingface/token\n", "Login successful\n", "Total folders: 473\n", "Selected folders (25% - 50%): 118\n", "Folders: 3e679118, 3ec05fe5, 3ec57102, 40428019, 40968f4b, 40b58885, 41a0ac54, 42adcde0, 42ba7db8, 4359417e, 437780d1, 43f628ef, 443c360e, 445a3b95, 449d5a0a, 44feed2f, 451e2ccb, 4590f2a0, 45a005ba, 4686cc6c, 46d6bf83, 46f336bf, 4800dd8d, 482d84dd, 48523d91, 48a6e182, 48fe48d3, 49e762bc, 4b122dae, 4cb40d9c, 4cc78fd2, 4ce0075b, 4d416dfd, 4d8b14ad, 4ded9fa1, 4e2f4ba6, 4f7d30d9, 51c20cd6, 51c41b5b, 51eb30f9, 520a2229, 52ccb6af, 532ebfa4, 5389834e, 54ba80a8, 553e757a, 55dcfe37, 565faede, 5686b87e, 57aebf70, 57d35f28, 57fca20a, 5854f41a, 586ae4bc, 58a2282f, 58fe56f1, 590e4fbf, 593bea10, 598c113f, 5a035120, 5a4f5ef4, 5b84bdc7, 5b8a5bb7, 5c25991f, 5cb8225c, 5d28b89e, 5d3b01f8, 5d3d37c5, 5d68aedf, 5e5993c5, 5e6a6f31, 5e85bf92, 5f8be8da, 60a5f2c6, 60f6ca64, 6136958e, 61c1b280, 62e9f0ac, 631b0413, 641fc74a, 6489388e, 64a24ae2, 653a1bc0, 65503d4f, 66a9b08d, 673810f3, 67eeef73, 693d59dd, 69ed067f, 6acb39c4, 6afdbcbc, 6b01cf3e, 6b2b26d1, 6d19f294, 6d250131, 6d565f54, 6d590dce, 6d60e3a1, 6da4c44b, 6e092d01, 6e22f5cd, 6e255b9c, 6fa02584, 7097b038, 70b38cc9, 72320792, 72360103, 72921df9, 72fa017c, 739cd4cd, 74caf944, 74eb72c7, 75044eb2, 7520c617, 75cb389c, 75e27ad3, 76323950, 76981655\n" ] } ], "source": [ "from huggingface_hub import HfApi\n", "import math\n", "!huggingface-cli login --token hf_lxxxxYpXdHgy\n", "def get_folder_subset(repo_id, start_percent, end_percent, repo_type=\"dataset\"):\n", " api = HfApi()\n", "\n", " # List the contents of the repository\n", " repo_contents = api.list_repo_files(repo_id, repo_type=repo_type)\n", "\n", " # Filter for files inside the \"data\" directory\n", " data_contents = [file for file in repo_contents if file.startswith(\"data/\")]\n", "\n", " # Get unique folders inside \"data\"\n", " folders_in_data = sorted(set(file.split('/')[1] for file in data_contents if file.count('/') > 1))\n", " \n", " total_folders = len(folders_in_data)\n", " start_index = math.floor(total_folders * start_percent / 100)\n", " end_index = math.floor(total_folders * end_percent / 100)\n", "\n", " selected_folders = folders_in_data[start_index:end_index]\n", "\n", " print(f\"Total folders: {total_folders}\")\n", " print(f\"Selected folders ({start_percent}% - {end_percent}%): {len(selected_folders)}\")\n", " print(\"Folders:\", ', '.join(selected_folders))\n", "\n", " # Return files from selected folders\n", " selected_files = [file for file in data_contents if file.split('/')[1] in selected_folders]\n", " return selected_files\n", "\n", "# Replace with the actual repo_id\n", "repo_id = \"litagin/moe-speech\"\n", "\n", "# Example usage:\n", "# First 25%\n", "# first_quarter = get_folder_subset(repo_id, 0, 25)\n", "\n", "# # Second 25%\n", "second_quarter = get_folder_subset(repo_id, 25, 50)\n", "\n", "# # Third 25%\n", "# third_quarter = get_folder_subset(repo_id, 50, 75)\n", "\n", "# # Last 25%\n", "# last_quarter = get_folder_subset(repo_id, 75, 100)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "1dbab453-6f17-4986-94d8-df6d96e637de", "metadata": {}, "outputs": [], "source": [ "import io\n", "import sys\n", "import time\n", "import threading\n", "from IPython import get_ipython\n", "\n", "def start_logging(log_file_path='cell_50%_output.log', interval=5):\n", " # Create a custom output stream\n", " class LogStream(io.StringIO):\n", " def __init__(self, filename):\n", " super().__init__()\n", " self.filename = filename\n", " \n", " def write(self, text):\n", " super().write(text)\n", " with open(self.filename, 'a') as f:\n", " f.write(text)\n", "\n", " # Create the log stream\n", " log_stream = LogStream(log_file_path)\n", "\n", " # Redirect stdout and stderr to the log stream\n", " sys.stdout = log_stream\n", " sys.stderr = log_stream\n", "\n", " # Function to save the current output\n", " def save_output():\n", " while True:\n", " time.sleep(interval)\n", " log_stream.flush()\n", "\n", " # Start the logging in a separate thread\n", " logging_thread = threading.Thread(target=save_output, daemon=True)\n", " logging_thread.start()\n", "\n", " print(f\"Logging started. Output will be saved to {log_file_path} every {interval} seconds.\")\n", "\n", "# Start logging\n", "start_logging()" ] }, { "cell_type": "code", "execution_count": null, "id": "b9dc43d9-7ee9-4029-bee2-427a9a592e36", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6f26f0719a7649948761225d5ce43770", "version_major": 2, "version_minor": 0 }, "text/plain": [ "3e679118_025.wav: 0%| | 0.00/887k [00:00