walidsobhie-code commited on
Commit
468b3cf
·
1 Parent(s): de15016

fix: create sample data if real data not found in repo

Browse files
Files changed (1) hide show
  1. colab_train_stack29.ipynb +1 -23
colab_train_stack29.ipynb CHANGED
@@ -100,29 +100,7 @@
100
  "execution_count": null,
101
  "metadata": {},
102
  "outputs": [],
103
- "source": [
104
- "# STEP 5: Find training data\n",
105
- "REPO_DIR = os.path.join(ROOT_DIR, \"stack-2.9\")\n",
106
- "DATA_PATH = None\n",
107
- "\n",
108
- "# Check multiple possible locations\n",
109
- "possible_paths = [\n",
110
- " os.path.join(REPO_DIR, \"data/final/train.jsonl\"),\n",
111
- " os.path.join(REPO_DIR, \"training-data/final/train.jsonl\"),\n",
112
- " os.path.join(REPO_DIR, \"data_mini/train_mini.jsonl\"),\n",
113
- "]\n",
114
- "\n",
115
- "for path in possible_paths:\n",
116
- " if os.path.exists(path):\n",
117
- " DATA_PATH = path\n",
118
- " print(f\"✅ Found data at: {path}\")\n",
119
- " break\n",
120
- "\n",
121
- "if DATA_PATH is None:\n",
122
- " print(\"❌ No training data found!\")\n",
123
- " print(\"\\nSearching for jsonl files:\")\n",
124
- " !find {REPO_DIR} -name \"*.jsonl\" | head -10"
125
- ]
126
  },
127
  {
128
  "cell_type": "code",
 
100
  "execution_count": null,
101
  "metadata": {},
102
  "outputs": [],
103
+ "source": "# STEP 5: Find or download training data\nREPO_DIR = os.path.join(ROOT_DIR, \"stack-2.9\")\nDATA_PATH = None\n\n# Check multiple possible locations\npossible_paths = [\n os.path.join(REPO_DIR, \"data/final/train.jsonl\"),\n os.path.join(REPO_DIR, \"training-data/final/train.jsonl\"),\n os.path.join(REPO_DIR, \"data_mini/train_mini.jsonl\"),\n]\n\nfor path in possible_paths:\n if os.path.exists(path):\n DATA_PATH = path\n print(f\"✅ Found data at: {path}\")\n break\n\n# If not found, try to download from a URL or create small sample\nif DATA_PATH is None:\n print(\"⚠️ Data not found in repo!\")\n print(\"The training data (data/final/train.jsonl) is not in the GitHub repo.\")\n print(\"Options:\")\n print(\" 1. Upload train.jsonl to your Drive at: /content/drive/MyDrive/stack-2.9/data/final/train.jsonl\")\n print(\" 2. Use a smaller dataset\")\n \n # Create minimal sample data for testing (just 100 examples)\n print(\"\\n📝 Creating minimal sample data (100 examples) for testing...\")\n sample_data = []\n sample_prompt = \"\"\"Write a Python function to reverse a string.\n```python\ndef reverse_string(s):\n return s[::-1]\n```\"\"\"\n sample_response = \"\"\"Here's the function:\n```python\ndef reverse_string(s):\n return s[::-1]\n```\nThis uses Python slicing to reverse the string.\"\"\"\n \n for i in range(100):\n sample_data.append({\n \"messages\": [\n {\"role\": \"user\", \"content\": sample_prompt},\n {\"role\": \"assistant\", \"content\": sample_response}\n ]\n })\n \n # Save sample\n import json\n sample_path = os.path.join(REPO_DIR, \"data_mini/sample.jsonl\")\n os.makedirs(os.path.dirname(sample_path), exist_ok=True)\n with open(sample_path, 'w') as f:\n for item in sample_data:\n f.write(json.dumps(item) + '\\n')\n \n DATA_PATH = sample_path\n print(f\"✅ Created sample data: {DATA_PATH}\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  },
105
  {
106
  "cell_type": "code",