walidsobhie-code Claude Opus 4.6 commited on
Commit
12c2955
·
1 Parent(s): 235cb20

fix: correct data path from training-data/final to data/final

Browse files
Files changed (1) hide show
  1. scripts/create_mini_dataset.py +5 -5
scripts/create_mini_dataset.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
  Create a minimal training dataset for rapid prototyping.
4
- Samples N examples from the full training-data/final/train.jsonl ensuring tool diversity.
5
  """
6
 
7
  import argparse
@@ -11,11 +11,11 @@ from pathlib import Path
11
  from typing import List, Dict
12
  from collections import defaultdict, Counter
13
 
14
- def load_full_dataset(train_path: str = "training-data/final/train.jsonl") -> List[Dict]:
15
  """Load the full dataset."""
16
  path = Path(train_path)
17
  if not path.exists():
18
- raise FileNotFoundError(f"Training data not found at {path}. Please ensure training-data/final/train.jsonl exists.")
19
 
20
  data = []
21
  with open(path, 'r') as f:
@@ -39,7 +39,7 @@ def extract_tool_calls(example: Dict) -> List[str]:
39
  def create_mini_dataset(
40
  output_path: str,
41
  n_samples: int = 5000,
42
- train_source: str = "training-data/final/train.jsonl",
43
  seed: int = 42
44
  ):
45
  """Create a stratified mini dataset."""
@@ -164,7 +164,7 @@ def main():
164
  parser = argparse.ArgumentParser(description="Create mini dataset for fast prototyping")
165
  parser.add_argument("--size", type=int, default=5000, help="Number of examples in mini dataset")
166
  parser.add_argument("--output", type=str, default="./data_mini/train_mini.jsonl", help="Output file path")
167
- parser.add_argument("--source", type=str, default="training-data/final/train.jsonl", help="Source full dataset")
168
  parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling")
169
 
170
  args = parser.parse_args()
 
1
  #!/usr/bin/env python3
2
  """
3
  Create a minimal training dataset for rapid prototyping.
4
+ Samples N examples from the full data/final/train.jsonl ensuring tool diversity.
5
  """
6
 
7
  import argparse
 
11
  from typing import List, Dict
12
  from collections import defaultdict, Counter
13
 
14
+ def load_full_dataset(train_path: str = "data/final/train.jsonl") -> List[Dict]:
15
  """Load the full dataset."""
16
  path = Path(train_path)
17
  if not path.exists():
18
+ raise FileNotFoundError(f"Training data not found at {path}. Please ensure data/final/train.jsonl exists.")
19
 
20
  data = []
21
  with open(path, 'r') as f:
 
39
  def create_mini_dataset(
40
  output_path: str,
41
  n_samples: int = 5000,
42
+ train_source: str = "data/final/train.jsonl",
43
  seed: int = 42
44
  ):
45
  """Create a stratified mini dataset."""
 
164
  parser = argparse.ArgumentParser(description="Create mini dataset for fast prototyping")
165
  parser.add_argument("--size", type=int, default=5000, help="Number of examples in mini dataset")
166
  parser.add_argument("--output", type=str, default="./data_mini/train_mini.jsonl", help="Output file path")
167
+ parser.add_argument("--source", type=str, default="data/final/train.jsonl", help="Source full dataset")
168
  parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling")
169
 
170
  args = parser.parse_args()