tazwarrrr commited on
Commit
f4be021
·
1 Parent(s): d61bf98
Files changed (2) hide show
  1. README.md +6 -0
  2. dataset/upload_dataset.py +118 -0
README.md CHANGED
@@ -27,6 +27,12 @@ ROCmPort AI orchestrates a migration loop:
27
 
28
  Complex kernels can fail conversion due to architecture assumptions, undefined behavior, inline PTX, or handcrafted memory logic. The value is reduced migration time and faster debug loops.
29
 
 
 
 
 
 
 
30
  ## Target User and Business Case
31
 
32
  Primary product position:
 
27
 
28
  Complex kernels can fail conversion due to architecture assumptions, undefined behavior, inline PTX, or handcrafted memory logic. The value is reduced migration time and faster debug loops.
29
 
30
+ ## Dataset
31
+
32
+ Training data: [tazwarrrr/cuda-to-rocm-wavefront-bugs](https://huggingface.co/datasets/tazwarrrr/cuda-to-rocm-wavefront-bugs)
33
+ 170 expert-curated CUDA→ROCm wavefront-64 bug examples across 6 categories.
34
+ See `dataset/upload_dataset.py` for how it was compiled and uploaded.
35
+
36
  ## Target User and Business Case
37
 
38
  Primary product position:
dataset/upload_dataset.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload cuda-to-rocm-wavefront-bugs dataset to HuggingFace Hub.
3
+
4
+ Supports either:
5
+ - 17 individual batch JSON files (recommended)
6
+ - A single combined JSONL file
7
+
8
+ Usage (individual files in current dir):
9
+ python upload_dataset.py --token hf_xxxx --files_dir .
10
+
11
+ Usage (specific directory):
12
+ python upload_dataset.py --token hf_xxxx --files_dir ./my_batches/
13
+
14
+ Usage (single JSONL fallback):
15
+ python upload_dataset.py --token hf_xxxx --jsonl cuda_rocm_bugs.jsonl
16
+ """
17
+ import json
18
+ import os
19
+ import glob
20
+ import argparse
21
+ from collections import Counter
22
+ from datasets import Dataset, DatasetDict, Features, Value
23
+
24
+ FEATURES = Features({
25
+ "id": Value("string"),
26
+ "bug_category": Value("string"),
27
+ "risk_level": Value("string"),
28
+ "kernel_type": Value("string"),
29
+ "cuda_snippet": Value("string"),
30
+ "hip_naive": Value("string"),
31
+ "hip_corrected": Value("string"),
32
+ "explanation": Value("string"),
33
+ "amd_hardware": Value("string"),
34
+ "rocm_version": Value("string"),
35
+ "verified_on_mi300x": Value("bool"),
36
+ "hipify_catches_this": Value("bool"),
37
+ })
38
+
39
+
40
+ def load_from_files(files_dir):
41
+ """Load all batch JSON files from a directory."""
42
+ pattern = os.path.join(files_dir, "*.json")
43
+ files = sorted(glob.glob(pattern))
44
+ if not files:
45
+ raise FileNotFoundError(f"No .json files found in {files_dir}")
46
+
47
+ all_examples = []
48
+ for f in files:
49
+ with open(f) as fp:
50
+ batch = json.load(fp)
51
+ all_examples.extend(batch)
52
+ print(f" Loaded {len(batch):>3} examples from {os.path.basename(f)}")
53
+
54
+ return all_examples
55
+
56
+
57
+ def load_from_jsonl(jsonl_path):
58
+ with open(jsonl_path) as f:
59
+ return [json.loads(line) for line in f]
60
+
61
+
62
+ def main():
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument("--token", required=True,
65
+ help="HuggingFace write token")
66
+ parser.add_argument(
67
+ "--repo", default="tazwarrrr/cuda-to-rocm-wavefront-bugs")
68
+ parser.add_argument("--files_dir", default=None,
69
+ help="Directory with batch .json files")
70
+ parser.add_argument("--jsonl", default=None,
71
+ help="Single combined JSONL file")
72
+ args = parser.parse_args()
73
+
74
+ # Auto-detect if nothing specified
75
+ if not args.files_dir and not args.jsonl:
76
+ json_files = sorted(glob.glob("batch_*.json"))
77
+ if json_files:
78
+ print(
79
+ f"Auto-detected {len(json_files)} batch files in current directory")
80
+ args.files_dir = "."
81
+ elif os.path.exists("cuda_rocm_bugs.jsonl"):
82
+ args.jsonl = "cuda_rocm_bugs.jsonl"
83
+ else:
84
+ parser.error("Provide --files_dir or --jsonl")
85
+
86
+ # Load data
87
+ print("\nLoading data...")
88
+ if args.files_dir:
89
+ data = load_from_files(args.files_dir)
90
+ else:
91
+ data = load_from_jsonl(args.jsonl)
92
+
93
+ print(f"\nTotal: {len(data)} examples")
94
+ print("\nBy category:")
95
+ for cat, count in sorted(Counter(e["bug_category"] for e in data).items()):
96
+ print(f" {cat}: {count}")
97
+
98
+ # Build dataset with 90/10 split
99
+ ds = Dataset.from_list(data, features=FEATURES)
100
+ split = ds.train_test_split(test_size=0.1, seed=42)
101
+ dataset_dict = DatasetDict(
102
+ {"train": split["train"], "test": split["test"]})
103
+
104
+ print(
105
+ f"\nSplit: {len(dataset_dict['train'])} train / {len(dataset_dict['test'])} test")
106
+ print(f"\nUploading to https://huggingface.co/datasets/{args.repo} ...")
107
+
108
+ dataset_dict.push_to_hub(args.repo, token=args.token, private=False)
109
+
110
+ print("\n✅ Done!")
111
+ print(f" https://huggingface.co/datasets/{args.repo}")
112
+ print("\nNext steps:")
113
+ print(" 1. Paste dataset_README.md as the Dataset Card on HuggingFace")
114
+ print(" 2. Link dataset in lablab.ai submission under 'HuggingFace' track")
115
+
116
+
117
+ if __name__ == "__main__":
118
+ main()