romcmu863 commited on
Commit
7fec244
·
verified ·
1 Parent(s): 8cfbba6

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - code
4
+ - programming
5
+ - dataset
6
+ pretty_name: "Coding Dataset"
7
+ ---
8
+
9
+ # Coding Dataset
10
+
11
+ Production-grade dataset for training AI coding agents.
12
+
13
+ ## Dataset Summary
14
+
15
+ - **Total Examples**: 6 (demo)
16
+ - **Languages**: Python, JavaScript, Java
17
+ - **Task Types**: Code Generation
18
+ - **License**: CC0-1.0
19
+
20
+ ## Dataset Structure
21
+
22
+ ### Data Splits
23
+
24
+ - train: 70% of data
25
+ - validation: 15% of data
26
+ - test: 15% of data
27
+
28
+ ### Features
29
+
30
+ - `id` (string): Unique identifier
31
+ - `code` (string): Source code snippet
32
+ - `code_description` (string): Natural language description
33
+ - `programming_language` (string): Language (python, javascript, java, etc.)
34
+ - `task_type` (string): Type of task
35
+ - `difficulty_level` (string): Difficulty (beginner, intermediate, advanced, expert)
36
+ - `quality_score` (float): Quality score 0.0-1.0
37
+ - `is_tested` (bool): Code is tested
38
+ - `has_bugs` (bool): Known bugs exist
39
+ - `lines_of_code` (int): Number of lines
40
+ - `collected_at` (string): Collection timestamp
41
+
42
+ ## Usage
43
+
44
+ ```python
45
+ from datasets import load_dataset
46
+
47
+ # Load dataset
48
+ dataset = load_dataset("romcmu863/code-dataset")
49
+
50
+ # Access splits
51
+ train = dataset['train']
52
+ validation = dataset['validation']
53
+ test = dataset['test']
54
+
55
+ # Get first example
56
+ example = train[0]
57
+ print(example['code_description'])
58
+ print(example['code'])
59
+ ```
60
+
61
+ ## License
62
+
63
+ CC0-1.0
64
+
65
+ ## Created
66
+
67
+ 2025-10-25
dataset/test/test.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"id":"35492ed8-a43e-4cd3-afac-4795a4519483","programming_language":"python","code":"class Stack:\n def __init__(self):\n self.items = []\n def push(self, item):\n self.items.append(item)\n def pop(self):\n return self.items.pop() if self.items else None","code_description":"Synthetic python code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":7,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.450096Z","collected_at":"2025-10-25T07:54:31.450096Z"}
dataset/train/train.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"id":"bbb35932-067d-49f6-8080-3f33d78f13c1","programming_language":"java","code":"public class Calculator {\n public static int add(int a, int b) {\n return a + b;\n }\n public static int multiply(int a, int b) {\n return a * b;\n }\n}","code_description":"Synthetic java code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":8,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.453096Z","collected_at":"2025-10-25T07:54:31.453096Z"}
2
+ {"id":"e9a75eec-f094-4753-a051-cb8072bd7787","programming_language":"javascript","code":"const debounce = (func, delay) => {\n let timeoutId;\n return (...args) => {\n clearTimeout(timeoutId);\n timeoutId = setTimeout(() => func(...args), delay);\n };\n}","code_description":"Synthetic javascript code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":7,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.451096Z","collected_at":"2025-10-25T07:54:31.451096Z"}
3
+ {"id":"b83c937a-2b86-4e83-9579-789e2f8f859f","programming_language":"python","code":"def binary_search(arr, target):\n left, right = 0, len(arr) - 1\n while left <= right:\n mid = (left + right) \/\/ 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid + 1\n else:\n right = mid - 1\n return -1","code_description":"Synthetic python code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":11,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.450096Z","collected_at":"2025-10-25T07:54:31.450096Z"}
4
+ {"id":"4bbd21cc-29ac-47e0-8497-e6aacf6aa6dc","programming_language":"python","code":"def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)","code_description":"Synthetic python code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":4,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.449100Z","collected_at":"2025-10-25T07:54:31.449100Z"}
dataset/validation/validation.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"id":"726137f5-5551-48d4-b7ce-2e2f7a2b8167","programming_language":"javascript","code":"function isPrime(num) {\n if (num <= 1) return false;\n for (let i = 2; i < num; i++) {\n if (num % i === 0) return false;\n }\n return true;\n}","code_description":"Synthetic javascript code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":7,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.451096Z","collected_at":"2025-10-25T07:54:31.451096Z"}
dataset_info.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "code-dataset",
3
+ "version": "1.0.0",
4
+ "description": "Coding dataset for AI agents",
5
+ "homepage": "https://huggingface.co",
6
+ "license": "CC0-1.0"
7
+ }
training_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "seq2seq",
3
+ "task": "code-generation",
4
+ "vocab_size": 50000,
5
+ "batch_size": 32,
6
+ "learning_rate": 0.0001,
7
+ "num_epochs": 3,
8
+ "max_length": 512
9
+ }