Upload folder using huggingface_hub
Browse files- README.md +67 -0
- dataset/test/test.jsonl +1 -0
- dataset/train/train.jsonl +4 -0
- dataset/validation/validation.jsonl +1 -0
- dataset_info.json +7 -0
- training_config.json +9 -0
README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- code
|
| 4 |
+
- programming
|
| 5 |
+
- dataset
|
| 6 |
+
pretty_name: "Coding Dataset"
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# Coding Dataset
|
| 10 |
+
|
| 11 |
+
Production-grade dataset for training AI coding agents.
|
| 12 |
+
|
| 13 |
+
## Dataset Summary
|
| 14 |
+
|
| 15 |
+
- **Total Examples**: 6 (demo)
|
| 16 |
+
- **Languages**: Python, JavaScript, Java
|
| 17 |
+
- **Task Types**: Code Generation
|
| 18 |
+
- **License**: CC0-1.0
|
| 19 |
+
|
| 20 |
+
## Dataset Structure
|
| 21 |
+
|
| 22 |
+
### Data Splits
|
| 23 |
+
|
| 24 |
+
- train: 70% of data
|
| 25 |
+
- validation: 15% of data
|
| 26 |
+
- test: 15% of data
|
| 27 |
+
|
| 28 |
+
### Features
|
| 29 |
+
|
| 30 |
+
- `id` (string): Unique identifier
|
| 31 |
+
- `code` (string): Source code snippet
|
| 32 |
+
- `code_description` (string): Natural language description
|
| 33 |
+
- `programming_language` (string): Language (python, javascript, java, etc.)
|
| 34 |
+
- `task_type` (string): Type of task
|
| 35 |
+
- `difficulty_level` (string): Difficulty (beginner, intermediate, advanced, expert)
|
| 36 |
+
- `quality_score` (float): Quality score 0.0-1.0
|
| 37 |
+
- `is_tested` (bool): Code is tested
|
| 38 |
+
- `has_bugs` (bool): Known bugs exist
|
| 39 |
+
- `lines_of_code` (int): Number of lines
|
| 40 |
+
- `collected_at` (string): Collection timestamp
|
| 41 |
+
|
| 42 |
+
## Usage
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
from datasets import load_dataset
|
| 46 |
+
|
| 47 |
+
# Load dataset
|
| 48 |
+
dataset = load_dataset("romcmu863/code-dataset")
|
| 49 |
+
|
| 50 |
+
# Access splits
|
| 51 |
+
train = dataset['train']
|
| 52 |
+
validation = dataset['validation']
|
| 53 |
+
test = dataset['test']
|
| 54 |
+
|
| 55 |
+
# Get first example
|
| 56 |
+
example = train[0]
|
| 57 |
+
print(example['code_description'])
|
| 58 |
+
print(example['code'])
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## License
|
| 62 |
+
|
| 63 |
+
CC0-1.0
|
| 64 |
+
|
| 65 |
+
## Created
|
| 66 |
+
|
| 67 |
+
2025-10-25
|
dataset/test/test.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"id":"35492ed8-a43e-4cd3-afac-4795a4519483","programming_language":"python","code":"class Stack:\n def __init__(self):\n self.items = []\n def push(self, item):\n self.items.append(item)\n def pop(self):\n return self.items.pop() if self.items else None","code_description":"Synthetic python code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":7,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.450096Z","collected_at":"2025-10-25T07:54:31.450096Z"}
|
dataset/train/train.jsonl
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id":"bbb35932-067d-49f6-8080-3f33d78f13c1","programming_language":"java","code":"public class Calculator {\n public static int add(int a, int b) {\n return a + b;\n }\n public static int multiply(int a, int b) {\n return a * b;\n }\n}","code_description":"Synthetic java code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":8,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.453096Z","collected_at":"2025-10-25T07:54:31.453096Z"}
|
| 2 |
+
{"id":"e9a75eec-f094-4753-a051-cb8072bd7787","programming_language":"javascript","code":"const debounce = (func, delay) => {\n let timeoutId;\n return (...args) => {\n clearTimeout(timeoutId);\n timeoutId = setTimeout(() => func(...args), delay);\n };\n}","code_description":"Synthetic javascript code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":7,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.451096Z","collected_at":"2025-10-25T07:54:31.451096Z"}
|
| 3 |
+
{"id":"b83c937a-2b86-4e83-9579-789e2f8f859f","programming_language":"python","code":"def binary_search(arr, target):\n left, right = 0, len(arr) - 1\n while left <= right:\n mid = (left + right) \/\/ 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid + 1\n else:\n right = mid - 1\n return -1","code_description":"Synthetic python code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":11,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.450096Z","collected_at":"2025-10-25T07:54:31.450096Z"}
|
| 4 |
+
{"id":"4bbd21cc-29ac-47e0-8497-e6aacf6aa6dc","programming_language":"python","code":"def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)","code_description":"Synthetic python code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":4,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.449100Z","collected_at":"2025-10-25T07:54:31.449100Z"}
|
dataset/validation/validation.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"id":"726137f5-5551-48d4-b7ce-2e2f7a2b8167","programming_language":"javascript","code":"function isPrime(num) {\n if (num <= 1) return false;\n for (let i = 2; i < num; i++) {\n if (num % i === 0) return false;\n }\n return true;\n}","code_description":"Synthetic javascript code snippet demonstrating common patterns","task_type":"code_generation","difficulty_level":"beginner","dataset_source":"synthetic","quality_score":0.9,"is_tested":true,"has_bugs":false,"lines_of_code":7,"complexity_score":0.4,"functions_count":1,"dependencies":null,"tags":["algorithm","example"],"docstring":null,"source_url":null,"source_repository":null,"license":"CC0-1.0","author":"synthetic","created_at":"2025-10-25T07:54:31.451096Z","collected_at":"2025-10-25T07:54:31.451096Z"}
|
dataset_info.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "code-dataset",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "Coding dataset for AI agents",
|
| 5 |
+
"homepage": "https://huggingface.co",
|
| 6 |
+
"license": "CC0-1.0"
|
| 7 |
+
}
|
training_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "seq2seq",
|
| 3 |
+
"task": "code-generation",
|
| 4 |
+
"vocab_size": 50000,
|
| 5 |
+
"batch_size": 32,
|
| 6 |
+
"learning_rate": 0.0001,
|
| 7 |
+
"num_epochs": 3,
|
| 8 |
+
"max_length": 512
|
| 9 |
+
}
|