File size: 3,420 Bytes
1c58cca | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | """
data/code_dataset.py
Dataset preparation for the "AI Assistant for Programmers" use case.
Uses: iamtarun/python_code_instructions_18k_alpaca
- 18K Python instruction-answer pairs
- Covers algorithms, data structures, debugging, APIs, best practices
- Each example: instruction + optional context + code answer
Why this dataset for fine-tuning:
- Domain-specific data → model learns programmer vocabulary
- Code-heavy outputs → model learns to format code blocks properly
- QA format matches how developers actually ask questions
- 18K examples → enough for meaningful domain adaptation via LoRA
Run: python -m data.code_dataset
"""
import json
import logging
from pathlib import Path
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import sys
sys.path.append(str(Path(__file__).parent.parent))
from config import cfg
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
# Mistral instruct format with code-aware system prompt
CODE_PROMPT_TEMPLATE = """<s>[INST] {system}
### Question:
{instruction}
{context_block}[/INST]
### Answer:
{output} </s>"""
SYSTEM_PROMPT = cfg.usecase.system_prompt
def format_code_example(example: dict) -> dict:
"""Format a code instruction example into Mistral chat format."""
instruction = example.get("instruction", "").strip()
context = example.get("input", "").strip()
output = example.get("output", "").strip()
if not instruction or not output:
return None
# Ensure code blocks are properly fenced
if "def " in output or "import " in output or "class " in output:
if "```" not in output:
output = f"```python\n{output}\n```"
context_block = f"### Context:\n{context}\n\n" if context else ""
text = CODE_PROMPT_TEMPLATE.format(
system=SYSTEM_PROMPT,
instruction=instruction,
context_block=context_block,
output=output,
)
return {
"text": text,
"instruction": instruction,
"context": context,
"output": output,
"domain": "programming",
}
def prepare_code_dataset():
"""Download and format the programming dataset."""
cfg.ensure_dirs()
out_dir = Path(cfg.data.processed_data_dir)
log.info(f"Loading dataset: {cfg.usecase.finetune_dataset}")
raw = load_dataset(cfg.usecase.finetune_dataset, split="train", trust_remote_code=True)
log.info(f"Raw size: {len(raw):,}")
formatted = []
skipped = 0
for ex in raw:
result = format_code_example(ex)
if result:
formatted.append(result)
else:
skipped += 1
log.info(f"Formatted: {len(formatted):,} | Skipped: {skipped:,}")
train_data, val_data = train_test_split(
formatted, test_size=cfg.data.val_size, random_state=cfg.data.seed
)
for split_name, split_data in [("train", train_data), ("val", val_data)]:
path = out_dir / f"{split_name}_code.jsonl"
with open(path, "w") as f:
for item in split_data:
f.write(json.dumps(item) + "\n")
log.info(f"Saved {split_name} → {path} ({len(split_data):,} examples)")
log.info("✅ Code dataset ready for fine-tuning!")
return train_data, val_data
if __name__ == "__main__":
prepare_code_dataset()
|