Spaces:
Sleeping
Sleeping
| """Phase 4: fine-tune CodeT5+ on docstring -> code. | |
| This is the second experimental arm (a tuned small model) to compare against | |
| frozen-LLM + RAG. Runs on a single mid-range GPU; raise subset/epochs for the | |
| real result. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| import pandas as pd | |
| sys.path.append(str(Path(__file__).resolve().parents[2])) | |
| from src.config import load_config # noqa: E402 | |
| CHECKPOINT = "Salesforce/codet5p-220m" | |
| def finetune(subset_size: int = 5000, epochs: int = 1, out_dir: str = "data/codet5p-ft", | |
| cfg=None): | |
| from datasets import Dataset | |
| from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, | |
| DataCollatorForSeq2Seq, Seq2SeqTrainer, | |
| Seq2SeqTrainingArguments) | |
| import torch | |
| cfg = cfg or load_config() | |
| train_path = Path(cfg.paths.processed_dir) / "train.parquet" | |
| df = pd.read_parquet(train_path).head(subset_size) | |
| tok = AutoTokenizer.from_pretrained(CHECKPOINT) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT) | |
| def to_features(batch): | |
| x = tok(batch["docstring"], max_length=64, truncation=True, padding="max_length") | |
| y = tok(text_target=batch["code"], max_length=256, truncation=True, | |
| padding="max_length") | |
| x["labels"] = y["input_ids"] | |
| return x | |
| ds = Dataset.from_pandas(df[["docstring", "code"]]).map( | |
| to_features, batched=True, remove_columns=["docstring", "code"]) | |
| args = Seq2SeqTrainingArguments( | |
| output_dir=out_dir, per_device_train_batch_size=8, num_train_epochs=epochs, | |
| learning_rate=5e-5, logging_steps=50, save_strategy="epoch", | |
| fp16=torch.cuda.is_available(), report_to="none") | |
| trainer = Seq2SeqTrainer( | |
| model=model, args=args, train_dataset=ds, | |
| data_collator=DataCollatorForSeq2Seq(tok, model=model)) | |
| trainer.train() | |
| trainer.save_model(out_dir) | |
| tok.save_pretrained(out_dir) | |
| print(f"[finetune] saved to {out_dir}") | |
| return out_dir | |
| def make_t5_generate_fn(model_dir: str): | |
| """Return generate_fn(intent)->code for plugging a tuned CodeT5+ into eval.""" | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| tok = AutoTokenizer.from_pretrained(model_dir) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_dir) | |
| model.eval() | |
| def generate_fn(intent: str) -> str: | |
| ids = tok(intent, return_tensors="pt", truncation=True, max_length=64).input_ids | |
| out = model.generate(ids.to(model.device), max_length=256) | |
| return tok.decode(out[0], skip_special_tokens=True) | |
| return generate_fn | |