Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
| model_id = "bigcode/starcoderbase-7b" | |
| dataset_repo = "Percy3822/python_ai_coder" # Your HF dataset repo | |
| # Load dataset | |
| dataset = load_dataset(dataset_repo, split="train") | |
| # Load tokenizer and model | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) | |
| # Tokenize | |
| def tokenize(example): | |
| result = tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512) | |
| return result | |
| tokenized = dataset.map(tokenize, remove_columns=["prompt", "completion"]) | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| # Training config | |
| args = TrainingArguments( | |
| output_dir="./python-ai-model", | |
| per_device_train_batch_size=2, | |
| gradient_accumulation_steps=4, | |
| num_train_epochs=3, | |
| logging_steps=50, | |
| save_steps=500, | |
| save_total_limit=2, | |
| evaluation_strategy="no", | |
| fp16=True, | |
| push_to_hub=True, | |
| hub_model_id="Percy3822/python_ai_coder", | |
| hub_token="<your_HF_token_here>" # Optional if you run in a linked HF Space | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| train_dataset=tokenized, | |
| tokenizer=tokenizer, | |
| args=args, | |
| data_collator=data_collator, | |
| ) | |
| trainer.train() | |
| trainer.push_to_hub() |