from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling model_id = "bigcode/starcoderbase-7b" dataset_repo = "Percy3822/python_ai_coder" # Your HF dataset repo # Load dataset dataset = load_dataset(dataset_repo, split="train") # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) # Tokenize def tokenize(example): result = tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512) return result tokenized = dataset.map(tokenize, remove_columns=["prompt", "completion"]) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Training config args = TrainingArguments( output_dir="./python-ai-model", per_device_train_batch_size=2, gradient_accumulation_steps=4, num_train_epochs=3, logging_steps=50, save_steps=500, save_total_limit=2, evaluation_strategy="no", fp16=True, push_to_hub=True, hub_model_id="Percy3822/python_ai_coder", hub_token="" # Optional if you run in a linked HF Space ) trainer = Trainer( model=model, train_dataset=tokenized, tokenizer=tokenizer, args=args, data_collator=data_collator, ) trainer.train() trainer.push_to_hub()