Spaces:

Coyoteranger
/

flutter-code-generator

Build error

App Files Files Community

flutter-code-generator / train.py

Coyoteranger

Upload 3 files

cfe5488 verified about 1 year ago

raw

history blame contribute delete

5.89 kB

	# from datasets import load_dataset
	# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
	# import torch

	# # Check for GPU
	# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# print(f"Using device: {device}")

	# # Step 1: Load the dataset
	# dataset = load_dataset("wraps/codegen-flutter-v1")

	# # Step 2: Load the tokenizer and model
	# model_name = "Salesforce/codegen-350M-mono"
	# tokenizer = AutoTokenizer.from_pretrained(model_name)
	# tokenizer.pad_token = tokenizer.eos_token # Set the padding token
	# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

	# # Step 3: Tokenize the dataset
	# def tokenize_function(examples):
	# return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=512)

	# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"])

	# # Step 4: Set up training arguments
	# training_args = TrainingArguments(
	# output_dir="./flutter_codegen_model",
	# evaluation_strategy="epoch",
	# learning_rate=5e-5,
	# per_device_train_batch_size=4, # Adjust based on GPU memory
	# num_train_epochs=3,
	# save_steps=500,
	# save_total_limit=2,
	# fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
	# logging_dir="./logs",
	# logging_steps=10,
	# report_to="none"
	# )

	# # Step 5: Initialize the Trainer
	# trainer = Trainer(
	# model=model,
	# args=training_args,
	# train_dataset=tokenized_dataset["train"],
	# eval_dataset=tokenized_dataset["validation"],
	# tokenizer=tokenizer,
	# )

	# # Step 6: Train the model
	# trainer.train()

	# # Step 7: Save the fine-tuned model
	# model.save_pretrained("./flutter_codegen_model")
	# tokenizer.save_pretrained("./flutter_codegen_model")

	# # # # # # # # # # # # # # # # #
	# Train on multiple datasets #
	# # # # # # # # # # # # # # # # #

	from datasets import load_dataset, concatenate_datasets
	from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
	import torch

	# Check for GPU
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Step 1: Load the datasets
	print("Loading datasets...")
	dataset1 = load_dataset("wraps/codegen-flutter-v1")
	dataset2 = load_dataset("limcheekin/flutter-website-3.7")
	dataset3 = load_dataset("deepklarity/top-flutter-packages")

	# Step 2: Preprocess datasets to extract relevant text
	def preprocess_dataset1(example):
	return {"text": example["content"]}

	def preprocess_dataset2(example):
	return {"text": example["text"]}

	def preprocess_dataset3(example):
	# Combine title and description into one text entry
	return {"text": f"{example['title']} - {example['description']}"}

	print("Preprocessing datasets...")
	dataset1_train = dataset1["train"].map(preprocess_dataset1, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"])
	dataset2_train = dataset2["train"].map(preprocess_dataset2, remove_columns=["id", "source"])
	dataset3_train = dataset3["train"].map(preprocess_dataset3, remove_columns=["title", "description", "likes", "dependencies"])

	# Combine all datasets into a single dataset
	print("Combining datasets...")
	combined_dataset = concatenate_datasets([dataset1_train, dataset2_train, dataset3_train])

	# Step 3: Create train-validation split
	print("Creating train-validation split...")
	train_test_split = combined_dataset.train_test_split(test_size=0.1, seed=42)
	train_dataset = train_test_split["train"]
	validation_dataset = train_test_split["test"]

	# Step 4: Load the tokenizer and model from the checkpoint
	print("Loading tokenizer and model from checkpoint...")
	checkpoint_path = "./flutter_codegen_model/checkpoint-1500"
	tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
	tokenizer.pad_token = tokenizer.eos_token # Set the padding token
	model = AutoModelForCausalLM.from_pretrained(checkpoint_path).to(device)

	# Step 5: Tokenize the datasets
	def tokenize_function(examples):
	# Tokenize the text and add labels
	tokenized = tokenizer(
	examples["text"],
	truncation=True,
	padding="max_length",
	max_length=512,
	)
	tokenized["labels"] = tokenized["input_ids"].copy() # Duplicate input_ids as labels
	return tokenized

	print("Tokenizing datasets...")
	tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
	tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

	# Step 6: Set up training arguments
	print("Setting up training arguments...")
	training_args = TrainingArguments(
	output_dir="./flutter_codegen_model",
	evaluation_strategy="epoch",
	learning_rate=5e-5,
	per_device_train_batch_size=4, # Adjust based on GPU memory
	num_train_epochs=3,
	save_steps=500,
	save_total_limit=2,
	fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
	logging_dir="./logs",
	logging_steps=10,
	resume_from_checkpoint=checkpoint_path, # Resume from the checkpoint
	report_to="none"
	)

	# Step 7: Initialize the Trainer
	print("Initializing Trainer...")
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_train_dataset,
	eval_dataset=tokenized_validation_dataset, # Use the new validation dataset
	tokenizer=tokenizer,
	)

	# Step 8: Train the model
	print("Starting training from checkpoint...")
	trainer.train()

	# Step 9: Save the fine-tuned model
	print("Saving the model...")
	model.save_pretrained("./flutter_codegen_model")
	tokenizer.save_pretrained("./flutter_codegen_model")

	print("Training complete. Model saved to './flutter_codegen_model'.")