Spaces:
Build error
Build error
| # from datasets import load_dataset | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
| # import torch | |
| # # Check for GPU | |
| # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # print(f"Using device: {device}") | |
| # # Step 1: Load the dataset | |
| # dataset = load_dataset("wraps/codegen-flutter-v1") | |
| # # Step 2: Load the tokenizer and model | |
| # model_name = "Salesforce/codegen-350M-mono" | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # tokenizer.pad_token = tokenizer.eos_token # Set the padding token | |
| # model = AutoModelForCausalLM.from_pretrained(model_name).to(device) | |
| # # Step 3: Tokenize the dataset | |
| # def tokenize_function(examples): | |
| # return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=512) | |
| # tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"]) | |
| # # Step 4: Set up training arguments | |
| # training_args = TrainingArguments( | |
| # output_dir="./flutter_codegen_model", | |
| # evaluation_strategy="epoch", | |
| # learning_rate=5e-5, | |
| # per_device_train_batch_size=4, # Adjust based on GPU memory | |
| # num_train_epochs=3, | |
| # save_steps=500, | |
| # save_total_limit=2, | |
| # fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available | |
| # logging_dir="./logs", | |
| # logging_steps=10, | |
| # report_to="none" | |
| # ) | |
| # # Step 5: Initialize the Trainer | |
| # trainer = Trainer( | |
| # model=model, | |
| # args=training_args, | |
| # train_dataset=tokenized_dataset["train"], | |
| # eval_dataset=tokenized_dataset["validation"], | |
| # tokenizer=tokenizer, | |
| # ) | |
| # # Step 6: Train the model | |
| # trainer.train() | |
| # # Step 7: Save the fine-tuned model | |
| # model.save_pretrained("./flutter_codegen_model") | |
| # tokenizer.save_pretrained("./flutter_codegen_model") | |
| # # # # # # # # # # # # # # # # # | |
| # Train on multiple datasets # | |
| # # # # # # # # # # # # # # # # # | |
| from datasets import load_dataset, concatenate_datasets | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
| import torch | |
| # Check for GPU | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Using device: {device}") | |
| # Step 1: Load the datasets | |
| print("Loading datasets...") | |
| dataset1 = load_dataset("wraps/codegen-flutter-v1") | |
| dataset2 = load_dataset("limcheekin/flutter-website-3.7") | |
| dataset3 = load_dataset("deepklarity/top-flutter-packages") | |
| # Step 2: Preprocess datasets to extract relevant text | |
| def preprocess_dataset1(example): | |
| return {"text": example["content"]} | |
| def preprocess_dataset2(example): | |
| return {"text": example["text"]} | |
| def preprocess_dataset3(example): | |
| # Combine title and description into one text entry | |
| return {"text": f"{example['title']} - {example['description']}"} | |
| print("Preprocessing datasets...") | |
| dataset1_train = dataset1["train"].map(preprocess_dataset1, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"]) | |
| dataset2_train = dataset2["train"].map(preprocess_dataset2, remove_columns=["id", "source"]) | |
| dataset3_train = dataset3["train"].map(preprocess_dataset3, remove_columns=["title", "description", "likes", "dependencies"]) | |
| # Combine all datasets into a single dataset | |
| print("Combining datasets...") | |
| combined_dataset = concatenate_datasets([dataset1_train, dataset2_train, dataset3_train]) | |
| # Step 3: Create train-validation split | |
| print("Creating train-validation split...") | |
| train_test_split = combined_dataset.train_test_split(test_size=0.1, seed=42) | |
| train_dataset = train_test_split["train"] | |
| validation_dataset = train_test_split["test"] | |
| # Step 4: Load the tokenizer and model from the checkpoint | |
| print("Loading tokenizer and model from checkpoint...") | |
| checkpoint_path = "./flutter_codegen_model/checkpoint-1500" | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) | |
| tokenizer.pad_token = tokenizer.eos_token # Set the padding token | |
| model = AutoModelForCausalLM.from_pretrained(checkpoint_path).to(device) | |
| # Step 5: Tokenize the datasets | |
| def tokenize_function(examples): | |
| # Tokenize the text and add labels | |
| tokenized = tokenizer( | |
| examples["text"], | |
| truncation=True, | |
| padding="max_length", | |
| max_length=512, | |
| ) | |
| tokenized["labels"] = tokenized["input_ids"].copy() # Duplicate input_ids as labels | |
| return tokenized | |
| print("Tokenizing datasets...") | |
| tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
| tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
| # Step 6: Set up training arguments | |
| print("Setting up training arguments...") | |
| training_args = TrainingArguments( | |
| output_dir="./flutter_codegen_model", | |
| evaluation_strategy="epoch", | |
| learning_rate=5e-5, | |
| per_device_train_batch_size=4, # Adjust based on GPU memory | |
| num_train_epochs=3, | |
| save_steps=500, | |
| save_total_limit=2, | |
| fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available | |
| logging_dir="./logs", | |
| logging_steps=10, | |
| resume_from_checkpoint=checkpoint_path, # Resume from the checkpoint | |
| report_to="none" | |
| ) | |
| # Step 7: Initialize the Trainer | |
| print("Initializing Trainer...") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_train_dataset, | |
| eval_dataset=tokenized_validation_dataset, # Use the new validation dataset | |
| tokenizer=tokenizer, | |
| ) | |
| # Step 8: Train the model | |
| print("Starting training from checkpoint...") | |
| trainer.train() | |
| # Step 9: Save the fine-tuned model | |
| print("Saving the model...") | |
| model.save_pretrained("./flutter_codegen_model") | |
| tokenizer.save_pretrained("./flutter_codegen_model") | |
| print("Training complete. Model saved to './flutter_codegen_model'.") | |