Spaces:
Runtime error
Runtime error
| import json | |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling | |
| from torch.utils.data import Dataset | |
| import os | |
| # Step 1: Load and Preprocess Data | |
| class SpiderDataset(Dataset): | |
| def __init__(self, file_paths, tokenizer, max_length=128): | |
| self.data = [] | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| for file_path in file_paths: | |
| with open(file_path, 'r') as f: | |
| self.data.extend(json.load(f)) | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| item = self.data[idx] | |
| question = item['question'] | |
| sql_query = item['query'] | |
| # Tokenize inputs and labels | |
| input_encoding = self.tokenizer( | |
| question, | |
| max_length=self.max_length, | |
| padding="max_length", | |
| truncation=True, | |
| return_tensors="pt" | |
| ) | |
| output_encoding = self.tokenizer( | |
| sql_query, | |
| max_length=self.max_length, | |
| padding="max_length", | |
| truncation=True, | |
| return_tensors="pt" | |
| ) | |
| # Prepare inputs and labels | |
| input_ids = input_encoding['input_ids'].squeeze() | |
| labels = output_encoding['input_ids'].squeeze() | |
| return { | |
| "input_ids": input_ids, | |
| "labels": labels | |
| } | |
| # Step 2: Initialize Tokenizer and Model | |
| tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") | |
| tokenizer.pad_token = tokenizer.eos_token # Set pad token | |
| # Load model with language model head | |
| model = GPT2LMHeadModel.from_pretrained("distilgpt2") | |
| # Step 3: Load Datasets | |
| # Assuming the files are in a directory called `space/dataset` | |
| file_paths = [ | |
| "text2sql_pepe/train_others.json", | |
| "text2sql_pepe/dev.json", | |
| "text2sql_pepe/train_spider.json", | |
| "text2sql_pepe/test.json" | |
| ] | |
| train_dataset = SpiderDataset(file_paths, tokenizer) | |
| # Step 4: Define Training Arguments | |
| training_args = TrainingArguments( | |
| output_dir="./distilgpt2-sql-converter", | |
| evaluation_strategy="epoch", | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| num_train_epochs=3, | |
| weight_decay=0.01, | |
| logging_dir="./logs", | |
| save_total_limit=2, | |
| ) | |
| # Step 5: Initialize Trainer with Data Collator | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| data_collator=data_collator, | |
| ) | |
| # Step 6: Train the Model | |
| trainer.train() | |
| # Step 7: Save the Model and Tokenizer | |
| model.save_pretrained("./distilgpt2-sql-converter") | |
| tokenizer.save_pretrained("./distilgpt2-sql-converter") | |