Spaces:
Runtime error
Runtime error
| import logging | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments | |
| from datasets import Dataset | |
| from sklearn.model_selection import train_test_split | |
| import re | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"} | |
| def stem_word(word): | |
| suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er'] | |
| for suffix in suffixes: | |
| if word.endswith(suffix): | |
| return word[:-len(suffix)] | |
| return word | |
| def clean_text(text): | |
| text = re.sub(r'[^\w\s]', '', text) | |
| text = re.sub(r'\d+', '', text) | |
| text = text.lower() | |
| text = " ".join([word for word in text.split() if word not in stop_words]) | |
| text = " ".join([stem_word(word) for word in text.split()]) | |
| return text | |
| def read_prompts(file_path): | |
| input_texts = [] | |
| target_texts = [] | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| lines = file.readlines() | |
| for line in lines: | |
| if line.startswith("input:"): | |
| input_texts.append(line.replace("input:", "").strip()) | |
| elif line.startswith("target:"): | |
| target_texts.append(line.replace("target:", "").strip()) | |
| return input_texts, target_texts | |
| def prepare_data(input_texts, target_texts): | |
| inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length") | |
| targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length") | |
| return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]} | |
| # Fine-tuning | |
| def fine_tune_model(): | |
| model_name = "t5-base" | |
| tokenizer = T5Tokenizer.from_pretrained(model_name) | |
| model = T5ForConditionalGeneration.from_pretrained(model_name) | |
| try: | |
| logger.info("Reading and cleaning prompts.") | |
| input_texts, target_texts = read_prompts("prompts.txt") | |
| input_texts_cleaned = [clean_text(text) for text in input_texts] | |
| target_texts_cleaned = [clean_text(text) for text in target_texts] | |
| logger.info("Splitting dataset into training and validation sets.") | |
| train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1) | |
| logger.info("Preparing datasets for training.") | |
| train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels, tokenizer)) | |
| val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer)) | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| evaluation_strategy="steps", | |
| learning_rate=5e-5, | |
| per_device_train_batch_size=4, | |
| num_train_epochs=3, | |
| save_steps=500, | |
| logging_dir="./logs", | |
| logging_steps=10 | |
| ) | |
| logger.info("Starting model training.") | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset | |
| ) | |
| trainer.train() | |
| logger.info("Saving fine-tuned model.") | |
| model.save_pretrained("./fine_tuned_model") | |
| tokenizer.save_pretrained("./fine_tuned_model") | |
| except Exception as e: | |
| logger.error(f"An error occurred during fine-tuning: {str(e)}") | |
| fine_tune_model() |