Version 15.29 | 19 September 2024

Curiosity-15.29 - LLM

4 General Purpose HuggingFace datasets for training* and 1 5MB JSONL file of inquiries for fine-tuning.

Run in your IDE

No context window functionality.

Files changed (9) hide show

config.json +39 -0
finetune.py +87 -0
interactalt.py +17 -0
json_read.py +8 -0
main.py +107 -0
model.safetensors +3 -0
optimizer.pt +3 -0
requirements.txt +6 -0
seed_tasks_5MB.jsonl +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_name_or_path": "/Users/kharazmimac/PycharmProjects/Curiosity-Test14/results/checkpoint-1500",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "vocab_size": 50257
+}

finetune.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
+from datasets import load_dataset
+import transformers
+transformers.logging.set_verbosity_info() # training / fine-tuning details
+# Load the fine-tuning dataset
+fine_tune_ds = load_dataset('json', data_files='seed_tasks_5MB.jsonl', split='train')
+# Load the pre-trained model and tokenizer from the checkpoint, training results @ /Users/kharazmimac/PycharmProjects/Curiosity-Test14/results/checkpoint-1500
+checkpoint_dir = '/Users/kharazmimac/PycharmProjects/Curiosity-Test14/results/checkpoint-1500'  # Adjust the path as necessary
+model_name = 'gpt2'
+tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+model = GPT2LMHeadModel.from_pretrained(checkpoint_dir)
+# Set padding token for consistency
+tokenizer.pad_token = tokenizer.eos_token
+# Preprocess function for fine-tuning
+def preprocess_function(dataset_column_examples):
+    # Adjust this list based on your dataset columns
+    text_fields = ['text', 'prompt', 'response', 'chosen', 'rejected', 'content',
+        'sentence', 'concept_name', 'context',
+        'column', 'id', 'name', 'instruction', 'instances',
+        'input', 'noinput', 'output']
+    for field in text_fields:
+        if field in dataset_column_examples:
+            texts = dataset_column_examples[field]
+            break
+    else:
+        raise ValueError(f"No available text fields were found: {dataset_column_examples.keys()}")
+    texts = [str(text) if text is not None else "" for text in texts]
+    return tokenizer(texts, truncation=True, padding='max_length', max_length=256)
+# Tokenize the fine-tuning dataset
+tokenized_datasets = fine_tune_ds.map(preprocess_function, batched=True, remove_columns=fine_tune_ds.column_names)
+tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask'])
+dataset_size = len(tokenized_datasets)
+# Define the size of the subsets, for training sets and eval sets, good for setting sizes later
+eval_size = min(200, dataset_size)
+# Shuffle and split the dataset
+shuffled_dataset = tokenized_datasets.shuffle(seed=42)
+small_eval_dataset = shuffled_dataset.select(range(eval_size))
+# Fine-tuning arguments
+training_args = TrainingArguments(
+    output_dir='./fine_tuned_results',
+    num_train_epochs=3,
+    per_device_train_batch_size=2,
+    save_total_limit=2,
+    learning_rate=2e-5,
+    weight_decay=0.01,
+    eval_strategy='epoch',
+    logging_dir='./logs',
+    logging_steps=10,
+    save_steps=500,
+)
+# Data collator for language modeling (not using MLM)
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm=False,
+)
+# Trainer setup
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_datasets,
+    data_collator=data_collator,
+    tokenizer=tokenizer,
+    eval_dataset=small_eval_dataset
+)
+# Resume from checkpoint during training if needed to run fine-tuning in different intervals
+#Add this snippet into train.train() if needed --> (resume_from_checkpoint="/Users/kharazmimac/PycharmProjects/Curiosity-Test14/fine_tuned_results/checkpoint-9500")
+trainer.train(resume_from_checkpoint="/Users/kharazmimac/PycharmProjects/Curiosity-Test14/fine_tuned_results/checkpoint-21000")
+# Save the model
+trainer.save_model('./fine_tuned_model')
+# Evaluate the model
+eval_results = trainer.evaluate()
+print("Evaluation results:", eval_results)

interactalt.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
+# Loading GPT-2 + GPT-2 Tokenizer + Checkpoint filePATH
+model_path = '/Users/kharazmimac/PycharmProjects/Curiosity-Test14/fine_tuned_results/checkpoint-26394'
+tokenizer = GPT2Tokenizer.from_pretrained(model_path)
+model = GPT2LMHeadModel.from_pretrained(model_path)
+# Set up pipeline for text generation (relating to user prompt)
+text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
+# Interactive Prompt for user, generate text based on user's entered prompt
+while True:
+    user_text = input("Enter Prompt: ")
+    if user_text.lower() == 'Exiting Chat...':
+        break
+    result = text_generator(user_text, num_return_sequences=1, truncation=True, max_length=224)
+    print(result[0]['generated_text'])

json_read.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import json
+with open('seed_tasks_5MB.jsonl', 'r') as file:
+    for i, line in enumerate(file, 1):
+        try:
+            json.loads(line)
+        except json.JSONDecodeError as e:
+            print(f"Error in line {i}: {e}")

main.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch  # PyTorch for training purposes
+from accelerate import Accelerator  # Need this to address numpy issues
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling  # OpenAI and HuggingFace packages for training, training configs, and data batch preparation
+from datasets import load_dataset, concatenate_datasets  # Hugging Face datasets
+import json
+import pandas as pd
+# Load the tokenizer and GPT-2 model for use. GPT-2 allows for local training/fine-tuning and no API key, so this is perfect for a student project.
+model_name = 'gpt2'
+tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+model = GPT2LMHeadModel.from_pretrained(model_name)
+# Set the padding token to the EOS token -- This keeps tokenization for sequences consistent, keeping attention masking simplified.
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.truncation = True
+# Local dataset (seed_tasks.jsonl)
+#json_ds = load_dataset('json', data_files='seed_tasks_2MB.jsonl', split='train')
+# Load datasets from Hugging Face.
+# Working datasets
+open_ds = load_dataset("OpenAssistant/oasst1", split='train[:100%]', trust_remote_code=True)
+comb_ds = load_dataset("yoonholee/combined-preference-dataset", split='train[:100%]', trust_remote_code=True)
+pref_ds = load_dataset("OpenRLHF/preference_dataset_mixture2_and_safe_pku", split='train[:100%]', trust_remote_code=True)
+com_ds = load_dataset("community-datasets/generics_kb", "generics_kb_simplewiki", split='train[:100%]', trust_remote_code=True)
+# List of datasets that do not work in conjunction with each other.
+# congpt_ds = load_dataset("routellm/gpt4_dataset", split='train[:5%]', trust_remote_code=True)
+# reward_ds = load_dataset("allenai/reward-bench", split='filtered[:5%]', trust_remote_code=True)
+# Combine dataset(s), make sure your datasets are compatible with each other
+combined_dataset = concatenate_datasets([open_ds, comb_ds, pref_ds, com_ds])
+# Preprocess function for the combined dataset(s)
+def preprocess_function(dataset_column_examples):  # Looks for examples as input, used as a dictionary
+    # Text fields can be adjusted based on data columns for dataset(s)
+    text_fields = [
+        'text', 'prompt', ' response', 'chosen', 'rejected', 'content',
+        'sentence', 'concept_name', 'context',
+        'column', 'id', 'name', 'instruction', 'instances',
+        'input', 'noinput', 'output']  # Adjusted for dataset(s) columns, looks for keywords in examples dictionary
+    for field in text_fields:  # Goes through list of text fields (loops), if field exists it assigns value to texts and leaves loop
+        if field in dataset_column_examples:
+            texts = dataset_column_examples[field]
+            break
+    else:
+        raise ValueError(f"No available text fields were found: {dataset_column_examples.keys()}")  # If no assigned values are found, the program breaks
+    # Elements MUST be strings (or it will break)
+    texts = [str(text) if text is not None else "" for text in texts]
+    return tokenizer(texts, truncation=True, padding='max_length', max_length=256)  # Adjust if needed -- uniformity in sequence tokenization, longer sequences are truncated
+# Print dataset (column) information (also good for debugging when your combined dataset(s) don't work together)
+print("Dataset columns:", combined_dataset.column_names)
+print("Sample data from datasets:")
+print(combined_dataset[:5])
+# Tokenize the combined dataset(s)
+tokenized_datasets = combined_dataset.map(preprocess_function, batched=True, remove_columns=combined_dataset.column_names)
+tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask'])
+# Finding (len) size of dataset(s) for future partitioning (breaking into smaller sets)
+dataset_size = len(tokenized_datasets)
+# Define the size of the subsets, for training sets and eval sets, good for setting sizes later
+train_size = min(1000, dataset_size)
+eval_size = min(200, dataset_size)
+test_size = min(200, dataset_size)
+# Shuffle and split the dataset
+shuffled_dataset = tokenized_datasets.shuffle(seed=42)
+small_train_dataset = shuffled_dataset.select(range(train_size))
+small_eval_dataset = shuffled_dataset.select(range(train_size, train_size + eval_size))
+small_test_dataset = shuffled_dataset.select(range(train_size + eval_size, train_size + eval_size + test_size))
+# Define training args
+training_args = TrainingArguments(
+    output_dir='./results',
+    eval_strategy='epoch',
+    learning_rate=2e-5,
+    per_device_train_batch_size=2,  # Smaller batch size for faster processing speeds/time
+    per_device_eval_batch_size=2,  # Smaller batch size for faster processing speeds/time
+    num_train_epochs=3,  # Increase number of epochs (cycles of running through)
+    weight_decay=0.01, # L2 Regularization
+    save_total_limit=2,  # Number of checkpoints that will be saved to the filePATH
+)
+# Data collator function (batching samples from training set), disabling Masked Language Modeling (no BERT)
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm=False,  # No BERT
+)
+# Trainer is set up to work with smaller datasets
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=small_train_dataset,  # This is all self-explanatory
+    eval_dataset=small_eval_dataset,
+    data_collator=data_collator,
+    tokenizer=tokenizer,
+)
+# Train model function
+trainer.train()
+# Evaluate the model on the test set after training
+test_results = trainer.evaluate(eval_dataset=small_test_dataset)
+print("Test results:", test_results)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:288d516d44483fcb9ccd86f8fc09dbf5233f074d59ede25bc945b21a243a8609
+size 497774208

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:492b640ba5653375023c8f5da073c5c420f632c2357d001a0ab1759853209cd2
+size 995638202

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas~=2.2.2
+accelerate~=0.34.2
+transformers~=4.44.2
+datasets~=3.0.0
+numpy~=1.2.6
+torch~=2.2.2

seed_tasks_5MB.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff