ariankharazmi commited on
Commit
7fcc512
·
verified ·
1 Parent(s): 98f33fe

Version 15.29 | 19 September 2024

Browse files

Curiosity-15.29 - LLM

4 General Purpose HuggingFace datasets for training* and 1 5MB JSONL file of inquiries for fine-tuning.

Run in your IDE

No context window functionality.

Files changed (9) hide show
  1. config.json +39 -0
  2. finetune.py +87 -0
  3. interactalt.py +17 -0
  4. json_read.py +8 -0
  5. main.py +107 -0
  6. model.safetensors +3 -0
  7. optimizer.pt +3 -0
  8. requirements.txt +6 -0
  9. seed_tasks_5MB.jsonl +0 -0
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/Users/kharazmimac/PycharmProjects/Curiosity-Test14/results/checkpoint-1500",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.44.2",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
finetune.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
3
+ from datasets import load_dataset
4
+ import transformers
5
+ transformers.logging.set_verbosity_info() # training / fine-tuning details
6
+ # Load the fine-tuning dataset
7
+ fine_tune_ds = load_dataset('json', data_files='seed_tasks_5MB.jsonl', split='train')
8
+
9
+ # Load the pre-trained model and tokenizer from the checkpoint, training results @ /Users/kharazmimac/PycharmProjects/Curiosity-Test14/results/checkpoint-1500
10
+ checkpoint_dir = '/Users/kharazmimac/PycharmProjects/Curiosity-Test14/results/checkpoint-1500' # Adjust the path as necessary
11
+ model_name = 'gpt2'
12
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
13
+ model = GPT2LMHeadModel.from_pretrained(checkpoint_dir)
14
+
15
+ # Set padding token for consistency
16
+ tokenizer.pad_token = tokenizer.eos_token
17
+
18
+ # Preprocess function for fine-tuning
19
+ def preprocess_function(dataset_column_examples):
20
+ # Adjust this list based on your dataset columns
21
+ text_fields = ['text', 'prompt', 'response', 'chosen', 'rejected', 'content',
22
+ 'sentence', 'concept_name', 'context',
23
+ 'column', 'id', 'name', 'instruction', 'instances',
24
+ 'input', 'noinput', 'output']
25
+ for field in text_fields:
26
+ if field in dataset_column_examples:
27
+ texts = dataset_column_examples[field]
28
+ break
29
+ else:
30
+ raise ValueError(f"No available text fields were found: {dataset_column_examples.keys()}")
31
+
32
+ texts = [str(text) if text is not None else "" for text in texts]
33
+ return tokenizer(texts, truncation=True, padding='max_length', max_length=256)
34
+
35
+ # Tokenize the fine-tuning dataset
36
+ tokenized_datasets = fine_tune_ds.map(preprocess_function, batched=True, remove_columns=fine_tune_ds.column_names)
37
+ tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask'])
38
+
39
+ dataset_size = len(tokenized_datasets)
40
+
41
+ # Define the size of the subsets, for training sets and eval sets, good for setting sizes later
42
+ eval_size = min(200, dataset_size)
43
+
44
+ # Shuffle and split the dataset
45
+ shuffled_dataset = tokenized_datasets.shuffle(seed=42)
46
+ small_eval_dataset = shuffled_dataset.select(range(eval_size))
47
+
48
+ # Fine-tuning arguments
49
+ training_args = TrainingArguments(
50
+ output_dir='./fine_tuned_results',
51
+ num_train_epochs=3,
52
+ per_device_train_batch_size=2,
53
+ save_total_limit=2,
54
+ learning_rate=2e-5,
55
+ weight_decay=0.01,
56
+ eval_strategy='epoch',
57
+ logging_dir='./logs',
58
+ logging_steps=10,
59
+ save_steps=500,
60
+ )
61
+
62
+ # Data collator for language modeling (not using MLM)
63
+ data_collator = DataCollatorForLanguageModeling(
64
+ tokenizer=tokenizer,
65
+ mlm=False,
66
+ )
67
+
68
+ # Trainer setup
69
+ trainer = Trainer(
70
+ model=model,
71
+ args=training_args,
72
+ train_dataset=tokenized_datasets,
73
+ data_collator=data_collator,
74
+ tokenizer=tokenizer,
75
+ eval_dataset=small_eval_dataset
76
+ )
77
+
78
+ # Resume from checkpoint during training if needed to run fine-tuning in different intervals
79
+ #Add this snippet into train.train() if needed --> (resume_from_checkpoint="/Users/kharazmimac/PycharmProjects/Curiosity-Test14/fine_tuned_results/checkpoint-9500")
80
+ trainer.train(resume_from_checkpoint="/Users/kharazmimac/PycharmProjects/Curiosity-Test14/fine_tuned_results/checkpoint-21000")
81
+
82
+ # Save the model
83
+ trainer.save_model('./fine_tuned_model')
84
+
85
+ # Evaluate the model
86
+ eval_results = trainer.evaluate()
87
+ print("Evaluation results:", eval_results)
interactalt.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
2
+
3
+ # Loading GPT-2 + GPT-2 Tokenizer + Checkpoint filePATH
4
+ model_path = '/Users/kharazmimac/PycharmProjects/Curiosity-Test14/fine_tuned_results/checkpoint-26394'
5
+ tokenizer = GPT2Tokenizer.from_pretrained(model_path)
6
+ model = GPT2LMHeadModel.from_pretrained(model_path)
7
+
8
+ # Set up pipeline for text generation (relating to user prompt)
9
+ text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
10
+
11
+ # Interactive Prompt for user, generate text based on user's entered prompt
12
+ while True:
13
+ user_text = input("Enter Prompt: ")
14
+ if user_text.lower() == 'Exiting Chat...':
15
+ break
16
+ result = text_generator(user_text, num_return_sequences=1, truncation=True, max_length=224)
17
+ print(result[0]['generated_text'])
json_read.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ with open('seed_tasks_5MB.jsonl', 'r') as file:
4
+ for i, line in enumerate(file, 1):
5
+ try:
6
+ json.loads(line)
7
+ except json.JSONDecodeError as e:
8
+ print(f"Error in line {i}: {e}")
main.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch # PyTorch for training purposes
2
+ from accelerate import Accelerator # Need this to address numpy issues
3
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling # OpenAI and HuggingFace packages for training, training configs, and data batch preparation
4
+ from datasets import load_dataset, concatenate_datasets # Hugging Face datasets
5
+ import json
6
+ import pandas as pd
7
+
8
+ # Load the tokenizer and GPT-2 model for use. GPT-2 allows for local training/fine-tuning and no API key, so this is perfect for a student project.
9
+ model_name = 'gpt2'
10
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
11
+ model = GPT2LMHeadModel.from_pretrained(model_name)
12
+ # Set the padding token to the EOS token -- This keeps tokenization for sequences consistent, keeping attention masking simplified.
13
+ tokenizer.pad_token = tokenizer.eos_token
14
+ tokenizer.truncation = True
15
+
16
+ # Local dataset (seed_tasks.jsonl)
17
+ #json_ds = load_dataset('json', data_files='seed_tasks_2MB.jsonl', split='train')
18
+
19
+ # Load datasets from Hugging Face.
20
+ # Working datasets
21
+ open_ds = load_dataset("OpenAssistant/oasst1", split='train[:100%]', trust_remote_code=True)
22
+ comb_ds = load_dataset("yoonholee/combined-preference-dataset", split='train[:100%]', trust_remote_code=True)
23
+ pref_ds = load_dataset("OpenRLHF/preference_dataset_mixture2_and_safe_pku", split='train[:100%]', trust_remote_code=True)
24
+ com_ds = load_dataset("community-datasets/generics_kb", "generics_kb_simplewiki", split='train[:100%]', trust_remote_code=True)
25
+
26
+ # List of datasets that do not work in conjunction with each other.
27
+ # congpt_ds = load_dataset("routellm/gpt4_dataset", split='train[:5%]', trust_remote_code=True)
28
+ # reward_ds = load_dataset("allenai/reward-bench", split='filtered[:5%]', trust_remote_code=True)
29
+
30
+ # Combine dataset(s), make sure your datasets are compatible with each other
31
+ combined_dataset = concatenate_datasets([open_ds, comb_ds, pref_ds, com_ds])
32
+
33
+ # Preprocess function for the combined dataset(s)
34
+ def preprocess_function(dataset_column_examples): # Looks for examples as input, used as a dictionary
35
+ # Text fields can be adjusted based on data columns for dataset(s)
36
+ text_fields = [
37
+ 'text', 'prompt', ' response', 'chosen', 'rejected', 'content',
38
+ 'sentence', 'concept_name', 'context',
39
+ 'column', 'id', 'name', 'instruction', 'instances',
40
+ 'input', 'noinput', 'output'] # Adjusted for dataset(s) columns, looks for keywords in examples dictionary
41
+ for field in text_fields: # Goes through list of text fields (loops), if field exists it assigns value to texts and leaves loop
42
+ if field in dataset_column_examples:
43
+ texts = dataset_column_examples[field]
44
+ break
45
+ else:
46
+ raise ValueError(f"No available text fields were found: {dataset_column_examples.keys()}") # If no assigned values are found, the program breaks
47
+ # Elements MUST be strings (or it will break)
48
+ texts = [str(text) if text is not None else "" for text in texts]
49
+ return tokenizer(texts, truncation=True, padding='max_length', max_length=256) # Adjust if needed -- uniformity in sequence tokenization, longer sequences are truncated
50
+
51
+ # Print dataset (column) information (also good for debugging when your combined dataset(s) don't work together)
52
+ print("Dataset columns:", combined_dataset.column_names)
53
+ print("Sample data from datasets:")
54
+ print(combined_dataset[:5])
55
+
56
+ # Tokenize the combined dataset(s)
57
+ tokenized_datasets = combined_dataset.map(preprocess_function, batched=True, remove_columns=combined_dataset.column_names)
58
+ tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask'])
59
+
60
+ # Finding (len) size of dataset(s) for future partitioning (breaking into smaller sets)
61
+ dataset_size = len(tokenized_datasets)
62
+
63
+ # Define the size of the subsets, for training sets and eval sets, good for setting sizes later
64
+ train_size = min(1000, dataset_size)
65
+ eval_size = min(200, dataset_size)
66
+ test_size = min(200, dataset_size)
67
+
68
+ # Shuffle and split the dataset
69
+ shuffled_dataset = tokenized_datasets.shuffle(seed=42)
70
+ small_train_dataset = shuffled_dataset.select(range(train_size))
71
+ small_eval_dataset = shuffled_dataset.select(range(train_size, train_size + eval_size))
72
+ small_test_dataset = shuffled_dataset.select(range(train_size + eval_size, train_size + eval_size + test_size))
73
+
74
+ # Define training args
75
+ training_args = TrainingArguments(
76
+ output_dir='./results',
77
+ eval_strategy='epoch',
78
+ learning_rate=2e-5,
79
+ per_device_train_batch_size=2, # Smaller batch size for faster processing speeds/time
80
+ per_device_eval_batch_size=2, # Smaller batch size for faster processing speeds/time
81
+ num_train_epochs=3, # Increase number of epochs (cycles of running through)
82
+ weight_decay=0.01, # L2 Regularization
83
+ save_total_limit=2, # Number of checkpoints that will be saved to the filePATH
84
+ )
85
+
86
+ # Data collator function (batching samples from training set), disabling Masked Language Modeling (no BERT)
87
+ data_collator = DataCollatorForLanguageModeling(
88
+ tokenizer=tokenizer,
89
+ mlm=False, # No BERT
90
+ )
91
+
92
+ # Trainer is set up to work with smaller datasets
93
+ trainer = Trainer(
94
+ model=model,
95
+ args=training_args,
96
+ train_dataset=small_train_dataset, # This is all self-explanatory
97
+ eval_dataset=small_eval_dataset,
98
+ data_collator=data_collator,
99
+ tokenizer=tokenizer,
100
+ )
101
+
102
+ # Train model function
103
+ trainer.train()
104
+
105
+ # Evaluate the model on the test set after training
106
+ test_results = trainer.evaluate(eval_dataset=small_test_dataset)
107
+ print("Test results:", test_results)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:288d516d44483fcb9ccd86f8fc09dbf5233f074d59ede25bc945b21a243a8609
3
+ size 497774208
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:492b640ba5653375023c8f5da073c5c420f632c2357d001a0ab1759853209cd2
3
+ size 995638202
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas~=2.2.2
2
+ accelerate~=0.34.2
3
+ transformers~=4.44.2
4
+ datasets~=3.0.0
5
+ numpy~=1.2.6
6
+ torch~=2.2.2
seed_tasks_5MB.jsonl ADDED
The diff for this file is too large to render. See raw diff