cranky-coder08 commited on
Commit
2b16304
·
verified ·
1 Parent(s): 9e9b4a6

Add files using upload-large-folder tool

Browse files
data/data_set1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/log_dataset.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
Binary file (2.7 kB). View file
 
scripts/train_dolphin_phi.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, os
2
+ from datasets import load_dataset
3
+ from transformers import EarlyStoppingCallback
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
5
+ from peft import LoraConfig, get_peft_model
6
+ from trl import SFTTrainer, SFTConfig, setup_chat_format
7
+ import torch
8
+
9
+ print("Is a CUDA GPU available? ", torch.cuda.is_available())
10
+ print("The CUDA version is: ", torch.version.cuda)
11
+
12
+ NAME_OF_MODEL = "microsoft/phi-2"
13
+ DATASET_PATH = "data/data_set1.jsonl"
14
+ OUTPUT_DIR = "/model_output/dolphi_round_1"
15
+
16
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
17
+
18
+ bnb_config = BitsAndBytesConfig(
19
+ load_in_4bit=True,
20
+ bnb_4bit_quant_type="nf4",
21
+ bnb_4bit_use_double_quant=True,
22
+ bnb_4bit_compute_dtype=torch.float16
23
+ )
24
+
25
+
26
+ lora_config = LoraConfig(
27
+ r=32,
28
+ lora_alpha=64,
29
+ bias='none',
30
+ target_modules=["q_proj", "k_proj", "v_proj"],
31
+ lora_dropout=0.15,
32
+ task_type="CAUSAL_LM"
33
+ )
34
+
35
+ try:
36
+ # Load dataset with your 'prompt' and 'response' keys
37
+ dataset = load_dataset("json", data_files=DATASET_PATH)
38
+ split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
39
+ train_dataset = split_dataset["train"]
40
+ eval_dataset = split_dataset["test"]
41
+ print("Dataset loaded and split successfully!")
42
+
43
+ train_dataset = train_dataset.rename_column("response", "completion")
44
+ eval_dataset = eval_dataset.rename_column("response", "completion")
45
+ print("Renamed 'response' column to 'completion' in datasets.")
46
+ except Exception as e:
47
+ print(f"Error loading dataset from {DATASET_PATH}: {e}")
48
+ exit(1)
49
+
50
+ def formatting_func(example):
51
+ text = f"### System Prompt:\nSummarize the following log entry in the specified format.\n\n### Log Entry:\n{example['prompt']}\n\n### Summary:\n{example['completion']}"
52
+ return text
53
+
54
+
55
+ try:
56
+ # Use setup_chat_format to automatically configure the tokenizer and model.
57
+ # This prevents manual syntax errors and resizes the embedding layer.
58
+ model = AutoModelForCausalLM.from_pretrained(
59
+ NAME_OF_MODEL,
60
+ quantization_config=bnb_config,
61
+ device_map="auto",
62
+ trust_remote_code=True,
63
+ torch_dtype=torch.float16,
64
+ attn_implementation="eager"
65
+ )
66
+ tokenizer = AutoTokenizer.from_pretrained(NAME_OF_MODEL, trust_remote_code=True)
67
+ model, tokenizer = setup_chat_format(
68
+ model,
69
+ tokenizer,
70
+ resize_to_multiple_of=8
71
+ )
72
+
73
+ # Note: When passing the model object directly to SFTTrainer,
74
+ # the model_init_kwargs in SFTConfig are ignored.
75
+ # The setup_chat_format function also correctly sets the chat template,
76
+ # making the manual definition unnecessary.
77
+ print("Model and Tokenizer loaded and configured successfully!")
78
+
79
+ except Exception as e:
80
+ print(f'ERROR LOADING MODEL OR TOKENIZER: {e}')
81
+ exit(1)
82
+
83
+
84
+
85
+ sft_config = SFTConfig(
86
+ output_dir=OUTPUT_DIR,
87
+ per_device_train_batch_size=4,
88
+ gradient_accumulation_steps=16,
89
+ learning_rate=1e-4,
90
+ weight_decay=0.001,
91
+ bf16=True,
92
+ warmup_ratio=0.03,
93
+ group_by_length=True,
94
+ lr_scheduler_type='cosine',
95
+ num_train_epochs=2,
96
+ logging_steps=10,
97
+ save_steps=25,
98
+ fp16=False,
99
+ optim="paged_adamw_8bit",
100
+ report_to=["tensorboard"],
101
+ eval_strategy="steps",
102
+ eval_steps=25,
103
+ packing=False,
104
+ completion_only_loss=False,
105
+ max_length=2048,
106
+ load_best_model_at_end=True,
107
+ metric_for_best_model="eval_loss",
108
+ greater_is_better=False
109
+ )
110
+
111
+ trainer=SFTTrainer(
112
+ model=model,
113
+ processing_class=tokenizer,
114
+ train_dataset=train_dataset,
115
+ eval_dataset=eval_dataset,
116
+ peft_config=lora_config,
117
+ args=sft_config,
118
+ formatting_func=formatting_func,
119
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
120
+ )
121
+
122
+ print("training started")
123
+
124
+ trainer.train()
125
+
126
+ print("fine tuning complete")
127
+
128
+ trainer.save_model(OUTPUT_DIR, merge_adapter_layers=True)
upload_to_hf.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ # Define the local folder to upload and the target repository name
5
+ local_folder_path = "C:/Users/aravi/Desktop/dolphin phi summarizer"
6
+ repo_id = "cranky-coder08/dolphin-phi-summarizer" # You can change this to your desired repo name
7
+
8
+ # Ensure you are logged in to the Hugging Face Hub
9
+ # You need to run `huggingface-cli login` in your terminal first.
10
+ # This command stores your token securely.
11
+ try:
12
+ api = HfApi()
13
+ api.whoami()
14
+ print("Successfully logged in to Hugging Face.")
15
+ except Exception as e:
16
+ print("Please log in to Hugging Face first by running 'huggingface-cli login' in your terminal.")
17
+ print(f"Error: {e}")
18
+ exit()
19
+
20
+ # Create the repository if it doesn't already exist
21
+ try:
22
+ api.create_repo(repo_id, exist_ok=True, repo_type="model")
23
+ print(f"Repository '{repo_id}' created or already exists.")
24
+ except Exception as e:
25
+ print(f"An error occurred while creating the repository: {e}")
26
+ exit()
27
+
28
+ # Upload the entire folder to the repository
29
+ print(f"Uploading folder '{local_folder_path}' to '{repo_id}'...")
30
+ try:
31
+ api.upload_large_folder(
32
+ folder_path=local_folder_path,
33
+ repo_id=repo_id,
34
+ repo_type="model",
35
+ )
36
+ print("Upload complete! The folder has been successfully pushed to the Hugging Face Hub.")
37
+ print(f"You can view your repository here: https://huggingface.co/{repo_id}")
38
+ except Exception as e:
39
+ print(f"An error occurred during the upload: {e}")