Add files using upload-large-folder tool

Browse files

Files changed (5) hide show

data/data_set1.jsonl +0 -0
data/log_dataset.json +0 -0
requirements.txt +0 -0
scripts/train_dolphin_phi.py +128 -0
upload_to_hf.py +39 -0

data/data_set1.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/log_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

Binary file (2.7 kB). View file

scripts/train_dolphin_phi.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch, os
+from datasets import load_dataset
+from transformers import EarlyStoppingCallback
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
+from peft import LoraConfig, get_peft_model
+from trl import SFTTrainer, SFTConfig, setup_chat_format
+import torch
+print("Is a CUDA GPU available? ", torch.cuda.is_available())
+print("The CUDA version is: ", torch.version.cuda)
+NAME_OF_MODEL = "microsoft/phi-2"
+DATASET_PATH = "data/data_set1.jsonl"
+OUTPUT_DIR = "/model_output/dolphi_round_1"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.float16
+)
+lora_config = LoraConfig(
+    r=32,
+    lora_alpha=64,
+    bias='none',
+    target_modules=["q_proj", "k_proj", "v_proj"],
+    lora_dropout=0.15,
+    task_type="CAUSAL_LM"
+)
+try:
+    # Load dataset with your 'prompt' and 'response' keys
+    dataset = load_dataset("json", data_files=DATASET_PATH)
+    split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
+    train_dataset = split_dataset["train"]
+    eval_dataset = split_dataset["test"]
+    print("Dataset loaded and split successfully!")
+    train_dataset = train_dataset.rename_column("response", "completion")
+    eval_dataset = eval_dataset.rename_column("response", "completion")
+    print("Renamed 'response' column to 'completion' in datasets.")
+except Exception as e:
+    print(f"Error loading dataset from {DATASET_PATH}: {e}")
+    exit(1)
+def formatting_func(example):
+    text = f"### System Prompt:\nSummarize the following log entry in the specified format.\n\n### Log Entry:\n{example['prompt']}\n\n### Summary:\n{example['completion']}"
+    return text
+try:
+    # Use setup_chat_format to automatically configure the tokenizer and model.
+    # This prevents manual syntax errors and resizes the embedding layer.
+    model = AutoModelForCausalLM.from_pretrained(
+        NAME_OF_MODEL,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.float16,
+        attn_implementation="eager"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(NAME_OF_MODEL, trust_remote_code=True)
+    model, tokenizer = setup_chat_format(
+        model,
+        tokenizer,
+        resize_to_multiple_of=8
+    )
+    # Note: When passing the model object directly to SFTTrainer,
+    # the model_init_kwargs in SFTConfig are ignored.
+    # The setup_chat_format function also correctly sets the chat template,
+    # making the manual definition unnecessary.
+    print("Model and Tokenizer loaded and configured successfully!")
+except Exception as e:
+    print(f'ERROR LOADING MODEL OR TOKENIZER: {e}')
+    exit(1)
+sft_config = SFTConfig(
+    output_dir=OUTPUT_DIR,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    learning_rate=1e-4,
+    weight_decay=0.001,
+    bf16=True,
+    warmup_ratio=0.03,
+    group_by_length=True,
+    lr_scheduler_type='cosine',
+    num_train_epochs=2,
+    logging_steps=10,
+    save_steps=25,
+    fp16=False,
+    optim="paged_adamw_8bit",
+    report_to=["tensorboard"],
+    eval_strategy="steps",
+    eval_steps=25,
+    packing=False,
+    completion_only_loss=False,
+    max_length=2048,
+    load_best_model_at_end=True,
+    metric_for_best_model="eval_loss",
+    greater_is_better=False
+)
+trainer=SFTTrainer(
+    model=model,
+    processing_class=tokenizer,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    peft_config=lora_config,
+    args=sft_config,
+    formatting_func=formatting_func,
+    callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
+)
+print("training started")
+trainer.train()
+print("fine tuning complete")
+trainer.save_model(OUTPUT_DIR, merge_adapter_layers=True)

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+from huggingface_hub import HfApi
+# Define the local folder to upload and the target repository name
+local_folder_path = "C:/Users/aravi/Desktop/dolphin phi summarizer"
+repo_id = "cranky-coder08/dolphin-phi-summarizer" # You can change this to your desired repo name
+# Ensure you are logged in to the Hugging Face Hub
+# You need to run `huggingface-cli login` in your terminal first.
+# This command stores your token securely.
+try:
+    api = HfApi()
+    api.whoami()
+    print("Successfully logged in to Hugging Face.")
+except Exception as e:
+    print("Please log in to Hugging Face first by running 'huggingface-cli login' in your terminal.")
+    print(f"Error: {e}")
+    exit()
+# Create the repository if it doesn't already exist
+try:
+    api.create_repo(repo_id, exist_ok=True, repo_type="model")
+    print(f"Repository '{repo_id}' created or already exists.")
+except Exception as e:
+    print(f"An error occurred while creating the repository: {e}")
+    exit()
+# Upload the entire folder to the repository
+print(f"Uploading folder '{local_folder_path}' to '{repo_id}'...")
+try:
+    api.upload_large_folder(
+        folder_path=local_folder_path,
+        repo_id=repo_id,
+        repo_type="model",
+    )
+    print("Upload complete! The folder has been successfully pushed to the Hugging Face Hub.")
+    print(f"You can view your repository here: https://huggingface.co/{repo_id}")
+except Exception as e:
+    print(f"An error occurred during the upload: {e}")