Spaces:
Sleeping
Sleeping
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +57 -9
run_cloud_training.py
CHANGED
|
@@ -2,8 +2,7 @@
|
|
| 2 |
|
| 3 |
"""
|
| 4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
| 5 |
-
- Optimized for L40S GPU
|
| 6 |
-
- Works with pre-tokenized datasets
|
| 7 |
- Research training only (no inference)
|
| 8 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
| 9 |
"""
|
|
@@ -13,6 +12,8 @@ import logging
|
|
| 13 |
import json
|
| 14 |
import torch
|
| 15 |
import argparse
|
|
|
|
|
|
|
| 16 |
from datasets import load_dataset
|
| 17 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
|
| 18 |
from transformers.data.data_collator import DataCollatorMixin
|
|
@@ -27,6 +28,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
| 27 |
# Force GPU mode in Space if we're using a pre-quantized model
|
| 28 |
os.environ["FORCE_GPU"] = "1"
|
| 29 |
|
|
|
|
|
|
|
|
|
|
| 30 |
# Default dataset with proper namespace
|
| 31 |
DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
|
| 32 |
|
|
@@ -294,8 +298,43 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
| 294 |
|
| 295 |
return batch
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
# Load and prepare dataset with proper sorting
|
| 298 |
-
def load_and_prepare_dataset(dataset_name, config):
|
| 299 |
"""Load and prepare the dataset for fine-tuning with proper sorting"""
|
| 300 |
# Use the default dataset if the provided one matches the default name without namespace
|
| 301 |
if dataset_name == "phi4-cognitive-dataset":
|
|
@@ -323,6 +362,10 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
| 323 |
dataset_config = config.get("dataset_config", {})
|
| 324 |
sort_field = dataset_config.get("sort_by_field", "prompt_number")
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
# Sort in ascending order by specified field
|
| 327 |
logger.info(f"Sorting dataset by {sort_field} in ascending order")
|
| 328 |
dataset = dataset.sort(sort_field)
|
|
@@ -377,9 +420,6 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 377 |
if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
|
| 378 |
logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
|
| 379 |
|
| 380 |
-
# Load and prepare dataset with proper sorting
|
| 381 |
-
dataset = load_and_prepare_dataset(dataset_name, config)
|
| 382 |
-
|
| 383 |
# Load model settings
|
| 384 |
original_model_name = model_config.get("model_name_or_path")
|
| 385 |
|
|
@@ -408,6 +448,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 408 |
)
|
| 409 |
tokenizer.pad_token = tokenizer.eos_token
|
| 410 |
|
|
|
|
|
|
|
|
|
|
| 411 |
# Get quantization config
|
| 412 |
quant_config = config.get("quantization_config", {})
|
| 413 |
|
|
@@ -525,7 +568,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 525 |
bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
|
| 526 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
|
| 527 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
| 528 |
-
|
| 529 |
load_best_model_at_end = training_config.get("load_best_model_at_end", True)
|
| 530 |
logger.info("Using full training parameters for GPU mode")
|
| 531 |
else:
|
|
@@ -536,7 +579,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 536 |
bf16 = False
|
| 537 |
gradient_checkpointing = False
|
| 538 |
dataloader_workers = 0
|
| 539 |
-
|
| 540 |
load_best_model_at_end = False
|
| 541 |
logger.warning("Using minimal parameters for CPU training in Space")
|
| 542 |
|
|
@@ -561,7 +604,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 561 |
logging_steps=training_config.get("logging_steps", 10),
|
| 562 |
save_steps=training_config.get("save_steps", 200),
|
| 563 |
save_total_limit=training_config.get("save_total_limit", 3),
|
| 564 |
-
|
| 565 |
load_best_model_at_end=load_best_model_at_end,
|
| 566 |
report_to=reports,
|
| 567 |
logging_first_step=training_config.get("logging_first_step", True),
|
|
@@ -581,6 +624,11 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
| 581 |
pad_token_id=tokenizer.pad_token_id,
|
| 582 |
tokenizer=tokenizer
|
| 583 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
)
|
| 585 |
|
| 586 |
# Start training
|
|
|
|
| 2 |
|
| 3 |
"""
|
| 4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
| 5 |
+
- Optimized for L40S GPU with pre-tokenized datasets
|
|
|
|
| 6 |
- Research training only (no inference)
|
| 7 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
| 8 |
"""
|
|
|
|
| 12 |
import json
|
| 13 |
import torch
|
| 14 |
import argparse
|
| 15 |
+
import shutil
|
| 16 |
+
from pathlib import Path
|
| 17 |
from datasets import load_dataset
|
| 18 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
|
| 19 |
from transformers.data.data_collator import DataCollatorMixin
|
|
|
|
| 28 |
# Force GPU mode in Space if we're using a pre-quantized model
|
| 29 |
os.environ["FORCE_GPU"] = "1"
|
| 30 |
|
| 31 |
+
# Create triton directory to avoid warning
|
| 32 |
+
os.makedirs(os.path.expanduser("~/.triton/autotune"), exist_ok=True)
|
| 33 |
+
|
| 34 |
# Default dataset with proper namespace
|
| 35 |
DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
|
| 36 |
|
|
|
|
| 298 |
|
| 299 |
return batch
|
| 300 |
|
| 301 |
+
# Preprocess dataset to ensure all entries are pre-tokenized
|
| 302 |
+
def preprocess_dataset(dataset, tokenizer):
|
| 303 |
+
"""Ensure dataset is fully pre-tokenized to avoid tokenization during training"""
|
| 304 |
+
logger.info("Pre-processing dataset to ensure all entries are tokenized")
|
| 305 |
+
|
| 306 |
+
def process_example(example):
|
| 307 |
+
# If already has input_ids as list of integers, keep as is
|
| 308 |
+
if 'input_ids' in example and isinstance(example['input_ids'], list) and all(isinstance(x, int) for x in example['input_ids']):
|
| 309 |
+
return example
|
| 310 |
+
|
| 311 |
+
# If has conversations with content field
|
| 312 |
+
if 'conversations' in example:
|
| 313 |
+
conversations = example['conversations']
|
| 314 |
+
if isinstance(conversations, list) and len(conversations) > 0:
|
| 315 |
+
# If conversations has content field, tokenize it
|
| 316 |
+
if isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
| 317 |
+
content = conversations[0]['content']
|
| 318 |
+
if isinstance(content, str):
|
| 319 |
+
example['input_ids'] = tokenizer.encode(content, add_special_tokens=False)
|
| 320 |
+
return example
|
| 321 |
+
|
| 322 |
+
# For any other format, try to extract text and tokenize
|
| 323 |
+
text = None
|
| 324 |
+
if 'text' in example:
|
| 325 |
+
text = example['text']
|
| 326 |
+
elif 'content' in example:
|
| 327 |
+
text = example['content']
|
| 328 |
+
|
| 329 |
+
if text and isinstance(text, str):
|
| 330 |
+
example['input_ids'] = tokenizer.encode(text, add_special_tokens=False)
|
| 331 |
+
|
| 332 |
+
return example
|
| 333 |
+
|
| 334 |
+
return dataset.map(process_example)
|
| 335 |
+
|
| 336 |
# Load and prepare dataset with proper sorting
|
| 337 |
+
def load_and_prepare_dataset(dataset_name, config, tokenizer=None):
|
| 338 |
"""Load and prepare the dataset for fine-tuning with proper sorting"""
|
| 339 |
# Use the default dataset if the provided one matches the default name without namespace
|
| 340 |
if dataset_name == "phi4-cognitive-dataset":
|
|
|
|
| 362 |
dataset_config = config.get("dataset_config", {})
|
| 363 |
sort_field = dataset_config.get("sort_by_field", "prompt_number")
|
| 364 |
|
| 365 |
+
# Preprocess dataset to ensure all entries are pre-tokenized
|
| 366 |
+
if tokenizer is not None:
|
| 367 |
+
dataset = preprocess_dataset(dataset, tokenizer)
|
| 368 |
+
|
| 369 |
# Sort in ascending order by specified field
|
| 370 |
logger.info(f"Sorting dataset by {sort_field} in ascending order")
|
| 371 |
dataset = dataset.sort(sort_field)
|
|
|
|
| 420 |
if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
|
| 421 |
logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
|
| 422 |
|
|
|
|
|
|
|
|
|
|
| 423 |
# Load model settings
|
| 424 |
original_model_name = model_config.get("model_name_or_path")
|
| 425 |
|
|
|
|
| 448 |
)
|
| 449 |
tokenizer.pad_token = tokenizer.eos_token
|
| 450 |
|
| 451 |
+
# Load and prepare dataset with proper sorting
|
| 452 |
+
dataset = load_and_prepare_dataset(dataset_name, config, tokenizer)
|
| 453 |
+
|
| 454 |
# Get quantization config
|
| 455 |
quant_config = config.get("quantization_config", {})
|
| 456 |
|
|
|
|
| 568 |
bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
|
| 569 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
|
| 570 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
| 571 |
+
eval_strategy = training_config.get("eval_strategy", "steps") # Updated from evaluation_strategy
|
| 572 |
load_best_model_at_end = training_config.get("load_best_model_at_end", True)
|
| 573 |
logger.info("Using full training parameters for GPU mode")
|
| 574 |
else:
|
|
|
|
| 579 |
bf16 = False
|
| 580 |
gradient_checkpointing = False
|
| 581 |
dataloader_workers = 0
|
| 582 |
+
eval_strategy = "no"
|
| 583 |
load_best_model_at_end = False
|
| 584 |
logger.warning("Using minimal parameters for CPU training in Space")
|
| 585 |
|
|
|
|
| 604 |
logging_steps=training_config.get("logging_steps", 10),
|
| 605 |
save_steps=training_config.get("save_steps", 200),
|
| 606 |
save_total_limit=training_config.get("save_total_limit", 3),
|
| 607 |
+
eval_strategy=eval_strategy, # Updated from evaluation_strategy
|
| 608 |
load_best_model_at_end=load_best_model_at_end,
|
| 609 |
report_to=reports,
|
| 610 |
logging_first_step=training_config.get("logging_first_step", True),
|
|
|
|
| 624 |
pad_token_id=tokenizer.pad_token_id,
|
| 625 |
tokenizer=tokenizer
|
| 626 |
),
|
| 627 |
+
# Add label_names to avoid warning
|
| 628 |
+
compute_metrics=None,
|
| 629 |
+
tokenizer=tokenizer, # Provide tokenizer for proper padding
|
| 630 |
+
# Define label_names to fix warning
|
| 631 |
+
label_names=["labels"]
|
| 632 |
)
|
| 633 |
|
| 634 |
# Start training
|