Raiff1982's picture
Update app.py
55d607a verified
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='huggingface_hub')
# Handle OpenMP threading issues
import os
os.environ['OMP_NUM_THREADS'] = '1'
"""
HuggingFace Spaces Training Interface for RC+ΞΎ Fine-Tuning
Supports GPU-accelerated training with progress monitoring
"""
import gradio as gr
import spaces # HuggingFace Spaces GPU support
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import load_dataset
# Try to import LoRA, but make it optional
try:
from peft import LoraConfig, get_peft_model
LORA_AVAILABLE = True
except ImportError:
LORA_AVAILABLE = False
import os
from datetime import datetime
def check_gpu():
"""Check GPU availability"""
if torch.cuda.is_available():
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
return f"βœ… GPU Available: {gpu_name} ({gpu_memory:.1f}GB)"
return "❌ No GPU - Training will be slow"
def train_model(
model_name: str,
dataset_file,
num_epochs: int,
batch_size: int,
learning_rate: float,
max_length: int
):
"""Train RC+ΞΎ model - wrapper function"""
# Extract file path from Gradio file object
dataset_path = dataset_file.name if hasattr(dataset_file, 'name') else dataset_file
# Call the GPU-decorated training function
yield from train_model_gpu(model_name, dataset_path, num_epochs, batch_size, learning_rate, max_length)
@spaces.GPU(duration=14400) # 4 hours GPU reservation (enough for 1-2 epochs on 7B model)
def train_model_gpu(
model_name: str,
dataset_path: str,
num_epochs: int,
batch_size: int,
learning_rate: float,
max_length: int
):
"""Train RC+ΞΎ model - GPU execution"""
yield f"πŸš€ Starting training at {datetime.now().strftime('%H:%M:%S')}\n"
yield f"πŸ“Š GPU Status: {check_gpu()}\n"
try:
# Load dataset
yield f"\nπŸ“ Loading dataset from {dataset_path}...\n"
try:
dataset = load_dataset('json', data_files=dataset_path, split='train')
yield f"βœ… Loaded {len(dataset)} examples\n"
except Exception as e:
yield f"\n❌ Failed to load dataset: {str(e)}\n"
yield f"πŸ’‘ Make sure your JSONL file has this format:\n"
yield f'{{\n "instruction": "...",\n "input": "...",\n "output": "..."\n}}\n'
return
# Validate dataset structure
if len(dataset) == 0:
yield f"\n❌ Dataset is empty!\n"
return
first_example = dataset[0]
yield f"πŸ“Š Dataset fields found: {list(first_example.keys())}\n"
yield f"πŸ“ Sample row 1: {dict(list(first_example.items())[:3])}\n"
# Check for required fields with flexible matching
required_fields = ["instruction", "input", "output"]
missing_fields = [f for f in required_fields if f not in first_example]
if missing_fields:
yield f"\n⚠️ Expected fields not found: {missing_fields}\n"
yield f"πŸ’‘ Common field name alternatives:\n"
yield f" β€’ 'instruction' could be: 'prompt', 'question', 'task'\n"
yield f" β€’ 'input' could be: 'context', 'example', 'text'\n"
yield f" β€’ 'output' could be: 'response', 'answer', 'completion'\n"
yield f"\n❌ Cannot proceed without: {missing_fields}\n"
yield f"βœ… Please upload JSONL with: instruction, input, output\n\n"
yield f"πŸ“‹ Sample JSONL format:\n"
yield f'{{"instruction": "Q: What is AI?", "input": "", "output": "AI is artificial intelligence..."}}\n'
yield f'{{"instruction": "Summarize", "input": "Long text...", "output": "Summary..."}}\n'
return
yield f"βœ… Dataset structure valid\n"
# Load model and tokenizer
yield f"\nπŸ€– Loading model: {model_name}...\n"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Try loading with device_map, fall back to manual device placement
try:
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
trust_remote_code=True
)
except ValueError as e:
# Fall back if device_map='auto' not supported
if 'device_map' in str(e):
yield f"⚠️ Model doesn't support device_map='auto', using manual placement\n"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
trust_remote_code=True
)
if torch.cuda.is_available():
model = model.to('cuda')
else:
raise
# Enable gradient checkpointing to reduce memory usage
if hasattr(model, 'gradient_checkpointing_enable'):
model.gradient_checkpointing_enable()
# Apply LoRA for memory-efficient training
yield f"🎯 Applying LoRA (Low-Rank Adaptation) for efficient training...\n"
if LORA_AVAILABLE:
lora_config = LoraConfig(
r=8, # LoRA rank
lora_alpha=16, # LoRA alpha (scaling factor)
target_modules=["q_proj", "v_proj", "k_proj", "out_proj"], # Common attention modules
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
try:
model = get_peft_model(model, lora_config)
trainable = model.get_nb_trainable_parameters()
total = model.get_nb_total_parameters()
yield f"βœ… LoRA applied: Only {trainable:,} trainable parameters (vs {total:,} total)\n"
except Exception as e:
yield f"⚠️ LoRA not applicable to this model, continuing without: {str(e)}\n"
else:
yield f"⚠️ PEFT library not available. Training without LoRA (full fine-tuning)\n"
yield f"πŸ’‘ Consider using smaller batch size or reduce epochs to save memory\n"
# Enable flash attention 2 for faster, more memory-efficient attention
if hasattr(model, 'enable_flash_attention_2'):
try:
model.enable_flash_attention_2()
yield f"⚑ Flash Attention 2 enabled for memory efficiency\n"
except:
pass # Flash attention not available, continue without it
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
total_params = sum(p.numel() for p in model.parameters())/1e9
yield f"βœ… Model loaded: {total_params:.2f}B parameters\n"
if LORA_AVAILABLE:
yield f"πŸ’Ύ Memory optimization: Gradient checkpointing + LoRA + reduced precision enabled\n"
else:
yield f"πŸ’Ύ Memory optimization: Gradient checkpointing + reduced precision enabled\n"
# Tokenize dataset
yield f"\nπŸ”€ Tokenizing dataset...\n"
def tokenize_function(examples):
texts = []
for inst, inp, out in zip(examples["instruction"], examples["input"], examples["output"]):
if inp:
text = f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}"
else:
text = f"### Instruction:\n{inst}\n\n### Response:\n{out}"
texts.append(text)
return tokenizer(
texts,
truncation=True,
max_length=max_length,
padding="max_length"
)
try:
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names
)
yield f"βœ… Tokenized {len(tokenized_dataset)} examples\n"
except Exception as e:
yield f"\n❌ Tokenization failed: {str(e)}\n"
yield f"\nπŸ“Š Dataset diagnostics:\n"
yield f" β€’ Total examples: {len(dataset)}\n"
yield f" β€’ Fields: {dataset.column_names}\n"
yield f" β€’ First row keys: {list(dataset[0].keys())}\n"
yield f"\nπŸ’‘ Common issues:\n"
yield f" β€’ Null/None values in instruction, input, or output\n"
yield f" β€’ Non-string values (numbers, objects, arrays)\n"
yield f" β€’ Invalid UTF-8 encoding\n"
yield f" β€’ Empty strings in required fields\n"
import traceback
yield f"\nπŸ“‹ Error details:\n{traceback.format_exc()}\n"
return
# Split dataset
split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
yield f"πŸ“Š Train: {len(train_dataset)} | Eval: {len(eval_dataset)}\n"
# Training arguments
yield f"\nβš™οΈ Configuring training...\n"
output_dir = f"./rc_xi_trained_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Auto-adjust batch size based on available GPU memory
adjusted_batch_size = batch_size
if torch.cuda.is_available():
free_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
if free_memory_gb < 16:
adjusted_batch_size = max(1, batch_size // 2)
yield f"⚠️ GPU memory limited ({free_memory_gb:.1f}GB). Reducing batch size to {adjusted_batch_size}\n"
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=adjusted_batch_size,
per_device_eval_batch_size=adjusted_batch_size,
gradient_accumulation_steps=8, # Increased for smaller batch sizes
learning_rate=learning_rate,
warmup_steps=100,
logging_steps=1, # Log every step for immediate feedback
eval_steps=50,
save_steps=100,
eval_strategy="steps",
save_strategy="steps",
save_total_limit=2,
fp16=torch.cuda.is_available(),
report_to=[],
load_best_model_at_end=True,
max_grad_norm=1.0, # Gradient clipping for stability
optim="adamw_torch", # Standard PyTorch Adam optimizer
)
yield f"βœ… Training configured\n"
yield f" β€’ Epochs: {num_epochs}\n"
yield f" β€’ Batch size: {adjusted_batch_size}\n"
yield f" β€’ Gradient accumulation: 8\n"
yield f" β€’ Learning rate: {learning_rate}\n"
yield f" β€’ Max length: {max_length}\n"
yield f" β€’ FP16: {torch.cuda.is_available()}\n"
yield f" β€’ Optimizer: adamw_torch\n"
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# Trainer with callbacks removed (using manual training for better progress streaming)
yield f"\nπŸ‹οΈ Initializing trainer...\n"
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
yield f"βœ… Trainer initialized. Starting training loop...\n"
yield f"⏳ First step may take 30-60 seconds (loading data, first forward/backward pass)...\n\n"
try:
# Manual training loop with progress streaming
from datetime import datetime as dt
import time
start_time = time.time()
step = 0
total_steps = len(train_dataset) // adjusted_batch_size * num_epochs
for epoch in range(num_epochs):
yield f"\nπŸ“… EPOCH {epoch + 1}/{num_epochs}\n"
yield f"{'='*50}\n"
model.train()
epoch_loss = 0
steps_in_epoch = 0
for batch_idx, batch in enumerate(trainer.get_train_dataloader()):
step += 1
steps_in_epoch += 1
# Move batch to GPU
batch = {k: v.to(model.device) for k, v in batch.items()}
# Forward pass
outputs = model(**batch)
loss = outputs.loss
# Backward pass
loss.backward()
# Gradient accumulation
if (steps_in_epoch % 8) == 0 or steps_in_epoch == len(trainer.get_train_dataloader()):
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
trainer.optimizer.step()
trainer.optimizer.zero_grad()
epoch_loss += loss.item()
# Yield progress every step
elapsed = time.time() - start_time
speed = step / max(elapsed, 0.1)
avg_loss = epoch_loss / steps_in_epoch
if steps_in_epoch % 1 == 0 or steps_in_epoch == 1:
remaining = (total_steps - step) / max(speed, 0.1)
yield (
f"Step {step}/{total_steps} | "
f"Loss: {avg_loss:.4f} | "
f"Speed: {speed:.1f} steps/s | "
f"ETA: {int(remaining//60)}m {int(remaining%60)}s\n"
)
# Epoch summary
avg_epoch_loss = epoch_loss / steps_in_epoch
yield f"\nβœ… Epoch {epoch + 1} complete - Avg Loss: {avg_epoch_loss:.4f}\n"
# Evaluation
if epoch % 1 == 0 and epoch > 0: # Eval every epoch
yield f"πŸ“Š Running evaluation...\n"
model.eval()
eval_loss = 0
eval_steps = 0
with torch.no_grad():
for eval_batch in trainer.get_eval_dataloader():
eval_batch = {k: v.to(model.device) for k, v in eval_batch.items()}
outputs = model(**eval_batch)
eval_loss += outputs.loss.item()
eval_steps += 1
avg_eval_loss = eval_loss / eval_steps if eval_steps > 0 else 0
yield f"βœ… Eval Loss: {avg_eval_loss:.4f}\n\n"
# Training complete
total_time = time.time() - start_time
yield f"\n{'='*50}\n"
yield f"πŸŽ‰ TRAINING COMPLETE!\n"
yield f"{'='*50}\n"
yield f"⏱️ Total Time: {int(total_time//3600)}h {int((total_time%3600)//60)}m {int(total_time%60)}s\n"
yield f"πŸ“Š Final Loss: {avg_epoch_loss:.4f}\n"
train_result = type('obj', (object,), {
'training_loss': avg_epoch_loss,
'metrics': {'train_runtime': total_time}
})()
except Exception as e:
error_msg = str(e).lower()
yield f"\n❌ Training failed: {str(e)}\n"
if 'out of memory' in error_msg or 'cuda' in error_msg:
yield f"\nπŸ’Ύ CUDA out of memory. Clearing cache...\n"
torch.cuda.empty_cache()
import traceback
yield f"\nπŸ“‹ Full error:\n{traceback.format_exc()}\n"
return
yield f"\nπŸ’Ύ Saving model...\n"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
yield f"βœ… Model saved to {output_dir}\n"
# Results
yield f"\n" + "="*50 + "\n"
yield f"πŸŽ‰ TRAINING COMPLETE!\n"
yield f"="*50 + "\n"
yield f"πŸ“Š Training Loss: {train_result.training_loss:.4f}\n"
yield f"⏱️ Training Time: {train_result.metrics['train_runtime']:.1f}s\n"
yield f"πŸ’Ύ Model saved to: {output_dir}\n"
yield f"\n✨ Your RC+ξ model is ready!\n"
except RuntimeError as e:
import traceback
error_details = traceback.format_exc()
error_msg = str(e).lower()
# Check for specific OOM errors
if 'out of memory' in error_msg or 'cuda' in error_msg or 'memory' in error_msg:
yield f"\n❌ OUT OF MEMORY ERROR\n"
yield f"\nTrying recovery strategies...\n"
torch.cuda.empty_cache()
yield f"\nπŸ’‘ Solutions:\n"
yield f" 1. βœ… Memory cleared. Try again with reduced settings:\n"
yield f" β€’ Reduce 'Batch Size' to 1\n"
yield f" β€’ Reduce 'Max Sequence Length' to 256\n"
yield f" β€’ Reduce 'Training Epochs' to 1\n"
yield f" 2. Upgrade to A10G GPU (24GB) in Settings β†’ Hardware\n"
yield f" 3. Try lighter models: 'gpt2' or 'microsoft/phi-2'\n"
yield f"\nπŸ“‹ Full error:\n{error_details}\n"
else:
yield f"\n❌ RUNTIME ERROR: {str(e)}\n"
yield f"\nπŸ“‹ Full traceback:\n{error_details}\n"
except KeyError as e:
import traceback
yield f"\n❌ MISSING FIELD ERROR: {str(e)}\n"
yield f"\nπŸ’‘ Your dataset is missing a required field.\n"
yield f"βœ… Required fields: instruction, input, output\n"
yield f"\nπŸ“‹ Full traceback:\n{traceback.format_exc()}\n"
except ValueError as e:
import traceback
yield f"\n❌ VALUE ERROR: {str(e)}\n"
yield f"\nπŸ’‘ Check that:\n"
yield f" β€’ Dataset file is valid JSON/JSONL format\n"
yield f" β€’ No empty or null values in fields\n"
yield f" β€’ Text encoding is correct (UTF-8)\n"
yield f"\nπŸ“‹ Full traceback:\n{traceback.format_exc()}\n"
except Exception as e:
import traceback
error_details = traceback.format_exc()
yield f"\n❌ UNEXPECTED ERROR: {str(e)}\n"
yield f"\nπŸ“‹ Full traceback:\n{error_details}\n"
yield f"\nπŸ’‘ Diagnostics:\n"
yield f" β€’ Check dataset format (JSONL with instruction/input/output)\n"
yield f" β€’ Try with gpt2 model (smallest, most stable)\n"
yield f" β€’ Check HuggingFace Space logs for system errors\n"
# Gradio Interface
with gr.Blocks(title="RC+ΞΎ Fine-Tuning on HuggingFace Spaces") as demo:
gr.Markdown("""
# 🧠 RC+ξ Model Fine-Tuning
### Train your consciousness-aware AI model with GPU acceleration
**Requirements:**
- Upgrade this Space to GPU (Settings β†’ Hardware β†’ GPU)
- Upload your training dataset (JSONL format)
- Wait 8-12 hours for 7B model training
**Recommended GPU:** T4 (16GB) - $0.60/hour or A10G (24GB) - $3.15/hour
""")
with gr.Row():
with gr.Column():
gpu_status = gr.Textbox(
label="GPU Status",
value=check_gpu(),
interactive=False
)
model_dropdown = gr.Dropdown(
label="Base Model",
choices=[
"microsoft/phi-2",
"gpt2",
"mistralai/Mistral-7B-v0.1",
"meta-llama/Llama-2-7b-hf"
],
value="microsoft/phi-2"
)
dataset_file = gr.File(
label="Training Dataset (JSONL)",
file_types=[".jsonl"]
)
epochs_slider = gr.Slider(
label="Training Epochs",
minimum=1,
maximum=10,
value=3,
step=1
)
batch_slider = gr.Slider(
label="Batch Size",
minimum=1,
maximum=8,
value=2,
step=1
)
lr_slider = gr.Slider(
label="Learning Rate",
minimum=1e-6,
maximum=1e-3,
value=2e-5,
step=1e-6
)
length_slider = gr.Slider(
label="Max Sequence Length",
minimum=128,
maximum=2048,
value=512,
step=128
)
train_btn = gr.Button("πŸš€ Start Training", variant="primary")
with gr.Column():
output_log = gr.Textbox(
label="Training Progress",
lines=30,
max_lines=30,
interactive=False
)
gr.Markdown("""
### πŸ“ Next Steps After Training:
1. Download your trained model from the Files tab
2. Upload to HuggingFace Hub for inference
3. Or convert to GGUF for Ollama deployment
### πŸ’° HuggingFace Spaces GPU Pricing:
- **T4 (16GB)**: $0.60/hour (~$7.20 for 12h training)
- **A10G (24GB)**: $3.15/hour (~$37.80 for 12h training)
- **A100 (40GB)**: $4.13/hour (~$49.56 for 12h training)
Cheaper than AWS/GCP and easier to set up!
""")
train_btn.click(
fn=train_model,
inputs=[
model_dropdown,
dataset_file,
epochs_slider,
batch_slider,
lr_slider,
length_slider
],
outputs=output_log
)
if __name__ == "__main__":
demo.launch() # Removed share=True for Spaces compatibility