Spaces:

mlopez6132
/

nano-coder-free

Runtime error

App Files Files Community

nano-coder-free / app.py

mlopez6132

Upload app.py with huggingface_hub

ee850c7 verified 5 months ago

raw

history blame contribute delete

7.92 kB

	"""
	Hugging Face Space App for Free H200 Training
	This app runs nano-coder training on HF's free H200 GPU (4 minutes daily)
	"""

	import os
	import subprocess
	import time
	import gradio as gr
	from datetime import datetime, timedelta

	# Configuration
	MAX_TRAINING_TIME = 3.5 * 60 # 3.5 minutes to be safe
	TRAINING_SCRIPT = "hf_free_training.py"
	DATA_PREP_SCRIPT = "prepare_code_dataset.py"

	def check_daily_limit():
	"""Check if we've used today's free H200 time."""
	today = datetime.now().date()
	limit_file = f"daily_limit_{today}.txt"

	# For debugging, let's check what's in the file
	if os.path.exists(limit_file):
	try:
	with open(limit_file, 'r') as f:
	last_run = f.read().strip()
	print(f"Debug: Found limit file with content: '{last_run}' for date: {today}")
	if last_run == str(today):
	return False, f"Daily H200 limit reached. Try again tomorrow! (Last run: {last_run})"
	except Exception as e:
	print(f"Debug: Error reading limit file: {e}")
	# If there's an error reading the file, let's allow training
	return True, "Ready to train! (Limit file error, allowing training)"
	else:
	print(f"Debug: No limit file found for today: {today}")

	return True, "Ready to train!"

	def mark_daily_usage():
	"""Mark that we've used today's free time."""
	today = datetime.now().date()
	limit_file = f"daily_limit_{today}.txt"

	with open(limit_file, 'w') as f:
	f.write(str(today))
	print(f"Debug: Marked daily usage for {today}")

	def reset_daily_limit():
	"""Reset the daily limit (for testing)."""
	today = datetime.now().date()
	limit_file = f"daily_limit_{today}.txt"

	if os.path.exists(limit_file):
	os.remove(limit_file)
	return f"✅ Daily limit reset for {today}"
	else:
	return f"ℹ️ No limit file found for {today}"

	def run_training():
	"""Run the free H200 training."""

	# Check daily limit
	can_run, message = check_daily_limit()
	if not can_run:
	return message

	try:
	# Mark usage
	mark_daily_usage()

	# Prepare dataset if not already done
	if not os.path.exists("data/python-codes-25k/train.bin"):
	print("Preparing dataset...")
	subprocess.run(["python", DATA_PREP_SCRIPT], check=True)

	# Run training
	print("Starting free H200 training...")
	start_time = time.time()

	# Set environment variables for HF
	env = os.environ.copy()
	# HF Spaces automatically provides HF_TOKEN
	if 'HF_TOKEN' not in env:
	env['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')

	# Run training with timeout
	process = subprocess.Popen(
	["python", TRAINING_SCRIPT],
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	universal_newlines=True,
	env=env
	)

	output_lines = []
	while True:
	elapsed = time.time() - start_time
	if elapsed > MAX_TRAINING_TIME:
	process.terminate()
	output_lines.append(f"\n⏰ Time limit reached ({elapsed/60:.1f} minutes)")
	break

	line = process.stdout.readline()
	if not line and process.poll() is not None:
	break

	if line:
	output_lines.append(line.strip())
	print(line.strip())

	# Wait for process to finish
	process.wait()

	# Check if training completed successfully
	if process.returncode == 0:
	result = "✅ Training completed successfully!\n\n" + "\n".join(output_lines[-20:]) # Last 20 lines
	else:
	result = "❌ Training failed or was interrupted.\n\n" + "\n".join(output_lines[-20:])

	return result

	except Exception as e:
	return f"❌ Error during training: {str(e)}"

	def check_model_status():
	"""Check if trained model exists."""
	model_path = "out-nano-coder-free/ckpt.pt"
	if os.path.exists(model_path):
	# Get file size
	size = os.path.getsize(model_path) / (1024 * 1024) # MB
	return f"✅ Model found! Size: {size:.1f} MB"
	else:
	return "❌ No trained model found. Run training first."

	def generate_sample_code(prompt, max_tokens=100, temperature=0.8):
	"""Generate code using the trained model."""
	if not os.path.exists("out-nano-coder-free/ckpt.pt"):
	return "❌ No trained model found. Please run training first."

	try:
	# Import and run sampling
	from sample_nano_coder import load_model, load_vocab, generate_code

	model, checkpoint = load_model()
	stoi, itos = load_vocab()

	# Generate code
	completion = generate_code(model, stoi, itos, prompt, max_tokens, temperature, 200)

	return f"Generated code:\n\n{completion}"

	except Exception as e:
	return f"❌ Error generating code: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Nano-Coder Free H200 Training") as demo:
	gr.Markdown("# 🚀 Nano-Coder Free H200 Training")
	gr.Markdown("Train a nanoGPT model for Python code generation using Hugging Face's free H200 GPU (4 minutes daily)")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🎯 Training Control")
	train_button = gr.Button("🚀 Start Free H200 Training", variant="primary")
	reset_button = gr.Button("🔄 Reset Daily Limit", variant="secondary")
	status_text = gr.Textbox(label="Training Status", lines=10, interactive=False)

	with gr.Column():
	gr.Markdown("### 📊 Model Status")
	model_status_button = gr.Button("🔍 Check Model Status")
	model_status_text = gr.Textbox(label="Model Status", lines=2, interactive=False)

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🎨 Code Generation")
	code_prompt = gr.Textbox(
	label="Code Prompt",
	placeholder="def fibonacci(n):\n ",
	lines=3
	)
	with gr.Row():
	max_tokens = gr.Slider(50, 500, 100, label="Max Tokens")
	temperature = gr.Slider(0.1, 2.0, 0.8, label="Temperature")
	generate_button = gr.Button("✨ Generate Code")
	generated_code = gr.Textbox(label="Generated Code", lines=10, interactive=False)

	# Event handlers
	train_button.click(
	fn=run_training,
	outputs=status_text
	)

	reset_button.click(
	fn=reset_daily_limit,
	outputs=status_text
	)

	model_status_button.click(
	fn=check_model_status,
	outputs=model_status_text
	)

	generate_button.click(
	fn=generate_sample_code,
	inputs=[code_prompt, max_tokens, temperature],
	outputs=generated_code
	)

	gr.Markdown("""
	### 📋 Instructions

	1. Daily Limit: You get 4 minutes of free H200 GPU time per day
	2. Training: Click "Start Free H200 Training" to begin
	3. Model: Check model status after training
	4. Generation: Use the trained model to generate Python code

	### ⚙️ Model Configuration (Free Tier)
	- Layers: 6 (reduced from 12)
	- Heads: 6 (reduced from 12)
	- Embedding: 384 (reduced from 768)
	- Context: 512 tokens
	- Parameters: ~15M (vs 124M full model)

	### 💡 Tips
	- Training automatically stops at 3.5 minutes to be safe
	- Model checkpoints are saved to HF Hub
	- Use shorter prompts for better results
	""")

	if __name__ == "__main__":
	demo.launch()