Spaces:
Runtime error
Runtime error
| """ | |
| Hugging Face Space App for Free H200 Training | |
| This app runs nano-coder training on HF's free H200 GPU (4 minutes daily) | |
| """ | |
| import os | |
| import subprocess | |
| import time | |
| import gradio as gr | |
| from datetime import datetime, timedelta | |
| # Configuration | |
| MAX_TRAINING_TIME = 3.5 * 60 # 3.5 minutes to be safe | |
| TRAINING_SCRIPT = "hf_free_training.py" | |
| DATA_PREP_SCRIPT = "prepare_code_dataset.py" | |
| def check_daily_limit(): | |
| """Check if we've used today's free H200 time.""" | |
| today = datetime.now().date() | |
| limit_file = f"daily_limit_{today}.txt" | |
| # For debugging, let's check what's in the file | |
| if os.path.exists(limit_file): | |
| try: | |
| with open(limit_file, 'r') as f: | |
| last_run = f.read().strip() | |
| print(f"Debug: Found limit file with content: '{last_run}' for date: {today}") | |
| if last_run == str(today): | |
| return False, f"Daily H200 limit reached. Try again tomorrow! (Last run: {last_run})" | |
| except Exception as e: | |
| print(f"Debug: Error reading limit file: {e}") | |
| # If there's an error reading the file, let's allow training | |
| return True, "Ready to train! (Limit file error, allowing training)" | |
| else: | |
| print(f"Debug: No limit file found for today: {today}") | |
| return True, "Ready to train!" | |
| def mark_daily_usage(): | |
| """Mark that we've used today's free time.""" | |
| today = datetime.now().date() | |
| limit_file = f"daily_limit_{today}.txt" | |
| with open(limit_file, 'w') as f: | |
| f.write(str(today)) | |
| print(f"Debug: Marked daily usage for {today}") | |
| def reset_daily_limit(): | |
| """Reset the daily limit (for testing).""" | |
| today = datetime.now().date() | |
| limit_file = f"daily_limit_{today}.txt" | |
| if os.path.exists(limit_file): | |
| os.remove(limit_file) | |
| return f"β Daily limit reset for {today}" | |
| else: | |
| return f"βΉοΈ No limit file found for {today}" | |
| def run_training(): | |
| """Run the free H200 training.""" | |
| # Check daily limit | |
| can_run, message = check_daily_limit() | |
| if not can_run: | |
| return message | |
| try: | |
| # Mark usage | |
| mark_daily_usage() | |
| # Prepare dataset if not already done | |
| if not os.path.exists("data/python-codes-25k/train.bin"): | |
| print("Preparing dataset...") | |
| subprocess.run(["python", DATA_PREP_SCRIPT], check=True) | |
| # Run training | |
| print("Starting free H200 training...") | |
| start_time = time.time() | |
| # Set environment variables for HF | |
| env = os.environ.copy() | |
| # HF Spaces automatically provides HF_TOKEN | |
| if 'HF_TOKEN' not in env: | |
| env['HF_TOKEN'] = os.environ.get('HF_TOKEN', '') | |
| # Run training with timeout | |
| process = subprocess.Popen( | |
| ["python", TRAINING_SCRIPT], | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| universal_newlines=True, | |
| env=env | |
| ) | |
| output_lines = [] | |
| while True: | |
| elapsed = time.time() - start_time | |
| if elapsed > MAX_TRAINING_TIME: | |
| process.terminate() | |
| output_lines.append(f"\nβ° Time limit reached ({elapsed/60:.1f} minutes)") | |
| break | |
| line = process.stdout.readline() | |
| if not line and process.poll() is not None: | |
| break | |
| if line: | |
| output_lines.append(line.strip()) | |
| print(line.strip()) | |
| # Wait for process to finish | |
| process.wait() | |
| # Check if training completed successfully | |
| if process.returncode == 0: | |
| result = "β Training completed successfully!\n\n" + "\n".join(output_lines[-20:]) # Last 20 lines | |
| else: | |
| result = "β Training failed or was interrupted.\n\n" + "\n".join(output_lines[-20:]) | |
| return result | |
| except Exception as e: | |
| return f"β Error during training: {str(e)}" | |
| def check_model_status(): | |
| """Check if trained model exists.""" | |
| model_path = "out-nano-coder-free/ckpt.pt" | |
| if os.path.exists(model_path): | |
| # Get file size | |
| size = os.path.getsize(model_path) / (1024 * 1024) # MB | |
| return f"β Model found! Size: {size:.1f} MB" | |
| else: | |
| return "β No trained model found. Run training first." | |
| def generate_sample_code(prompt, max_tokens=100, temperature=0.8): | |
| """Generate code using the trained model.""" | |
| if not os.path.exists("out-nano-coder-free/ckpt.pt"): | |
| return "β No trained model found. Please run training first." | |
| try: | |
| # Import and run sampling | |
| from sample_nano_coder import load_model, load_vocab, generate_code | |
| model, checkpoint = load_model() | |
| stoi, itos = load_vocab() | |
| # Generate code | |
| completion = generate_code(model, stoi, itos, prompt, max_tokens, temperature, 200) | |
| return f"Generated code:\n\n{completion}" | |
| except Exception as e: | |
| return f"β Error generating code: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Nano-Coder Free H200 Training") as demo: | |
| gr.Markdown("# π Nano-Coder Free H200 Training") | |
| gr.Markdown("Train a nanoGPT model for Python code generation using Hugging Face's free H200 GPU (4 minutes daily)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π― Training Control") | |
| train_button = gr.Button("π Start Free H200 Training", variant="primary") | |
| reset_button = gr.Button("π Reset Daily Limit", variant="secondary") | |
| status_text = gr.Textbox(label="Training Status", lines=10, interactive=False) | |
| with gr.Column(): | |
| gr.Markdown("### π Model Status") | |
| model_status_button = gr.Button("π Check Model Status") | |
| model_status_text = gr.Textbox(label="Model Status", lines=2, interactive=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π¨ Code Generation") | |
| code_prompt = gr.Textbox( | |
| label="Code Prompt", | |
| placeholder="def fibonacci(n):\n ", | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| max_tokens = gr.Slider(50, 500, 100, label="Max Tokens") | |
| temperature = gr.Slider(0.1, 2.0, 0.8, label="Temperature") | |
| generate_button = gr.Button("β¨ Generate Code") | |
| generated_code = gr.Textbox(label="Generated Code", lines=10, interactive=False) | |
| # Event handlers | |
| train_button.click( | |
| fn=run_training, | |
| outputs=status_text | |
| ) | |
| reset_button.click( | |
| fn=reset_daily_limit, | |
| outputs=status_text | |
| ) | |
| model_status_button.click( | |
| fn=check_model_status, | |
| outputs=model_status_text | |
| ) | |
| generate_button.click( | |
| fn=generate_sample_code, | |
| inputs=[code_prompt, max_tokens, temperature], | |
| outputs=generated_code | |
| ) | |
| gr.Markdown(""" | |
| ### π Instructions | |
| 1. **Daily Limit**: You get 4 minutes of free H200 GPU time per day | |
| 2. **Training**: Click "Start Free H200 Training" to begin | |
| 3. **Model**: Check model status after training | |
| 4. **Generation**: Use the trained model to generate Python code | |
| ### βοΈ Model Configuration (Free Tier) | |
| - **Layers**: 6 (reduced from 12) | |
| - **Heads**: 6 (reduced from 12) | |
| - **Embedding**: 384 (reduced from 768) | |
| - **Context**: 512 tokens | |
| - **Parameters**: ~15M (vs 124M full model) | |
| ### π‘ Tips | |
| - Training automatically stops at 3.5 minutes to be safe | |
| - Model checkpoints are saved to HF Hub | |
| - Use shorter prompts for better results | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |