Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """ | |
| Francis Botcon - Chat Application | |
| Hugging Face Spaces with ZeroGPU support | |
| """ | |
| import os | |
| import sys | |
| import torch | |
| import gradio as gr | |
| import time | |
| # Get token BEFORE importing spaces - critical for ZeroGPU authentication | |
| # Priority: francis_botcon_space (Space secret) > HF_TOKEN > HUGGING_FACE_HUB_TOKEN | |
| HF_TOKEN = os.getenv("francis_botcon_space") or os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN") | |
| # Set both environment variables to ensure ZeroGPU picks up authentication | |
| # This is the Space owner's token (for private model access) | |
| if HF_TOKEN: | |
| os.environ["HF_TOKEN"] = HF_TOKEN | |
| os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN | |
| os.environ["HF_HOME"] = os.path.expanduser("~/.cache/huggingface") | |
| print(f"✓ Using token from environment (length: {len(HF_TOKEN)})") | |
| else: | |
| print("⚠️ WARNING: No HF_TOKEN found in environment!") | |
| print(" Set 'francis_botcon_space' or 'HF_TOKEN' secret in Space settings") | |
| # Try to import spaces (only available in HF Spaces) | |
| try: | |
| import spaces | |
| HAS_SPACES = True | |
| except ImportError: | |
| HAS_SPACES = False | |
| print("spaces module not available (local development)") | |
| BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1" | |
| LORA_MODEL = "rojaldo/francis-botcon-lora" | |
| # Global model state | |
| model = None | |
| tokenizer = None | |
| # ============================================================================ | |
| # INITIALIZATION | |
| # ============================================================================ | |
| print("=" * 70) | |
| print("INITIALIZATION" if HAS_SPACES else "LOCAL DEVELOPMENT MODE") | |
| print("=" * 70) | |
| print(f"HF_TOKEN (Space Secret) configured: {bool(HF_TOKEN)}") | |
| print(f"HF_HOME: {os.getenv('HF_HOME')}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"CUDA device: {torch.cuda.get_device_name(0)}") | |
| print(f"PyTorch version: {torch.__version__}") | |
| print(f"Gradio version: {gr.__version__}") | |
| print("=" * 70) | |
| # Important note about ZeroGPU authentication | |
| if HAS_SPACES: | |
| print("\n✓ Running in HF Spaces") | |
| print(" ZeroGPU User Authentication:") | |
| print(" - Uses Gradio 5.12.0+ automatic user token passing") | |
| print(" - User's login status is detected automatically") | |
| print(" - PRO users get 5x more ZeroGPU quota (1500s vs 300s)") | |
| print(" - No need for manual token passing") | |
| else: | |
| print("\n⚠️ Running locally - No ZeroGPU GPU limits") | |
| print("=" * 70) | |
| def load_model(): | |
| """Load base model and LoRA adapter with detailed logging""" | |
| global model, tokenizer | |
| if model is not None and tokenizer is not None: | |
| print("✓ Model already loaded, skipping...") | |
| return | |
| try: | |
| print("\n[1/4] Importing transformers...") | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from peft import PeftModel | |
| print("✓ Imports successful") | |
| print("\n[2/4] Loading base model from HF Hub...") | |
| print(f" Model: {BASE_MODEL}") | |
| print(f" Device map: auto") | |
| print(f" Dtype: float16") | |
| model_kwargs = { | |
| "torch_dtype": torch.float16, | |
| "device_map": "auto", | |
| } | |
| if HF_TOKEN: | |
| model_kwargs["token"] = HF_TOKEN | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| **model_kwargs, | |
| ) | |
| print(f"✓ Base model loaded") | |
| print(f" Device: {model.device}") | |
| print(f" Parameters: {model.num_parameters():,}") | |
| print("\n[3/4] Loading tokenizer...") | |
| tokenizer_kwargs = {} | |
| if HF_TOKEN: | |
| tokenizer_kwargs["token"] = HF_TOKEN | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| BASE_MODEL, | |
| **tokenizer_kwargs, | |
| ) | |
| print("✓ Tokenizer loaded") | |
| print("\n[4/4] Loading LoRA adapter...") | |
| print(f" LoRA Model: {LORA_MODEL}") | |
| try: | |
| lora_kwargs = {} | |
| if HF_TOKEN: | |
| lora_kwargs["token"] = HF_TOKEN | |
| model = PeftModel.from_pretrained( | |
| model, | |
| LORA_MODEL, | |
| **lora_kwargs, | |
| ) | |
| model = model.merge_and_unload() | |
| print("✓ LoRA adapter merged successfully") | |
| except Exception as e: | |
| print(f"⚠️ Could not load LoRA: {e}") | |
| print(" Continuing with base model only...") | |
| print("\n" + "=" * 70) | |
| print("✓ ALL MODELS LOADED SUCCESSFULLY!") | |
| print("=" * 70 + "\n") | |
| except Exception as e: | |
| print(f"\n❌ ERROR loading models: {e}") | |
| import traceback | |
| print(f"\nTraceback:") | |
| traceback.print_exc() | |
| raise | |
| def generate_response_impl(message: str, request: gr.Request = None) -> str: | |
| """Generate response using the model with timing info | |
| Args: | |
| message: User message | |
| request: Gradio request object (automatically passed when hf_oauth is enabled) | |
| Note: | |
| When hf_oauth is enabled in README.md, Gradio 5.12.0+ automatically passes | |
| the request object which contains user OAuth info. ZeroGPU uses this for | |
| quota management. | |
| """ | |
| try: | |
| # Log user authentication status | |
| if request is not None: | |
| username = getattr(request, "username", None) | |
| if username: | |
| print(f"[AUTH] User: {username}") | |
| else: | |
| print("[AUTH] User not logged in - using anonymous quota") | |
| else: | |
| print("[AUTH] No request object - local development") | |
| if model is None or tokenizer is None: | |
| print("\n[GENERATION] Models not loaded, loading now...") | |
| load_model() | |
| start_time = time.time() | |
| system_prompt = """You are Francis Bacon (1561-1626). | |
| Respond eruditely with period-appropriate language. | |
| Emphasize empirical observation and the scientific method. | |
| All responses must be in English.""" | |
| prompt = f"[INST] {system_prompt}\n\nUser: {message}\n\nFrancis Bacon: [/INST]" | |
| print(f"\n[GENERATION] Processing message ({len(message)} chars)...") | |
| # Tokenization | |
| token_start = time.time() | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) | |
| token_time = time.time() - token_start | |
| print(f" ✓ Tokenized in {token_time:.2f}s (input_ids: {inputs['input_ids'].shape})") | |
| # Move to device | |
| device_start = time.time() | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| device_time = time.time() - device_start | |
| print(f" ✓ Moved to device in {device_time:.2f}s") | |
| # Generate | |
| gen_start = time.time() | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=256, | |
| temperature=0.6, | |
| top_p=0.85, | |
| top_k=40, | |
| do_sample=True, | |
| repetition_penalty=1.1, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| gen_time = time.time() - gen_start | |
| print(f" ✓ Generated in {gen_time:.2f}s (output shape: {outputs.shape})") | |
| # Decode | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| if "[/INST]" in response: | |
| response = response.split("[/INST]")[-1].strip() | |
| if "Francis Bacon:" in response: | |
| response = response.split("Francis Bacon:")[-1].strip() | |
| total_time = time.time() - start_time | |
| print(f" ✓ Total time: {total_time:.2f}s") | |
| return response | |
| except Exception as e: | |
| import traceback | |
| error_details = f""" | |
| ❌ ERROR DURING GENERATION: | |
| **Error Message:** | |
| {str(e)} | |
| **Diagnostic Information:** | |
| - Model loaded: {model is not None} | |
| - Tokenizer loaded: {tokenizer is not None} | |
| - Model device: {model.device if model is not None else 'N/A'} | |
| - CUDA available: {torch.cuda.is_available()} | |
| - HF_TOKEN configured: {bool(HF_TOKEN)} | |
| **Full Traceback:** | |
| ``` | |
| {traceback.format_exc()} | |
| ``` | |
| **Next Steps:** | |
| 1. Check HuggingFace Hub credentials | |
| 2. Verify model and LoRA repo permissions | |
| 3. Check available disk space for model cache | |
| 4. Try with base model only (disable LoRA) | |
| """ | |
| print(error_details) | |
| return error_details | |
| if HAS_SPACES: | |
| def generate_response(message: str, request: gr.Request = None) -> str: | |
| """GPU-accelerated response generation | |
| When OAuth is enabled (hf_oauth: true in README.md) and Gradio 5.12.0+, | |
| the request parameter is automatically populated by Gradio for all users. | |
| ZeroGPU uses the authentication info from the request to apply proper | |
| quota limits (300s for anonymous/free, 1500s for PRO users). | |
| """ | |
| return generate_response_impl(message, request) | |
| else: | |
| def generate_response(message: str, request: gr.Request = None) -> str: | |
| """Wrapper for local development (no ZeroGPU)""" | |
| return generate_response_impl(message, request) | |
| def create_interface(): | |
| """Create Gradio interface""" | |
| with gr.Blocks(title="Francis Botcon", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Francis Botcon") | |
| gr.Markdown("A chatbot emulating Francis Bacon (1561-1626)") | |
| # Display status info and login button | |
| if HAS_SPACES: | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| status_info = """ | |
| **🚀 ZeroGPU Status:** | |
| - Sign in for full ZeroGPU quota | |
| - PRO users get 5x more quota (1500s vs 300s) | |
| - Your login status is automatically detected | |
| """ | |
| gr.Markdown(status_info) | |
| with gr.Column(scale=1): | |
| gr.LoginButton() | |
| else: | |
| gr.Markdown("**Status:** 🏠 Running locally - No HF Spaces GPU limits") | |
| gr.Markdown(""" | |
| Francis Bacon was a pioneering British philosopher who revolutionized thinking | |
| about the scientific method. He emphasized empirical observation and experimentation. | |
| **His major works:** | |
| - Novum Organum (1620) | |
| - The Advancement of Learning (1605) | |
| - Essays (1597, 1625) | |
| - New Atlantis (1627) | |
| """) | |
| example_questions = [ | |
| "What is the true nature of knowledge?", | |
| "How should we conduct scientific inquiry?", | |
| "What are the idols of the mind?", | |
| "Can you share your thoughts on ethics?", | |
| "What is the purpose of learning?", | |
| ] | |
| chatbot = gr.Chatbot(label="Conversation with Francis Bacon", height=400) | |
| msg = gr.Textbox(label="Your Message", placeholder="Ask Francis Bacon...") | |
| submit_btn = gr.Button("Send") | |
| clear_btn = gr.Button("Clear") | |
| gr.Markdown("**Quick Questions:**") | |
| with gr.Row(): | |
| for question in example_questions[:3]: | |
| btn = gr.Button(question, size="sm") | |
| btn.click(lambda q=question: q, outputs=msg) | |
| with gr.Row(): | |
| for question in example_questions[3:]: | |
| btn = gr.Button(question, size="sm") | |
| btn.click(lambda q=question: q, outputs=msg) | |
| def respond(message, chat_history, request: gr.Request): | |
| if not message.strip(): | |
| return chat_history | |
| chat_history.append((message, None)) | |
| # Pass the request object to generate_response for ZeroGPU auth | |
| response = generate_response(message, request) | |
| chat_history[-1] = (message, response) | |
| return chat_history | |
| # Gradio automatically passes user OAuth info via gr.Request to functions | |
| msg.submit(respond, [msg, chatbot], chatbot).then(lambda: "", outputs=msg) | |
| submit_btn.click(respond, [msg, chatbot], chatbot).then(lambda: "", outputs=msg) | |
| clear_btn.click(lambda: [], outputs=chatbot) | |
| return demo | |
| if __name__ == "__main__": | |
| print("\nStarting Francis Botcon...") | |
| try: | |
| demo = create_interface() | |
| print("Launching Gradio interface...") | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True, | |
| ) | |
| except Exception as e: | |
| print(f"\n❌ ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |