francis-botcon / app.py
Francis Botcon Publisher
Deploy Francis Botcon chatbot
50446a3
#!/usr/bin/env python3
"""
Francis Botcon - Chat Application
Hugging Face Spaces with ZeroGPU support
"""
import os
import sys
import torch
import gradio as gr
import time
# Get token BEFORE importing spaces - critical for ZeroGPU authentication
# Priority: francis_botcon_space (Space secret) > HF_TOKEN > HUGGING_FACE_HUB_TOKEN
HF_TOKEN = os.getenv("francis_botcon_space") or os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
# Set both environment variables to ensure ZeroGPU picks up authentication
# This is the Space owner's token (for private model access)
if HF_TOKEN:
os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
os.environ["HF_HOME"] = os.path.expanduser("~/.cache/huggingface")
print(f"✓ Using token from environment (length: {len(HF_TOKEN)})")
else:
print("⚠️ WARNING: No HF_TOKEN found in environment!")
print(" Set 'francis_botcon_space' or 'HF_TOKEN' secret in Space settings")
# Try to import spaces (only available in HF Spaces)
try:
import spaces
HAS_SPACES = True
except ImportError:
HAS_SPACES = False
print("spaces module not available (local development)")
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
LORA_MODEL = "rojaldo/francis-botcon-lora"
# Global model state
model = None
tokenizer = None
# ============================================================================
# INITIALIZATION
# ============================================================================
print("=" * 70)
print("INITIALIZATION" if HAS_SPACES else "LOCAL DEVELOPMENT MODE")
print("=" * 70)
print(f"HF_TOKEN (Space Secret) configured: {bool(HF_TOKEN)}")
print(f"HF_HOME: {os.getenv('HF_HOME')}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
print(f"PyTorch version: {torch.__version__}")
print(f"Gradio version: {gr.__version__}")
print("=" * 70)
# Important note about ZeroGPU authentication
if HAS_SPACES:
print("\n✓ Running in HF Spaces")
print(" ZeroGPU User Authentication:")
print(" - Uses Gradio 5.12.0+ automatic user token passing")
print(" - User's login status is detected automatically")
print(" - PRO users get 5x more ZeroGPU quota (1500s vs 300s)")
print(" - No need for manual token passing")
else:
print("\n⚠️ Running locally - No ZeroGPU GPU limits")
print("=" * 70)
def load_model():
"""Load base model and LoRA adapter with detailed logging"""
global model, tokenizer
if model is not None and tokenizer is not None:
print("✓ Model already loaded, skipping...")
return
try:
print("\n[1/4] Importing transformers...")
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
print("✓ Imports successful")
print("\n[2/4] Loading base model from HF Hub...")
print(f" Model: {BASE_MODEL}")
print(f" Device map: auto")
print(f" Dtype: float16")
model_kwargs = {
"torch_dtype": torch.float16,
"device_map": "auto",
}
if HF_TOKEN:
model_kwargs["token"] = HF_TOKEN
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
**model_kwargs,
)
print(f"✓ Base model loaded")
print(f" Device: {model.device}")
print(f" Parameters: {model.num_parameters():,}")
print("\n[3/4] Loading tokenizer...")
tokenizer_kwargs = {}
if HF_TOKEN:
tokenizer_kwargs["token"] = HF_TOKEN
tokenizer = AutoTokenizer.from_pretrained(
BASE_MODEL,
**tokenizer_kwargs,
)
print("✓ Tokenizer loaded")
print("\n[4/4] Loading LoRA adapter...")
print(f" LoRA Model: {LORA_MODEL}")
try:
lora_kwargs = {}
if HF_TOKEN:
lora_kwargs["token"] = HF_TOKEN
model = PeftModel.from_pretrained(
model,
LORA_MODEL,
**lora_kwargs,
)
model = model.merge_and_unload()
print("✓ LoRA adapter merged successfully")
except Exception as e:
print(f"⚠️ Could not load LoRA: {e}")
print(" Continuing with base model only...")
print("\n" + "=" * 70)
print("✓ ALL MODELS LOADED SUCCESSFULLY!")
print("=" * 70 + "\n")
except Exception as e:
print(f"\n❌ ERROR loading models: {e}")
import traceback
print(f"\nTraceback:")
traceback.print_exc()
raise
def generate_response_impl(message: str, request: gr.Request = None) -> str:
"""Generate response using the model with timing info
Args:
message: User message
request: Gradio request object (automatically passed when hf_oauth is enabled)
Note:
When hf_oauth is enabled in README.md, Gradio 5.12.0+ automatically passes
the request object which contains user OAuth info. ZeroGPU uses this for
quota management.
"""
try:
# Log user authentication status
if request is not None:
username = getattr(request, "username", None)
if username:
print(f"[AUTH] User: {username}")
else:
print("[AUTH] User not logged in - using anonymous quota")
else:
print("[AUTH] No request object - local development")
if model is None or tokenizer is None:
print("\n[GENERATION] Models not loaded, loading now...")
load_model()
start_time = time.time()
system_prompt = """You are Francis Bacon (1561-1626).
Respond eruditely with period-appropriate language.
Emphasize empirical observation and the scientific method.
All responses must be in English."""
prompt = f"[INST] {system_prompt}\n\nUser: {message}\n\nFrancis Bacon: [/INST]"
print(f"\n[GENERATION] Processing message ({len(message)} chars)...")
# Tokenization
token_start = time.time()
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
token_time = time.time() - token_start
print(f" ✓ Tokenized in {token_time:.2f}s (input_ids: {inputs['input_ids'].shape})")
# Move to device
device_start = time.time()
inputs = {k: v.to(model.device) for k, v in inputs.items()}
device_time = time.time() - device_start
print(f" ✓ Moved to device in {device_time:.2f}s")
# Generate
gen_start = time.time()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.6,
top_p=0.85,
top_k=40,
do_sample=True,
repetition_penalty=1.1,
pad_token_id=tokenizer.eos_token_id,
)
gen_time = time.time() - gen_start
print(f" ✓ Generated in {gen_time:.2f}s (output shape: {outputs.shape})")
# Decode
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if "[/INST]" in response:
response = response.split("[/INST]")[-1].strip()
if "Francis Bacon:" in response:
response = response.split("Francis Bacon:")[-1].strip()
total_time = time.time() - start_time
print(f" ✓ Total time: {total_time:.2f}s")
return response
except Exception as e:
import traceback
error_details = f"""
❌ ERROR DURING GENERATION:
**Error Message:**
{str(e)}
**Diagnostic Information:**
- Model loaded: {model is not None}
- Tokenizer loaded: {tokenizer is not None}
- Model device: {model.device if model is not None else 'N/A'}
- CUDA available: {torch.cuda.is_available()}
- HF_TOKEN configured: {bool(HF_TOKEN)}
**Full Traceback:**
```
{traceback.format_exc()}
```
**Next Steps:**
1. Check HuggingFace Hub credentials
2. Verify model and LoRA repo permissions
3. Check available disk space for model cache
4. Try with base model only (disable LoRA)
"""
print(error_details)
return error_details
if HAS_SPACES:
@spaces.GPU(duration=60)
def generate_response(message: str, request: gr.Request = None) -> str:
"""GPU-accelerated response generation
When OAuth is enabled (hf_oauth: true in README.md) and Gradio 5.12.0+,
the request parameter is automatically populated by Gradio for all users.
ZeroGPU uses the authentication info from the request to apply proper
quota limits (300s for anonymous/free, 1500s for PRO users).
"""
return generate_response_impl(message, request)
else:
def generate_response(message: str, request: gr.Request = None) -> str:
"""Wrapper for local development (no ZeroGPU)"""
return generate_response_impl(message, request)
def create_interface():
"""Create Gradio interface"""
with gr.Blocks(title="Francis Botcon", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Francis Botcon")
gr.Markdown("A chatbot emulating Francis Bacon (1561-1626)")
# Display status info and login button
if HAS_SPACES:
with gr.Row():
with gr.Column(scale=3):
status_info = """
**🚀 ZeroGPU Status:**
- Sign in for full ZeroGPU quota
- PRO users get 5x more quota (1500s vs 300s)
- Your login status is automatically detected
"""
gr.Markdown(status_info)
with gr.Column(scale=1):
gr.LoginButton()
else:
gr.Markdown("**Status:** 🏠 Running locally - No HF Spaces GPU limits")
gr.Markdown("""
Francis Bacon was a pioneering British philosopher who revolutionized thinking
about the scientific method. He emphasized empirical observation and experimentation.
**His major works:**
- Novum Organum (1620)
- The Advancement of Learning (1605)
- Essays (1597, 1625)
- New Atlantis (1627)
""")
example_questions = [
"What is the true nature of knowledge?",
"How should we conduct scientific inquiry?",
"What are the idols of the mind?",
"Can you share your thoughts on ethics?",
"What is the purpose of learning?",
]
chatbot = gr.Chatbot(label="Conversation with Francis Bacon", height=400)
msg = gr.Textbox(label="Your Message", placeholder="Ask Francis Bacon...")
submit_btn = gr.Button("Send")
clear_btn = gr.Button("Clear")
gr.Markdown("**Quick Questions:**")
with gr.Row():
for question in example_questions[:3]:
btn = gr.Button(question, size="sm")
btn.click(lambda q=question: q, outputs=msg)
with gr.Row():
for question in example_questions[3:]:
btn = gr.Button(question, size="sm")
btn.click(lambda q=question: q, outputs=msg)
def respond(message, chat_history, request: gr.Request):
if not message.strip():
return chat_history
chat_history.append((message, None))
# Pass the request object to generate_response for ZeroGPU auth
response = generate_response(message, request)
chat_history[-1] = (message, response)
return chat_history
# Gradio automatically passes user OAuth info via gr.Request to functions
msg.submit(respond, [msg, chatbot], chatbot).then(lambda: "", outputs=msg)
submit_btn.click(respond, [msg, chatbot], chatbot).then(lambda: "", outputs=msg)
clear_btn.click(lambda: [], outputs=chatbot)
return demo
if __name__ == "__main__":
print("\nStarting Francis Botcon...")
try:
demo = create_interface()
print("Launching Gradio interface...")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
)
except Exception as e:
print(f"\n❌ ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)