Spaces:

rojaldo
/

francis-botcon

Running on Zero

Francis Botcon Publisher

Deploy Francis Botcon chatbot

50446a3 6 months ago

12.3 kB

	#!/usr/bin/env python3
	"""
	Francis Botcon - Chat Application
	Hugging Face Spaces with ZeroGPU support
	"""

	import os
	import sys
	import torch
	import gradio as gr
	import time

	# Get token BEFORE importing spaces - critical for ZeroGPU authentication
	# Priority: francis_botcon_space (Space secret) > HF_TOKEN > HUGGING_FACE_HUB_TOKEN
	HF_TOKEN = os.getenv("francis_botcon_space") or os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")

	# Set both environment variables to ensure ZeroGPU picks up authentication
	# This is the Space owner's token (for private model access)
	if HF_TOKEN:
	os.environ["HF_TOKEN"] = HF_TOKEN
	os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
	os.environ["HF_HOME"] = os.path.expanduser("~/.cache/huggingface")
	print(f"✓ Using token from environment (length: {len(HF_TOKEN)})")
	else:
	print("⚠️ WARNING: No HF_TOKEN found in environment!")
	print(" Set 'francis_botcon_space' or 'HF_TOKEN' secret in Space settings")

	# Try to import spaces (only available in HF Spaces)
	try:
	import spaces
	HAS_SPACES = True
	except ImportError:
	HAS_SPACES = False
	print("spaces module not available (local development)")

	BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
	LORA_MODEL = "rojaldo/francis-botcon-lora"

	# Global model state
	model = None
	tokenizer = None

	# ============================================================================
	# INITIALIZATION
	# ============================================================================
	print("=" * 70)
	print("INITIALIZATION" if HAS_SPACES else "LOCAL DEVELOPMENT MODE")
	print("=" * 70)
	print(f"HF_TOKEN (Space Secret) configured: {bool(HF_TOKEN)}")
	print(f"HF_HOME: {os.getenv('HF_HOME')}")
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(0)}")
	print(f"PyTorch version: {torch.__version__}")
	print(f"Gradio version: {gr.__version__}")
	print("=" * 70)

	# Important note about ZeroGPU authentication
	if HAS_SPACES:
	print("\n✓ Running in HF Spaces")
	print(" ZeroGPU User Authentication:")
	print(" - Uses Gradio 5.12.0+ automatic user token passing")
	print(" - User's login status is detected automatically")
	print(" - PRO users get 5x more ZeroGPU quota (1500s vs 300s)")
	print(" - No need for manual token passing")
	else:
	print("\n⚠️ Running locally - No ZeroGPU GPU limits")
	print("=" * 70)

	def load_model():
	"""Load base model and LoRA adapter with detailed logging"""
	global model, tokenizer

	if model is not None and tokenizer is not None:
	print("✓ Model already loaded, skipping...")
	return

	try:
	print("\n[1/4] Importing transformers...")
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from peft import PeftModel
	print("✓ Imports successful")

	print("\n[2/4] Loading base model from HF Hub...")
	print(f" Model: {BASE_MODEL}")
	print(f" Device map: auto")
	print(f" Dtype: float16")

	model_kwargs = {
	"torch_dtype": torch.float16,
	"device_map": "auto",
	}
	if HF_TOKEN:
	model_kwargs["token"] = HF_TOKEN

	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	**model_kwargs,
	)
	print(f"✓ Base model loaded")
	print(f" Device: {model.device}")
	print(f" Parameters: {model.num_parameters():,}")

	print("\n[3/4] Loading tokenizer...")
	tokenizer_kwargs = {}
	if HF_TOKEN:
	tokenizer_kwargs["token"] = HF_TOKEN

	tokenizer = AutoTokenizer.from_pretrained(
	BASE_MODEL,
	**tokenizer_kwargs,
	)
	print("✓ Tokenizer loaded")

	print("\n[4/4] Loading LoRA adapter...")
	print(f" LoRA Model: {LORA_MODEL}")

	try:
	lora_kwargs = {}
	if HF_TOKEN:
	lora_kwargs["token"] = HF_TOKEN

	model = PeftModel.from_pretrained(
	model,
	LORA_MODEL,
	**lora_kwargs,
	)
	model = model.merge_and_unload()
	print("✓ LoRA adapter merged successfully")
	except Exception as e:
	print(f"⚠️ Could not load LoRA: {e}")
	print(" Continuing with base model only...")

	print("\n" + "=" * 70)
	print("✓ ALL MODELS LOADED SUCCESSFULLY!")
	print("=" * 70 + "\n")

	except Exception as e:
	print(f"\n❌ ERROR loading models: {e}")
	import traceback
	print(f"\nTraceback:")
	traceback.print_exc()
	raise

	def generate_response_impl(message: str, request: gr.Request = None) -> str:
	"""Generate response using the model with timing info

	Args:
	message: User message
	request: Gradio request object (automatically passed when hf_oauth is enabled)

	Note:
	When hf_oauth is enabled in README.md, Gradio 5.12.0+ automatically passes
	the request object which contains user OAuth info. ZeroGPU uses this for
	quota management.
	"""
	try:
	# Log user authentication status
	if request is not None:
	username = getattr(request, "username", None)
	if username:
	print(f"[AUTH] User: {username}")
	else:
	print("[AUTH] User not logged in - using anonymous quota")
	else:
	print("[AUTH] No request object - local development")

	if model is None or tokenizer is None:
	print("\n[GENERATION] Models not loaded, loading now...")
	load_model()

	start_time = time.time()

	system_prompt = """You are Francis Bacon (1561-1626).
	Respond eruditely with period-appropriate language.
	Emphasize empirical observation and the scientific method.
	All responses must be in English."""

	prompt = f"[INST] {system_prompt}\n\nUser: {message}\n\nFrancis Bacon: [/INST]"

	print(f"\n[GENERATION] Processing message ({len(message)} chars)...")

	# Tokenization
	token_start = time.time()
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
	token_time = time.time() - token_start
	print(f" ✓ Tokenized in {token_time:.2f}s (input_ids: {inputs['input_ids'].shape})")

	# Move to device
	device_start = time.time()
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	device_time = time.time() - device_start
	print(f" ✓ Moved to device in {device_time:.2f}s")

	# Generate
	gen_start = time.time()
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=256,
	temperature=0.6,
	top_p=0.85,
	top_k=40,
	do_sample=True,
	repetition_penalty=1.1,
	pad_token_id=tokenizer.eos_token_id,
	)
	gen_time = time.time() - gen_start
	print(f" ✓ Generated in {gen_time:.2f}s (output shape: {outputs.shape})")

	# Decode
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	if "[/INST]" in response:
	response = response.split("[/INST]")[-1].strip()

	if "Francis Bacon:" in response:
	response = response.split("Francis Bacon:")[-1].strip()

	total_time = time.time() - start_time
	print(f" ✓ Total time: {total_time:.2f}s")

	return response

	except Exception as e:
	import traceback
	error_details = f"""
	❌ ERROR DURING GENERATION:

	Error Message:
	{str(e)}

	Diagnostic Information:
	- Model loaded: {model is not None}
	- Tokenizer loaded: {tokenizer is not None}
	- Model device: {model.device if model is not None else 'N/A'}
	- CUDA available: {torch.cuda.is_available()}
	- HF_TOKEN configured: {bool(HF_TOKEN)}

	Full Traceback:
	```
	{traceback.format_exc()}
	```

	Next Steps:
	1. Check HuggingFace Hub credentials
	2. Verify model and LoRA repo permissions
	3. Check available disk space for model cache
	4. Try with base model only (disable LoRA)
	"""
	print(error_details)
	return error_details

	if HAS_SPACES:
	@spaces.GPU(duration=60)
	def generate_response(message: str, request: gr.Request = None) -> str:
	"""GPU-accelerated response generation

	When OAuth is enabled (hf_oauth: true in README.md) and Gradio 5.12.0+,
	the request parameter is automatically populated by Gradio for all users.
	ZeroGPU uses the authentication info from the request to apply proper
	quota limits (300s for anonymous/free, 1500s for PRO users).
	"""
	return generate_response_impl(message, request)
	else:
	def generate_response(message: str, request: gr.Request = None) -> str:
	"""Wrapper for local development (no ZeroGPU)"""
	return generate_response_impl(message, request)

	def create_interface():
	"""Create Gradio interface"""
	with gr.Blocks(title="Francis Botcon", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Francis Botcon")
	gr.Markdown("A chatbot emulating Francis Bacon (1561-1626)")

	# Display status info and login button
	if HAS_SPACES:
	with gr.Row():
	with gr.Column(scale=3):
	status_info = """
	🚀 ZeroGPU Status:
	- Sign in for full ZeroGPU quota
	- PRO users get 5x more quota (1500s vs 300s)
	- Your login status is automatically detected
	"""
	gr.Markdown(status_info)
	with gr.Column(scale=1):
	gr.LoginButton()
	else:
	gr.Markdown("Status: 🏠 Running locally - No HF Spaces GPU limits")

	gr.Markdown("""
	Francis Bacon was a pioneering British philosopher who revolutionized thinking
	about the scientific method. He emphasized empirical observation and experimentation.

	His major works:
	- Novum Organum (1620)
	- The Advancement of Learning (1605)
	- Essays (1597, 1625)
	- New Atlantis (1627)
	""")

	example_questions = [
	"What is the true nature of knowledge?",
	"How should we conduct scientific inquiry?",
	"What are the idols of the mind?",
	"Can you share your thoughts on ethics?",
	"What is the purpose of learning?",
	]

	chatbot = gr.Chatbot(label="Conversation with Francis Bacon", height=400)
	msg = gr.Textbox(label="Your Message", placeholder="Ask Francis Bacon...")
	submit_btn = gr.Button("Send")
	clear_btn = gr.Button("Clear")

	gr.Markdown("Quick Questions:")
	with gr.Row():
	for question in example_questions[:3]:
	btn = gr.Button(question, size="sm")
	btn.click(lambda q=question: q, outputs=msg)

	with gr.Row():
	for question in example_questions[3:]:
	btn = gr.Button(question, size="sm")
	btn.click(lambda q=question: q, outputs=msg)

	def respond(message, chat_history, request: gr.Request):
	if not message.strip():
	return chat_history
	chat_history.append((message, None))
	# Pass the request object to generate_response for ZeroGPU auth
	response = generate_response(message, request)
	chat_history[-1] = (message, response)
	return chat_history

	# Gradio automatically passes user OAuth info via gr.Request to functions
	msg.submit(respond, [msg, chatbot], chatbot).then(lambda: "", outputs=msg)
	submit_btn.click(respond, [msg, chatbot], chatbot).then(lambda: "", outputs=msg)
	clear_btn.click(lambda: [], outputs=chatbot)

	return demo

	if __name__ == "__main__":
	print("\nStarting Francis Botcon...")
	try:
	demo = create_interface()
	print("Launching Gradio interface...")
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	)
	except Exception as e:
	print(f"\n❌ ERROR: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)