Upload app.py with huggingface_hub

2716eda verified 7 months ago

9.29 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import json
	import warnings
	from typing import List, Dict, Any

	# Suppress warnings for cleaner output
	warnings.filterwarnings("ignore")

	# Load model and tokenizer with error handling
	model_name = "microsoft/Phi-3.5-MoE-instruct"

	try:
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

	print("Loading model...")
	# Use CPU-compatible settings for Hugging Face Spaces
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16, # Use float16 instead of bfloat16 for better compatibility
	device_map="auto",
	trust_remote_code=True,
	low_cpu_mem_usage=True
	)

	print("Creating pipeline...")
	# Create pipeline
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)

	print("Model loaded successfully!")

	except Exception as e:
	print(f"Error loading model: {e}")
	print("This is likely due to missing dependencies (einops, flash_attn) or memory constraints.")
	print("The model will run in fallback mode.")
	# Create a fallback pipeline for demo purposes
	pipe = None
	tokenizer = None

	def classify_query_type(query: str) -> str:
	"""Classify query to determine expert specialization"""
	query_lower = query.lower()

	expert_keywords = {
	"Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code"],
	"Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate"],
	"Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because"],
	"Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german"],
	"General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where"]
	}

	scores = {}
	for expert, keywords in expert_keywords.items():
	score = sum(1 for keyword in keywords if keyword in query_lower)
	scores[expert] = score

	if scores:
	best_expert = max(scores.items(), key=lambda x: x[1])[0]
	if scores[best_expert] > 0:
	return best_expert

	return "General"

	def generate_fallback_response(query: str, expert_type: str) -> str:
	"""Generate a fallback response when the model is not available"""
	fallback_responses = {
	"Code": f"I'm a Code Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically provide detailed code examples and programming guidance. Please try again later when the model is loaded.",
	"Math": f"I'm a Math Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically solve mathematical problems step-by-step. Please try again later when the model is loaded.",
	"Reasoning": f"I'm a Reasoning Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically provide logical analysis and systematic problem-solving. Please try again later when the model is loaded.",
	"Multilingual": f"I'm a Multilingual Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically help with translations and language learning. Please try again later when the model is loaded.",
	"General": f"I'm a General Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically provide helpful and informative responses. Please try again later when the model is loaded."
	}
	return fallback_responses.get(expert_type, fallback_responses["General"])

	def generate_response(query: str, max_tokens: int = 500, temperature: float = 0.7) -> str:
	"""Generate response using Phi-3.5-MoE"""
	try:
	# Classify query type
	expert_type = classify_query_type(query)

	if pipe is None or tokenizer is None:
	return f"Expert Type: {expert_type}\\n\\nResponse:\\n{generate_fallback_response(query, expert_type)}"

	# Create system message based on expert type
	system_messages = {
	"Code": "You are an expert software engineer and programming assistant. Provide clear, well-commented code examples and explain programming concepts thoroughly.",
	"Math": "You are a mathematics expert. Solve problems step-by-step, show your work, and explain mathematical concepts clearly.",
	"Reasoning": "You are a logical reasoning expert. Break down complex problems, analyze them systematically, and provide clear explanations.",
	"Multilingual": "You are a multilingual expert. Help with translations, language learning, and cross-cultural communication.",
	"General": "You are a helpful AI assistant. Provide accurate, helpful, and informative responses to user questions."
	}

	system_message = system_messages.get(expert_type, system_messages["General"])

	# Format messages
	messages = [
	{"role": "system", "content": system_message},
	{"role": "user", "content": query}
	]

	# Generate response
	response = pipe(
	messages,
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	# Extract response text
	generated_text = response[0]['generated_text']

	# Find the assistant's response
	if "Assistant:" in generated_text:
	assistant_response = generated_text.split("Assistant:")[-1].strip()
	else:
	assistant_response = generated_text

	return f"Expert Type: {expert_type}\\n\\nResponse:\\n{assistant_response}"

	except Exception as e:
	return f"❌ Error generating response: {str(e)}\\n\\nPlease try again or check the logs for more details."

	def create_interface():
	"""Create Gradio interface"""

	with gr.Blocks(title="Phi-3.5-MoE Expert Assistant", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🤖 Phi-3.5-MoE Expert Assistant")
	gr.Markdown("""
	This is a specialized AI assistant powered by Microsoft's Phi-3.5-MoE model.
	It automatically routes your queries to the most appropriate expert:
	- Code Expert: Programming, software development, algorithms
	- Math Expert: Mathematics, calculations, problem solving
	- Reasoning Expert: Logic, analysis, critical thinking
	- Multilingual Expert: Translation and language assistance
	- General Expert: General purpose assistance
	""")

	with gr.Row():
	with gr.Column(scale=3):
	query_input = gr.Textbox(
	label="Your Question",
	placeholder="Ask me anything...",
	lines=3
	)

	with gr.Row():
	max_tokens = gr.Slider(
	minimum=50,
	maximum=1000,
	value=500,
	step=50,
	label="Max Tokens"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	)

	submit_btn = gr.Button("Generate Response", variant="primary")

	with gr.Column(scale=2):
	response_output = gr.Markdown(label="Response")

	# Example queries
	gr.Markdown("### 💡 Example Queries")
	examples = [
	"How do I implement a binary search algorithm in Python?",
	"What is the derivative of x² + 3x + 1?",
	"Explain the logical reasoning behind the Monty Hall problem",
	"Translate 'Hello, how are you?' to Spanish",
	"What are the benefits of renewable energy?"
	]

	gr.Examples(
	examples=examples,
	inputs=query_input
	)

	# Event handlers
	submit_btn.click(
	fn=generate_response,
	inputs=[query_input, max_tokens, temperature],
	outputs=response_output
	)

	query_input.submit(
	fn=generate_response,
	inputs=[query_input, max_tokens, temperature],
	outputs=response_output
	)

	return demo

	# Create and launch the interface
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860)