phi35-moe-demo / app.py
ianshank's picture
Upload app.py with huggingface_hub
2716eda verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import json
import warnings
from typing import List, Dict, Any
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
# Load model and tokenizer with error handling
model_name = "microsoft/Phi-3.5-MoE-instruct"
try:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print("Loading model...")
# Use CPU-compatible settings for Hugging Face Spaces
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # Use float16 instead of bfloat16 for better compatibility
device_map="auto",
trust_remote_code=True,
low_cpu_mem_usage=True
)
print("Creating pipeline...")
# Create pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
print("Model loaded successfully!")
except Exception as e:
print(f"Error loading model: {e}")
print("This is likely due to missing dependencies (einops, flash_attn) or memory constraints.")
print("The model will run in fallback mode.")
# Create a fallback pipeline for demo purposes
pipe = None
tokenizer = None
def classify_query_type(query: str) -> str:
"""Classify query to determine expert specialization"""
query_lower = query.lower()
expert_keywords = {
"Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code"],
"Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate"],
"Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because"],
"Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german"],
"General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where"]
}
scores = {}
for expert, keywords in expert_keywords.items():
score = sum(1 for keyword in keywords if keyword in query_lower)
scores[expert] = score
if scores:
best_expert = max(scores.items(), key=lambda x: x[1])[0]
if scores[best_expert] > 0:
return best_expert
return "General"
def generate_fallback_response(query: str, expert_type: str) -> str:
"""Generate a fallback response when the model is not available"""
fallback_responses = {
"Code": f"I'm a Code Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically provide detailed code examples and programming guidance. Please try again later when the model is loaded.",
"Math": f"I'm a Math Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically solve mathematical problems step-by-step. Please try again later when the model is loaded.",
"Reasoning": f"I'm a Reasoning Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically provide logical analysis and systematic problem-solving. Please try again later when the model is loaded.",
"Multilingual": f"I'm a Multilingual Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically help with translations and language learning. Please try again later when the model is loaded.",
"General": f"I'm a General Expert, but the Phi-3.5-MoE model is currently unavailable. For your question about '{query}', I would typically provide helpful and informative responses. Please try again later when the model is loaded."
}
return fallback_responses.get(expert_type, fallback_responses["General"])
def generate_response(query: str, max_tokens: int = 500, temperature: float = 0.7) -> str:
"""Generate response using Phi-3.5-MoE"""
try:
# Classify query type
expert_type = classify_query_type(query)
if pipe is None or tokenizer is None:
return f"**Expert Type:** {expert_type}\\n\\n**Response:**\\n{generate_fallback_response(query, expert_type)}"
# Create system message based on expert type
system_messages = {
"Code": "You are an expert software engineer and programming assistant. Provide clear, well-commented code examples and explain programming concepts thoroughly.",
"Math": "You are a mathematics expert. Solve problems step-by-step, show your work, and explain mathematical concepts clearly.",
"Reasoning": "You are a logical reasoning expert. Break down complex problems, analyze them systematically, and provide clear explanations.",
"Multilingual": "You are a multilingual expert. Help with translations, language learning, and cross-cultural communication.",
"General": "You are a helpful AI assistant. Provide accurate, helpful, and informative responses to user questions."
}
system_message = system_messages.get(expert_type, system_messages["General"])
# Format messages
messages = [
{"role": "system", "content": system_message},
{"role": "user", "content": query}
]
# Generate response
response = pipe(
messages,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Extract response text
generated_text = response[0]['generated_text']
# Find the assistant's response
if "Assistant:" in generated_text:
assistant_response = generated_text.split("Assistant:")[-1].strip()
else:
assistant_response = generated_text
return f"**Expert Type:** {expert_type}\\n\\n**Response:**\\n{assistant_response}"
except Exception as e:
return f"❌ **Error generating response:** {str(e)}\\n\\nPlease try again or check the logs for more details."
def create_interface():
"""Create Gradio interface"""
with gr.Blocks(title="Phi-3.5-MoE Expert Assistant", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🤖 Phi-3.5-MoE Expert Assistant")
gr.Markdown("""
This is a specialized AI assistant powered by Microsoft's Phi-3.5-MoE model.
It automatically routes your queries to the most appropriate expert:
- **Code Expert**: Programming, software development, algorithms
- **Math Expert**: Mathematics, calculations, problem solving
- **Reasoning Expert**: Logic, analysis, critical thinking
- **Multilingual Expert**: Translation and language assistance
- **General Expert**: General purpose assistance
""")
with gr.Row():
with gr.Column(scale=3):
query_input = gr.Textbox(
label="Your Question",
placeholder="Ask me anything...",
lines=3
)
with gr.Row():
max_tokens = gr.Slider(
minimum=50,
maximum=1000,
value=500,
step=50,
label="Max Tokens"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
label="Temperature"
)
submit_btn = gr.Button("Generate Response", variant="primary")
with gr.Column(scale=2):
response_output = gr.Markdown(label="Response")
# Example queries
gr.Markdown("### 💡 Example Queries")
examples = [
"How do I implement a binary search algorithm in Python?",
"What is the derivative of x² + 3x + 1?",
"Explain the logical reasoning behind the Monty Hall problem",
"Translate 'Hello, how are you?' to Spanish",
"What are the benefits of renewable energy?"
]
gr.Examples(
examples=examples,
inputs=query_input
)
# Event handlers
submit_btn.click(
fn=generate_response,
inputs=[query_input, max_tokens, temperature],
outputs=response_output
)
query_input.submit(
fn=generate_response,
inputs=[query_input, max_tokens, temperature],
outputs=response_output
)
return demo
# Create and launch the interface
if __name__ == "__main__":
demo = create_interface()
demo.launch(server_name="0.0.0.0", server_port=7860)