MLX_GPT_OSS_120B / mlx-gpt-oss-120b /gpt_oss_chat.py

Upload 48 files

c28358e verified 3 months ago

6.8 kB

	#!/usr/bin/env python3
	"""
	Polished chat interface for GPT-OSS-120B with proper response parsing
	"""

	from mlx_lm import load, generate
	import logging
	import re
	import time
	from typing import List, Dict

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	class GPTOSSChat:
	def __init__(self):
	logger.info("🚀 Loading GPT-OSS-120B...")
	self.model, self.tokenizer = load("mlx-community/gpt-oss-120b-MXFP4-Q4")
	logger.info("✅ Model loaded successfully!")

	def extract_final_response(self, response: str) -> str:
	"""Extract the final assistant response from the chat template"""
	# Look for the final assistant response
	if "<\|start\|>assistant" in response:
	parts = response.split("<\|start\|>assistant")
	if len(parts) > 1:
	final_part = parts[-1]

	# Remove all channel and message tags
	final_part = re.sub(r'<\\|channel\\|>[^<]+', '', final_part)
	final_part = final_part.replace('<\|message\|>', '')
	final_part = final_part.replace('<\|end\|>', '')

	# Clean up any remaining tags or whitespace
	final_part = re.sub(r'<[^>]+>', '', final_part)
	final_part = final_part.strip()

	if final_part:
	return final_part

	# Fallback: return the original response cleaned up
	cleaned = re.sub(r'<\\|[^>]+\\|>', '', response)
	cleaned = re.sub(r'<[^>]+>', '', cleaned)
	return cleaned.strip()

	def generate_response(self, prompt: str, max_tokens: int = 2048) -> str: # temp: float = 0.7
	"""Generate a response with proper formatting"""
	try:
	# Format prompt with chat template
	messages = [{"role": "user", "content": prompt}]
	formatted_prompt = self.tokenizer.apply_chat_template(
	messages, add_generation_prompt=True
	)

	# Generate response
	response = generate(
	self.model,
	self.tokenizer,
	prompt=formatted_prompt,
	max_tokens=max_tokens,
	#temp=temp,
	verbose=False
	)

	# Extract and clean the final response
	return self.extract_final_response(response)

	except Exception as e:
	logger.error(f"Generation error: {e}")
	return f"I encountered an error: {str(e)}"

	def interactive_chat(self):
	"""Beautiful interactive chat interface"""
	print("\n" + "=" * 60)
	print("🤖 GPT-OSS-120B Chat Interface")
	print("=" * 60)
	print("💡 Your M3 Ultra is running a 120B parameter model locally!")
	print("🎯 Type your messages below (type '/quit' to exit)")
	print("=" * 60)

	conversation_history = []

	while True:
	try:
	user_input = input("\n👤 You: ").strip()

	if user_input.lower() in ['/quit', '/exit', '/bye']:
	print("👋 Goodbye! It was amazing chatting with you!")
	break

	if user_input.lower() == '/clear':
	conversation_history = []
	print("🧹 Conversation cleared!")
	continue

	if user_input.lower() == '/help':
	print("\n📋 Available commands:")
	print(" /quit - Exit the chat")
	print(" /clear - Clear conversation history")
	print(" /help - Show this help message")
	continue

	if not user_input:
	continue

	# Generate response
	print("💭 Thinking...", end="\r")
	start_time = time.time()

	response = self.generate_response(user_input, max_tokens=2048) # temp=0.7

	generation_time = time.time() - start_time

	# Add to conversation history
	conversation_history.append({"user": user_input, "ai": response})

	# Display response
	print(f"🤖 AI ({generation_time:.1f}s): {response}")

	except KeyboardInterrupt:
	print("\n\n👋 Thanks for chatting! Goodbye!")
	break
	except Exception as e:
	print(f"\n❌ Error: {e}")

	def demonstration_mode():
	"""Showcase the model's capabilities with beautiful formatting"""
	print("\n" + "=" * 60)
	print("🎭 GPT-OSS-120B Capabilities Demonstration")
	print("=" * 60)

	ai = GPTOSSChat()

	demonstrations = [
	{
	"prompt": "Explain quantum computing like I'm 10 years old",
	"description": "Simplified explanation"
	},
	{
	"prompt": "Write a beautiful haiku about the ocean and technology",
	"description": "Creative writing"
	},
	{
	"prompt": "What are the most exciting recent developments in AI?",
	"description": "Technical knowledge"
	},
	{
	"prompt": "How would you describe the feeling of wonder to an alien?",
	"description": "Philosophical reasoning"
	},
	{
	"prompt": "Create a short story about a robot who discovers poetry",
	"description": "Creative fiction"
	}
	]

	for i, demo in enumerate(demonstrations, 1):
	print(f"\n{i}. 🌟 {demo['description']}")
	print(f" 📝 '{demo['prompt']}'")

	response = ai.generate_response(demo['prompt'], max_tokens=2048)

	# Format response with indentation
	lines = response.split('\n')
	for line in lines:
	print(f" 🤖 {line}")

	print(" " + "─" * 50)
	time.sleep(2) # Pause between demonstrations

	if __name__ == "__main__":
	print("🚀 Starting GPT-OSS-120B Chat System")
	print("💾 Model: 120B parameters, 4-bit quantized")
	print("🍎 Hardware: Apple M3 Ultra with 512GB RAM")
	print("⚡ Performance: ~95 tokens/second")

	# Create chat interface
	chat = GPTOSSChat()

	# Run demonstration
	demonstration_mode()

	# Start interactive chat
	print("\n" + "=" * 60)
	print("💬 Starting Interactive Chat Mode...")
	print("=" * 60)
	chat.interactive_chat()