|
|
|
|
|
""" |
|
|
Polished chat interface for GPT-OSS-120B with proper response parsing |
|
|
""" |
|
|
|
|
|
from mlx_lm import load, generate |
|
|
import logging |
|
|
import re |
|
|
import time |
|
|
from typing import List, Dict |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class GPTOSSChat: |
|
|
def __init__(self): |
|
|
logger.info("π Loading GPT-OSS-120B...") |
|
|
self.model, self.tokenizer = load("mlx-community/gpt-oss-120b-MXFP4-Q4") |
|
|
logger.info("β
Model loaded successfully!") |
|
|
|
|
|
def extract_final_response(self, response: str) -> str: |
|
|
"""Extract the final assistant response from the chat template""" |
|
|
|
|
|
if "<|start|>assistant" in response: |
|
|
parts = response.split("<|start|>assistant") |
|
|
if len(parts) > 1: |
|
|
final_part = parts[-1] |
|
|
|
|
|
|
|
|
final_part = re.sub(r'<\|channel\|>[^<]+', '', final_part) |
|
|
final_part = final_part.replace('<|message|>', '') |
|
|
final_part = final_part.replace('<|end|>', '') |
|
|
|
|
|
|
|
|
final_part = re.sub(r'<[^>]+>', '', final_part) |
|
|
final_part = final_part.strip() |
|
|
|
|
|
if final_part: |
|
|
return final_part |
|
|
|
|
|
|
|
|
cleaned = re.sub(r'<\|[^>]+\|>', '', response) |
|
|
cleaned = re.sub(r'<[^>]+>', '', cleaned) |
|
|
return cleaned.strip() |
|
|
|
|
|
def generate_response(self, prompt: str, max_tokens: int = 2048) -> str: |
|
|
"""Generate a response with proper formatting""" |
|
|
try: |
|
|
|
|
|
messages = [{"role": "user", "content": prompt}] |
|
|
formatted_prompt = self.tokenizer.apply_chat_template( |
|
|
messages, add_generation_prompt=True |
|
|
) |
|
|
|
|
|
|
|
|
response = generate( |
|
|
self.model, |
|
|
self.tokenizer, |
|
|
prompt=formatted_prompt, |
|
|
max_tokens=max_tokens, |
|
|
|
|
|
verbose=False |
|
|
) |
|
|
|
|
|
|
|
|
return self.extract_final_response(response) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Generation error: {e}") |
|
|
return f"I encountered an error: {str(e)}" |
|
|
|
|
|
def interactive_chat(self): |
|
|
"""Beautiful interactive chat interface""" |
|
|
print("\n" + "=" * 60) |
|
|
print("π€ GPT-OSS-120B Chat Interface") |
|
|
print("=" * 60) |
|
|
print("π‘ Your M3 Ultra is running a 120B parameter model locally!") |
|
|
print("π― Type your messages below (type '/quit' to exit)") |
|
|
print("=" * 60) |
|
|
|
|
|
conversation_history = [] |
|
|
|
|
|
while True: |
|
|
try: |
|
|
user_input = input("\nπ€ You: ").strip() |
|
|
|
|
|
if user_input.lower() in ['/quit', '/exit', '/bye']: |
|
|
print("π Goodbye! It was amazing chatting with you!") |
|
|
break |
|
|
|
|
|
if user_input.lower() == '/clear': |
|
|
conversation_history = [] |
|
|
print("π§Ή Conversation cleared!") |
|
|
continue |
|
|
|
|
|
if user_input.lower() == '/help': |
|
|
print("\nπ Available commands:") |
|
|
print(" /quit - Exit the chat") |
|
|
print(" /clear - Clear conversation history") |
|
|
print(" /help - Show this help message") |
|
|
continue |
|
|
|
|
|
if not user_input: |
|
|
continue |
|
|
|
|
|
|
|
|
print("π Thinking...", end="\r") |
|
|
start_time = time.time() |
|
|
|
|
|
response = self.generate_response(user_input, max_tokens=2048) |
|
|
|
|
|
generation_time = time.time() - start_time |
|
|
|
|
|
|
|
|
conversation_history.append({"user": user_input, "ai": response}) |
|
|
|
|
|
|
|
|
print(f"π€ AI ({generation_time:.1f}s): {response}") |
|
|
|
|
|
except KeyboardInterrupt: |
|
|
print("\n\nπ Thanks for chatting! Goodbye!") |
|
|
break |
|
|
except Exception as e: |
|
|
print(f"\nβ Error: {e}") |
|
|
|
|
|
def demonstration_mode(): |
|
|
"""Showcase the model's capabilities with beautiful formatting""" |
|
|
print("\n" + "=" * 60) |
|
|
print("π GPT-OSS-120B Capabilities Demonstration") |
|
|
print("=" * 60) |
|
|
|
|
|
ai = GPTOSSChat() |
|
|
|
|
|
demonstrations = [ |
|
|
{ |
|
|
"prompt": "Explain quantum computing like I'm 10 years old", |
|
|
"description": "Simplified explanation" |
|
|
}, |
|
|
{ |
|
|
"prompt": "Write a beautiful haiku about the ocean and technology", |
|
|
"description": "Creative writing" |
|
|
}, |
|
|
{ |
|
|
"prompt": "What are the most exciting recent developments in AI?", |
|
|
"description": "Technical knowledge" |
|
|
}, |
|
|
{ |
|
|
"prompt": "How would you describe the feeling of wonder to an alien?", |
|
|
"description": "Philosophical reasoning" |
|
|
}, |
|
|
{ |
|
|
"prompt": "Create a short story about a robot who discovers poetry", |
|
|
"description": "Creative fiction" |
|
|
} |
|
|
] |
|
|
|
|
|
for i, demo in enumerate(demonstrations, 1): |
|
|
print(f"\n{i}. π {demo['description']}") |
|
|
print(f" π '{demo['prompt']}'") |
|
|
|
|
|
response = ai.generate_response(demo['prompt'], max_tokens=2048) |
|
|
|
|
|
|
|
|
lines = response.split('\n') |
|
|
for line in lines: |
|
|
print(f" π€ {line}") |
|
|
|
|
|
print(" " + "β" * 50) |
|
|
time.sleep(2) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("π Starting GPT-OSS-120B Chat System") |
|
|
print("πΎ Model: 120B parameters, 4-bit quantized") |
|
|
print("π Hardware: Apple M3 Ultra with 512GB RAM") |
|
|
print("β‘ Performance: ~95 tokens/second") |
|
|
|
|
|
|
|
|
chat = GPTOSSChat() |
|
|
|
|
|
|
|
|
demonstration_mode() |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("π¬ Starting Interactive Chat Mode...") |
|
|
print("=" * 60) |
|
|
chat.interactive_chat() |