""" Client for Qwen ONNX Model API Use this to interact with the api_server.py """ import requests import json from typing import List, Dict, Optional class QwenAPIClient: def __init__(self, base_url: str = "http://localhost:8000"): """ Initialize the API client. Args: base_url: Base URL of the API server (default: localhost:8000) """ self.base_url = base_url.rstrip("/") self.session = requests.Session() def health_check(self) -> Dict: """Check if the API is healthy""" response = self.session.get(f"{self.base_url}/health") response.raise_for_status() return response.json() def generate( self, prompt: str, max_length: int = 100, temperature: float = 0.6, top_p: float = 0.95, top_k: int = 20 ) -> Dict: """ Generate text from a prompt. Args: prompt: Input prompt max_length: Maximum tokens to generate temperature: Sampling temperature top_p: Top-p sampling parameter top_k: Top-k sampling parameter Returns: Response with generated text """ payload = { "prompt": prompt, "max_length": max_length, "temperature": temperature, "top_p": top_p, "top_k": top_k } response = self.session.post( f"{self.base_url}/generate", json=payload ) response.raise_for_status() return response.json() def chat( self, messages: List[Dict[str, str]], max_length: int = 200, temperature: float = 0.6, top_p: float = 0.95, top_k: int = 20 ) -> Dict: """ Chat with the model. Args: messages: List of message dicts with 'role' and 'content' max_length: Maximum tokens to generate temperature: Sampling temperature top_p: Top-p sampling parameter top_k: Top-k sampling parameter Returns: Response with assistant message """ payload = { "messages": messages, "max_length": max_length, "temperature": temperature, "top_p": top_p, "top_k": top_k } response = self.session.post( f"{self.base_url}/chat", json=payload ) response.raise_for_status() return response.json() def tokenize(self, text: str) -> Dict: """ Tokenize text. Args: text: Text to tokenize Returns: Response with token IDs """ payload = {"text": text} response = self.session.post( f"{self.base_url}/tokenize", json=payload ) response.raise_for_status() return response.json() def model_info(self) -> Dict: """Get model information""" response = self.session.get(f"{self.base_url}/info") response.raise_for_status() return response.json() def main(): """Example usage of the API client""" # Initialize client client = QwenAPIClient("http://localhost:8000") # Check health print("Checking API health...") try: health = client.health_check() print(f"Status: {health['status']}\n") except requests.exceptions.ConnectionError: print("ERROR: Could not connect to API server.") print("Make sure to run: python api_server.py") return # Example 1: Generate text print("="*60) print("Example 1: Text Generation") print("="*60) try: result = client.generate( prompt="What is artificial intelligence?", max_length=150, temperature=0.7 ) print(f"Prompt: {result['prompt']}") print(f"Response: {result['generated_text']}\n") except Exception as e: print(f"Error: {e}\n") # Example 2: Chat print("="*60) print("Example 2: Chat") print("="*60) try: messages = [ {"role": "system", "content": "You are a helpful Python assistant."}, {"role": "user", "content": "How do I read a file in Python?"} ] result = client.chat(messages, max_length=200) print(f"User: {messages[-1]['content']}") print(f"Assistant: {result['assistant_response']}\n") except Exception as e: print(f"Error: {e}\n") # Example 3: Multi-turn conversation print("="*60) print("Example 3: Multi-turn Chat") print("="*60) try: conversation = [ {"role": "system", "content": "You are an expert programmer."} ] # Turn 1 conversation.append({"role": "user", "content": "What is recursion?"}) result1 = client.chat(conversation, max_length=150) response1 = result1['assistant_response'] print(f"User: {conversation[-1]['content']}") print(f"Assistant: {response1}\n") # Add to conversation and continue conversation.append({"role": "assistant", "content": response1}) conversation.append({"role": "user", "content": "Can you give a code example?"}) result2 = client.chat(conversation, max_length=200) response2 = result2['assistant_response'] print(f"User: {conversation[-1]['content']}") print(f"Assistant: {response2}\n") except Exception as e: print(f"Error: {e}\n") # Example 4: Tokenization print("="*60) print("Example 4: Tokenization") print("="*60) try: result = client.tokenize("Hello, world!") print(f"Text: {result['text']}") print(f"Tokens: {result['token_ids']}") print(f"Number of tokens: {result['num_tokens']}\n") except Exception as e: print(f"Error: {e}\n") # Example 5: Model info print("="*60) print("Example 5: Model Information") print("="*60) try: info = client.model_info() print(f"Model Type: {info['model_type']}") print(f"Context Length: {info['context_length']}") print(f"Vocabulary Size: {info['vocab_size']}") print(f"Default Max Length: {info['default_max_length']}") print(f"Default Temperature: {info['default_temperature']}\n") except Exception as e: print(f"Error: {e}\n") # Interactive chat mode print("="*60) print("Interactive Chat Mode") print("="*60) print("Enter 'quit' to exit\n") conversation = [ {"role": "system", "content": "You are a helpful assistant."} ] while True: user_input = input("You: ").strip() if user_input.lower() == "quit": break if not user_input: continue try: conversation.append({"role": "user", "content": user_input}) result = client.chat(conversation, max_length=300, temperature=0.7) response = result['assistant_response'] print(f"Assistant: {response}\n") conversation.append({"role": "assistant", "content": response}) except Exception as e: print(f"Error: {e}\n") if __name__ == "__main__": main()