| import requests |
| import json |
| import os |
| import argparse |
|
|
| def query_endpoint(endpoint_url, api_token=None, prompt="Hello, how are you?", |
| system_message="You are a helpful assistant.", |
| max_tokens=256, temperature=0.7, |
| format_type="openai"): |
| """ |
| Query the Phi-4 Mini model at the specified HuggingFace Inference Endpoint. |
| |
| Args: |
| endpoint_url: The URL of your HuggingFace Inference Endpoint |
| api_token: Your HuggingFace API token (if needed) |
| prompt: The user message to send to the model |
| system_message: The system message to include |
| max_tokens: Maximum number of tokens to generate |
| temperature: Temperature for generation (0.0 to 1.0) |
| format_type: Type of request format to use: |
| "openai" - Standard OpenAI format |
| "hf_wrapped" - HuggingFace format with OpenAI format wrapped in "inputs" |
| "simple" - Simple text input in "inputs" field |
| |
| Returns: |
| The response from the model |
| """ |
| |
| headers = { |
| "Content-Type": "application/json" |
| } |
| |
| if api_token: |
| headers["Authorization"] = f"Bearer {api_token}" |
| |
| |
| if format_type == "openai": |
| |
| payload = { |
| "messages": [ |
| {"role": "system", "content": system_message}, |
| {"role": "user", "content": prompt} |
| ], |
| "max_tokens": max_tokens, |
| "temperature": temperature |
| } |
| elif format_type == "hf_wrapped": |
| |
| payload = { |
| "inputs": { |
| "messages": [ |
| {"role": "system", "content": system_message}, |
| {"role": "user", "content": prompt} |
| ], |
| "max_tokens": max_tokens, |
| "temperature": temperature |
| } |
| } |
| elif format_type == "simple": |
| |
| payload = { |
| "inputs": prompt |
| } |
| else: |
| raise ValueError(f"Invalid format type: {format_type}") |
| |
| |
| try: |
| print(f"Request payload: {json.dumps(payload, indent=2)}") |
| response = requests.post(endpoint_url, headers=headers, data=json.dumps(payload)) |
| response.raise_for_status() |
| |
| |
| return response.json() |
| except requests.exceptions.RequestException as e: |
| print(f"Error making request: {e}") |
| if hasattr(e, 'response') and e.response: |
| print(f"Response content: {e.response.text}") |
| return None |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Query a Phi-4 Mini HuggingFace Inference Endpoint") |
| parser.add_argument("--url", type=str, required=True, help="The endpoint URL") |
| parser.add_argument("--token", type=str, default=os.environ.get("HF_API_TOKEN"), help="HuggingFace API token") |
| parser.add_argument("--prompt", type=str, default="Explain quantum computing in simple terms.", help="User prompt") |
| parser.add_argument("--system", type=str, default="You are a helpful assistant.", help="System message") |
| parser.add_argument("--max_tokens", type=int, default=256, help="Maximum tokens to generate") |
| parser.add_argument("--temperature", type=float, default=0.7, help="Temperature (0.0 to 1.0)") |
| parser.add_argument("--format", type=str, default="openai", |
| choices=["openai", "hf_wrapped", "simple"], |
| help="Format to use for the request") |
| |
| args = parser.parse_args() |
| |
| print(f"Querying endpoint: {args.url}") |
| print(f"Prompt: {args.prompt}") |
| print(f"Format: {args.format}") |
| |
| response = query_endpoint( |
| args.url, |
| args.token, |
| args.prompt, |
| args.system, |
| args.max_tokens, |
| args.temperature, |
| args.format |
| ) |
| |
| if response: |
| print("\nResponse:") |
| if "choices" in response and len(response["choices"]) > 0: |
| print(response["choices"][0]["message"]["content"]) |
| else: |
| print(json.dumps(response, indent=2)) |
| else: |
| print("Failed to get a valid response") |