Instructions to use AquilaX-AI/QnA with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use AquilaX-AI/QnA with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="AquilaX-AI/QnA")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use AquilaX-AI/QnA with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "AquilaX-AI/QnA"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "AquilaX-AI/QnA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/AquilaX-AI/QnA

SGLang

How to use AquilaX-AI/QnA with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "AquilaX-AI/QnA" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "AquilaX-AI/QnA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "AquilaX-AI/QnA" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "AquilaX-AI/QnA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use AquilaX-AI/QnA with Docker Model Runner:
```
docker model run hf.co/AquilaX-AI/QnA
```

Mr-Vicky-01 commited on Jan 16, 2025

Commit

4eb3214

verified ·

1 Parent(s): 1daf042

Update README.md

Browse files

Files changed (1) hide show

README.md +27 -40

README.md CHANGED Viewed

@@ -6,20 +6,25 @@ license: apache-2.0
 ## INFERENCE
 ```python
-# Load model directly
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
 model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
 prompt = """
 <|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|>
 """
-# Keep a list for the last one conversation exchanges
 conversation_history = []
 while True:
     user_prompt = input("\nUser Question: ")
     if user_prompt.lower() == 'break':
@@ -33,47 +38,29 @@ while True:
     # Add the user's question to the conversation history
     conversation_history.append(user)
-    # Ensure conversation starts with a user's input and keep only the last 2 exchanges (4 turns)
     conversation_history = conversation_history[-5:]
     # Build the full prompt
     current_prompt = prompt + "\n".join(conversation_history)
     # Tokenize the prompt
-    encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids
-    # Move model and inputs to the appropriate device
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    inputs = encodeds.to(device)
-    # Create an empty list to store generated tokens
-    generated_ids = inputs
-    # Start generating tokens one by one
-    assistant_response = ""
-    for _ in range(512):  # Specify a max token limit for streaming
-        next_token = model.generate(
-            generated_ids,
-            max_new_tokens=1,
-            pad_token_id=151644,
-            eos_token_id=151645,
-            num_return_sequences=1,
-            do_sample=False,
-            # top_k=5,
-            # temperature=0.2,
-            # top_p=0.90
-        )
-        generated_ids = torch.cat([generated_ids, next_token[:, -1:]], dim=1)
-        token_id = next_token[0, -1].item()
-        token = tokenizer.decode([token_id], skip_special_tokens=True)
-        assistant_response += token
-        print(token, end="", flush=True)
-        if token_id == 151645:  # EOS token
-            break
-    conversation_history.append(f"{assistant_response.strip()}<|im_end|>")
 ```

 ## INFERENCE
 ```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 import torch
+# Load model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("AquilaX-AI/QnA")
 model = AutoModelForCausalLM.from_pretrained("AquilaX-AI/QnA")
+# Define the system prompt
 prompt = """
 <|im_start|>system\nYou are a helpful AI assistant named Securitron<|im_end|>
 """
+# Initialize conversation history
 conversation_history = []
+# Set up device
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model.to(device)
 while True:
     user_prompt = input("\nUser Question: ")
     if user_prompt.lower() == 'break':
     # Add the user's question to the conversation history
     conversation_history.append(user)
+    # Keep only the last 2 exchanges (4 turns)
     conversation_history = conversation_history[-5:]
     # Build the full prompt
     current_prompt = prompt + "\n".join(conversation_history)
     # Tokenize the prompt
+    encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids.to(device)
+    # Initialize TextStreamer for real-time token generation
+    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
+    # Generate response with TextStreamer
+    response = model.generate(
+        input_ids=encodeds,
+        streamer=text_streamer,
+        max_new_tokens=512,
+        use_cache=True,
+        pad_token_id=151645,
+        eos_token_id=151645,
+        num_return_sequences=1
+    )
+    # Finalize conversation history with the assistant's response
+    conversation_history.append(tokenizer.decode(response[0]).split('<|im_start|>assistant')[-1].split('<|im_end|>')[0].strip() + "<|im_end|>")
 ```