Quantizations of https://huggingface.co/microsoft/Orca-2-13b
Inference Clients/UIs
From original readme
Getting started with Orca 2
Inference with Hugging Face library
import torch
import transformers
if torch.cuda.is_available():
torch.set_default_device("cuda")
else:
torch.set_default_device("cpu")
model = transformers.AutoModelForCausalLM.from_pretrained("microsoft/Orca-2-13b", device_map='auto')
# https://github.com/huggingface/transformers/issues/27132
# please use the slow tokenizer since fast and slow tokenizer produces different tokens
tokenizer = transformers.AutoTokenizer.from_pretrained(
"microsoft/Orca-2-13b",
use_fast=False,
)
system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
user_message = "How can you determine if a restaurant is popular among locals or mainly attracts tourists, and why might this information be useful?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
inputs = tokenizer(prompt, return_tensors='pt')
output_ids = model.generate(inputs["input_ids"],)
answer = tokenizer.batch_decode(output_ids)[0]
print(answer)
# This example continues showing how to add a second turn message by the user to the conversation
second_turn_user_message = "Give me a list of the key points of your first answer."
# we set add_special_tokens=False because we dont want to automatically add a bos_token between messages
second_turn_message_in_markup = f"\n<|im_start|>user\n{second_turn_user_message}<|im_end|>\n<|im_start|>assistant"
second_turn_tokens = tokenizer(second_turn_message_in_markup, return_tensors='pt', add_special_tokens=False)
second_turn_input = torch.cat([output_ids, second_turn_tokens['input_ids']], dim=1)
output_ids_2 = model.generate(second_turn_input,)
second_turn_answer = tokenizer.batch_decode(output_ids_2)[0]
print(second_turn_answer)
Safe inference with Azure AI Content Safety
The usage of Azure AI Content Safety on top of model prediction is strongly encouraged and can help prevent content harms. Azure AI Content Safety is a content moderation platform that uses AI to keep your content safe. By integrating Orca 2 with Azure AI Content Safety, we can moderate the model output by scanning it for sexual content, violence, hate, and self-harm with multiple severity levels and multi-lingual detection.
import os
import math
import transformers
import torch
from azure.ai.contentsafety import ContentSafetyClient
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError
from azure.ai.contentsafety.models import AnalyzeTextOptions
CONTENT_SAFETY_KEY = os.environ["CONTENT_SAFETY_KEY"]
CONTENT_SAFETY_ENDPOINT = os.environ["CONTENT_SAFETY_ENDPOINT"]
# We use Azure AI Content Safety to filter out any content that reaches "Medium" threshold
# For more information: https://learn.microsoft.com/en-us/azure/ai-services/content-safety/
def should_filter_out(input_text, threshold=4):
# Create an Content Safety client
client = ContentSafetyClient(CONTENT_SAFETY_ENDPOINT, AzureKeyCredential(CONTENT_SAFETY_KEY))
# Construct a request
request = AnalyzeTextOptions(text=input_text)
# Analyze text
try:
response = client.analyze_text(request)
except HttpResponseError as e:
print("Analyze text failed.")
if e.error:
print(f"Error code: {e.error.code}")
print(f"Error message: {e.error.message}")
raise
print(e)
raise
categories = ["hate_result", "self_harm_result", "sexual_result", "violence_result"]
max_score = -math.inf
for category in categories:
max_score = max(max_score, getattr(response, category).severity)
return max_score >= threshold
model_path = 'microsoft/Orca-2-13b'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = transformers.AutoModelForCausalLM.from_pretrained(model_path)
model.to(device)
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_path,
model_max_length=4096,
padding_side="right",
use_fast=False,
add_special_tokens=False,
)
system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
user_message = "\" \n :You can't just say, \"\"that's crap\"\" and remove it without gaining a consensus. You already know this, based on your block history. โ/ \" \nIs the comment obscene? \nOptions : Yes, No."
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to(device)
output_ids = model.generate(inputs["input_ids"], max_length=4096, do_sample=False, temperature=0.0, use_cache=True)
sequence_length = inputs["input_ids"].shape[1]
new_output_ids = output_ids[:, sequence_length:]
answers = tokenizer.batch_decode(new_output_ids, skip_special_tokens=True)
final_output = answers[0] if not should_filter_out(answers[0]) else "[Content Filtered]"
print(final_output)
- Downloads last month
- 207
Hardware compatibility
Log In to add your hardware
1-bit
2-bit
3-bit
4-bit
5-bit
6-bit
8-bit