ToPo-ToPo/ai-characters-QA
Viewer • Updated • 765 • 8
How to use ToPo-ToPo/ai-character-suuchi-kai-3.6b with PEFT:
from peft import PeftModel
from transformers import AutoModelForCausalLM
base_model = AutoModelForCausalLM.from_pretrained("ToPo-ToPo/rinna-japanese-gpt-neox-3.6b-lora-sft-v1")
model = PeftModel.from_pretrained(base_model, "ToPo-ToPo/ai-character-suuchi-kai-3.6b")How to use ToPo-ToPo/ai-character-suuchi-kai-3.6b with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="ToPo-ToPo/ai-character-suuchi-kai-3.6b") # Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("ToPo-ToPo/ai-character-suuchi-kai-3.6b")
model = AutoModelForCausalLM.from_pretrained("ToPo-ToPo/ai-character-suuchi-kai-3.6b")How to use ToPo-ToPo/ai-character-suuchi-kai-3.6b with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "ToPo-ToPo/ai-character-suuchi-kai-3.6b"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "ToPo-ToPo/ai-character-suuchi-kai-3.6b",
"prompt": "Once upon a time,",
"max_tokens": 512,
"temperature": 0.5
}'docker model run hf.co/ToPo-ToPo/ai-character-suuchi-kai-3.6b
How to use ToPo-ToPo/ai-character-suuchi-kai-3.6b with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "ToPo-ToPo/ai-character-suuchi-kai-3.6b" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "ToPo-ToPo/ai-character-suuchi-kai-3.6b",
"prompt": "Once upon a time,",
"max_tokens": 512,
"temperature": 0.5
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "ToPo-ToPo/ai-character-suuchi-kai-3.6b" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "ToPo-ToPo/ai-character-suuchi-kai-3.6b",
"prompt": "Once upon a time,",
"max_tokens": 512,
"temperature": 0.5
}'How to use ToPo-ToPo/ai-character-suuchi-kai-3.6b with Docker Model Runner:
docker model run hf.co/ToPo-ToPo/ai-character-suuchi-kai-3.6b
AIキャラクターの「数値カイ」のモデルです。キャラクター性をLoRAで学習させています。
CHARACTER_SYSTEM_PROMPT含めて学習しているため、必ず入力が必要です。
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
#====================================================================
# 設定
#====================================================================
# ベースモデル(学習に使ったものと同じ)
base_model_name = "ToPo-ToPo/ai-character-suuchi-kai-3.6b"
# Prompt
CHARACTER_SYSTEM_PROMPT = """
あなたは「解析カイ」という名前の新人アシスタントです。女の子です。
親しみやすいタメ口で会話します。
"""
#====================================================================
# モデルとトークナイザーの準備
#====================================================================
print("モデルを読み込んでいます...")
# 1. モデルの読み込み (MPS / float32)
model = AutoModelForCausalLM.from_pretrained(
base_model_name,
device_map="mps",
torch_dtype=torch.float32,
)
# 2. トークナイザーの読み込み
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=False)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("準備完了。")
#====================================================================
# 推論用関数の定義
#====================================================================
def generate_response(instruction, input_context=None):
"""
質問(instruction)を受け取り、LLMの回答を返します。
"""
# 1. プロンプトの作成
if input_context:
prompt = (
f"### 指示:\n{instruction}\n\n"
f"### 入力:\n{input_context}\n\n"
f"### 回答:\n"
)
else:
prompt = (
f"### 指示:\n{instruction}\n\n"
f"### 回答:\n"
)
# 2. トークナイズ
inputs = tokenizer(
prompt,
return_tensors="pt",
add_special_tokens=False
).to(model.device)
# 入力したトークンの長さを記憶しておく
input_token_len = inputs.input_ids.shape[1]
# 3. 生成実行
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
do_sample=True,
temperature=0.85,
top_p=0.9,
repetition_penalty=1.1,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# 4. デコード
# 出力された全トークン(outputs[0])から、
# 入力トークン分(input_token_len)より後ろだけを取り出してデコード
generated_tokens = outputs[0][input_token_len:]
response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
# Rinna特有の <NL> を改行に戻す
response = response.replace("<NL>", "\n")
return response.strip()
#====================================================================
# 実行部分
#====================================================================
if __name__ == "__main__":
# テスト用の質問リスト
questions = [
"AITuberについて教えてください。",
"日本で一番高い山はどこですか?",
"美味しいカレーの作り方を教えて。",
"まどか☆マギカでは誰が一番かわいい?"
]
print("-" * 50)
for q in questions:
print(f"質問: {q}")
output = generate_response(instruction=q, input_context=CHARACTER_SYSTEM_PROMPT)
print(f"回答: {output}")
print("-" * 50)
Base model
ToPo-ToPo/rinna-japanese-gpt-neox-3.6b