|
|
--- |
|
|
base_model: openai/gpt-oss-120b |
|
|
tags: |
|
|
- text-generation-inference |
|
|
- transformers |
|
|
- gpt_oss |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- en |
|
|
library_name: transformers |
|
|
--- |
|
|
|
|
|
# GPT-Z |
|
|
|
|
|
A fine-tuned version of openai/gpt-gpt-oss-120b optimized for chat applications. |
|
|
|
|
|
## Model Details |
|
|
|
|
|
- **Base Model:** openai/gpt-gpt-oss-120b |
|
|
- **Fine-tuned by:** Daemontatox |
|
|
- **Purpose:** Chat/Conversational AI |
|
|
- **Training:** Experimental dataset and fine-tuning methodology |
|
|
- **Parameters:** 120B |
|
|
- **Language:** English |
|
|
|
|
|
## Training |
|
|
|
|
|
This model uses an experimental approach to fine-tuning with a custom dataset designed for conversational tasks. |
|
|
|
|
|
## Inference |
|
|
|
|
|
### Transformers |
|
|
|
|
|
```python |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"Daemontatox/GPT-Z", |
|
|
device_map="auto", |
|
|
torch_dtype="auto" |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained("Daemontatox/GPT-Z") |
|
|
|
|
|
messages = [ |
|
|
{"role": "user", "content": "Hello, how are you?"} |
|
|
] |
|
|
|
|
|
input_ids = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
add_generation_prompt=True, |
|
|
return_tensors="pt" |
|
|
).to(model.device) |
|
|
|
|
|
outputs = model.generate( |
|
|
input_ids, |
|
|
max_new_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
do_sample=True |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True) |
|
|
print(response) |
|
|
``` |
|
|
# vLLM |
|
|
``` |
|
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
llm = LLM( |
|
|
model="Daemontatox/GPT-Z", |
|
|
tensor_parallel_size=4, # Adjust based on GPU count |
|
|
dtype="auto" |
|
|
) |
|
|
|
|
|
sampling_params = SamplingParams( |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
max_tokens=512 |
|
|
) |
|
|
|
|
|
prompts = ["Hello, how are you?"] |
|
|
outputs = llm.generate(prompts, sampling_params) |
|
|
|
|
|
for output in outputs: |
|
|
print(output.outputs[0].text) |
|
|
``` |
|
|
# vLLM OpenAI-Compatible Server |
|
|
``` |
|
|
vllm serve Daemontatox/GPT-Z \ |
|
|
--tensor-parallel-size 4 \ |
|
|
--dtype auto \ |
|
|
--max-model-len 4096 |
|
|
from openai import OpenAI |
|
|
|
|
|
client = OpenAI( |
|
|
base_url="http://localhost:8000/v1", |
|
|
api_key="token-abc123" |
|
|
) |
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model="Daemontatox/GPT-Z", |
|
|
messages=[ |
|
|
{"role": "user", "content": "Hello, how are you?"} |
|
|
], |
|
|
temperature=0.7, |
|
|
max_tokens=512 |
|
|
) |
|
|
|
|
|
print(response.choices[0].message.content) |
|
|
``` |
|
|
# TensorRT-LLM |
|
|
|
|
|
# Convert to TensorRT-LLM format |
|
|
``` |
|
|
convert_checkpoint.py \ |
|
|
--model_dir Daemontatox/GPT-Z \ |
|
|
--output_dir ./trt_ckpt \ |
|
|
--dtype float16 \ |
|
|
--tp_size 4 |
|
|
``` |
|
|
# Build TensorRT engine |
|
|
``` |
|
|
trtllm-build \ |
|
|
--checkpoint_dir ./trt_ckpt \ |
|
|
--output_dir ./trt_engine \ |
|
|
--gemm_plugin float16 \ |
|
|
--max_batch_size 8 \ |
|
|
--max_input_len 2048 \ |
|
|
--max_output_len 512 |
|
|
from tensorrt_llm import LLM |
|
|
|
|
|
llm = LLM(model="./trt_engine") |
|
|
|
|
|
prompts = ["Hello, how are you?"] |
|
|
outputs = llm.generate(prompts, max_new_tokens=512) |
|
|
|
|
|
for output in outputs: |
|
|
print(output.text) |
|
|
``` |
|
|
# Modular MAX |
|
|
``` |
|
|
# Serve with MAX Engine |
|
|
max serve Daemontatox/GPT-Z \ |
|
|
--port 8000 \ |
|
|
--tensor-parallel-size 4 |
|
|
from max import engine |
|
|
|
|
|
# Load model with MAX |
|
|
model = engine.InferenceSession( |
|
|
"Daemontatox/GPT-Z", |
|
|
device="cuda", |
|
|
tensor_parallel=4 |
|
|
) |
|
|
|
|
|
# Run inference |
|
|
prompt = "Hello, how are you?" |
|
|
output = model.generate( |
|
|
prompt, |
|
|
max_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9 |
|
|
) |
|
|
|
|
|
print(output.text) |
|
|
# Using MAX with Python API |
|
|
from max.serve import serve |
|
|
from max.pipelines import pipeline |
|
|
|
|
|
# Create pipeline |
|
|
pipe = pipeline( |
|
|
"text-generation", |
|
|
model="Daemontatox/GPT-Z", |
|
|
device="cuda", |
|
|
tensor_parallel=4 |
|
|
) |
|
|
|
|
|
# Generate |
|
|
result = pipe( |
|
|
"Hello, how are you?", |
|
|
max_new_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9 |
|
|
) |
|
|
|
|
|
print(result[0]["generated_text"]) |
|
|
``` |
|
|
## Limitations |
|
|
As this uses experimental training methods, results may vary. Test thoroughly before production use. |