|
|
--- |
|
|
license: other |
|
|
license_name: nvidia-open-model-license |
|
|
license_link: >- |
|
|
https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/ |
|
|
base_model: nvidia/Nemotron-Cascade-14B-Thinking |
|
|
tags: |
|
|
- text-generation-inference |
|
|
- transformers |
|
|
- qwen3 |
|
|
- nemotron |
|
|
- vllm |
|
|
- cascade |
|
|
- thinking |
|
|
- nvidia |
|
|
- nemotron-cascade |
|
|
- reasoning |
|
|
- general-purpose |
|
|
- SFT |
|
|
- RL |
|
|
- pytorch |
|
|
|
|
|
language: |
|
|
- en |
|
|
library_name: transformers |
|
|
--- |
|
|
|
|
|
# Cascade-Droidz |
|
|
|
|
|
A fine-tuned version of nvidia/Nemotron-Cascade-14B-Thinking optimized for chat applications. |
|
|
|
|
|
|
|
|
|
|
|
## Training |
|
|
|
|
|
This model uses an experimental approach to fine-tuning with a custom dataset designed for conversational tasks. |
|
|
|
|
|
## Inference |
|
|
|
|
|
### Transformers |
|
|
|
|
|
```python |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"Daemontatox/Cascade-Droidz", |
|
|
device_map="auto", |
|
|
torch_dtype="auto" |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained("Daemontatox/Cascade-Droidz") |
|
|
|
|
|
messages = [ |
|
|
{"role": "user", "content": "Hello, how are you?"} |
|
|
] |
|
|
|
|
|
input_ids = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
add_generation_prompt=True, |
|
|
return_tensors="pt" |
|
|
).to(model.device) |
|
|
|
|
|
outputs = model.generate( |
|
|
input_ids, |
|
|
max_new_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
do_sample=True |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True) |
|
|
print(response) |
|
|
``` |
|
|
|
|
|
### vLLM |
|
|
|
|
|
```python |
|
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
llm = LLM( |
|
|
model="Daemontatox/Cascade-Droidz", |
|
|
tensor_parallel_size=4, # Adjust based on GPU count |
|
|
dtype="auto" |
|
|
) |
|
|
|
|
|
sampling_params = SamplingParams( |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
max_tokens=512 |
|
|
) |
|
|
|
|
|
prompts = ["Hello, how are you?"] |
|
|
outputs = llm.generate(prompts, sampling_params) |
|
|
|
|
|
for output in outputs: |
|
|
print(output.outputs[0].text) |
|
|
``` |
|
|
|
|
|
### vLLM OpenAI-Compatible Server |
|
|
|
|
|
```bash |
|
|
vllm serve Daemontatox/Cascade-Droidz \ |
|
|
--tensor-parallel-size 4 \ |
|
|
--dtype auto \ |
|
|
--max-model-len 4096 |
|
|
``` |
|
|
|
|
|
```python |
|
|
from openai import OpenAI |
|
|
|
|
|
client = OpenAI( |
|
|
base_url="http://localhost:8000/v1", |
|
|
api_key="token-abc123" |
|
|
) |
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model="Daemontatox/Cascade-Droidz", |
|
|
messages=[ |
|
|
{"role": "user", "content": "Hello, how are you?"} |
|
|
], |
|
|
temperature=0.7, |
|
|
max_tokens=512 |
|
|
) |
|
|
|
|
|
print(response.choices[0].message.content) |
|
|
``` |
|
|
|
|
|
### TensorRT-LLM |
|
|
|
|
|
#### Convert to TensorRT-LLM format |
|
|
|
|
|
```bash |
|
|
convert_checkpoint.py \ |
|
|
--model_dir Daemontatox/Cascade-Droidz \ |
|
|
--output_dir ./trt_ckpt \ |
|
|
--dtype float16 \ |
|
|
--tp_size 4 |
|
|
``` |
|
|
|
|
|
#### Build TensorRT engine |
|
|
|
|
|
```bash |
|
|
trtllm-build \ |
|
|
--checkpoint_dir ./trt_ckpt \ |
|
|
--output_dir ./trt_engine \ |
|
|
--gemm_plugin float16 \ |
|
|
--max_batch_size 8 \ |
|
|
--max_input_len 2048 \ |
|
|
--max_output_len 512 |
|
|
``` |
|
|
|
|
|
```python |
|
|
from tensorrt_llm import LLM |
|
|
|
|
|
llm = LLM(model="./trt_engine") |
|
|
|
|
|
prompts = ["Hello, how are you?"] |
|
|
outputs = llm.generate(prompts, max_new_tokens=512) |
|
|
|
|
|
for output in outputs: |
|
|
print(output.text) |
|
|
``` |
|
|
|
|
|
### Modular MAX |
|
|
|
|
|
```bash |
|
|
# Serve with MAX Engine |
|
|
max serve Daemontatox/Cascade-Droidz \ |
|
|
--port 8000 \ |
|
|
--tensor-parallel-size 4 |
|
|
``` |
|
|
|
|
|
```python |
|
|
from max import engine |
|
|
|
|
|
# Load model with MAX |
|
|
model = engine.InferenceSession( |
|
|
"Daemontatox/Cascade-Droidz", |
|
|
device="cuda", |
|
|
tensor_parallel=4 |
|
|
) |
|
|
|
|
|
# Run inference |
|
|
prompt = "Hello, how are you?" |
|
|
output = model.generate( |
|
|
prompt, |
|
|
max_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9 |
|
|
) |
|
|
|
|
|
print(output.text) |
|
|
``` |
|
|
|
|
|
```python |
|
|
# Using MAX with Python API |
|
|
from max.serve import serve |
|
|
from max.pipelines import pipeline |
|
|
|
|
|
# Create pipeline |
|
|
pipe = pipeline( |
|
|
"text-generation", |
|
|
model="Daemontatox/Cascade-Droidz", |
|
|
device="cuda", |
|
|
tensor_parallel=4 |
|
|
) |
|
|
|
|
|
# Generate |
|
|
result = pipe( |
|
|
"Hello, how are you?", |
|
|
max_new_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9 |
|
|
) |
|
|
|
|
|
print(result[0]["generated_text"]) |
|
|
``` |
|
|
|
|
|
## Limitations |
|
|
|
|
|
As this uses experimental training methods, results may vary. Test thoroughly before production use. |
|
|
``` |