Instructions to use itriedcoding/Sage with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use itriedcoding/Sage with llama-cpp-python:

# !pip install llama-cpp-python

from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="itriedcoding/Sage",
	filename="sage-f16.gguf",
)

output = llm(
	"Once upon a time,",
	max_tokens=512,
	echo=True
)
print(output)

Notebooks
Google Colab
Kaggle
Local Apps Settings

llama.cpp

How to use itriedcoding/Sage with llama.cpp:

Install (macOS, Linux)

curl -LsSf https://llama.app/install.sh | sh
# Start a local OpenAI-compatible server with a web UI:
llama serve -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
llama cli -hf itriedcoding/Sage:F16

Install from WinGet (Windows)

winget install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama serve -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
llama cli -hf itriedcoding/Sage:F16

Use pre-built binary

# Download pre-built binary from:
# https://github.com/ggerganov/llama.cpp/releases
# Start a local OpenAI-compatible server with a web UI:
./llama-server -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
./llama-cli -hf itriedcoding/Sage:F16

Build from source code

git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
cmake -B build
cmake --build build -j --target llama-server llama-cli
# Start a local OpenAI-compatible server with a web UI:
./build/bin/llama-server -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
./build/bin/llama-cli -hf itriedcoding/Sage:F16

Use Docker

docker model run hf.co/itriedcoding/Sage:F16

LM Studio
Jan
Ollama
How to use itriedcoding/Sage with Ollama:
```
ollama run hf.co/itriedcoding/Sage:F16
```

Unsloth Studio

How to use itriedcoding/Sage with Unsloth Studio:

Install Unsloth Studio (macOS, Linux, WSL)

curl -fsSL https://unsloth.ai/install.sh | sh
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for itriedcoding/Sage to start chatting

Install Unsloth Studio (Windows)

irm https://unsloth.ai/install.ps1 | iex
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for itriedcoding/Sage to start chatting

Using HuggingFace Spaces for Unsloth

# No setup required
# Open https://huggingface.co/spaces/unsloth/studio in your browser
# Search for itriedcoding/Sage to start chatting

Atomic Chat new
Docker Model Runner
How to use itriedcoding/Sage with Docker Model Runner:
```
docker model run hf.co/itriedcoding/Sage:F16
```

Lemonade

How to use itriedcoding/Sage with Lemonade:

Pull the model

# Download Lemonade from https://lemonade-server.ai/
lemonade pull itriedcoding/Sage:F16

Run and chat with the model

lemonade run user.Sage-F16

List all available models

lemonade list

Sage

File size: 4,583 Bytes

f62675d

import torch
import gguf
import numpy as np
import os
import sys
import pickle

# Character tokenizer class for loading the checkpoint
class CharacterTokenizer:
    def __init__(self):
        self.char_to_idx = {}
        self.idx_to_char = {}
        self.vocab_size = 0
        self.pad_token_id = 0
        self.unk_token_id = 1
    def fit(self, texts):
        chars = set()
        for text in texts:
            chars.update(list(str(text)))
        self.char_to_idx['<PAD>'] = 0
        self.char_to_idx['<UNK>'] = 1
        for i, char in enumerate(sorted(chars)):
            self.char_to_idx[char] = i + 2
        self.idx_to_char = {v: k for k, v in self.char_to_idx.items()}
        self.vocab_size = len(self.char_to_idx)
    def encode(self, text, max_length=None, padding=False, truncation=False, return_tensors=None):
        if isinstance(text, str):
            text = [text]
        encoded = []
        for t in text:
            tokens = [self.char_to_idx.get(c, self.unk_token_id) for c in str(t)]
            if truncation and max_length:
                tokens = tokens[:max_length]
            if padding and max_length:
                tokens = tokens + [self.pad_token_id] * (max_length - len(tokens))
            encoded.append(tokens)
        if return_tensors == 'pt':
            return torch.tensor(encoded, dtype=torch.long)
        return encoded
    def decode(self, token_ids):
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.tolist()
        chars = [self.idx_to_char.get(idx, '<UNK>') for idx in token_ids]
        return ''.join(chars)

def convert_sage_to_gguf(model_path, output_path):
    checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
    state_dict = checkpoint['model_state_dict']
    
    gguf_writer = gguf.GGUFWriter(output_path, "transformer_lm")
    
    # Add metadata
    gguf_writer.add_context_length(64)
    gguf_writer.add_embedding_length(256)
    gguf_writer.add_block_count(4)
    gguf_writer.add_feed_forward_length(1024)
    gguf_writer.add_head_count(8)
    gguf_writer.add_head_count_kv(8)
    gguf_writer.add_vocab_size(checkpoint['model_config']['vocab_size'])
    gguf_writer.add_layer_norm_rms_eps(1e-5)
    gguf_writer.add_name("Sage")
    gguf_writer.add_license("MIT")
    
    # Map Sage's tensor names to GGUF format
    tensor_map = {}
    
    # Embedding layers
    tensor_map['embedding.weight'] = 'token_embd.weight'
    tensor_map['pos_embedding.weight'] = 'position_embd.weight'
    tensor_map['output_layer.weight'] = 'output.weight'
    tensor_map['output_layer.bias'] = 'output.bias'
    
    # Per-layer mappings
    for i in range(4):
        p = f'transformer_encoder.layers.{i}'
        tensor_map[f'{p}.self_attn.in_proj_weight'] = f'blk.{i}.attn_q.weight'
        tensor_map[f'{p}.self_attn.in_proj_bias'] = f'blk.{i}.attn_q.bias'
        tensor_map[f'{p}.self_attn.out_proj.weight'] = f'blk.{i}.attn_output.weight'
        tensor_map[f'{p}.self_attn.out_proj.bias'] = f'blk.{i}.attn_output.bias'
        tensor_map[f'{p}.linear1.weight'] = f'blk.{i}.ffn_gate.weight'
        tensor_map[f'{p}.linear1.bias'] = f'blk.{i}.ffn_gate.bias'
        tensor_map[f'{p}.linear2.weight'] = f'blk.{i}.ffn_down.weight'
        tensor_map[f'{p}.linear2.bias'] = f'blk.{i}.ffn_down.bias'
        tensor_map[f'{p}.norm1.weight'] = f'blk.{i}.attn_norm.weight'
        tensor_map[f'{p}.norm1.bias'] = f'blk.{i}.attn_norm.bias'
        tensor_map[f'{p}.norm2.weight'] = f'blk.{i}.ffn_norm.weight'
        tensor_map[f'{p}.norm2.bias'] = f'blk.{i}.ffn_norm.bias'
    
    # Write tensors
    for orig_name in state_dict:
        tensor = state_dict[orig_name]
        mapped_name = tensor_map.get(orig_name, orig_name)
        arr = tensor.numpy().astype(np.float32)
        gguf_writer.add_tensor(mapped_name, arr)
    
    gguf_writer.write_header_to_file()
    gguf_writer.write_kv_data_to_file()
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()
    
    print(f"GGUF file created: {output_path}")
    print(f"Total tensors written: {len(state_dict)}")
    print(f"NOTE: This GGUF file uses a custom architecture 'transformer_lm'")
    print(f"      and will NOT load in standard llama.cpp/llama-cpp-python")
    print(f"      without adding custom architecture support.")

script_dir = os.path.dirname(os.path.abspath(__file__))
pytorch_bin = os.path.join(script_dir, "pytorch_model.bin")
if os.path.exists(pytorch_bin):
    convert_sage_to_gguf(pytorch_bin, "sage-f16.gguf")
else:
    print(f"Model file {pytorch_bin} not found")