Instructions to use itriedcoding/Sage with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use itriedcoding/Sage with llama-cpp-python:

# !pip install llama-cpp-python

from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="itriedcoding/Sage",
	filename="sage-f16.gguf",
)

output = llm(
	"Once upon a time,",
	max_tokens=512,
	echo=True
)
print(output)

Notebooks
Google Colab
Kaggle
Local Apps Settings

llama.cpp

How to use itriedcoding/Sage with llama.cpp:

Install (macOS, Linux)

curl -LsSf https://llama.app/install.sh | sh
# Start a local OpenAI-compatible server with a web UI:
llama serve -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
llama cli -hf itriedcoding/Sage:F16

Install from WinGet (Windows)

winget install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama serve -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
llama cli -hf itriedcoding/Sage:F16

Use pre-built binary

# Download pre-built binary from:
# https://github.com/ggerganov/llama.cpp/releases
# Start a local OpenAI-compatible server with a web UI:
./llama-server -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
./llama-cli -hf itriedcoding/Sage:F16

Build from source code

git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
cmake -B build
cmake --build build -j --target llama-server llama-cli
# Start a local OpenAI-compatible server with a web UI:
./build/bin/llama-server -hf itriedcoding/Sage:F16
# Run inference directly in the terminal:
./build/bin/llama-cli -hf itriedcoding/Sage:F16

Use Docker

docker model run hf.co/itriedcoding/Sage:F16

LM Studio
Jan
Ollama
How to use itriedcoding/Sage with Ollama:
```
ollama run hf.co/itriedcoding/Sage:F16
```

Unsloth Studio

How to use itriedcoding/Sage with Unsloth Studio:

Install Unsloth Studio (macOS, Linux, WSL)

curl -fsSL https://unsloth.ai/install.sh | sh
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for itriedcoding/Sage to start chatting

Install Unsloth Studio (Windows)

irm https://unsloth.ai/install.ps1 | iex
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for itriedcoding/Sage to start chatting

Using HuggingFace Spaces for Unsloth

# No setup required
# Open https://huggingface.co/spaces/unsloth/studio in your browser
# Search for itriedcoding/Sage to start chatting

Atomic Chat new
Docker Model Runner
How to use itriedcoding/Sage with Docker Model Runner:
```
docker model run hf.co/itriedcoding/Sage:F16
```

Lemonade

How to use itriedcoding/Sage with Lemonade:

Pull the model

# Download Lemonade from https://lemonade-server.ai/
lemonade pull itriedcoding/Sage:F16

Run and chat with the model

lemonade run user.Sage-F16

List all available models

lemonade list

Sage / gguf_convert.py

itriedcoding

Upload folder using huggingface_hub

f62675d verified 28 days ago

Raw

History Blame Contribute Delete

4.58 kB

	import torch
	import gguf
	import numpy as np
	import os
	import sys
	import pickle

	# Character tokenizer class for loading the checkpoint
	class CharacterTokenizer:
	def __init__(self):
	self.char_to_idx = {}
	self.idx_to_char = {}
	self.vocab_size = 0
	self.pad_token_id = 0
	self.unk_token_id = 1
	def fit(self, texts):
	chars = set()
	for text in texts:
	chars.update(list(str(text)))
	self.char_to_idx['<PAD>'] = 0
	self.char_to_idx['<UNK>'] = 1
	for i, char in enumerate(sorted(chars)):
	self.char_to_idx[char] = i + 2
	self.idx_to_char = {v: k for k, v in self.char_to_idx.items()}
	self.vocab_size = len(self.char_to_idx)
	def encode(self, text, max_length=None, padding=False, truncation=False, return_tensors=None):
	if isinstance(text, str):
	text = [text]
	encoded = []
	for t in text:
	tokens = [self.char_to_idx.get(c, self.unk_token_id) for c in str(t)]
	if truncation and max_length:
	tokens = tokens[:max_length]
	if padding and max_length:
	tokens = tokens + [self.pad_token_id] * (max_length - len(tokens))
	encoded.append(tokens)
	if return_tensors == 'pt':
	return torch.tensor(encoded, dtype=torch.long)
	return encoded
	def decode(self, token_ids):
	if isinstance(token_ids, torch.Tensor):
	token_ids = token_ids.tolist()
	chars = [self.idx_to_char.get(idx, '<UNK>') for idx in token_ids]
	return ''.join(chars)

	def convert_sage_to_gguf(model_path, output_path):
	checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
	state_dict = checkpoint['model_state_dict']

	gguf_writer = gguf.GGUFWriter(output_path, "transformer_lm")

	# Add metadata
	gguf_writer.add_context_length(64)
	gguf_writer.add_embedding_length(256)
	gguf_writer.add_block_count(4)
	gguf_writer.add_feed_forward_length(1024)
	gguf_writer.add_head_count(8)
	gguf_writer.add_head_count_kv(8)
	gguf_writer.add_vocab_size(checkpoint['model_config']['vocab_size'])
	gguf_writer.add_layer_norm_rms_eps(1e-5)
	gguf_writer.add_name("Sage")
	gguf_writer.add_license("MIT")

	# Map Sage's tensor names to GGUF format
	tensor_map = {}

	# Embedding layers
	tensor_map['embedding.weight'] = 'token_embd.weight'
	tensor_map['pos_embedding.weight'] = 'position_embd.weight'
	tensor_map['output_layer.weight'] = 'output.weight'
	tensor_map['output_layer.bias'] = 'output.bias'

	# Per-layer mappings
	for i in range(4):
	p = f'transformer_encoder.layers.{i}'
	tensor_map[f'{p}.self_attn.in_proj_weight'] = f'blk.{i}.attn_q.weight'
	tensor_map[f'{p}.self_attn.in_proj_bias'] = f'blk.{i}.attn_q.bias'
	tensor_map[f'{p}.self_attn.out_proj.weight'] = f'blk.{i}.attn_output.weight'
	tensor_map[f'{p}.self_attn.out_proj.bias'] = f'blk.{i}.attn_output.bias'
	tensor_map[f'{p}.linear1.weight'] = f'blk.{i}.ffn_gate.weight'
	tensor_map[f'{p}.linear1.bias'] = f'blk.{i}.ffn_gate.bias'
	tensor_map[f'{p}.linear2.weight'] = f'blk.{i}.ffn_down.weight'
	tensor_map[f'{p}.linear2.bias'] = f'blk.{i}.ffn_down.bias'
	tensor_map[f'{p}.norm1.weight'] = f'blk.{i}.attn_norm.weight'
	tensor_map[f'{p}.norm1.bias'] = f'blk.{i}.attn_norm.bias'
	tensor_map[f'{p}.norm2.weight'] = f'blk.{i}.ffn_norm.weight'
	tensor_map[f'{p}.norm2.bias'] = f'blk.{i}.ffn_norm.bias'

	# Write tensors
	for orig_name in state_dict:
	tensor = state_dict[orig_name]
	mapped_name = tensor_map.get(orig_name, orig_name)
	arr = tensor.numpy().astype(np.float32)
	gguf_writer.add_tensor(mapped_name, arr)

	gguf_writer.write_header_to_file()
	gguf_writer.write_kv_data_to_file()
	gguf_writer.write_tensors_to_file()
	gguf_writer.close()

	print(f"GGUF file created: {output_path}")
	print(f"Total tensors written: {len(state_dict)}")
	print(f"NOTE: This GGUF file uses a custom architecture 'transformer_lm'")
	print(f" and will NOT load in standard llama.cpp/llama-cpp-python")
	print(f" without adding custom architecture support.")

	script_dir = os.path.dirname(os.path.abspath(__file__))
	pytorch_bin = os.path.join(script_dir, "pytorch_model.bin")
	if os.path.exists(pytorch_bin):
	convert_sage_to_gguf(pytorch_bin, "sage-f16.gguf")
	else:
	print(f"Model file {pytorch_bin} not found")