Text Generation
Transformers
Safetensors
PyTorch
Indonesian
deeplm
bitnet
Mixture of Experts
mla
mtp
indonesian
Instructions to use samcheng0/deeplm-108m with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use samcheng0/deeplm-108m with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="samcheng0/deeplm-108m")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("samcheng0/deeplm-108m", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use samcheng0/deeplm-108m with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "samcheng0/deeplm-108m" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "samcheng0/deeplm-108m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/samcheng0/deeplm-108m
- SGLang
How to use samcheng0/deeplm-108m with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "samcheng0/deeplm-108m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "samcheng0/deeplm-108m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "samcheng0/deeplm-108m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "samcheng0/deeplm-108m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use samcheng0/deeplm-108m with Docker Model Runner:
docker model run hf.co/samcheng0/deeplm-108m
| """ | |
| Initialize Deeplm model with config and BitNet quantization, save to safetensors. | |
| """ | |
| import sys | |
| import os | |
| import json | |
| import torch | |
| # Add deeplm to path | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "deeplm")) | |
| from deeplm.config import DeeplmConfig | |
| from deeplm.model.deeplm import DeeplmModel | |
| from deeplm.quantization.bitnet_quantize import apply_bitnet_quantization | |
| def main(): | |
| print("Building DeeplmConfig...") | |
| config = DeeplmConfig( | |
| vocab_size=32000, | |
| max_seq_length=4096, | |
| dtype="float32", | |
| ) | |
| config.architecture.num_layers = 10 | |
| config.architecture.hidden_size = 512 | |
| config.architecture.intermediate_size = 2048 | |
| config.architecture.num_attention_heads = 8 | |
| config.architecture.num_key_value_heads = 1 | |
| config.architecture.head_dim = 128 | |
| config.architecture.rope_head_dim = 64 | |
| config.architecture.nope_head_dim = 64 | |
| config.architecture.max_seq_length = 4096 | |
| config.architecture.rope_theta = 50000.0 | |
| config.mla.q_lora_rank = 192 | |
| config.mla.kv_lora_rank = 64 | |
| config.mla.qk_rope_head_dim = 64 | |
| config.mla.qk_nope_head_dim = 64 | |
| config.mla.v_head_dim = 128 | |
| config.mla.num_heads = 8 | |
| config.mla.kv_heads = 1 | |
| config.moe.num_routed_experts = 4 | |
| config.moe.num_shared_experts = 1 | |
| config.moe.top_k = 2 | |
| config.mtp.num_mtp_layers = 2 | |
| config.mtp.mtp_depth = 2 | |
| config.mtp.mtp_hidden_size = 512 | |
| config.output_heads.lm_head.type = "tied" | |
| config.output_heads.lm_head.bias = False | |
| print(f"Creating DeeplmModel...") | |
| model = DeeplmModel(config) | |
| total_params = model.num_parameters() | |
| print(f"Total parameters: {total_params:,}") | |
| print("Applying BitNet b1.58 ternary quantization (absmean)...") | |
| stats = apply_bitnet_quantization(model, scale="absmean", verbose=True) | |
| print(f"Quantized {stats['quantized']}/{stats['total_linear']} linear layers") | |
| print("Saving to model.safetensors...") | |
| from safetensors.torch import save_file | |
| state_dict = model.state_dict() | |
| save_file(state_dict, "model.safetensors") | |
| # Save config.json | |
| config_json = { | |
| "architectures": ["DeeplmModel"], | |
| "model_type": "deeplm", | |
| "vocab_size": 32000, | |
| "hidden_size": 512, | |
| "intermediate_size": 2048, | |
| "num_hidden_layers": 10, | |
| "num_attention_heads": 8, | |
| "num_key_value_heads": 1, | |
| "max_position_embeddings": 4096, | |
| "rms_norm_eps": 1e-06, | |
| "rope_theta": 50000.0, | |
| "rope_dim": 64, | |
| "tie_word_embeddings": True, | |
| "num_routed_experts": 4, | |
| "num_shared_experts": 1, | |
| "expert_topk": 2, | |
| "q_lora_rank": 192, | |
| "kv_lora_rank": 64, | |
| "qk_rope_head_dim": 64, | |
| "qk_nope_head_dim": 64, | |
| "v_head_dim": 128, | |
| "mtp_depth": 2, | |
| "mtp_num_layers": 2, | |
| "bitnet_quantized": True, | |
| "bitnet_scale": "absmean", | |
| } | |
| with open("config.json", "w") as f: | |
| json.dump(config_json, f, indent=2) | |
| print("Saved config.json") | |
| # Save generation_config.json | |
| gen_config = { | |
| "max_new_tokens": 512, | |
| "do_sample": True, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "top_k": 50, | |
| "repetition_penalty": 1.1, | |
| "pad_token_id": 0, | |
| "eos_token_id": 2, | |
| "bos_token_id": 1, | |
| } | |
| with open("generation_config.json", "w") as f: | |
| json.dump(gen_config, f, indent=2) | |
| print("Saved generation_config.json") | |
| print("Done!") | |
| if __name__ == "__main__": | |
| main() | |