CPU-LLM-Inference

Running

File size: 5,942 Bytes

9a0f889

# ------------------------------
# Torch-Compatible Model Definitions with Adjusted Descriptions
# ------------------------------
MODELS = {

    # 1.5B
    "Nemotron-Research-Reasoning-Qwen-1.5B": {
        "repo_id": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
        "description": "Nemotron-Research-Reasoning-Qwen-1.5B",
        "params_b": 1.5
    },
    "Falcon-H1-1.5B-Instruct": {
        "repo_id": "tiiuae/Falcon-H1-1.5B-Instruct",
        "description": "Falcon‑H1 model with 1.5 B parameters, instruction‑tuned",
        "params_b": 1.5
    },
    "Qwen2.5-Taiwan-1.5B-Instruct": {
        "repo_id": "benchang1110/Qwen2.5-Taiwan-1.5B-Instruct",
        "description": "Qwen2.5-Taiwan-1.5B-Instruct",
        "params_b": 1.5
    },

    # 1.2B
    "LFM2-1.2B": {
        "repo_id": "LiquidAI/LFM2-1.2B",
        "description": "A 1.2B parameter hybrid language model from Liquid AI, designed for efficient on-device and edge AI deployment, outperforming larger models like Llama-2-7b-hf in specific tasks.",
        "params_b": 1.2
    },

    # 1.1B
    "Taiwan-ELM-1_1B-Instruct": {
        "repo_id": "liswei/Taiwan-ELM-1_1B-Instruct",
        "description": "Taiwan-ELM-1_1B-Instruct",
        "params_b": 1.1
    },

    # 1B
    "Llama-3.2-Taiwan-1B": {
        "repo_id": "lianghsun/Llama-3.2-Taiwan-1B",
        "description": "Llama-3.2-Taiwan base model with 1 B parameters",
        "params_b": 1.0
    },

    # 700M
    "LFM2-700M": {
        "repo_id": "LiquidAI/LFM2-700M",
        "description": "A 700M parameter model from the LFM2 family, designed for high efficiency on edge devices with a hybrid architecture of multiplicative gates and short convolutions.",
        "params_b": 0.7
    },

    # 600M
    "Qwen3-0.6B": {
        "repo_id": "Qwen/Qwen3-0.6B",
        "description": "Dense causal language model with 0.6 B total parameters (0.44 B non-embedding), 28 transformer layers, 16 query heads & 8 KV heads, native 32 768-token context window, dual-mode generation, full multilingual & agentic capabilities.",
        "params_b": 0.6
    },
    "Qwen3-0.6B-Taiwan": {
        "repo_id": "ShengweiPeng/Qwen3-0.6B-Taiwan",
        "description": "Qwen3-Taiwan model with 0.6 B parameters",
        "params_b": 0.6
    },

    # 500M
    "Qwen2.5-0.5B-Taiwan-Instruct": {
        "repo_id": "ShengweiPeng/Qwen2.5-0.5B-Taiwan-Instruct",
        "description": "Qwen2.5-Taiwan model with 0.5 B parameters, instruction-tuned",
        "params_b": 0.5
    },

    # 360M
    "SmolLM2-360M-Instruct": {
        "repo_id": "HuggingFaceTB/SmolLM2-360M-Instruct",
        "description": "Original SmolLM2‑360M Instruct",
        "params_b": 0.36
    },
    "SmolLM2-360M-Instruct-TaiwanChat": {
        "repo_id": "Luigi/SmolLM2-360M-Instruct-TaiwanChat",
        "description": "SmolLM2‑360M Instruct fine-tuned on TaiwanChat",
        "params_b": 0.36
    },

    # 350M
    "LFM2-350M": {
        "repo_id": "LiquidAI/LFM2-350M",
        "description": "A compact 350M parameter hybrid model optimized for edge and on-device applications, offering significantly faster training and inference speeds compared to models like Qwen3.",
        "params_b": 0.35
    },

    # 270M
    "parser_model_ner_gemma_v0.1": {
        "repo_id": "myfi/parser_model_ner_gemma_v0.1",
        "description": "A lightweight named‑entity‑like (NER) parser fine‑tuned from Google’s **Gemma‑3‑270M** model. The base Gemma‑3‑270M is a 270 M‑parameter, hyper‑efficient LLM designed for on‑device inference, supporting >140 languages, a 128 k‑token context window, and instruction‑following capabilities [2][7]. This variant is further trained on standard NER corpora (e.g., CoNLL‑2003, OntoNotes) to extract PERSON, ORG, LOC, and MISC entities with high precision while keeping the memory footprint low (≈240 MB VRAM in BF16 quantized form) [1]. It is released under the Apache‑2.0 license and can be used for fast, cost‑effective entity extraction in low‑resource environments.",
        "params_b": 0.27
    },
    "Gemma-3-Taiwan-270M-it": {
        "repo_id": "lianghsun/Gemma-3-Taiwan-270M-it",
        "description": "google/gemma-3-270m-it fintuned on Taiwan Chinese dataset",
        "params_b": 0.27
    },
    "gemma-3-270m-it": {
        "repo_id": "google/gemma-3-270m-it",
        "description": "Gemma‑3‑270M‑IT is a compact, 270‑million‑parameter language model fine‑tuned for Italian, offering fast and efficient on‑device text generation and comprehension in the Italian language.",
        "params_b": 0.27
    },
    "Taiwan-ELM-270M-Instruct": {
        "repo_id": "liswei/Taiwan-ELM-270M-Instruct",
        "description": "Taiwan-ELM-270M-Instruct",
        "params_b": 0.27
    },

    # 135M
    "SmolLM2-135M-multilingual-base": {
        "repo_id": "agentlans/SmolLM2-135M-multilingual-base",
        "description": "SmolLM2-135M-multilingual-base",
        "params_b": 0.135
    },
    "SmolLM-135M-Taiwan-Instruct-v1.0": {
        "repo_id": "benchang1110/SmolLM-135M-Taiwan-Instruct-v1.0",
        "description": "135-million-parameter F32 safetensors instruction-finetuned variant of SmolLM-135M-Taiwan, trained on the 416 k-example ChatTaiwan dataset for Traditional Chinese conversational and instruction-following tasks",
        "params_b": 0.135
    },
    "SmolLM2_135M_Grpo_Gsm8k": {
        "repo_id": "prithivMLmods/SmolLM2_135M_Grpo_Gsm8k",
        "description": "SmolLM2_135M_Grpo_Gsm8k",
        "params_b": 0.135
    },
    "SmolLM2-135M-Instruct": {
        "repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct",
        "description": "Original SmolLM2‑135M Instruct",
        "params_b": 0.135
    },
    "SmolLM2-135M-Instruct-TaiwanChat": {
        "repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat",
        "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat",
        "params_b": 0.135
    },
}