|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
import os |
|
|
|
|
|
|
|
|
REPO_ID = "bartowski/Qwen2.5-3B-Instruct-GGUF" |
|
|
FILENAME = "Qwen2.5-3B-Instruct-Q5_K_M.gguf" |
|
|
|
|
|
|
|
|
CACHE_DIR = "/tmp/hf/models" |
|
|
os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
|
|
llm = None |
|
|
|
|
|
|
|
|
def load_model(): |
|
|
global llm |
|
|
|
|
|
if llm is not None: |
|
|
return llm |
|
|
|
|
|
print(f"Скачиваем/находим модель {REPO_ID}/{FILENAME} ... (первый раз ~5–15 мин на HF CPU)") |
|
|
|
|
|
|
|
|
model_path = hf_hub_download( |
|
|
repo_id=REPO_ID, |
|
|
filename=FILENAME, |
|
|
cache_dir=CACHE_DIR, |
|
|
local_dir=CACHE_DIR, |
|
|
local_dir_use_symlinks=False |
|
|
) |
|
|
|
|
|
print(f"Путь к модели: {model_path}") |
|
|
|
|
|
llm = Llama( |
|
|
model_path=model_path, |
|
|
n_ctx=8192, |
|
|
n_threads=0, |
|
|
n_gpu_layers=0, |
|
|
n_batch=512, |
|
|
verbose=True |
|
|
) |
|
|
|
|
|
print("Модель Qwen2.5-7B-Instruct успешно загружена в CPU-режиме") |
|
|
return llm |