import os import subprocess import sys try: import llama_cpp except ImportError: print("Installing pre-built llama-cpp-python...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "llama-cpp-python", "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu" ]) import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # モデルの設定 model_id = "yuna1126/Tema_Q-R-0.4B-GGUF" model_file = "Tema_Q-R-0.4B-f16.gguf" # 入力制限文字数 MAX_INPUT_CHARS = 700 print("Downloading model...") model_path = hf_hub_download(repo_id=model_id, filename=model_file) print(f"Loading model from {model_path}...") llm = Llama( model_path=model_path, n_ctx=1024, n_threads=2, use_mmap=False, n_batch=128, ) print("Model loaded.") def chat_response(message, history): # 文字数制限 if len(message) > MAX_INPUT_CHARS: yield f"入力が長すぎます。{MAX_INPUT_CHARS}文字以内で入力してください。" return # テンプレートに合わせたプロンプト形式に変更 # <|im_start|>user # メッセージ # <|im_end|> # <|im_start|>assistant prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" try: output = llm( prompt, max_tokens=512, # 停止トークンもテンプレートに合わせて変更 stop=["<|im_end|>", "<|im_start|>"], stream=True ) response = "" for chunk in output: text = chunk["choices"][0]["text"] response += text yield response except Exception as e: yield f"エラーが発生しました: {str(e)}" # UIの構築(説明欄に制限の記載を追加) demo = gr.ChatInterface( fn=chat_response, title="Tema_Q-R-0.4B Chat", ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)