Spaces:

m1b2lover
/

llamate

Paused

App Files Files Community

m1b2lover commited on May 21, 2025

Commit

f99ed48

verified ·

1 Parent(s): 1f4f173

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile +9 -24
Dockerfile copy +44 -0
Dockerfile3 +25 -0
app.py +147 -0
hfdoc +43 -0
main.py +160 -0
memo +34 -0
model_loader.py +181 -0

Dockerfile CHANGED Viewed

@@ -1,7 +1,6 @@
-# ARG CUDA_IMAGE="12.1.0-devel-ubuntu22.04"
-# FROM nvidia/cuda:${CUDA_IMAGE}
 ARG CUDA_IMAGE="12.5.0-devel-ubuntu22.04"
 FROM nvidia/cuda:${CUDA_IMAGE}
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
@@ -9,37 +8,23 @@ RUN apt-get update && apt-get upgrade -y \
     && apt-get install -y git build-essential \
     python3 python3-pip gcc wget \
     ocl-icd-opencl-dev opencl-headers clinfo \
-    libclblast-dev libopenblas-dev curl cmake \
     && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
-# ちゃんと書き込みできるユーザー設定しないとエラーが出るっぽい。
-RUN useradd -m -u 1000 gee
-USER gee
-ENV HOME=/home/gee \
-    PATH=/home/gee/.local/bin:$PATH
-ENV HF_HOME=$HOME/app/.cache/huggingface
-WORKDIR $HOME/app
-COPY --chown=gee . $HOME/app
 # setting build related env vars
 ENV CUDA_DOCKER_ARCH=all
 ENV GGML_CUDA=1
 # Install depencencies
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context huggingface_hub hf_xet
-# Install llama-cpp-python (build with cuda)
-RUN pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125
 # Install llama-cpp-python (build with cuda)
-# RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
-# # Run the server
-# CMD python3 -m llama_cpp.server
-# python -m llama_cpp.server --model /path/to/your/model.gguf --host 0.0.0.0 --port 8000
-# CMD ["python3","-m", "llama_cpp.server", "--hf_model_repo_id", "unsloth/Qwen3-30B-A3B-GGUF" ,"--model", "*Q4_0.gguf","--n_gpu_layers", "32" ,"--host", "0.0.0.0", "--port", "8000"]
-CMD ["python3","llm.py"]

 ARG CUDA_IMAGE="12.5.0-devel-ubuntu22.04"
 FROM nvidia/cuda:${CUDA_IMAGE}
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
     && apt-get install -y git build-essential \
     python3 python3-pip gcc wget \
     ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
     && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+COPY . .
+RUN nvidia-smi
 # setting build related env vars
 ENV CUDA_DOCKER_ARCH=all
 ENV GGML_CUDA=1
 # Install depencencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 # Install llama-cpp-python (build with cuda)
+RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
+# --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+# Run the server
+CMD python3 -m llama_cpp.server

Dockerfile copy ADDED Viewed

	@@ -0,0 +1,44 @@

+# Dockerfile
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+ARG MODEL_ID="Qwen/Qwen3-8B"
+ENV MODEL_ID=${MODEL_ID}
+# 量子化設定のデフォルト (Spaceのenvで上書き可能)
+ENV LOAD_IN_4BIT="true"
+ENV LOAD_IN_8BIT="false"
+# 依存関係のインストール
+COPY requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# コードのコピー
+COPY main.py .
+COPY model_loader.py .
+COPY schemas.py .
+# ちゃんと書き込みできるユーザー設定しないとエラーが出るっぽい。
+RUN useradd -m -u 1000 gee
+USER gee
+ENV HOME=/home/gee \
+    PATH=/home/gee/.local/bin:$PATH
+ENV HF_HOME=$HOME/app/.cache/huggingface
+WORKDIR $HOME/app
+COPY --chown=gee . $HOME/app
+# ポートの公開 (Uvicornがリッスンするポート)
+# Hugging Face Spacesでは通常7860がデフォルトだが、APIサーバーなら8000でも良い。
+# README.mdのapp_portと合わせる。
+EXPOSE 8000
+# アプリケーションの起動コマンド
+# CMD uvicorn main:app --host 0.0.0.0 --port ${PORT:-8000} --workers 1
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]

Dockerfile3 ADDED Viewed

	@@ -0,0 +1,25 @@

+# ベースイメージはCUDA 11.8
+FROM docker.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04@sha256:8f9dd0d09d3ad3900357a1cf7f887888b5b74056636cd6ef03c160c3cd4b1d95
+# Pythonやpipなどの基本的なツールをインストール
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    git \
+    # llama-cpp-pythonのビルドに必要な場合がある（ホイールが見つからない場合など）
+    # build-essential cmake \
+    && rm -rf /var/lib/apt/lists/*
+# (推奨) モデルダウンロード用のディレクトリ作成と権限設定
+RUN mkdir /models && chmod 777 /models
+VOLUME /models
+RUN pip install llama-cpp-python[server] \
+    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu118
+RUN pip install huggingface-hub
+ENV MODEL_ID="unsloth/Qwen3-30B-A3B-GGUF"
+COPY . .
+RUN ln -s /usr/bin/python3 /usr/bin/python
+# python -m llama_cpp.server --model /path/to/your/model.gguf --host 0.0.0.0 --port 8000
+CMD ["python", "-m", "llama_cpp.server", "--hf_model_repo_id", "unsloth/Qwen3-30B-A3B-GGUF" ,"--model", "*Q4_0.gguf", "--host", "0.0.0.0", "--port", "8000"]

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# app.py
+import gradio as gr
+import subprocess
+import os
+import time # timeモジュールは直接使っていませんが、コメントアウトされた部分で使われる可能性あり
+# --- JGLUE評価タスク実行関数 ---
+def run_jglue_evaluation_task(model_name, other_param, max_samples, progress: gr.Progress):
+    log_output = ""
+    # 評価スクリプトのパス (app.py と同じディレクトリにあると仮定)
+    script_path = "jglue_script.py"
+    task_name_dummy = "marc_ja" # UIからタスク名も入力できるようにするのが理想
+    try:
+        # コマンドリストの作成
+        command = [
+            "python", script_path,
+            "--model_name_or_path", str(model_name),
+            "--task_name", str(task_name_dummy), # ここはUIから取得するように変更推奨
+            # "--dataset_path", str(dataset_path), # JGLUEスクリプトがHubからロードするなら不要な場合も
+            "--output_dir", "./evaluation_results", # Space内に結果保存用ディレクトリ
+            "--eval_batch_size", "8", # 例: UIから変更可能にしても良い
+            "--max_seq_length", "128", # 例: UIから変更可能にしても良い
+            # "--other_param_for_script", str(other_param), # スクリプト側で受け取る引数名に合わせる
+        ]
+        if max_samples is not None and int(max_samples) > 0:
+            command.extend(["--max_eval_samples", str(int(max_samples))])
+        log_output += f"実行コマンド: {' '.join(command)}\n\n"
+        # progress(0, desc="評価スクリプト準備中...")
+        yield log_output # 初期ログをすぐに表示
+        # subprocessの実行
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT, # 標準エラーも標準出力にマージ
+            text=True,
+            bufsize=1, # 1行ずつのバッファリング
+            universal_newlines=True,
+            encoding='utf-8' # 明示的にエンコーディング指定
+        )
+        # リアルタイムでログを読み取り、進捗を更新
+        line_count = 0
+        # max_expected_lines = 200 # これはあくまで目安なので、より動的な進捗更新が望ましい
+        # iterの第二引数を空文字列にすることで、プロセスが終了するまで読み続ける
+        for line in iter(process.stdout.readline, ''):
+            if not line: # 空行が連続する場合の対策 (あまりないはずだが)
+                # プロセスがまだ生きているか確認 (オプション)
+                # if process.poll() is not None: break
+                continue
+            print(line, end='', flush=True) # Dockerのログにもリアルタイムで出力
+            log_output += line
+            line_count += 1
+            # 進捗バーの更新 (ここでは単純に1行ごとに更新するが、より意味のある更新が望ましい)
+            # 例えば、スクリプト側で "PROGRESS: 25%" のような文字列を出力し、それをパースするなど
+            progress(min(0.01 * line_count, 0.95), desc=f"評価実行中... (ログ {line_count}行目)") # 0.95で止めておき、完了時に1.0にする
+            yield log_output # ストリーミング出力でリアルタイムにUI更新
+        process.stdout.close()
+        return_code = process.wait() # プロセスの終了を待つ
+        if return_code == 0:
+            log_output += "\n\n評価が正常に完了しました。"
+            progress(1.0, desc="評価完了！")
+        else:
+            log_output += f"\n\n評価スクリプトがエラーコード {return_code} で終了しました。"
+            progress(1.0, desc="評価エラー")
+        yield log_output # 最終ログを送信
+    except FileNotFoundError:
+        log_output += f"\n\nエラー: 評価スクリプト '{script_path}' が見つかりません。"
+        progress(1.0, desc="スクリプトエラー")
+        yield log_output
+    except Exception as e:
+        log_output += f"\n\n予期せぬエラーが発生しました: {e}"
+        progress(1.0, desc="致命的エラー")
+        import traceback
+        log_output += "\n\n--- Traceback ---\n" + traceback.format_exc()
+        yield log_output
+# --- Gradioインターフェース ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# JGLUE 評価プラットフォーム")
+    gr.Markdown("Dockerコンテナ上でJGLUE評価スクリプトを実行します。")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 評価設定")
+            model_name_input = gr.Textbox(
+                label="評価するモデル名またはパス",
+                placeholder="例: cl-tohoku/bert-base-japanese-whole-word-masking",
+                value="cl-tohoku/bert-base-japanese-whole-word-masking"
+            )
+            task_name_input = gr.Dropdown(
+                label="JGLUE タスク名",
+                choices=["marc_ja", "jsts", "jnli", "jcommonsense_qa"],
+                value="marc_ja"
+            )
+            other_param_input = gr.Slider(
+                label="その他のパラメータ (スクリプト側で解釈)",
+                minimum=1, maximum=10, value=5, step=1
+            )
+            max_samples_input = gr.Number(
+                label="評価サンプル数上限 (0または空で全件)",
+                value=100,
+                minimum=0, step=10, precision=0
+            )
+            submit_button = gr.Button("評価開始", variant="primary", icon="▶️")
+        with gr.Column(scale=2):
+            gr.Markdown("### 実行ログと結果")
+            # ProgressコンポーネントはUIに配置するだけで、clickのinputsには含めない
+            progress_component = gr.Progress()
+            output_log = gr.Textbox(
+                label="ログ出力エリア",
+                lines=20,
+                interactive=False,
+                max_lines=200,
+                show_copy_button=True
+            )
+    # ボタンが押されたら評価関数を実行し、出力をストリーミング
+    submit_button.click(
+        fn=run_jglue_evaluation_task,
+        # inputs から progress_component を削除
+        inputs=[model_name_input, task_name_input, other_param_input, max_samples_input],
+        outputs=[output_log]
+        # progress引数はGradioが関数の型ヒントを見て自動的に対応するUIコンポーネントを渡してくれる
+        # この場合、fnの引数 progress: gr.Progress とUI上の progress_component が関連付けられる
+    )
+# --- Gradioアプリの起動 ---
+if __name__ == "__main__":
+    print("DEBUG: app.py - Inside __main__ block. Attempting to launch Gradio app...", flush=True)
+    try:
+        # .queue() をつけることで、複数のリクエストや長時間実行タスクを処理しやすくなる
+        demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)
+        # debug=True をつけると、GradioやUvicornのより詳細なログが出る (開発中便利)
+    except Exception as e:
+        print(f"DEBUG: app.py - Error during demo.launch(): {e}", flush=True)
+        import traceback
+        traceback.print_exc()

hfdoc ADDED Viewed

	@@ -0,0 +1,43 @@

+# ARG CUDA_IMAGE="12.1.0-devel-ubuntu22.04"
+# FROM nvidia/cuda:${CUDA_IMAGE}
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+# ちゃんと書き込みできるユーザー設定しないとエラーが出るっぽい。
+RUN useradd -m -u 1000 gee
+USER gee
+ENV HOME=/home/gee \
+    PATH=/home/gee/.local/bin:$PATH
+ENV HF_HOME=$HOME/app/.cache/huggingface
+WORKDIR $HOME/app
+COPY --chown=gee . $HOME/app
+# setting build related env vars
+ENV CUDA_DOCKER_ARCH=all
+ENV GGML_CUDA=1
+# Install depencencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context huggingface_hub hf_xet
+# Install llama-cpp-python (build with cuda)
+RUN pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+# # Run the server
+# CMD python3 -m llama_cpp.server
+# python -m llama_cpp.server --model /path/to/your/model.gguf --host 0.0.0.0 --port 8000
+CMD ["python3", "-W","ignore","-m", "llama_cpp.server", "--hf_model_repo_id", "unsloth/Qwen3-30B-A3B-GGUF" ,"--model", "*Q4_0.gguf", "--host", "0.0.0.0", "--port", "8000"]

main.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# main.py
+from fastapi import FastAPI, HTTPException, Request as FastAPIRequest
+from fastapi.responses import JSONResponse
+import uvicorn
+import os
+import uuid  # id生成用 (schemasに移動しても良い)
+import time
+# ローカルモジュールからのインポート
+from model_loader import (
+    load_model,
+    generate_text,
+    MODEL_ID as LOADED_MODEL_ID,
+)  # MODEL_IDもインポート
+import model_loader
+from schemas import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionMessage,
+    ChatCompletionResponseMessage,
+    ChatCompletionChoice,
+    # Usage,
+)
+app = FastAPI(
+    title="OpenAI Compatible LLM API",
+    description=f"Provides an OpenAI-compatible API endpoint for the model: {os.environ.get('MODEL_ID', 'default_model_id_from_env')}",
+    version="0.1.0",
+)
+# --- イベントハンドラ ---
+@app.on_event("startup")
+async def startup_event():
+    """
+    アプリケーション起動時にモデルをロードする。
+    """
+    print("Application startup: Loading model...")
+    try:
+        load_model()  # model_loader.py の関数を呼び出し
+        print(f"Model '{LOADED_MODEL_ID}' should be loaded now.")
+    except RuntimeError as e:
+        print(f"Fatal Error during application startup: {e}")
+        # ここでアプリを異常終了させるか、ヘルスチェックエンドポイントでエラーを返すようにする
+        # Uvicornの起動自体は成功してしまう可能性があるため注意
+# --- APIエンドポイント ---
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(request: ChatCompletionRequest):
+    """
+    OpenAI互換のチャット補完エンドポイント。
+    """
+    print(
+        f"Received request for model: {request.model} (Actual model: {LOADED_MODEL_ID})"
+    )
+    print(f"Messages: {request.messages}")
+    if (
+        model_loader.model is None or model_loader.tokenizer is None
+    ):  # model_loaderのグローバル変数をチェック
+        raise HTTPException(
+            status_code=503,
+            detail=f"Model '{LOADED_MODEL_ID}' is not available. Check server logs.",
+        )
+    # 最後のユーザーメッセージをプロンプトとして使用 (より複雑な会話履歴の扱いは要検討)
+    # OpenAIのmessagesはリストなので、最後のユーザーメッセージを取り出すか、
+    # 全体を結合して1つのプロンプトにするかはモデルの期待する形式による。
+    # ここでは最後のユーザーメッセージのcontentをプロンプトとする単純な例。
+    user_prompt = ""
+    if request.messages and request.messages[-1].role == "user":
+        user_prompt = request.messages[-1].content
+    elif (
+        request.messages
+    ):  # 最後のメッセージがuserでない場合でも、何らかのテキストを取得
+        user_prompt = "\n".join(
+            [msg.content for msg in request.messages if msg.content]
+        )
+    if not user_prompt:
+        raise HTTPException(status_code=400, detail="No user prompt found in messages.")
+    try:
+        # model_loader.py の推論関数を呼び出す
+        generated_content = generate_text(
+            prompt=user_prompt,
+            max_new_tokens=request.max_tokens
+            if request.max_tokens is not None
+            else 1024,  # HFはmax_new_tokens
+            temperature=request.temperature if request.temperature is not None else 0.7,
+            top_p=request.top_p if request.top_p is not None else 0.9,
+            # repetition_penalty など他のパラメータも渡せるように拡張可能
+        )
+        # OpenAI互換のレスポンスを作成
+        response_message = ChatCompletionResponseMessage(
+            role="assistant", content=generated_content
+        )
+        choice = ChatCompletionChoice(
+            index=0, message=response_message, finish_reason="stop"
+        )
+        # usage はダミー (正確なトークン数は別途計算が必要)
+        # usage = Usage(prompt_tokens=0, completion_tokens=0, total_tokens=0)
+        return ChatCompletionResponse(
+            id="chatcmpl-" + uuid.uuid4().hex,  # 一意なIDを生成
+            object="chat.completion",
+            created=int(time.time()),
+            model=LOADED_MODEL_ID,  # 実際に使ったモデルID
+            choices=[choice],
+            # usage=usage,
+        )
+    except RuntimeError as e:
+        print(f"RuntimeError during generation: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        print(f"Unexpected error during generation: {e}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail="An unexpected error occurred.")
+@app.get("/health")
+async def health_check():
+    """
+    ヘルスチェックエンドポイント。モデルがロードされていればOKを返す。
+    """
+    if model_loader.model is not None and model_loader.tokenizer is not None:
+        return {"status": "ok", "model_loaded": LOADED_MODEL_ID}
+    else:
+        return JSONResponse(
+            status_code=503,
+            content={
+                "status": "error",
+                "message": f"Model {LOADED_MODEL_ID} not loaded or failed to load.",
+            },
+        )
+@app.get("/")
+async def root():
+    return {
+        "message": f"OpenAI Compatible API for model: {LOADED_MODEL_ID}. Use POST /v1/chat/completions."
+    }
+# --- Uvicornでの実行 (ローカルテスト用、DockerfileのCMDで上書きされる) ---
+if __name__ == "__main__":
+    # 環境変数からモデルIDを読み込む (ローカルテスト時に設定)
+    # 例: export MODEL_ID="google/gemma-2b-it"
+    #     python main.py
+    port = int(os.environ.get("PORT", 8000))  # DockerfileのEXPOSE/CMDと合わせる
+    print(
+        f"Starting Uvicorn server on port {port} for model '{os.environ.get('MODEL_ID', 'default_model_id_from_env')}'"
+    )
+    uvicorn.run(app, host="0.0.0.0", port=port)

memo ADDED Viewed

	@@ -0,0 +1,34 @@

+GGUF いろんな量子化がある Q4,Q5, ..
+FPTQ 4bit 量子化が多い
+AWQ 主に 4bit
+32B は GGUF でどちらもある
+https://huggingface.co/BlackBeenie/Qwen3-32B-Q4_K_M-GGUF
+https://huggingface.co/kaitchup/Qwen3-32B-autoround-4bit-gptq
+https://huggingface.co/BenevolenceMessiah/Qwen3-32B-Q8_0-GGUF
+https://huggingface.co/charlesthefool/Qwen3-30B-A3B-Q4_K_M-GGUF
+https://huggingface.co/BenevolenceMessiah/Qwen3-30B-A3B-Q8_0-GGUF
+https://huggingface.co/mlx-community/Qwen3-32B-8bit
+https://huggingface.co/unsloth/Qwen3-32B-bnb-4bit
+https://huggingface.co/mlx-community/Qwen3-30B-A3B-8bit/blob/main/config.json
+https://huggingface.co/unsloth/Qwen3-30B-A3B-bnb-4bit/tree/main
+ちゃんとしたモデルをCUDA使って動かそうとしたらDockerのベースイメージをちゃんと選ぶ必要ある。
+Dockerfileの作成
+curl -X POST "https://yheye43-Eval-Qwen3-30B-A3B-GPTQ-Int4.hf.space/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "model": "stabilityai/japanese-stablelm-instruct-gamma-7b",
+  "messages": [
+    {"role": "user", "content": "日本の首都はどこですか？ /nothink"}
+  ],
+  "max_tokens": 50,
+  "temperature": 0.7
+}'
+{"id":"chatcmpl-6b73cd9660694171aa1064b33a14e8d9","object":"chat.completion","created":1747841643,"model":"Qwen/Qwen3-8B","choices":[{"index":0,"message":{"role":"ass

model_loader.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# model_loader.py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import torch
+import os
+# --- グローバル変数 (アプリケーション起動時にロードされる) ---
+model = None
+tokenizer = None
+MODEL_ID = os.environ.get(
+    "MODEL_ID", "Qwen/Qwen3-30B-A3B"
+)  # 環境変数からモデルIDを取得、なければデフォルト
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+LOAD_IN_4BIT = os.environ.get("LOAD_IN_4BIT", "false").lower() == "true"
+LOAD_IN_8BIT = os.environ.get("LOAD_IN_8BIT", "false").lower() == "true"
+# 4bitと8bitが同時にTrueになるのを防ぐ (どちらか一方、またはどちらもFalse)
+if LOAD_IN_4BIT and LOAD_IN_8BIT:
+    print(
+        "Warning: Both LOAD_IN_4BIT and LOAD_IN_8BIT are set to true. Prioritizing 4-bit."
+    )
+    LOAD_IN_8BIT = False
+elif not LOAD_IN_4BIT and not LOAD_IN_8BIT:
+    print(
+        "Info: No explicit quantization (4-bit/8-bit) requested via environment variables. Loading in default precision (e.g., bfloat16 on GPU)."
+    )
+def load_model():
+    """
+    アプリケーション起動時にモデルとトークナイザーをロードする。
+    """
+    global model, tokenizer
+    if model is None or tokenizer is None:
+        quantization_info = "No Quantization"
+        if LOAD_IN_4BIT:
+            quantization_info = "4-bit Quantization"
+        elif LOAD_IN_8BIT:
+            quantization_info = "8-bit Quantization"
+        print(
+            f"Loading model: {MODEL_ID} on device: {DEVICE} with {quantization_info}..."
+        )
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+            model_kwargs = {
+                "trust_remote_code": True
+            }  # 基本的にTrueにしておくことが多い
+            quantization_config = None
+            if DEVICE == "cuda":
+                model_kwargs["device_map"] = "auto"
+                if LOAD_IN_4BIT:
+                    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+                    model_kwargs["torch_dtype"] = "auto"  # 4bitと併用する計算時の型
+                    # bnb_4bit_compute_dtype など、より詳細なbitsandbytes設定も環境変数で制御可能
+                elif LOAD_IN_8BIT:
+                    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+                    # 8bitの場合、torch_dtypeは自動で設定されることが多いが、明示も可
+                else:  # 量子化なしGPU
+                    model_kwargs["torch_dtype"] = torch.bfloat16
+                # model = AutoModelForCausalLM.from_pretrained(
+                #     MODEL_ID,
+                #     torch_dtype=torch.bfloat16,  # または torch.float16
+                #     load_in_4bit=True,  # 4ビット量子化でロード (bitsandbytesが必要)
+                #     # load_in_8bit=True, # 8ビット量子化の場合
+                #     device_map="auto",  # 自動でGPUに割り当て
+                #     trust_remote_code=True,  # モデルによっては必要
+                # )
+            else:  # CPUの場合 (量子化はGPU推奨だが、一応対応)
+                # CPUでのbitsandbytes量子化は限定的、または非推奨
+                if LOAD_IN_4BIT or LOAD_IN_8BIT:
+                    print(
+                        "Warning: bitsandbytes quantization (4-bit/8-bit) is primarily for GPU. Attempting on CPU may be slow or unstable."
+                    )
+                # model_kwargs["device_map"] = {"": "cpu"} # 明示的にCPUを指定
+                pass  # .to(DEVICE) で対応
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID, **model_kwargs, quantization_config=quantization_config
+            )
+            if DEVICE == "cpu" and not (
+                LOAD_IN_4BIT or LOAD_IN_8BIT
+            ):  # CPUで量子化なしの場合
+                model = model.to(DEVICE)
+            model.eval()  # 評価モード
+            print(f"Model {MODEL_ID} loaded successfully.")
+        except Exception as e:
+            print(f"Error loading model {MODEL_ID}: {e}")
+            # エラー発生時は model と tokenizer が None のままになる
+            # アプリケーションのヘルスチェックなどでこれを確認できるようにするのも良い
+            raise RuntimeError(f"Failed to load model: {e}")
+def generate_text(
+    prompt: str,
+    max_new_tokens: int = 100,
+    temperature: float = 0.3,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.0,
+) -> str:
+    """
+    ロードされたモデルを使ってテキストを生成する。
+    """
+    if model is None or tokenizer is None:
+        raise RuntimeError("Model not loaded. Cannot generate text.")
+    try:
+        # プロンプトの形式はモデルによって調整が必要
+        # 例: Instructモデルの場合、特定のテンプレートがあることが多い
+        # こ��では単純にユーザープロンプトのみを使用
+        # inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
+        # より一般的なチャット形式のプロンプト適用 (モデルに合わせて調整)
+        # StableLM Instruct Gamma のプロンプト形式例 (あくまで一例)
+        # 参考: https://huggingface.co/stabilityai/japanese-stablelm-instruct-gamma-7b
+        messages = [{"role": "user", "content": prompt}]
+        # モデルによっては tokenizer.apply_chat_template が使える
+        try:
+            # 多くのモデルではtokenizer.apply_chat_templateが使える
+            prompt_formatted = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                # Thinking Modeの切り替えここでできる
+                # enable_thinking=False,
+            )
+        except Exception:
+            # 古いモデルや特殊なモデルでapply_chat_templateがない場合の手動フォーマット例
+            # これはモデルのドキュメントを確認して適切な形式にする
+            print(
+                f"Warning: tokenizer.apply_chat_template failed for {MODEL_ID}. Using raw prompt or basic formatting."
+            )
+            if (
+                "stablelm-instruct" in MODEL_ID.lower() or "elyza" in MODEL_ID.lower()
+            ):  # ELYZAやStableLMの例
+                prompt_formatted = f"ユーザー: {prompt}\nシステム: "
+            elif (
+                "qwen" in MODEL_ID.lower() and "chat" in MODEL_ID.lower()
+            ):  # Qwen-Chatの例
+                prompt_formatted = (
+                    f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+                )
+            else:  # デフォルトはそのまま
+                prompt_formatted = prompt
+        inputs = tokenizer(
+            prompt_formatted, return_tensors="pt", add_special_tokens=False
+        ).to(DEVICE)  # add_special_tokensはテンプレートによる
+        # テキスト生成
+        # pad_token_id はeos_token_idと同じに設定することが多い (警告抑制)
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        generation_kwargs = {
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "repetition_penalty": repetition_penalty,
+            "do_sample": True
+            if temperature > 0
+            else False,  # temperatureが0超ならサンプリング
+            "pad_token_id": tokenizer.pad_token_id,
+        }
+        outputs = model.generate(**inputs, **generation_kwargs)
+        # 生成されたテキストのみをデコード (入力プロンプト部分を除く)
+        # inputs.input_ids.shape[1] は入力トークンの長さ
+        output_text = tokenizer.decode(
+            outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+        )
+        return output_text.strip()
+    except Exception as e:
+        print(f"Error during text generation: {e}")
+        # traceback.print_exc() # 詳細なエラー表示
+        raise RuntimeError(f"Text generation failed: {e}")