Add FastAPI word tree server
Browse files- app.py +129 -0
- package/__pycache__/ai.cpython-310.pyc +0 -0
- package/__pycache__/config.cpython-310.pyc +0 -0
- package/__pycache__/path_manager.cpython-310.pyc +0 -0
- package/__pycache__/rust_adapter.cpython-310.pyc +0 -0
- package/__pycache__/word_counter.cpython-310.pyc +0 -0
- package/__pycache__/word_processor.cpython-310.pyc +0 -0
- package/ai.py +156 -0
- package/config.py +157 -0
- package/path_manager.py +151 -0
- package/rust_adapter.py +72 -0
- package/word_counter.py +203 -0
- package/word_processor.py +519 -0
- requirements.txt +23 -0
app.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
FastAPI 版 LLMView Word Tree サーバー
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import threading
|
| 7 |
+
from typing import List, Dict, Any, Optional
|
| 8 |
+
|
| 9 |
+
from fastapi import FastAPI, HTTPException
|
| 10 |
+
from pydantic import BaseModel, Field
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from package.path_manager import get_path_manager
|
| 14 |
+
except ImportError:
|
| 15 |
+
from path_manager import get_path_manager # type: ignore
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
path_manager = get_path_manager()
|
| 19 |
+
path_manager.setup_sys_path()
|
| 20 |
+
|
| 21 |
+
adapter = None
|
| 22 |
+
status_message = "モデル初期化中..."
|
| 23 |
+
status_lock = threading.Lock()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class WordTreeRequest(BaseModel):
|
| 27 |
+
prompt_text: str = Field(..., description="生成に使用するプロンプト")
|
| 28 |
+
root_text: str = Field("", description="任意のルートテキスト")
|
| 29 |
+
top_k: int = Field(5, ge=1, le=50, description="取得する候補数")
|
| 30 |
+
max_depth: int = Field(10, ge=1, le=50, description="探索深さ")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class WordTreeResponse(BaseModel):
|
| 34 |
+
text: str
|
| 35 |
+
probability: float
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _set_status(message: str) -> None:
|
| 39 |
+
global status_message
|
| 40 |
+
with status_lock:
|
| 41 |
+
status_message = message
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def initialize_model() -> None:
|
| 45 |
+
"""RustAdapter とモデルを初期化"""
|
| 46 |
+
global adapter
|
| 47 |
+
try:
|
| 48 |
+
_set_status("モデルを読み込み中です...")
|
| 49 |
+
from package.rust_adapter import RustAdapter
|
| 50 |
+
|
| 51 |
+
model_path = path_manager.get_model_path()
|
| 52 |
+
adapter = RustAdapter.get_instance(model_path)
|
| 53 |
+
_set_status("モデル準備完了")
|
| 54 |
+
except Exception as exc: # pragma: no cover
|
| 55 |
+
_set_status(f"モデル初期化に失敗しました: {exc}")
|
| 56 |
+
import traceback
|
| 57 |
+
|
| 58 |
+
traceback.print_exc()
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# Space 起動時にバックグラウンドで初期化
|
| 62 |
+
threading.Thread(target=initialize_model, daemon=True).start()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
app = FastAPI(
|
| 66 |
+
title="LLMView Word Tree API",
|
| 67 |
+
description="LLMView の単語ツリー構築 API。/build_word_tree にPOSTしてください。",
|
| 68 |
+
version="1.0.0",
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@app.get("/")
|
| 73 |
+
def root() -> Dict[str, str]:
|
| 74 |
+
"""簡易案内"""
|
| 75 |
+
return {
|
| 76 |
+
"message": "LLMView Word Tree API",
|
| 77 |
+
"status_endpoint": "/health",
|
| 78 |
+
"build_endpoint": "/build_word_tree",
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@app.get("/health")
|
| 83 |
+
def health() -> Dict[str, Any]:
|
| 84 |
+
"""状態確認"""
|
| 85 |
+
with status_lock:
|
| 86 |
+
current_status = status_message
|
| 87 |
+
return {
|
| 88 |
+
"model_loaded": adapter is not None,
|
| 89 |
+
"status": current_status,
|
| 90 |
+
"model_path": path_manager.get_model_path(),
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@app.post("/build_word_tree", response_model=List[WordTreeResponse])
|
| 95 |
+
def build_word_tree(payload: WordTreeRequest) -> List[WordTreeResponse]:
|
| 96 |
+
"""単語ツリーを構築"""
|
| 97 |
+
if not payload.prompt_text.strip():
|
| 98 |
+
raise HTTPException(status_code=400, detail="prompt_text を入力してください。")
|
| 99 |
+
|
| 100 |
+
if adapter is None:
|
| 101 |
+
raise HTTPException(
|
| 102 |
+
status_code=503, detail=f"モデル準備中です: {status_message}"
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
results = adapter.build_word_tree(
|
| 107 |
+
prompt_text=payload.prompt_text,
|
| 108 |
+
root_text=payload.root_text,
|
| 109 |
+
top_k=payload.top_k,
|
| 110 |
+
max_depth=payload.max_depth,
|
| 111 |
+
)
|
| 112 |
+
if not results:
|
| 113 |
+
raise HTTPException(status_code=500, detail="候補を生成できませんでした。")
|
| 114 |
+
return results
|
| 115 |
+
except HTTPException:
|
| 116 |
+
raise
|
| 117 |
+
except Exception as exc:
|
| 118 |
+
import traceback
|
| 119 |
+
|
| 120 |
+
traceback.print_exc()
|
| 121 |
+
raise HTTPException(status_code=500, detail=f"内部エラー: {exc}") from exc
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
import uvicorn
|
| 126 |
+
|
| 127 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 128 |
+
|
| 129 |
+
|
package/__pycache__/ai.cpython-310.pyc
ADDED
|
Binary file (5.28 kB). View file
|
|
|
package/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (4.93 kB). View file
|
|
|
package/__pycache__/path_manager.cpython-310.pyc
ADDED
|
Binary file (4.76 kB). View file
|
|
|
package/__pycache__/rust_adapter.cpython-310.pyc
ADDED
|
Binary file (3.2 kB). View file
|
|
|
package/__pycache__/word_counter.cpython-310.pyc
ADDED
|
Binary file (5.42 kB). View file
|
|
|
package/__pycache__/word_processor.cpython-310.pyc
ADDED
|
Binary file (15.2 kB). View file
|
|
|
package/ai.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple, Any, Optional
|
| 2 |
+
import os
|
| 3 |
+
from config import Config
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class AI:
|
| 7 |
+
"""AIクラス - モデルをロードして文章とkを引数にトークンと確率のリストを返す(常駐版)"""
|
| 8 |
+
|
| 9 |
+
_instances = {} # モデルパスごとのインスタンスをキャッシュ(常駐)
|
| 10 |
+
|
| 11 |
+
def __new__(cls, model_path: str = None):
|
| 12 |
+
"""シングルトンパターンでモデルを常駐"""
|
| 13 |
+
path = model_path or Config.get_default_model_path()
|
| 14 |
+
|
| 15 |
+
if path not in cls._instances:
|
| 16 |
+
cls._instances[path] = super().__new__(cls)
|
| 17 |
+
cls._instances[path]._initialized = False
|
| 18 |
+
|
| 19 |
+
return cls._instances[path]
|
| 20 |
+
|
| 21 |
+
def __init__(self, model_path: str = None):
|
| 22 |
+
"""
|
| 23 |
+
モデルをロードして初期化(一度だけ実行、常駐)
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
model_path: モデルファイルのパス(Noneの場合はデフォルトパスを使用)
|
| 27 |
+
"""
|
| 28 |
+
if hasattr(self, '_initialized') and self._initialized:
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
self.model_path = model_path or Config.get_default_model_path()
|
| 32 |
+
self.model = self._load_model(self.model_path)
|
| 33 |
+
self._initialized = True
|
| 34 |
+
|
| 35 |
+
if self.model is None:
|
| 36 |
+
raise ValueError(f"モデルのロードに失敗しました: {self.model_path}")
|
| 37 |
+
|
| 38 |
+
@classmethod
|
| 39 |
+
def get_model(cls, model_path: str = None) -> 'AI':
|
| 40 |
+
"""モデルインスタンスを取得(常駐キャッシュから)"""
|
| 41 |
+
return cls(model_path)
|
| 42 |
+
|
| 43 |
+
@classmethod
|
| 44 |
+
def clear_cache(cls):
|
| 45 |
+
"""キャッシュをクリア(開発・テスト用)"""
|
| 46 |
+
cls._instances.clear()
|
| 47 |
+
|
| 48 |
+
def _load_model(self, model_path: str) -> Optional[Any]:
|
| 49 |
+
"""モデルをロード"""
|
| 50 |
+
try:
|
| 51 |
+
if not model_path or not os.path.exists(model_path):
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
# llama-cpp-pythonを使用してモデルをロード
|
| 55 |
+
try:
|
| 56 |
+
from llama_cpp import Llama
|
| 57 |
+
llm = Llama(
|
| 58 |
+
model_path=model_path,
|
| 59 |
+
n_ctx=2048,
|
| 60 |
+
logits_all=True,
|
| 61 |
+
n_gpu_layers=-1,
|
| 62 |
+
verbose=False,
|
| 63 |
+
)
|
| 64 |
+
return llm
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"llama-cpp-pythonでのロードに失敗: {e}")
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(f"モデルロードエラー: {e}")
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
def get_token_probabilities(self, text: str, k: int = 5) -> List[Tuple[str, float]]:
|
| 74 |
+
"""
|
| 75 |
+
文章とkを引数に、{token, 確率}のリストを返す
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
text: 入力文章
|
| 79 |
+
k: 取得するトークン数
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
List[Tuple[str, float]]: (トークン, 確率)のリスト
|
| 83 |
+
"""
|
| 84 |
+
if self.model is None:
|
| 85 |
+
return []
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
# llama-cpp-pythonのcreate_completionを使用
|
| 89 |
+
if hasattr(self.model, "create_completion"):
|
| 90 |
+
resp = self.model.create_completion(
|
| 91 |
+
prompt=text,
|
| 92 |
+
max_tokens=1,
|
| 93 |
+
logprobs=k,
|
| 94 |
+
temperature=0.0,
|
| 95 |
+
echo=False,
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# レスポンスからトークンと確率を抽出
|
| 99 |
+
items: List[Tuple[str, float]] = []
|
| 100 |
+
choice = resp.get("choices", [{}])[0]
|
| 101 |
+
lp = choice.get("logprobs", {})
|
| 102 |
+
top = lp.get("top_logprobs", [])
|
| 103 |
+
|
| 104 |
+
if top and isinstance(top[0], dict):
|
| 105 |
+
cand_dict = top[0]
|
| 106 |
+
tokens = list(cand_dict.keys())
|
| 107 |
+
logprobs = [cand_dict[t] for t in tokens]
|
| 108 |
+
|
| 109 |
+
# logprobsを確率に変換
|
| 110 |
+
probs = self._softmax_from_logprobs(logprobs)
|
| 111 |
+
|
| 112 |
+
for token, prob in zip(tokens, probs):
|
| 113 |
+
items.append((token, float(prob)))
|
| 114 |
+
|
| 115 |
+
# 確率順でソートして上位k個を返す
|
| 116 |
+
items = sorted(items, key=lambda x: x[1], reverse=True)[:k]
|
| 117 |
+
|
| 118 |
+
# 確率を正規化
|
| 119 |
+
if items:
|
| 120 |
+
total_prob = sum(prob for _, prob in items)
|
| 121 |
+
if total_prob > 0:
|
| 122 |
+
normalized_items: List[Tuple[str, float]] = []
|
| 123 |
+
for token, prob in items:
|
| 124 |
+
normalized_prob = prob / total_prob
|
| 125 |
+
normalized_items.append((token, normalized_prob))
|
| 126 |
+
return normalized_items
|
| 127 |
+
|
| 128 |
+
return items
|
| 129 |
+
else:
|
| 130 |
+
print("モデルがcreate_completion���ソッドをサポートしていません")
|
| 131 |
+
return []
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"トークン確率取得エラー: {e}")
|
| 135 |
+
return []
|
| 136 |
+
|
| 137 |
+
def _softmax_from_logprobs(self, logprobs: List[float]) -> List[float]:
|
| 138 |
+
"""logprobsをsoftmaxで確率に変換"""
|
| 139 |
+
if not logprobs:
|
| 140 |
+
return []
|
| 141 |
+
|
| 142 |
+
# 数値安定性のため最大値を引く
|
| 143 |
+
max_logprob = max(logprobs)
|
| 144 |
+
exp_logprobs = [exp(logprob - max_logprob) for logprob in logprobs]
|
| 145 |
+
sum_exp = sum(exp_logprobs)
|
| 146 |
+
|
| 147 |
+
if sum_exp == 0:
|
| 148 |
+
return [0.0] * len(logprobs)
|
| 149 |
+
|
| 150 |
+
return [exp_logprob / sum_exp for exp_logprob in exp_logprobs]
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def exp(x: float) -> float:
|
| 154 |
+
"""指数関数の近似実装(math.expの代替)"""
|
| 155 |
+
import math
|
| 156 |
+
return math.exp(x)
|
package/config.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
設定ファイル - パス設定専用(Tauriアプリ用)
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
from path_manager import get_path_manager
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Config:
|
| 12 |
+
"""設定管理クラス(Tauriアプリ用)"""
|
| 13 |
+
|
| 14 |
+
# 実行環境の判定
|
| 15 |
+
@classmethod
|
| 16 |
+
def _get_base_path(cls) -> str:
|
| 17 |
+
"""ベースパスを取得(pyinstaller対応)"""
|
| 18 |
+
if getattr(sys, 'frozen', False):
|
| 19 |
+
# pyinstallerでビルドされた場合
|
| 20 |
+
return os.path.dirname(sys.executable)
|
| 21 |
+
else:
|
| 22 |
+
# 開発環境の場合
|
| 23 |
+
return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 24 |
+
|
| 25 |
+
# MeCab設定(従来。必要なら使用)
|
| 26 |
+
MECAB_CONFIG_PATH = "/opt/homebrew/etc/mecabrc"
|
| 27 |
+
MECAB_DICT_PATH = "/opt/homebrew/lib/mecab/dic/ipadic"
|
| 28 |
+
|
| 29 |
+
# AIモデル設定(Tauriアプリ用)
|
| 30 |
+
# 環境変数で指定されていない場合は、ユーザーのDocumentsフォルダをデフォルトに
|
| 31 |
+
DEFAULT_MODEL_PATH = os.path.expanduser("~/Documents/models/llama-3.2-3b-instruct-q4_k_m.gguf")
|
| 32 |
+
|
| 33 |
+
# fugashi設定(MeCab/IPA 用)。Sudachi は SudachiPy を直接使用する。
|
| 34 |
+
# sudachidict_core のパスは参照のみ(情報表示やデバッグ用途)。
|
| 35 |
+
try:
|
| 36 |
+
import importlib.util
|
| 37 |
+
spec = importlib.util.find_spec("sudachidict_core")
|
| 38 |
+
if spec and spec.origin:
|
| 39 |
+
# origin は __init__.py のパス。辞書ディレクトリはその親
|
| 40 |
+
import os as _os
|
| 41 |
+
_pkg_dir = _os.path.dirname(spec.origin)
|
| 42 |
+
SUDACHI_DICT_PATH = _pkg_dir
|
| 43 |
+
else:
|
| 44 |
+
SUDACHI_DICT_PATH = ""
|
| 45 |
+
except Exception:
|
| 46 |
+
SUDACHI_DICT_PATH = ""
|
| 47 |
+
|
| 48 |
+
# fugashi は常に MeCab 設定を使用(IPA)。
|
| 49 |
+
FUGASHI_ARGS = f"-r {MECAB_CONFIG_PATH}"
|
| 50 |
+
|
| 51 |
+
@classmethod
|
| 52 |
+
def get_mecab_config_path(cls) -> str:
|
| 53 |
+
"""MeCab設定ファイルのパスを取得"""
|
| 54 |
+
return cls.MECAB_CONFIG_PATH
|
| 55 |
+
|
| 56 |
+
@classmethod
|
| 57 |
+
def get_mecab_dict_path(cls) -> str:
|
| 58 |
+
"""MeCab辞書のパスを取得"""
|
| 59 |
+
return cls.MECAB_DICT_PATH
|
| 60 |
+
|
| 61 |
+
@classmethod
|
| 62 |
+
def get_fugashi_args(cls) -> str:
|
| 63 |
+
"""fugashi用の引数を取得"""
|
| 64 |
+
return cls.FUGASHI_ARGS
|
| 65 |
+
|
| 66 |
+
@classmethod
|
| 67 |
+
def get_default_model_path(cls) -> str:
|
| 68 |
+
"""デフォルトのモデルパスを取得"""
|
| 69 |
+
return get_path_manager().get_model_path()
|
| 70 |
+
|
| 71 |
+
@classmethod
|
| 72 |
+
def get_package_path(cls) -> str:
|
| 73 |
+
"""パッケージパスを取得"""
|
| 74 |
+
return get_path_manager().get_package_path()
|
| 75 |
+
|
| 76 |
+
@classmethod
|
| 77 |
+
def validate_paths(cls) -> Dict[str, bool]:
|
| 78 |
+
"""パスの存在確認"""
|
| 79 |
+
return {
|
| 80 |
+
"mecab_config": os.path.exists(cls.MECAB_CONFIG_PATH),
|
| 81 |
+
"mecab_dict": os.path.exists(cls.MECAB_DICT_PATH),
|
| 82 |
+
"default_model": os.path.exists(cls.DEFAULT_MODEL_PATH)
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
@classmethod
|
| 86 |
+
def print_status(cls):
|
| 87 |
+
"""設定状況を表示"""
|
| 88 |
+
print("=== 設定状況 ===")
|
| 89 |
+
print(f"MeCab設定ファイル: {cls.MECAB_CONFIG_PATH}")
|
| 90 |
+
print(f"MeCab辞書: {cls.MECAB_DICT_PATH}")
|
| 91 |
+
print(f"デフォルトモデル: {cls.DEFAULT_MODEL_PATH}")
|
| 92 |
+
print(f"fugashi引数: {cls.FUGASHI_ARGS}")
|
| 93 |
+
print(f"Sudachi辞書パス検出: {getattr(cls, 'SUDACHI_DICT_PATH', '')}")
|
| 94 |
+
|
| 95 |
+
print("\n=== パス存在確認 ===")
|
| 96 |
+
status = cls.validate_paths()
|
| 97 |
+
for name, exists in status.items():
|
| 98 |
+
status_text = "✓" if exists else "✗"
|
| 99 |
+
print(f"{name}: {status_text}")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# 環境変数での上書き対応
|
| 103 |
+
def load_config_from_env():
|
| 104 |
+
"""環境変数から設定を読み込み"""
|
| 105 |
+
Config.MECAB_CONFIG_PATH = os.getenv("MECAB_CONFIG_PATH", Config.MECAB_CONFIG_PATH)
|
| 106 |
+
Config.MECAB_DICT_PATH = os.getenv("MECAB_DICT_PATH", Config.MECAB_DICT_PATH)
|
| 107 |
+
Config.DEFAULT_MODEL_PATH = os.getenv("DEFAULT_MODEL_PATH", Config.DEFAULT_MODEL_PATH)
|
| 108 |
+
# 環境変数 SUDACHI_DICT_PATH があれば記録のみ(SudachiPy が使用)。
|
| 109 |
+
sudachi_env = os.getenv("SUDACHI_DICT_PATH", getattr(Config, "SUDACHI_DICT_PATH", ""))
|
| 110 |
+
if sudachi_env:
|
| 111 |
+
Config.SUDACHI_DICT_PATH = sudachi_env
|
| 112 |
+
# fugashiは常にMeCab設定
|
| 113 |
+
Config.FUGASHI_ARGS = f"-r {Config.MECAB_CONFIG_PATH}"
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# 環境変数から設定を読み込み
|
| 117 |
+
load_config_from_env()
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# テスト関数
|
| 121 |
+
def test_config():
|
| 122 |
+
"""設定のテスト"""
|
| 123 |
+
print("=== Configテスト ===")
|
| 124 |
+
|
| 125 |
+
# 設定状況を表示
|
| 126 |
+
Config.print_status()
|
| 127 |
+
|
| 128 |
+
# fugashiテスト
|
| 129 |
+
try:
|
| 130 |
+
import fugashi
|
| 131 |
+
print(f"\n=== fugashiテスト ===")
|
| 132 |
+
tagger = fugashi.GenericTagger(Config.get_fugashi_args())
|
| 133 |
+
test_text = "こんにちは世界"
|
| 134 |
+
tokens = tagger(test_text)
|
| 135 |
+
print(f"テストテキスト: '{test_text}'")
|
| 136 |
+
print(f"形態素: {[token.surface for token in tokens]}")
|
| 137 |
+
print("✓ fugashi動作確認完了")
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"✗ fugashiテスト失敗: {e}")
|
| 140 |
+
|
| 141 |
+
# AIクラステスト
|
| 142 |
+
try:
|
| 143 |
+
from ai import AI
|
| 144 |
+
print(f"\n=== AIクラステスト ===")
|
| 145 |
+
if os.path.exists(Config.get_default_model_path()):
|
| 146 |
+
ai = AI(Config.get_default_model_path())
|
| 147 |
+
tokens = ai.get_token_probabilities("こんにちは", k=3)
|
| 148 |
+
print(f"テスト結果: {tokens}")
|
| 149 |
+
print("✓ AIクラス動作確認完了")
|
| 150 |
+
else:
|
| 151 |
+
print("✗ デフォルトモデルが見つかりません")
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f"✗ AIクラステスト失敗: {e}")
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
+
test_config()
|
package/path_manager.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
パス管理モジュール
|
| 4 |
+
PyInstaller対応とサーバー起動の両方に対応
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
class PathManager:
|
| 13 |
+
"""パス管理クラス(シングルトン)"""
|
| 14 |
+
|
| 15 |
+
_instance: Optional["PathManager"] = None
|
| 16 |
+
|
| 17 |
+
def __new__(cls):
|
| 18 |
+
if cls._instance is None:
|
| 19 |
+
cls._instance = super().__new__(cls)
|
| 20 |
+
cls._instance._initialized = False
|
| 21 |
+
return cls._instance
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
if self._initialized:
|
| 25 |
+
return
|
| 26 |
+
|
| 27 |
+
self._initialized = True
|
| 28 |
+
self._setup_paths()
|
| 29 |
+
|
| 30 |
+
def _setup_paths(self):
|
| 31 |
+
"""パス設定を初期化"""
|
| 32 |
+
# 実行環境の判定
|
| 33 |
+
if getattr(sys, 'frozen', False):
|
| 34 |
+
# PyInstallerでビルドされた場合
|
| 35 |
+
self.is_pyinstaller = True
|
| 36 |
+
self.base_path = Path(sys._MEIPASS)
|
| 37 |
+
self.package_path = self.base_path / 'package'
|
| 38 |
+
self.python_executable = sys.executable
|
| 39 |
+
else:
|
| 40 |
+
# 開発環境の場合
|
| 41 |
+
self.is_pyinstaller = False
|
| 42 |
+
self.base_path = Path(__file__).parent.parent
|
| 43 |
+
self.package_path = self.base_path / 'package'
|
| 44 |
+
self.python_executable = sys.executable
|
| 45 |
+
|
| 46 |
+
# モデルパス設定
|
| 47 |
+
self.model_path = self._get_model_path()
|
| 48 |
+
|
| 49 |
+
# ログ出力
|
| 50 |
+
print(f"[PATH] PyInstaller: {self.is_pyinstaller}")
|
| 51 |
+
print(f"[PATH] Base path: {self.base_path}")
|
| 52 |
+
print(f"[PATH] Package path: {self.package_path}")
|
| 53 |
+
print(f"[PATH] Model path: {self.model_path}")
|
| 54 |
+
|
| 55 |
+
def _get_model_path(self) -> str:
|
| 56 |
+
"""モデルファイルのパスを取得"""
|
| 57 |
+
# 環境変数から取得
|
| 58 |
+
env_model_path = os.getenv('LLM_MODEL_PATH')
|
| 59 |
+
if env_model_path and os.path.exists(env_model_path):
|
| 60 |
+
return env_model_path
|
| 61 |
+
|
| 62 |
+
# 配布物からの相対位置(優先)
|
| 63 |
+
try:
|
| 64 |
+
# 実行ファイルのあるディレクトリ(PyInstaller実行時は dist/.../tauri_python_server の場所)
|
| 65 |
+
executable_dir = Path(os.path.dirname(sys.executable)) if getattr(sys, 'frozen', False) else Path(__file__).parent.parent
|
| 66 |
+
candidate_relative_paths = [
|
| 67 |
+
# 配布レイアウト: <root>/models/<file> (exe が <root>/python/ にある想定)
|
| 68 |
+
(executable_dir.parent / 'models' / 'llama-3.2-3b-instruct-q4_k_m.gguf'),
|
| 69 |
+
# exe と同階層に models/
|
| 70 |
+
(executable_dir / 'models' / 'llama-3.2-3b-instruct-q4_k_m.gguf'),
|
| 71 |
+
# MEIPASS(展開一時ディレクトリ)配下
|
| 72 |
+
(Path(getattr(sys, '_MEIPASS', str(self.base_path))) / 'models' / 'llama-3.2-3b-instruct-q4_k_m.gguf'),
|
| 73 |
+
]
|
| 74 |
+
for rel in candidate_relative_paths:
|
| 75 |
+
rel_str = str(rel)
|
| 76 |
+
if os.path.exists(rel_str):
|
| 77 |
+
return rel_str
|
| 78 |
+
except Exception:
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
# デフォルトパス
|
| 82 |
+
default_paths = [
|
| 83 |
+
os.path.expanduser("~/Documents/GitHub/LLMV_app_frontend/src-tauri/python/models/llama-3.2-3b-instruct-q4_k_m.gguf"),
|
| 84 |
+
os.path.expanduser("~/Documents/models/llama-3.2-3b-instruct-q4_k_m.gguf"),
|
| 85 |
+
os.path.expanduser("~/models/llama-3.2-3b-instruct-q4_k_m.gguf"),
|
| 86 |
+
"/opt/models/llama-3.2-3b-instruct-q4_k_m.gguf"
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
for path in default_paths:
|
| 90 |
+
if os.path.exists(path):
|
| 91 |
+
return path
|
| 92 |
+
|
| 93 |
+
# 見つからない場合は最初のデフォルトパスを返す
|
| 94 |
+
return default_paths[0]
|
| 95 |
+
|
| 96 |
+
def get_package_path(self) -> str:
|
| 97 |
+
"""パッケージパスを取得"""
|
| 98 |
+
return str(self.package_path)
|
| 99 |
+
|
| 100 |
+
def get_model_path(self) -> str:
|
| 101 |
+
"""モデルパスを取得"""
|
| 102 |
+
return self.model_path
|
| 103 |
+
|
| 104 |
+
def get_python_executable(self) -> str:
|
| 105 |
+
"""Python実行ファイルのパスを取得"""
|
| 106 |
+
return self.python_executable
|
| 107 |
+
|
| 108 |
+
def setup_sys_path(self):
|
| 109 |
+
"""sys.pathを設定"""
|
| 110 |
+
package_path_str = self.get_package_path()
|
| 111 |
+
|
| 112 |
+
# パッケージパスを追加
|
| 113 |
+
if package_path_str not in sys.path:
|
| 114 |
+
sys.path.insert(0, package_path_str)
|
| 115 |
+
|
| 116 |
+
# conda環境のパッケージパスも追加(開発環境のみ)
|
| 117 |
+
if not self.is_pyinstaller:
|
| 118 |
+
conda_package_path = '/Users/wataru/Documents/AppProject/LLMView_Server/package'
|
| 119 |
+
if os.path.exists(conda_package_path) and conda_package_path not in sys.path:
|
| 120 |
+
sys.path.insert(0, conda_package_path)
|
| 121 |
+
|
| 122 |
+
print(f"[PATH] sys.path configured: {len(sys.path)} paths")
|
| 123 |
+
|
| 124 |
+
def get_server_config(self) -> dict:
|
| 125 |
+
"""サーバー設定を取得"""
|
| 126 |
+
return {
|
| 127 |
+
'host': '127.0.0.1',
|
| 128 |
+
'port': 5000,
|
| 129 |
+
'debug': False,
|
| 130 |
+
'use_reloader': False,
|
| 131 |
+
'threaded': True
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
def get_pyinstaller_spec_path(self) -> str:
|
| 135 |
+
"""PyInstaller specファイルのパスを取得"""
|
| 136 |
+
return str(self.base_path / 'tauri_python_server.spec')
|
| 137 |
+
|
| 138 |
+
def get_dist_path(self) -> str:
|
| 139 |
+
"""ビルド出力ディレクトリのパスを取得"""
|
| 140 |
+
return str(self.base_path / 'dist')
|
| 141 |
+
|
| 142 |
+
def get_executable_name(self) -> str:
|
| 143 |
+
"""実行ファイル名を取得"""
|
| 144 |
+
return 'tauri_python_server'
|
| 145 |
+
|
| 146 |
+
# グローバルインスタンス
|
| 147 |
+
path_manager = PathManager()
|
| 148 |
+
|
| 149 |
+
def get_path_manager() -> PathManager:
|
| 150 |
+
"""PathManagerインスタンスを取得"""
|
| 151 |
+
return path_manager
|
package/rust_adapter.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List, Optional
|
| 2 |
+
from threading import Lock
|
| 3 |
+
|
| 4 |
+
from word_processor import WordDeterminer, WordPiece
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class RustAdapter:
|
| 8 |
+
"""
|
| 9 |
+
Rust から呼び出すためのアダプタ。
|
| 10 |
+
- 初期化コストの高いコンポーネント(WordDeterminer, AIモデル)を1回だけ生成して保持
|
| 11 |
+
- メソッドでビルド処理を提供
|
| 12 |
+
- 返却はシリアライズしやすい dict/list 形式
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
_instance: Optional["RustAdapter"] = None
|
| 16 |
+
_lock: Lock = Lock()
|
| 17 |
+
|
| 18 |
+
def __init__(self, model_path: Optional[str] = None):
|
| 19 |
+
# WordDeterminer(内部で Sudachi C モードの WordCounter を使用)
|
| 20 |
+
self.determiner = WordDeterminer()
|
| 21 |
+
|
| 22 |
+
# AIモデルは共有キャッシュ取得
|
| 23 |
+
# model_path が None の場合はデフォルトモデル
|
| 24 |
+
from ai import AI
|
| 25 |
+
|
| 26 |
+
self.model = AI.get_model(model_path)
|
| 27 |
+
|
| 28 |
+
@classmethod
|
| 29 |
+
def get_instance(cls, model_path: Optional[str] = None) -> "RustAdapter":
|
| 30 |
+
"""シングルトン取得。model_path 指定時は初回のみ反映。"""
|
| 31 |
+
if cls._instance is not None:
|
| 32 |
+
return cls._instance
|
| 33 |
+
with cls._lock:
|
| 34 |
+
if cls._instance is None:
|
| 35 |
+
cls._instance = RustAdapter(model_path)
|
| 36 |
+
return cls._instance
|
| 37 |
+
|
| 38 |
+
# ===== 公開API =====
|
| 39 |
+
def build_word_tree(self, prompt_text: str, root_text: str = "", top_k: int = 5, max_depth: int = 10) -> List[Dict[str, Any]]:
|
| 40 |
+
"""
|
| 41 |
+
単語ツリーを構築して、完成ピースを dict の配列で返す。
|
| 42 |
+
各要素: { text: str, probability: float }
|
| 43 |
+
"""
|
| 44 |
+
pieces: List[WordPiece] = self.determiner.build_word_tree(
|
| 45 |
+
prompt_text=prompt_text,
|
| 46 |
+
root_text=root_text,
|
| 47 |
+
model=self.model,
|
| 48 |
+
top_k=top_k,
|
| 49 |
+
max_depth=max_depth,
|
| 50 |
+
)
|
| 51 |
+
return [
|
| 52 |
+
{"text": p.get_full_word(), "probability": float(p.probability)}
|
| 53 |
+
for p in pieces
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
def build_chat_prompt(self, user_content: str, system_content: str = "あなたは親切で役に立つAIアシスタントです。") -> str:
|
| 57 |
+
"""チャットプロンプト文字列を返す。"""
|
| 58 |
+
return self.determiner.build_chat_prompt(user_content, system_content)
|
| 59 |
+
|
| 60 |
+
def count_words(self, text: str) -> int:
|
| 61 |
+
"""Sudachi(C) ベースでの語数カウント。"""
|
| 62 |
+
return self.determiner._count_words(text)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# 簡易動作テスト
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
adapter = RustAdapter.get_instance()
|
| 68 |
+
prompt = "電球を作ったのは誰?"
|
| 69 |
+
results = adapter.build_word_tree(prompt_text=prompt, root_text="", top_k=5, max_depth=5)
|
| 70 |
+
print("=== RustAdapter 確認 ===")
|
| 71 |
+
for i, r in enumerate(results, 1):
|
| 72 |
+
print(f"{i}. {r['text']} ({r['probability']:.4f})")
|
package/word_counter.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
import fugashi
|
| 3 |
+
from config import Config
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
# SudachiPy があれば直接利用してモードCを使用
|
| 7 |
+
from sudachipy import dictionary as sudachi_dictionary
|
| 8 |
+
from sudachipy import tokenizer as sudachi_tokenizer
|
| 9 |
+
_SUDACHI_AVAILABLE = True
|
| 10 |
+
except Exception:
|
| 11 |
+
_SUDACHI_AVAILABLE = False
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class WordCounter:
|
| 15 |
+
"""単語数を数えるクラス(SudachiPyがあれば mode=C、なければfugashi)"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, tokenizer: Any = None):
|
| 18 |
+
"""
|
| 19 |
+
初期化
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
tokenizer: fugashiトークナイザー(Noneの場合はデフォルトを使用)
|
| 23 |
+
"""
|
| 24 |
+
# 優先順位: 引数tokenizer > SudachiPy > fugashi(GenericTagger)
|
| 25 |
+
self._use_sudachi = False
|
| 26 |
+
self._sudachi_mode = None
|
| 27 |
+
if tokenizer is not None:
|
| 28 |
+
self.tokenizer = tokenizer
|
| 29 |
+
elif _SUDACHI_AVAILABLE:
|
| 30 |
+
# SudachiPyの辞書は自動で同梱辞書を参照(sudachidict_core)
|
| 31 |
+
# 外部設定不要。SplitMode.C を使用
|
| 32 |
+
self._use_sudachi = True
|
| 33 |
+
self.tokenizer = sudachi_dictionary.Dictionary().create()
|
| 34 |
+
self._sudachi_mode = sudachi_tokenizer.Tokenizer.SplitMode.C
|
| 35 |
+
else:
|
| 36 |
+
# fugashi (MeCab) フォールバック
|
| 37 |
+
self.tokenizer = fugashi.GenericTagger(Config.get_fugashi_args())
|
| 38 |
+
|
| 39 |
+
def count_words(self, text: str) -> int:
|
| 40 |
+
"""
|
| 41 |
+
テキストの単語数をカウント
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
text: カウントするテキスト
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
int: 単語数
|
| 48 |
+
"""
|
| 49 |
+
if not text:
|
| 50 |
+
return 0
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
# fugashiで形態素解析して単語数をカウント
|
| 54 |
+
if self._use_sudachi:
|
| 55 |
+
tokens = self.tokenizer.tokenize(text, self._sudachi_mode)
|
| 56 |
+
return len(tokens)
|
| 57 |
+
else:
|
| 58 |
+
tokens = self.tokenizer(text)
|
| 59 |
+
return len(tokens)
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"fugashi単語数カウントエラー: {e}")
|
| 62 |
+
# フォールバック: 空白で分割
|
| 63 |
+
return len(text.split())
|
| 64 |
+
|
| 65 |
+
def is_word_boundary(self, text: str, position: int) -> bool:
|
| 66 |
+
"""
|
| 67 |
+
指定位置が単語境界かどうかを判定
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
text: テキスト
|
| 71 |
+
position: 位置(負の値で末尾から指定可能、-1は末尾)
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
bool: 単語境界かどうか
|
| 75 |
+
"""
|
| 76 |
+
if not text:
|
| 77 |
+
return True
|
| 78 |
+
|
| 79 |
+
# 負のインデックスを正のインデックスに変換
|
| 80 |
+
if position < 0:
|
| 81 |
+
position = len(text) + position
|
| 82 |
+
|
| 83 |
+
if position >= len(text):
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
# fugashiで形態素解析
|
| 88 |
+
if self._use_sudachi:
|
| 89 |
+
tokens = self.tokenizer.tokenize(text, self._sudachi_mode)
|
| 90 |
+
surfaces = [m.surface() for m in tokens]
|
| 91 |
+
else:
|
| 92 |
+
tokens = self.tokenizer(text)
|
| 93 |
+
surfaces = [m.surface for m in tokens]
|
| 94 |
+
|
| 95 |
+
current_pos = 0
|
| 96 |
+
for surface in surfaces:
|
| 97 |
+
token_length = len(surface)
|
| 98 |
+
if current_pos <= position < current_pos + token_length:
|
| 99 |
+
return False
|
| 100 |
+
if position == current_pos + token_length:
|
| 101 |
+
return True
|
| 102 |
+
current_pos += token_length
|
| 103 |
+
|
| 104 |
+
return True
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"fugashi境界判定エラー: {e}")
|
| 108 |
+
# フォールバック: 空白文字で判定
|
| 109 |
+
return position < len(text) and text[position].isspace()
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# テスト関数
|
| 113 |
+
def test_word_counter():
|
| 114 |
+
"""WordCounterのテスト"""
|
| 115 |
+
print("=== WordCounterテスト ===")
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
counter = WordCounter()
|
| 119 |
+
|
| 120 |
+
# 基本的な単語数カウントテスト
|
| 121 |
+
print("単語数カウントテスト:")
|
| 122 |
+
test_texts = [
|
| 123 |
+
"私はエ",
|
| 124 |
+
"私はエジ",
|
| 125 |
+
"私はエジソ",
|
| 126 |
+
"私はエジソン",
|
| 127 |
+
"私はエジソンで"
|
| 128 |
+
]
|
| 129 |
+
|
| 130 |
+
for text in test_texts:
|
| 131 |
+
word_count = counter.count_words(text)
|
| 132 |
+
print(f" '{text}' → {word_count}語")
|
| 133 |
+
|
| 134 |
+
# 単語境界テスト
|
| 135 |
+
print("\n単語境界テスト:")
|
| 136 |
+
test_text = "私はエジソンで"
|
| 137 |
+
print(f" '{test_text}' の境界判定:")
|
| 138 |
+
|
| 139 |
+
for i in range(len(test_text) + 1):
|
| 140 |
+
is_boundary = counter.is_word_boundary(test_text, i)
|
| 141 |
+
print(f" 位置{i}: {is_boundary}")
|
| 142 |
+
|
| 143 |
+
# 負のインデックステスト
|
| 144 |
+
print("\n負のインデックステスト:")
|
| 145 |
+
print(f" '{test_text}' の負のインデックス境界判定:")
|
| 146 |
+
for i in range(-len(test_text), 1):
|
| 147 |
+
is_boundary = counter.is_word_boundary(test_text, i)
|
| 148 |
+
print(f" 位置{i}: {is_boundary}")
|
| 149 |
+
|
| 150 |
+
# Sudachi(C) と fugashi(IPA) の分割比較
|
| 151 |
+
print("\n分割比較: Sudachi(C) vs fugashi(IPA)")
|
| 152 |
+
compare_texts = [
|
| 153 |
+
"私はエジソンで有名な科学者です。",
|
| 154 |
+
"電球を作ったのは誰?",
|
| 155 |
+
"自然言語処理は面白い。",
|
| 156 |
+
"電球を"
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
# 準備: 各トークナイザ
|
| 160 |
+
sudachi_ok = False
|
| 161 |
+
sudachi_tok = None
|
| 162 |
+
sudachi_mode = None
|
| 163 |
+
try:
|
| 164 |
+
if _SUDACHI_AVAILABLE:
|
| 165 |
+
sudachi_tok = sudachi_dictionary.Dictionary().create()
|
| 166 |
+
sudachi_mode = sudachi_tokenizer.Tokenizer.SplitMode.C
|
| 167 |
+
sudachi_ok = True
|
| 168 |
+
except Exception:
|
| 169 |
+
sudachi_ok = False
|
| 170 |
+
|
| 171 |
+
ipa_tagger = fugashi.GenericTagger(Config.get_fugashi_args())
|
| 172 |
+
|
| 173 |
+
for text in compare_texts:
|
| 174 |
+
print(f"\n--- テキスト: {text}")
|
| 175 |
+
# fugashi(IPA)
|
| 176 |
+
try:
|
| 177 |
+
ipa_tokens = ipa_tagger(text)
|
| 178 |
+
ipa_surfaces = [t.surface for t in ipa_tokens]
|
| 179 |
+
print(f"fugashi(IPA) {len(ipa_surfaces)}語: {' | '.join(ipa_surfaces)}")
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"fugashi(IPA) 解析失敗: {e}")
|
| 182 |
+
|
| 183 |
+
# Sudachi(C)
|
| 184 |
+
if sudachi_ok:
|
| 185 |
+
try:
|
| 186 |
+
s_tokens = sudachi_tok.tokenize(text, sudachi_mode)
|
| 187 |
+
s_surfaces = [m.surface() for m in s_tokens]
|
| 188 |
+
print(f"Sudachi(C) {len(s_surfaces)}語: {' | '.join(s_surfaces)}")
|
| 189 |
+
except Exception as e:
|
| 190 |
+
print(f"Sudachi(C) 解析失敗: {e}")
|
| 191 |
+
else:
|
| 192 |
+
print("Sudachi(C) は利用不可(モジュール未検出)")
|
| 193 |
+
|
| 194 |
+
print("\nテスト完了")
|
| 195 |
+
|
| 196 |
+
except ImportError:
|
| 197 |
+
print("fugashiがインストールされていません。pip install fugashi でインストールしてください。")
|
| 198 |
+
except Exception as e:
|
| 199 |
+
print(f"テストエラー: {e}")
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
test_word_counter()
|
package/word_processor.py
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple, Any, Optional
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from enum import Enum
|
| 4 |
+
import os
|
| 5 |
+
import math
|
| 6 |
+
from word_counter import WordCounter
|
| 7 |
+
from config import Config
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class WordState(Enum):
|
| 11 |
+
"""単語の状態"""
|
| 12 |
+
INCOMPLETE = "incomplete" # 未完成
|
| 13 |
+
COMPLETE = "complete" # 完成
|
| 14 |
+
TRIGGER = "trigger" # トリガー(次語の開始)
|
| 15 |
+
|
| 16 |
+
class KList:
|
| 17 |
+
def __init__(self, num: int):
|
| 18 |
+
self.num = num
|
| 19 |
+
self.list: List[Any] = []
|
| 20 |
+
|
| 21 |
+
def check_k(self) -> None:
|
| 22 |
+
if len(self.list) >= self.num:
|
| 23 |
+
self.list.sort(key=lambda x: x.probability, reverse=True)
|
| 24 |
+
self.list = self.list[:self.num]
|
| 25 |
+
else:
|
| 26 |
+
self.list.sort(key=lambda x: x.probability, reverse=True)
|
| 27 |
+
|
| 28 |
+
def add(self, piece_word: Any) -> None:
|
| 29 |
+
# 重複チェック: 同じテキストのピースが既に存在するか確認
|
| 30 |
+
new_text = piece_word.get_full_text()
|
| 31 |
+
for existing_piece in self.list:
|
| 32 |
+
if existing_piece.get_full_text() == new_text:
|
| 33 |
+
# 既存のピースに確率を足す
|
| 34 |
+
existing_piece.probability += piece_word.probability
|
| 35 |
+
# 確率を更新したので、ソートし直す
|
| 36 |
+
self.check_k()
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
# 重複がない場合は追加
|
| 40 |
+
self.list.append(piece_word)
|
| 41 |
+
self.check_k()
|
| 42 |
+
|
| 43 |
+
def pop(self) -> Any:
|
| 44 |
+
if self.list:
|
| 45 |
+
return self.list.pop(0)
|
| 46 |
+
raise IndexError("List is empty")
|
| 47 |
+
|
| 48 |
+
def empty(self) -> bool:
|
| 49 |
+
return len(self.list) == 0
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class WordPiece:
|
| 53 |
+
"""単語のピース(部分)"""
|
| 54 |
+
text: str # ピースのテキスト
|
| 55 |
+
probability: float # 確率
|
| 56 |
+
next_tokens: Optional[List[Tuple[str, float]]] = None # 次のトークン候補
|
| 57 |
+
parent: Optional['WordPiece'] = None # 親ピース
|
| 58 |
+
children: List['WordPiece'] = None # 子ピース
|
| 59 |
+
|
| 60 |
+
def __post_init__(self):
|
| 61 |
+
if self.children is None:
|
| 62 |
+
self.children = []
|
| 63 |
+
|
| 64 |
+
def get_full_text(self) -> str:
|
| 65 |
+
"""ルートからこのピースまでの完全なテキストを取得"""
|
| 66 |
+
pieces = []
|
| 67 |
+
current = self
|
| 68 |
+
while current is not None:
|
| 69 |
+
if current.text:
|
| 70 |
+
pieces.append(current.text)
|
| 71 |
+
current = current.parent
|
| 72 |
+
return "".join(reversed(pieces))
|
| 73 |
+
|
| 74 |
+
def get_full_word(self) -> str:
|
| 75 |
+
"""ルートの次語からこのピースまでの完全な単語を取得"""
|
| 76 |
+
pieces = []
|
| 77 |
+
current = self
|
| 78 |
+
while current is not None:
|
| 79 |
+
if current.text:
|
| 80 |
+
pieces.append(current.text)
|
| 81 |
+
current = current.parent
|
| 82 |
+
reversed_pieces = reversed(pieces[:-1])
|
| 83 |
+
return "".join(reversed_pieces)
|
| 84 |
+
|
| 85 |
+
def add_child(self, text: str, probability: float, next_tokens: Optional[List[Tuple[str, float]]] = None) -> 'WordPiece':
|
| 86 |
+
"""子ピースを追加"""
|
| 87 |
+
child = WordPiece(
|
| 88 |
+
text=text,
|
| 89 |
+
probability=probability,
|
| 90 |
+
next_tokens=next_tokens,
|
| 91 |
+
parent=self
|
| 92 |
+
)
|
| 93 |
+
self.children.append(child)
|
| 94 |
+
return child
|
| 95 |
+
|
| 96 |
+
def is_leaf(self) -> bool:
|
| 97 |
+
"""葉ノードかどうか"""
|
| 98 |
+
return len(self.children) == 0
|
| 99 |
+
|
| 100 |
+
def get_depth(self) -> int:
|
| 101 |
+
"""ルートからの深さを取得"""
|
| 102 |
+
depth = 0
|
| 103 |
+
current = self.parent
|
| 104 |
+
while current is not None:
|
| 105 |
+
depth += 1
|
| 106 |
+
current = current.parent
|
| 107 |
+
return depth
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class WordDeterminer:
|
| 111 |
+
"""単語確定システム(ストリーミング向けリアルタイムアルゴリズム)"""
|
| 112 |
+
|
| 113 |
+
def __init__(self, word_counter: WordCounter = None):
|
| 114 |
+
"""
|
| 115 |
+
初期化
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
word_counter: WordCounterインスタンス(Noneの場合はデフォルトを使用)
|
| 119 |
+
"""
|
| 120 |
+
self.word_counter = word_counter or WordCounter()
|
| 121 |
+
|
| 122 |
+
def is_boundary_char(self, char: str) -> bool:
|
| 123 |
+
"""境界文字かどうかを判定(fugashi使用)"""
|
| 124 |
+
if not char:
|
| 125 |
+
return False
|
| 126 |
+
|
| 127 |
+
# 空白文字
|
| 128 |
+
if char.isspace():
|
| 129 |
+
return True
|
| 130 |
+
|
| 131 |
+
# 句読点
|
| 132 |
+
punctuation = ",,..。!?!?:;;、\n\t"
|
| 133 |
+
return char in punctuation
|
| 134 |
+
|
| 135 |
+
def is_word_boundary(self, text: str, position: int) -> bool:
|
| 136 |
+
"""
|
| 137 |
+
WordCounterを使用して単語境界を判定
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
text: テキスト
|
| 141 |
+
position: 位置(負の値で末尾から指定可能)
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
bool: 単語境界かどうか
|
| 145 |
+
"""
|
| 146 |
+
return self.word_counter.is_word_boundary(text, position)
|
| 147 |
+
|
| 148 |
+
def check_word_completion(self, piece: WordPiece, root_count: int, model: Any = None) -> Tuple[WordState, Optional[Any]]:
|
| 149 |
+
"""
|
| 150 |
+
ストリーミング向けリアルタ��ム単語決定アルゴリズム
|
| 151 |
+
|
| 152 |
+
戦略:
|
| 153 |
+
1. 確率エントロピー: 次のトークンの不確実性を測定
|
| 154 |
+
2. 確率重み付き境界検出: 高確率トークンの挙動を重視
|
| 155 |
+
3. 信頼度ベース判定: 高確率トークンが明確に境界を示す場合のみ確定
|
| 156 |
+
|
| 157 |
+
アルゴリズム:
|
| 158 |
+
- エントロピーが低い(確率が集中)→ 単語継続の可能性が高い
|
| 159 |
+
- エントロピーが高い(確率が分散)→ 単語境界の可能性
|
| 160 |
+
- 高確率トークンが境界を示す → 確定
|
| 161 |
+
- 低確率トークンだけが境界を示す → 無視
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
piece: チェックするピース
|
| 165 |
+
root_count: ルートテキストの単語数
|
| 166 |
+
model: LLMモデル(必要に応じて)
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
Tuple[WordState, Optional[Any]]: (状態, ペイロード)
|
| 170 |
+
"""
|
| 171 |
+
full_text = piece.get_full_text()
|
| 172 |
+
|
| 173 |
+
# next_tokensを取得
|
| 174 |
+
if not piece.next_tokens:
|
| 175 |
+
if model:
|
| 176 |
+
piece.next_tokens = self._get_next_tokens_from_model(model, full_text)
|
| 177 |
+
else:
|
| 178 |
+
return (WordState.COMPLETE, None)
|
| 179 |
+
|
| 180 |
+
if not piece.next_tokens:
|
| 181 |
+
return (WordState.COMPLETE, None)
|
| 182 |
+
|
| 183 |
+
# 確率順にソート(念のため)
|
| 184 |
+
sorted_tokens = sorted(piece.next_tokens, key=lambda x: x[1], reverse=True)
|
| 185 |
+
# sorted_tokens = piece.next_tokens
|
| 186 |
+
if piece.get_full_word()[-1] in ["(","「","(","【","〈","《","[","{","⦅","《","[","{","⦅","《","[","{","⦅","《","[","{","⦅","《","[","{","⦅"]:
|
| 187 |
+
return (WordState.INCOMPLETE, None)
|
| 188 |
+
if piece.get_full_word()[-1] in [")","]","}","》","〉","》","]","}","⦆","》","]","}","⦆","》","]","}","⦆","》","]","}","⦆","》","]","}","⦆"]:
|
| 189 |
+
return (WordState.COMPLETE, None)
|
| 190 |
+
|
| 191 |
+
# 2.全トークンの挙動を確認
|
| 192 |
+
count = max(1, len(sorted_tokens) )
|
| 193 |
+
tokens = sorted_tokens[:count]
|
| 194 |
+
|
| 195 |
+
boundary_prob = 0.0 # 境界を示すトークンの確率合計
|
| 196 |
+
continuation_prob = 0.0 # 継続を示すトークンの確率合計
|
| 197 |
+
total = sum(prob for _, prob in tokens)
|
| 198 |
+
|
| 199 |
+
for token, prob in tokens:
|
| 200 |
+
test_text = full_text + token
|
| 201 |
+
test_word_count = self._count_words(test_text)
|
| 202 |
+
|
| 203 |
+
# 単語数がより多く増えた場合のみ境界と判定(まとまりを上げる)
|
| 204 |
+
if test_word_count > root_count + 1:
|
| 205 |
+
boundary_prob += prob
|
| 206 |
+
else:
|
| 207 |
+
continuation_prob += prob
|
| 208 |
+
|
| 209 |
+
# 3. 判定ロジック
|
| 210 |
+
if total > 0:
|
| 211 |
+
boundary_ratio = boundary_prob / total
|
| 212 |
+
|
| 213 |
+
# トークンの多くが境界を示す場合 → 確定(閾値を上げてまとまりを上げる)
|
| 214 |
+
if boundary_ratio > 0.85:
|
| 215 |
+
return (WordState.COMPLETE, None)
|
| 216 |
+
|
| 217 |
+
# トークンの多くが継続を示す場合 → 継続(閾値を下げて継続しやすく)
|
| 218 |
+
if boundary_ratio < 0.2:
|
| 219 |
+
return (WordState.INCOMPLETE, None)
|
| 220 |
+
# 1. 確率エントロピーを計算
|
| 221 |
+
probs = [prob for _, prob in sorted_tokens]
|
| 222 |
+
entropy = -sum(p * math.log(p + 1e-10) for p in probs if p > 0)
|
| 223 |
+
max_entropy = math.log(len(sorted_tokens)) if len(sorted_tokens) > 1 else 1.0
|
| 224 |
+
normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
|
| 225 |
+
# 4. エントロピーベース判定
|
| 226 |
+
# エントロピーが低い(確率が集中)→ 単語継続の可能性
|
| 227 |
+
# エントロピーが高い(確率が分散)→ 単語境界の可能性
|
| 228 |
+
|
| 229 |
+
return (WordState.INCOMPLETE, None)
|
| 230 |
+
|
| 231 |
+
def _count_words(self, text: str) -> int:
|
| 232 |
+
"""
|
| 233 |
+
WordCounterを使用してテキストの単語数をカウント
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
text: カウントするテキスト
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
int: 単語数
|
| 240 |
+
"""
|
| 241 |
+
return self.word_counter.count_words(text)
|
| 242 |
+
|
| 243 |
+
def _get_next_tokens_from_model(self, model: Any, text: str, top_k: int = 5) -> List[Tuple[str, float]]:
|
| 244 |
+
"""
|
| 245 |
+
モデルから次のトークン候補を取得(常駐AIモデルを使用)
|
| 246 |
+
|
| 247 |
+
Args:
|
| 248 |
+
model: LLMモデル(パス文字列またはモデルオブジェクト)
|
| 249 |
+
text: 入力テキスト
|
| 250 |
+
top_k: 取得する候補数
|
| 251 |
+
|
| 252 |
+
Returns:
|
| 253 |
+
List[Tuple[str, float]]: (トークン, 確率)のリスト
|
| 254 |
+
"""
|
| 255 |
+
try:
|
| 256 |
+
# AIクラスをインポート
|
| 257 |
+
from ai import AI
|
| 258 |
+
|
| 259 |
+
# モデルパスを取得
|
| 260 |
+
if isinstance(model, str):
|
| 261 |
+
model_path = model
|
| 262 |
+
elif hasattr(model, 'model_path'):
|
| 263 |
+
model_path = model.model_path
|
| 264 |
+
else:
|
| 265 |
+
# デフォルトモデルを使用
|
| 266 |
+
model_path = None
|
| 267 |
+
|
| 268 |
+
# 常駐AIモデルを使用
|
| 269 |
+
ai_model = AI.get_model(model_path)
|
| 270 |
+
return ai_model.get_token_probabilities(text, top_k)
|
| 271 |
+
|
| 272 |
+
except Exception as e:
|
| 273 |
+
print(f"モデルからのトークン取得に失敗: {e}")
|
| 274 |
+
|
| 275 |
+
return []
|
| 276 |
+
|
| 277 |
+
def expand_piece(self, piece: WordPiece, model: Any = None) -> List[WordPiece]:
|
| 278 |
+
"""
|
| 279 |
+
ピースを展開して子ピースを生成
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
piece: 展開するピース
|
| 283 |
+
model: LLMモデル
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
List[WordPiece]: 生成された子ピースのリスト
|
| 287 |
+
"""
|
| 288 |
+
children = []
|
| 289 |
+
full_text = piece.get_full_text()
|
| 290 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] expand_piece: '{full_text}'")
|
| 291 |
+
|
| 292 |
+
if piece.next_tokens:
|
| 293 |
+
# 既存のnext_tokensを使用
|
| 294 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] Using existing next_tokens: {len(piece.next_tokens)}")
|
| 295 |
+
for token, prob in piece.next_tokens:
|
| 296 |
+
# 空文字列トークンを無視
|
| 297 |
+
if not token:
|
| 298 |
+
continue
|
| 299 |
+
child_prob = piece.probability * prob
|
| 300 |
+
child = piece.add_child(token, child_prob)
|
| 301 |
+
children.append(child)
|
| 302 |
+
elif model:
|
| 303 |
+
# モデルから次のトークンを取得
|
| 304 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] Getting tokens from model for: '{full_text}'")
|
| 305 |
+
next_tokens = self._get_next_tokens_from_model(model, full_text)
|
| 306 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] Got {len(next_tokens)} tokens from model")
|
| 307 |
+
|
| 308 |
+
if next_tokens:
|
| 309 |
+
piece.next_tokens = next_tokens
|
| 310 |
+
for token, prob in next_tokens:
|
| 311 |
+
# 空文字列トークンを無視
|
| 312 |
+
if not token:
|
| 313 |
+
continue
|
| 314 |
+
child_prob = piece.probability * prob
|
| 315 |
+
child = piece.add_child(token, child_prob)
|
| 316 |
+
children.append(child)
|
| 317 |
+
else:
|
| 318 |
+
print(f"[WORD_PROCESSOR_STREAMING] No model provided for expansion")
|
| 319 |
+
|
| 320 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] Generated {len(children)} children")
|
| 321 |
+
return children
|
| 322 |
+
|
| 323 |
+
def build_word_tree(self, prompt_text: str, root_text: str, model: Any, top_k: int = 5, max_depth: int = 10) -> List[WordPiece]:
|
| 324 |
+
"""
|
| 325 |
+
単語ツリーを構築
|
| 326 |
+
|
| 327 |
+
Args:
|
| 328 |
+
root_text: ルートテキスト
|
| 329 |
+
model: LLMモデル
|
| 330 |
+
top_k: 取得する候補数
|
| 331 |
+
max_depth: 最大深さ
|
| 332 |
+
|
| 333 |
+
Returns:
|
| 334 |
+
List[WordPiece]: 完成した単語ピースのリスト
|
| 335 |
+
"""
|
| 336 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] build_word_tree called: prompt='{prompt_text}', root='{root_text}', top_k={top_k}")
|
| 337 |
+
|
| 338 |
+
# ルートピースを作成
|
| 339 |
+
root = WordPiece(text=self.build_chat_prompt(prompt_text, )+root_text, probability=1.0)
|
| 340 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] Root piece created: '{root.get_full_text()}'")
|
| 341 |
+
|
| 342 |
+
# 優先度付きキュー(確率順)
|
| 343 |
+
candidates = KList(2*top_k)
|
| 344 |
+
completed = []
|
| 345 |
+
iteration = 0
|
| 346 |
+
max_iterations = 1000
|
| 347 |
+
children = self.expand_piece(root, model)
|
| 348 |
+
#1#print(f"[WORD_PROCESSOR_STREAMING] Initial children: {len(children)}")
|
| 349 |
+
for child in children:
|
| 350 |
+
candidates.add(child)
|
| 351 |
+
while not candidates.empty() and iteration < max_iterations and len(completed) < top_k:
|
| 352 |
+
iteration += 1
|
| 353 |
+
|
| 354 |
+
# 最も確率の高い候補を取得
|
| 355 |
+
current = candidates.pop()
|
| 356 |
+
|
| 357 |
+
# # 深さ制限チェック
|
| 358 |
+
# if current.get_depth() >= max_depth:
|
| 359 |
+
# completed.append(current)
|
| 360 |
+
# continue
|
| 361 |
+
|
| 362 |
+
# 単語完成状態をチェック
|
| 363 |
+
root_count = self._count_words(root.get_full_text())
|
| 364 |
+
state, payload = self.check_word_completion(current, root_count, model)
|
| 365 |
+
|
| 366 |
+
if state == WordState.COMPLETE:
|
| 367 |
+
completed.append(current)
|
| 368 |
+
# print(f"☆☆☆☆☆complete: {current.get_full_text()}")
|
| 369 |
+
elif state == WordState.INCOMPLETE:
|
| 370 |
+
# ピースを展開
|
| 371 |
+
children = self.expand_piece(current, model)
|
| 372 |
+
if len(children) == 0:
|
| 373 |
+
# 子が生成できない場合、ピースを完成として扱う(無限ループ防止)
|
| 374 |
+
print(f"[WORD_PROCESSOR_STREAMING] No children generated for '{current.get_full_text()}', marking as COMPLETE")
|
| 375 |
+
completed.append(current)
|
| 376 |
+
else:
|
| 377 |
+
for child in children:
|
| 378 |
+
candidates.add(child)
|
| 379 |
+
# print(f"☆☆☆☆☆while end{len(completed),candidates.empty(),iteration}")
|
| 380 |
+
|
| 381 |
+
# 確率で正規化
|
| 382 |
+
total_prob = sum(p.probability for p in completed)
|
| 383 |
+
if total_prob > 0:
|
| 384 |
+
for piece in completed:
|
| 385 |
+
piece.probability = piece.probability / total_prob
|
| 386 |
+
|
| 387 |
+
return completed[:top_k]
|
| 388 |
+
|
| 389 |
+
def build_chat_prompt(self, user_content: str,
|
| 390 |
+
system_content: str = "あなたは親切で役に立つAIアシスタントです。簡潔な回答をしてください") -> str:
|
| 391 |
+
"""
|
| 392 |
+
チャットプロンプトを構築
|
| 393 |
+
|
| 394 |
+
注意: Rust側で既に整形済みのプロンプトが渡される場合は、そのまま返す
|
| 395 |
+
後方互換性のため、単一のuser_contentが渡された場合は従来の形式で整形
|
| 396 |
+
"""
|
| 397 |
+
# Rust側で既に整形済みのプロンプトが渡されている場合(複数行、ヘッダーを含む)
|
| 398 |
+
# そのまま返す
|
| 399 |
+
if "<|start_header_id|>" in user_content or "<|eot_id|>" in user_content:
|
| 400 |
+
return user_content
|
| 401 |
+
|
| 402 |
+
# 後方互換性: 単一のuser_contentが渡された場合の従来の形式
|
| 403 |
+
prompt_text = (
|
| 404 |
+
f"<|begin_of_text|>"
|
| 405 |
+
f"<|start_header_id|>system<|end_header_id|>\n"
|
| 406 |
+
f"{system_content}\n<|eot_id|>"
|
| 407 |
+
f"<|start_header_id|>user<|end_header_id|>\n"
|
| 408 |
+
f"{user_content}\n<|eot_id|>"
|
| 409 |
+
f"<|start_header_id|>assistant<|end_header_id|>\n"
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
# BOS(<|begin_of_text|>) の重複を抑止: 先頭のBOSを全て除去
|
| 413 |
+
# llama-cpp 側でBOSが自動付与されるため、ここでは付与しない
|
| 414 |
+
BOS = "<|begin_of_text|>"
|
| 415 |
+
s = prompt_text.lstrip()
|
| 416 |
+
while s.startswith(BOS):
|
| 417 |
+
s = s[len(BOS):]
|
| 418 |
+
prompt_text = s
|
| 419 |
+
return prompt_text
|
| 420 |
+
|
| 421 |
+
if __name__ == "__main__":
|
| 422 |
+
"""WordDeterminerのテスト(ストリーミング版)"""
|
| 423 |
+
print("=== WordDeterminerテスト(ストリーミング版) ===")
|
| 424 |
+
|
| 425 |
+
try:
|
| 426 |
+
# WordDeterminerを初期化
|
| 427 |
+
determiner = WordDeterminer()
|
| 428 |
+
|
| 429 |
+
# プロンプト設定
|
| 430 |
+
prompt_text = "電球を作ったのは誰?"
|
| 431 |
+
root_text = ""
|
| 432 |
+
|
| 433 |
+
print(f"プロンプト: '{prompt_text}'")
|
| 434 |
+
print(f"ルートテキスト: '{root_text}'")
|
| 435 |
+
|
| 436 |
+
print("\nAIモデルテスト:")
|
| 437 |
+
prompt_text = "電球を作ったのは誰?"
|
| 438 |
+
root_text = "電球を作ったのは候補1:トマス"
|
| 439 |
+
try:
|
| 440 |
+
from ai import AI
|
| 441 |
+
|
| 442 |
+
# モデルを取得
|
| 443 |
+
model = AI.get_model()
|
| 444 |
+
print(f"モデル取得成功: {type(model)}")
|
| 445 |
+
|
| 446 |
+
# トークン確率取得テスト
|
| 447 |
+
test_text = prompt_text
|
| 448 |
+
tokens = model.get_token_probabilities(test_text, k=5)
|
| 449 |
+
print(f"トークン確率 ({test_text}): {tokens}")
|
| 450 |
+
|
| 451 |
+
# 単語ツリー構築テスト
|
| 452 |
+
print("\n単語ツリー構築テスト:")
|
| 453 |
+
completed_pieces = determiner.build_word_tree(
|
| 454 |
+
prompt_text=prompt_text,
|
| 455 |
+
root_text=root_text,
|
| 456 |
+
model=model,
|
| 457 |
+
top_k=3,
|
| 458 |
+
max_depth=5
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
print(f"完成したピース数: {len(completed_pieces)}")
|
| 462 |
+
for i, piece in enumerate(completed_pieces):
|
| 463 |
+
full_text = piece.get_full_text()
|
| 464 |
+
print(f" ピース{i+1}: '{full_text}' (確率: {piece.probability:.4f})")
|
| 465 |
+
|
| 466 |
+
except Exception as e:
|
| 467 |
+
print(f"AIモデルテスト失敗: {e}")
|
| 468 |
+
|
| 469 |
+
# 単語数カウントテスト
|
| 470 |
+
print("\n単語数カウントテスト:")
|
| 471 |
+
test_texts = [
|
| 472 |
+
"電球",
|
| 473 |
+
"電球を作った",
|
| 474 |
+
"電球を作ったのは",
|
| 475 |
+
"電球を作ったのは誰",
|
| 476 |
+
"電球を作ったのは誰?"
|
| 477 |
+
]
|
| 478 |
+
|
| 479 |
+
for text in test_texts:
|
| 480 |
+
word_count = determiner._count_words(text)
|
| 481 |
+
tokens = determiner._get_next_tokens_from_model(model, text)
|
| 482 |
+
print(f" '{text}' → {word_count}語: {tokens}")
|
| 483 |
+
|
| 484 |
+
# 単語確定テスト
|
| 485 |
+
print("\n単語確定テスト:")
|
| 486 |
+
test_sequence = ["電球", "電球を", "電球を作", "電球を作った", "電球を作ったの", "電球を作ったのは"]
|
| 487 |
+
prev_count = 0
|
| 488 |
+
|
| 489 |
+
for text in test_sequence:
|
| 490 |
+
current_count = determiner._count_words(text)
|
| 491 |
+
if current_count > prev_count:
|
| 492 |
+
print(f" '{text}' → {current_count}語 (確定!)")
|
| 493 |
+
prev_count = current_count
|
| 494 |
+
else:
|
| 495 |
+
print(f" '{text}' → {current_count}語 (継続)")
|
| 496 |
+
|
| 497 |
+
# 境界文字テスト
|
| 498 |
+
print("\n境界文字テスト:")
|
| 499 |
+
test_chars = [" ", "?", "、", "。", "a", "1"]
|
| 500 |
+
for char in test_chars:
|
| 501 |
+
is_boundary = determiner.is_boundary_char(char)
|
| 502 |
+
print(f" '{char}': {is_boundary}")
|
| 503 |
+
|
| 504 |
+
# ピース作成テスト
|
| 505 |
+
print("\nピース作成テスト:")
|
| 506 |
+
root = WordPiece(text="電球", probability=1.0)
|
| 507 |
+
child1 = root.add_child("を", 0.6)
|
| 508 |
+
child2 = root.add_child("の", 0.3)
|
| 509 |
+
|
| 510 |
+
print(f"ルートテキスト: {root.get_full_text()}")
|
| 511 |
+
print(f"子1テキスト: {child1.get_full_text()}")
|
| 512 |
+
print(f"子2テキスト: {child2.get_full_text()}")
|
| 513 |
+
|
| 514 |
+
print("\nテスト完了")
|
| 515 |
+
|
| 516 |
+
except ImportError as e:
|
| 517 |
+
print(f"必要なライブラリがインストールされていません: {e}")
|
| 518 |
+
except Exception as e:
|
| 519 |
+
print(f"テストエラー: {e}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tauriアプリ用Python依存関係
|
| 2 |
+
|
| 3 |
+
# Web API
|
| 4 |
+
flask>=2.0.0
|
| 5 |
+
flask-cors>=3.0.0
|
| 6 |
+
|
| 7 |
+
# 形態素解析
|
| 8 |
+
fugashi>=1.3.0
|
| 9 |
+
sudachipy>=0.6.7
|
| 10 |
+
sudachidict-core>=20240125
|
| 11 |
+
|
| 12 |
+
# AI/LLM
|
| 13 |
+
llama-cpp-python>=0.2.0
|
| 14 |
+
|
| 15 |
+
# UI
|
| 16 |
+
gradio>=4.38.0
|
| 17 |
+
|
| 18 |
+
# API
|
| 19 |
+
fastapi>=0.111.0
|
| 20 |
+
uvicorn>=0.30.0
|
| 21 |
+
|
| 22 |
+
# その他
|
| 23 |
+
typing-extensions>=4.0.0
|