Spaces:

WatNeru
/

LLMView_multi_model

Sleeping

File size: 3,986 Bytes

0447f30

from typing import Any
import fugashi
from .config import Config

try:
    # SudachiPy があれば直接利用してモードCを使用
    from sudachipy import dictionary as sudachi_dictionary
    from sudachipy import tokenizer as sudachi_tokenizer
    _SUDACHI_AVAILABLE = True
except Exception:
    _SUDACHI_AVAILABLE = False


class WordCounter:
    """単語数を数えるクラス（SudachiPyがあれば mode=C、なければfugashi）"""
    
    def __init__(self, tokenizer: Any = None):
        """
        初期化
        
        Args:
            tokenizer: fugashiトークナイザー（Noneの場合はデフォルトを使用）
        """
        # 優先順位: 引数tokenizer > SudachiPy > fugashi(GenericTagger)
        self._use_sudachi = False
        self._sudachi_mode = None
        if tokenizer is not None:
            self.tokenizer = tokenizer
        elif _SUDACHI_AVAILABLE:
            # SudachiPyの辞書は自動で同梱辞書を参照（sudachidict_core）
            # 外部設定不要。SplitMode.C を使用
            self._use_sudachi = True
            self.tokenizer = sudachi_dictionary.Dictionary().create()
            self._sudachi_mode = sudachi_tokenizer.Tokenizer.SplitMode.C
        else:
            # fugashi (MeCab) フォールバック
            fugashi_args = Config.get_fugashi_args()
            if fugashi_args:
                self.tokenizer = fugashi.GenericTagger(fugashi_args)
            else:
                # 引数なしでデフォルト設定を使用
                self.tokenizer = fugashi.GenericTagger()
    
    def count_words(self, text: str) -> int:
        """
        テキストの単語数をカウント
        
        Args:
            text: カウントするテキスト
            
        Returns:
            int: 単語数
        """
        if not text:
            return 0
        
        try:
            # fugashiで形態素解析して単語数をカウント
            if self._use_sudachi:
                tokens = self.tokenizer.tokenize(text, self._sudachi_mode)
                return len(tokens)
            else:
                tokens = self.tokenizer(text)
                return len(tokens)
        except Exception as e:
            print(f"単語数カウントエラー: {e}")
            # フォールバック: 空白で分割
            return len(text.split())
    
    def is_word_boundary(self, text: str, position: int) -> bool:
        """
        指定位置が単語境界かどうかを判定
        
        Args:
            text: テキスト
            position: 位置（負の値で末尾から指定可能、-1は末尾）
            
        Returns:
            bool: 単語境界かどうか
        """
        if not text:
            return True
        
        # 負のインデックスを正のインデックスに変換
        if position < 0:
            position = len(text) + position
        
        if position >= len(text):
            return True
        
        try:
            # fugashiで形態素解析
            if self._use_sudachi:
                tokens = self.tokenizer.tokenize(text, self._sudachi_mode)
                surfaces = [m.surface() for m in tokens]
            else:
                tokens = self.tokenizer(text)
                surfaces = [m.surface for m in tokens]

            current_pos = 0
            for surface in surfaces:
                token_length = len(surface)
                if current_pos <= position < current_pos + token_length:
                    return False
                if position == current_pos + token_length:
                    return True
                current_pos += token_length
            
            return True
            
        except Exception as e:
            print(f"境界判定エラー: {e}")
            # フォールバック: 空白文字で判定
            return position < len(text) and text[position].isspace()