Spaces:

WatNeru
/

LLMView_multi_model

Sleeping

App Files Files Community

LLMView_multi_model / package /word_counter.py

WatNeru

first commit

0447f30 4 months ago

raw

history blame contribute delete

3.99 kB

	from typing import Any
	import fugashi
	from .config import Config

	try:
	# SudachiPy があれば直接利用してモードCを使用
	from sudachipy import dictionary as sudachi_dictionary
	from sudachipy import tokenizer as sudachi_tokenizer
	_SUDACHI_AVAILABLE = True
	except Exception:
	_SUDACHI_AVAILABLE = False


	class WordCounter:
	"""単語数を数えるクラス（SudachiPyがあれば mode=C、なければfugashi）"""

	def __init__(self, tokenizer: Any = None):
	"""
	初期化

	Args:
	tokenizer: fugashiトークナイザー（Noneの場合はデフォルトを使用）
	"""
	# 優先順位: 引数tokenizer > SudachiPy > fugashi(GenericTagger)
	self._use_sudachi = False
	self._sudachi_mode = None
	if tokenizer is not None:
	self.tokenizer = tokenizer
	elif _SUDACHI_AVAILABLE:
	# SudachiPyの辞書は自動で同梱辞書を参照（sudachidict_core）
	# 外部設定不要。SplitMode.C を使用
	self._use_sudachi = True
	self.tokenizer = sudachi_dictionary.Dictionary().create()
	self._sudachi_mode = sudachi_tokenizer.Tokenizer.SplitMode.C
	else:
	# fugashi (MeCab) フォールバック
	fugashi_args = Config.get_fugashi_args()
	if fugashi_args:
	self.tokenizer = fugashi.GenericTagger(fugashi_args)
	else:
	# 引数なしでデフォルト設定を使用
	self.tokenizer = fugashi.GenericTagger()

	def count_words(self, text: str) -> int:
	"""
	テキストの単語数をカウント

	Args:
	text: カウントするテキスト

	Returns:
	int: 単語数
	"""
	if not text:
	return 0

	try:
	# fugashiで形態素解析して単語数をカウント
	if self._use_sudachi:
	tokens = self.tokenizer.tokenize(text, self._sudachi_mode)
	return len(tokens)
	else:
	tokens = self.tokenizer(text)
	return len(tokens)
	except Exception as e:
	print(f"単語数カウントエラー: {e}")
	# フォールバック: 空白で分割
	return len(text.split())

	def is_word_boundary(self, text: str, position: int) -> bool:
	"""
	指定位置が単語境界かどうかを判定

	Args:
	text: テキスト
	position: 位置（負の値で末尾から指定可能、-1は末尾）

	Returns:
	bool: 単語境界かどうか
	"""
	if not text:
	return True

	# 負のインデックスを正のインデックスに変換
	if position < 0:
	position = len(text) + position

	if position >= len(text):
	return True

	try:
	# fugashiで形態素解析
	if self._use_sudachi:
	tokens = self.tokenizer.tokenize(text, self._sudachi_mode)
	surfaces = [m.surface() for m in tokens]
	else:
	tokens = self.tokenizer(text)
	surfaces = [m.surface for m in tokens]

	current_pos = 0
	for surface in surfaces:
	token_length = len(surface)
	if current_pos <= position < current_pos + token_length:
	return False
	if position == current_pos + token_length:
	return True
	current_pos += token_length

	return True

	except Exception as e:
	print(f"境界判定エラー: {e}")
	# フォールバック: 空白文字で判定
	return position < len(text) and text[position].isspace()