Spaces:

leilaghomashchi
/

chunking_test

Build error

App Files Files Community

chunking_test / modules /utils.py

leilaghomashchi

Upload 5 files

dfbf6c3 verified about 2 months ago

raw

history blame contribute delete

6.55 kB

	"""
	🔧 توابع کمکی عمومی
	Utility functions for text processing
	"""

	import re
	import logging
	from typing import List

	logger = logging.getLogger(__name__)


	def count_tokens(text: str, method: str = 'simple') -> int:
	"""
	شمارش تعداد tokens در متن

	Args:
	text: متن ورودی
	method: روش شمارش - 'simple' یا 'accurate'

	Returns:
	تعداد tokens تخمینی

	Examples:
	>>> count_tokens("این یک متن تست است")
	6
	"""
	if not text or not text.strip():
	return 0

	if method == 'simple':
	# تخمین ساده: 1 token ≈ 4 کاراکتر
	# این تخمین برای فارسی و انگلیسی کار می‌کند
	return max(1, len(text) // 4)

	elif method == 'accurate':
	try:
	import tiktoken
	encoding = tiktoken.get_encoding("cl100k_base")
	return len(encoding.encode(text))
	except ImportError:
	logger.warning("⚠️ tiktoken not installed, falling back to simple method")
	return max(1, len(text) // 4)
	except Exception as e:
	logger.error(f"❌ Error in tiktoken: {e}")
	return max(1, len(text) // 4)

	else:
	raise ValueError(f"Invalid method: {method}. Use 'simple' or 'accurate'")


	def should_use_chunking(text: str, threshold: int = 6000) -> bool:
	"""
	تصمیم‌گیری: آیا نیاز به chunking داریم؟

	Args:
	text: متن ورودی
	threshold: حد آستانه (tokens) - پیش‌فرض 6000

	Returns:
	True اگر تعداد tokens بیشتر از threshold باشد

	Examples:
	>>> should_use_chunking("متن کوتاه", threshold=100)
	False
	"""
	if not text or not text.strip():
	return False

	token_count = count_tokens(text)

	if token_count > threshold:
	logger.info(
	f"📊 متن بلند تشخیص داده شد: {token_count} tokens > {threshold} "
	f"→ استفاده از chunking"
	)
	return True
	else:
	logger.info(
	f"📊 متن کوتاه: {token_count} tokens ≤ {threshold} "
	f"→ بدون chunking"
	)
	return False


	def split_sentences(text: str) -> List[str]:
	"""
	تقسیم متن به جملات

	از الگوهای regex برای تشخیص پایان جملات فارسی استفاده می‌کند

	Args:
	text: متن ورودی

	Returns:
	لیست جملات

	Examples:
	>>> split_sentences("جمله اول. جمله دوم؟ جمله سوم!")
	['جمله اول', 'جمله دوم', 'جمله سوم']
	"""
	if not text or not text.strip():
	return []

	# الگوهای پایان جمله در فارسی
	# . ! ? ؟ و همچنین نسخه‌های فارسی آن‌ها
	pattern = r'[.!?؟]\s+'

	sentences = re.split(pattern, text)

	# حذف جملات خالی و فضای خالی اضافی
	sentences = [s.strip() for s in sentences if s.strip()]

	return sentences


	def get_last_n_tokens(text: str, n: int) -> str:
	"""
	استخراج n توکن آخر از متن (برای ایجاد overlap در chunking)

	Args:
	text: متن ورودی
	n: تعداد tokens مورد نظر

	Returns:
	بخش آخر متن که تقریباً n توکن دارد

	Examples:
	>>> get_last_n_tokens("این یک متن بلند است", 2)
	'است' # تقریباً 2 توکن آخر
	"""
	if not text or n <= 0:
	return ""

	# تقریب: هر token ≈ 4 کاراکتر
	approx_chars = n * 4

	if len(text) <= approx_chars:
	return text

	return text[-approx_chars:]


	def get_first_n_tokens(text: str, n: int) -> str:
	"""
	استخراج n توکن اول از متن

	Args:
	text: متن ورودی
	n: تعداد tokens مورد نظر

	Returns:
	بخش اول متن که تقریباً n توکن دارد
	"""
	if not text or n <= 0:
	return ""

	# تقریب: هر token ≈ 4 کاراکتر
	approx_chars = n * 4

	if len(text) <= approx_chars:
	return text

	return text[:approx_chars]


	def clean_text(text: str) -> str:
	"""
	پاکسازی و نرمال‌سازی اولیه متن

	Args:
	text: متن ورودی

	Returns:
	متن پاکسازی شده
	"""
	if not text:
	return ""

	# حذف فضاهای خالی اضافی
	text = re.sub(r'\s+', ' ', text)

	# حذف فضای خالی ابتدا و انتها
	text = text.strip()

	return text


	# ✅ تست‌های سریع
	if __name__ == "__main__":
	print("=" * 60)
	print("🧪 Testing Utils Module")
	print("=" * 60)

	# تست 1: Token counting
	test_text = "این یک متن تست برای بررسی تعداد توکن‌ها است."
	tokens = count_tokens(test_text)
	print(f"\n📊 Test 1: Token Counting")
	print(f" Text: {test_text}")
	print(f" Tokens: {tokens}")

	# تست 2: Chunking decision
	short_text = "متن کوتاه"
	long_text = "متن بلند " * 1000

	print(f"\n📊 Test 2: Chunking Decision")
	print(f" Short text ({count_tokens(short_text)} tokens): {should_use_chunking(short_text)}")
	print(f" Long text ({count_tokens(long_text)} tokens): {should_use_chunking(long_text)}")

	# تست 3: Sentence splitting
	multi_sentence = "جمله اول. جمله دوم؟ جمله سوم! چطور هستید؟"
	sentences = split_sentences(multi_sentence)
	print(f"\n📊 Test 3: Sentence Splitting")
	print(f" Input: {multi_sentence}")
	print(f" Sentences: {sentences}")
	print(f" Count: {len(sentences)}")

	# تست 4: Last n tokens
	sample_text = "این یک متن نمونه برای تست است که می‌خواهیم بخش آخر آن را بگیریم"
	last_part = get_last_n_tokens(sample_text, 5)
	print(f"\n📊 Test 4: Get Last N Tokens")
	print(f" Original: {sample_text}")
	print(f" Last 5 tokens (~20 chars): {last_part}")

	print("\n" + "=" * 60)
	print("✅ All tests completed!")
	print("=" * 60)