Spaces:

leilaghomashchi
/

chunking_test

Build error

File size: 6,552 Bytes

dfbf6c3

"""
🔧 توابع کمکی عمومی
Utility functions for text processing
"""

import re
import logging
from typing import List

logger = logging.getLogger(__name__)


def count_tokens(text: str, method: str = 'simple') -> int:
    """
    شمارش تعداد tokens در متن
    
    Args:
        text: متن ورودی
        method: روش شمارش - 'simple' یا 'accurate'
        
    Returns:
        تعداد tokens تخمینی
        
    Examples:
        >>> count_tokens("این یک متن تست است")
        6
    """
    if not text or not text.strip():
        return 0
    
    if method == 'simple':
        # تخمین ساده: 1 token ≈ 4 کاراکتر
        # این تخمین برای فارسی و انگلیسی کار می‌کند
        return max(1, len(text) // 4)
    
    elif method == 'accurate':
        try:
            import tiktoken
            encoding = tiktoken.get_encoding("cl100k_base")
            return len(encoding.encode(text))
        except ImportError:
            logger.warning("⚠️ tiktoken not installed, falling back to simple method")
            return max(1, len(text) // 4)
        except Exception as e:
            logger.error(f"❌ Error in tiktoken: {e}")
            return max(1, len(text) // 4)
    
    else:
        raise ValueError(f"Invalid method: {method}. Use 'simple' or 'accurate'")


def should_use_chunking(text: str, threshold: int = 6000) -> bool:
    """
    تصمیم‌گیری: آیا نیاز به chunking داریم؟
    
    Args:
        text: متن ورودی
        threshold: حد آستانه (tokens) - پیش‌فرض 6000
        
    Returns:
        True اگر تعداد tokens بیشتر از threshold باشد
        
    Examples:
        >>> should_use_chunking("متن کوتاه", threshold=100)
        False
    """
    if not text or not text.strip():
        return False
    
    token_count = count_tokens(text)
    
    if token_count > threshold:
        logger.info(
            f"📊 متن بلند تشخیص داده شد: {token_count} tokens > {threshold} "
            f"→ استفاده از chunking"
        )
        return True
    else:
        logger.info(
            f"📊 متن کوتاه: {token_count} tokens ≤ {threshold} "
            f"→ بدون chunking"
        )
        return False


def split_sentences(text: str) -> List[str]:
    """
    تقسیم متن به جملات
    
    از الگوهای regex برای تشخیص پایان جملات فارسی استفاده می‌کند
    
    Args:
        text: متن ورودی
        
    Returns:
        لیست جملات
        
    Examples:
        >>> split_sentences("جمله اول. جمله دوم؟ جمله سوم!")
        ['جمله اول', 'جمله دوم', 'جمله سوم']
    """
    if not text or not text.strip():
        return []
    
    # الگوهای پایان جمله در فارسی
    # . ! ? ؟ و همچنین نسخه‌های فارسی آن‌ها
    pattern = r'[.!?؟]\s+'
    
    sentences = re.split(pattern, text)
    
    # حذف جملات خالی و فضای خالی اضافی
    sentences = [s.strip() for s in sentences if s.strip()]
    
    return sentences


def get_last_n_tokens(text: str, n: int) -> str:
    """
    استخراج n توکن آخر از متن (برای ایجاد overlap در chunking)
    
    Args:
        text: متن ورودی
        n: تعداد tokens مورد نظر
        
    Returns:
        بخش آخر متن که تقریباً n توکن دارد
        
    Examples:
        >>> get_last_n_tokens("این یک متن بلند است", 2)
        'است'  # تقریباً 2 توکن آخر
    """
    if not text or n <= 0:
        return ""
    
    # تقریب: هر token ≈ 4 کاراکتر
    approx_chars = n * 4
    
    if len(text) <= approx_chars:
        return text
    
    return text[-approx_chars:]


def get_first_n_tokens(text: str, n: int) -> str:
    """
    استخراج n توکن اول از متن
    
    Args:
        text: متن ورودی
        n: تعداد tokens مورد نظر
        
    Returns:
        بخش اول متن که تقریباً n توکن دارد
    """
    if not text or n <= 0:
        return ""
    
    # تقریب: هر token ≈ 4 کاراکتر
    approx_chars = n * 4
    
    if len(text) <= approx_chars:
        return text
    
    return text[:approx_chars]


def clean_text(text: str) -> str:
    """
    پاکسازی و نرمال‌سازی اولیه متن
    
    Args:
        text: متن ورودی
        
    Returns:
        متن پاکسازی شده
    """
    if not text:
        return ""
    
    # حذف فضاهای خالی اضافی
    text = re.sub(r'\s+', ' ', text)
    
    # حذف فضای خالی ابتدا و انتها
    text = text.strip()
    
    return text


# ✅ تست‌های سریع
if __name__ == "__main__":
    print("=" * 60)
    print("🧪 Testing Utils Module")
    print("=" * 60)
    
    # تست 1: Token counting
    test_text = "این یک متن تست برای بررسی تعداد توکن‌ها است."
    tokens = count_tokens(test_text)
    print(f"\n📊 Test 1: Token Counting")
    print(f"  Text: {test_text}")
    print(f"  Tokens: {tokens}")
    
    # تست 2: Chunking decision
    short_text = "متن کوتاه"
    long_text = "متن بلند " * 1000
    
    print(f"\n📊 Test 2: Chunking Decision")
    print(f"  Short text ({count_tokens(short_text)} tokens): {should_use_chunking(short_text)}")
    print(f"  Long text ({count_tokens(long_text)} tokens): {should_use_chunking(long_text)}")
    
    # تست 3: Sentence splitting
    multi_sentence = "جمله اول. جمله دوم؟ جمله سوم! چطور هستید؟"
    sentences = split_sentences(multi_sentence)
    print(f"\n📊 Test 3: Sentence Splitting")
    print(f"  Input: {multi_sentence}")
    print(f"  Sentences: {sentences}")
    print(f"  Count: {len(sentences)}")
    
    # تست 4: Last n tokens
    sample_text = "این یک متن نمونه برای تست است که می‌خواهیم بخش آخر آن را بگیریم"
    last_part = get_last_n_tokens(sample_text, 5)
    print(f"\n📊 Test 4: Get Last N Tokens")
    print(f"  Original: {sample_text}")
    print(f"  Last 5 tokens (~20 chars): {last_part}")
    
    print("\n" + "=" * 60)
    print("✅ All tests completed!")
    print("=" * 60)