Spaces:

wiizm
/

soyailabs

Running on CPU Upgrade

File size: 6,099 Bytes

730c79f

"""

텍스트 처리 유틸리티 함수

"""

import re
from typing import List, Optional

from app.core.logger import get_logger

logger = get_logger(__name__)


def clean_text(text: str) -> str:
    """

    텍스트 정리 (공백 정규화 등)

    

    Args:

        text: 정리할 텍스트

    

    Returns:

        정리된 텍스트

    """
    if not text:
        return ''
    
    # 연속된 공백 제거
    text = re.sub(r'\s+', ' ', text)
    # 앞뒤 공백 제거
    text = text.strip()
    
    return text


def split_text_into_chunks(

    text: str,

    min_chunk_size: int = 200,

    max_chunk_size: int = 1000,

    overlap: int = 150

) -> List[str]:
    """

    의미 기반 텍스트 청킹 (문장과 문단 경계를 고려하여 분할)

    

    Args:

        text: 분할할 텍스트

        min_chunk_size: 최소 청크 크기

        max_chunk_size: 최대 청크 크기

        overlap: 오버랩 크기

    

    Returns:

        분할된 청크 리스트

    """
    if not text or len(text.strip()) == 0:
        return []
    
    # 1단계: 문단 단위로 분할 (빈 줄 기준)
    paragraphs = re.split(r'\n\s*\n', text.strip())
    paragraphs = [p.strip() for p in paragraphs if p.strip()]
    
    if not paragraphs:
        return []
    
    # 2단계: 각 문단을 문장 단위로 분할
    sentence_pattern = r'([.!?]+)(?=\s+|$)'
    
    all_sentences: List[str] = []
    for para in paragraphs:
        parts = re.split(sentence_pattern, para)
        combined_sentences: List[str] = []
        current_sentence = ""
        
        for part in parts:
            if not part.strip():
                continue
            if re.match(r'^[.!?]+$', part):
                # 구두점인 경우 현재 문장에 추가하고 문장 완성
                current_sentence += part
                if current_sentence.strip():
                    combined_sentences.append(current_sentence.strip())
                current_sentence = ""
            else:
                # 텍스트인 경우 현재 문장에 추가
                current_sentence += part
        
        # 마지막 문장 처리
        if current_sentence.strip():
            combined_sentences.append(current_sentence.strip())
        
        # 문장이 하나도 없는 경우
        if not combined_sentences and para.strip():
            combined_sentences.append(para.strip())
        
        all_sentences.extend(combined_sentences)
    
    if not all_sentences:
        return [text] if text.strip() else []
    
    # 3단계: 문장들을 모아서 의미 있는 청크 생성
    chunks: List[str] = []
    current_chunk: List[str] = []
    current_size = 0
    
    for sentence in all_sentences:
        sentence_size = len(sentence)
        
        # 현재 청크에 문장 추가 시 최대 크기를 초과하는 경우
        if current_size + sentence_size > max_chunk_size and current_chunk:
            # 현재 청크 저장
            chunk_text = '\n'.join(current_chunk)
            if len(chunk_text.strip()) >= min_chunk_size:
                chunks.append(chunk_text)
            else:
                # 최소 크기 미만이면 다음 청크와 병합
                if chunks:
                    chunks[-1] = chunks[-1] + '\n' + chunk_text
                else:
                    chunks.append(chunk_text)
            
            # 오버랩을 위한 문장 유지
            overlap_sentences: List[str] = []
            overlap_size = 0
            for s in reversed(current_chunk):
                if overlap_size + len(s) <= overlap:
                    overlap_sentences.insert(0, s)
                    overlap_size += len(s) + 1
                else:
                    break
            
            current_chunk = overlap_sentences + [sentence]
            current_size = overlap_size + sentence_size
        else:
            # 현재 청크에 문장 추가
            current_chunk.append(sentence)
            current_size += sentence_size + 1
    
    # 마지막 청크 추가
    if current_chunk:
        chunk_text = '\n'.join(current_chunk)
        if chunks and len(chunk_text.strip()) < min_chunk_size:
            chunks[-1] = chunks[-1] + '\n' + chunk_text
        else:
            chunks.append(chunk_text)
    
    # 빈 청크 제거 및 최소 크기 미만 청크 처리
    final_chunks: List[str] = []
    for chunk in chunks:
        chunk = chunk.strip()
        if chunk and len(chunk) >= min_chunk_size:
            final_chunks.append(chunk)
        elif chunk:
            if final_chunks:
                final_chunks[-1] = final_chunks[-1] + '\n' + chunk
            else:
                final_chunks.append(chunk)
    
    return final_chunks if final_chunks else [text] if text.strip() else []


def extract_chapter_number(text: str) -> Optional[int]:
    """

    텍스트에서 챕터 번호 추출

    

    Args:

        text: 챕터 번호를 추출할 텍스트

    

    Returns:

        챕터 번호, 없으면 None

    """
    # 다양한 챕터 패턴 매칭
    patterns = [
        r'제\s*(\d+)\s*장',  # 제1장, 제 1 장
        r'제\s*(\d+)\s*화',  # 제1화
        r'Chapter\s*(\d+)',  # Chapter 1
        r'CHAPTER\s*(\d+)',  # CHAPTER 1
        r'Ch\.\s*(\d+)',     # Ch. 1
        r'(\d+)\s*장',       # 1장
        r'(\d+)\s*화',       # 1화
        r'chap\.\s*(\d+)',   # chap. 1
        r'ch\s*(\d+)',       # ch 1
        r'(\d+)\s*章',       # 1章
    ]
    
    # 텍스트의 처음 500자만 검사
    search_text = text[:500]
    
    for pattern in patterns:
        match = re.search(pattern, search_text, re.IGNORECASE)
        if match:
            try:
                chapter_num = int(match.group(1))
                return chapter_num
            except (ValueError, AttributeError):
                continue
    
    return None