Spaces:

wiizm
/

soyailabs

Running on CPU Upgrade

App Files Files Community

soyailabs / app\utils\text_utils.py

wiizm

Upload app\utils\text_utils.py with huggingface_hub

730c79f verified 2 days ago

raw

history blame contribute delete

6.1 kB

	"""
	텍스트 처리 유틸리티 함수
	"""

	import re
	from typing import List, Optional

	from app.core.logger import get_logger

	logger = get_logger(__name__)


	def clean_text(text: str) -> str:
	"""
	텍스트 정리 (공백 정규화 등)

	Args:
	text: 정리할 텍스트

	Returns:
	정리된 텍스트
	"""
	if not text:
	return ''

	# 연속된 공백 제거
	text = re.sub(r'\s+', ' ', text)
	# 앞뒤 공백 제거
	text = text.strip()

	return text


	def split_text_into_chunks(
	text: str,
	min_chunk_size: int = 200,
	max_chunk_size: int = 1000,
	overlap: int = 150
	) -> List[str]:
	"""
	의미 기반 텍스트 청킹 (문장과 문단 경계를 고려하여 분할)

	Args:
	text: 분할할 텍스트
	min_chunk_size: 최소 청크 크기
	max_chunk_size: 최대 청크 크기
	overlap: 오버랩 크기

	Returns:
	분할된 청크 리스트
	"""
	if not text or len(text.strip()) == 0:
	return []

	# 1단계: 문단 단위로 분할 (빈 줄 기준)
	paragraphs = re.split(r'\n\s*\n', text.strip())
	paragraphs = [p.strip() for p in paragraphs if p.strip()]

	if not paragraphs:
	return []

	# 2단계: 각 문단을 문장 단위로 분할
	sentence_pattern = r'([.!?]+)(?=\s+\|$)'

	all_sentences: List[str] = []
	for para in paragraphs:
	parts = re.split(sentence_pattern, para)
	combined_sentences: List[str] = []
	current_sentence = ""

	for part in parts:
	if not part.strip():
	continue
	if re.match(r'^[.!?]+$', part):
	# 구두점인 경우 현재 문장에 추가하고 문장 완성
	current_sentence += part
	if current_sentence.strip():
	combined_sentences.append(current_sentence.strip())
	current_sentence = ""
	else:
	# 텍스트인 경우 현재 문장에 추가
	current_sentence += part

	# 마지막 문장 처리
	if current_sentence.strip():
	combined_sentences.append(current_sentence.strip())

	# 문장이 하나도 없는 경우
	if not combined_sentences and para.strip():
	combined_sentences.append(para.strip())

	all_sentences.extend(combined_sentences)

	if not all_sentences:
	return [text] if text.strip() else []

	# 3단계: 문장들을 모아서 의미 있는 청크 생성
	chunks: List[str] = []
	current_chunk: List[str] = []
	current_size = 0

	for sentence in all_sentences:
	sentence_size = len(sentence)

	# 현재 청크에 문장 추가 시 최대 크기를 초과하는 경우
	if current_size + sentence_size > max_chunk_size and current_chunk:
	# 현재 청크 저장
	chunk_text = '\n'.join(current_chunk)
	if len(chunk_text.strip()) >= min_chunk_size:
	chunks.append(chunk_text)
	else:
	# 최소 크기 미만이면 다음 청크와 병합
	if chunks:
	chunks[-1] = chunks[-1] + '\n' + chunk_text
	else:
	chunks.append(chunk_text)

	# 오버랩을 위한 문장 유지
	overlap_sentences: List[str] = []
	overlap_size = 0
	for s in reversed(current_chunk):
	if overlap_size + len(s) <= overlap:
	overlap_sentences.insert(0, s)
	overlap_size += len(s) + 1
	else:
	break

	current_chunk = overlap_sentences + [sentence]
	current_size = overlap_size + sentence_size
	else:
	# 현재 청크에 문장 추가
	current_chunk.append(sentence)
	current_size += sentence_size + 1

	# 마지막 청크 추가
	if current_chunk:
	chunk_text = '\n'.join(current_chunk)
	if chunks and len(chunk_text.strip()) < min_chunk_size:
	chunks[-1] = chunks[-1] + '\n' + chunk_text
	else:
	chunks.append(chunk_text)

	# 빈 청크 제거 및 최소 크기 미만 청크 처리
	final_chunks: List[str] = []
	for chunk in chunks:
	chunk = chunk.strip()
	if chunk and len(chunk) >= min_chunk_size:
	final_chunks.append(chunk)
	elif chunk:
	if final_chunks:
	final_chunks[-1] = final_chunks[-1] + '\n' + chunk
	else:
	final_chunks.append(chunk)

	return final_chunks if final_chunks else [text] if text.strip() else []


	def extract_chapter_number(text: str) -> Optional[int]:
	"""
	텍스트에서 챕터 번호 추출

	Args:
	text: 챕터 번호를 추출할 텍스트

	Returns:
	챕터 번호, 없으면 None
	"""
	# 다양한 챕터 패턴 매칭
	patterns = [
	r'제\s(\d+)\s장', # 제1장, 제 1 장
	r'제\s(\d+)\s화', # 제1화
	r'Chapter\s*(\d+)', # Chapter 1
	r'CHAPTER\s*(\d+)', # CHAPTER 1
	r'Ch\.\s*(\d+)', # Ch. 1
	r'(\d+)\s*장', # 1장
	r'(\d+)\s*화', # 1화
	r'chap\.\s*(\d+)', # chap. 1
	r'ch\s*(\d+)', # ch 1
	r'(\d+)\s*章', # 1章
	]

	# 텍스트의 처음 500자만 검사
	search_text = text[:500]

	for pattern in patterns:
	match = re.search(pattern, search_text, re.IGNORECASE)
	if match:
	try:
	chapter_num = int(match.group(1))
	return chapter_num
	except (ValueError, AttributeError):
	continue

	return None