soyailabs / app\utils\text_utils.py
wiizm's picture
Upload app\utils\text_utils.py with huggingface_hub
730c79f verified
"""
ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜
"""
import re
from typing import List, Optional
from app.core.logger import get_logger
logger = get_logger(__name__)
def clean_text(text: str) -> str:
"""
ํ…์ŠคํŠธ ์ •๋ฆฌ (๊ณต๋ฐฑ ์ •๊ทœํ™” ๋“ฑ)
Args:
text: ์ •๋ฆฌํ•  ํ…์ŠคํŠธ
Returns:
์ •๋ฆฌ๋œ ํ…์ŠคํŠธ
"""
if not text:
return ''
# ์—ฐ์†๋œ ๊ณต๋ฐฑ ์ œ๊ฑฐ
text = re.sub(r'\s+', ' ', text)
# ์•ž๋’ค ๊ณต๋ฐฑ ์ œ๊ฑฐ
text = text.strip()
return text
def split_text_into_chunks(
text: str,
min_chunk_size: int = 200,
max_chunk_size: int = 1000,
overlap: int = 150
) -> List[str]:
"""
์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ…์ŠคํŠธ ์ฒญํ‚น (๋ฌธ์žฅ๊ณผ ๋ฌธ๋‹จ ๊ฒฝ๊ณ„๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ๋ถ„ํ• )
Args:
text: ๋ถ„ํ• ํ•  ํ…์ŠคํŠธ
min_chunk_size: ์ตœ์†Œ ์ฒญํฌ ํฌ๊ธฐ
max_chunk_size: ์ตœ๋Œ€ ์ฒญํฌ ํฌ๊ธฐ
overlap: ์˜ค๋ฒ„๋žฉ ํฌ๊ธฐ
Returns:
๋ถ„ํ• ๋œ ์ฒญํฌ ๋ฆฌ์ŠคํŠธ
"""
if not text or len(text.strip()) == 0:
return []
# 1๋‹จ๊ณ„: ๋ฌธ๋‹จ ๋‹จ์œ„๋กœ ๋ถ„ํ•  (๋นˆ ์ค„ ๊ธฐ์ค€)
paragraphs = re.split(r'\n\s*\n', text.strip())
paragraphs = [p.strip() for p in paragraphs if p.strip()]
if not paragraphs:
return []
# 2๋‹จ๊ณ„: ๊ฐ ๋ฌธ๋‹จ์„ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋ถ„ํ• 
sentence_pattern = r'([.!?]+)(?=\s+|$)'
all_sentences: List[str] = []
for para in paragraphs:
parts = re.split(sentence_pattern, para)
combined_sentences: List[str] = []
current_sentence = ""
for part in parts:
if not part.strip():
continue
if re.match(r'^[.!?]+$', part):
# ๊ตฌ๋‘์ ์ธ ๊ฒฝ์šฐ ํ˜„์žฌ ๋ฌธ์žฅ์— ์ถ”๊ฐ€ํ•˜๊ณ  ๋ฌธ์žฅ ์™„์„ฑ
current_sentence += part
if current_sentence.strip():
combined_sentences.append(current_sentence.strip())
current_sentence = ""
else:
# ํ…์ŠคํŠธ์ธ ๊ฒฝ์šฐ ํ˜„์žฌ ๋ฌธ์žฅ์— ์ถ”๊ฐ€
current_sentence += part
# ๋งˆ์ง€๋ง‰ ๋ฌธ์žฅ ์ฒ˜๋ฆฌ
if current_sentence.strip():
combined_sentences.append(current_sentence.strip())
# ๋ฌธ์žฅ์ด ํ•˜๋‚˜๋„ ์—†๋Š” ๊ฒฝ์šฐ
if not combined_sentences and para.strip():
combined_sentences.append(para.strip())
all_sentences.extend(combined_sentences)
if not all_sentences:
return [text] if text.strip() else []
# 3๋‹จ๊ณ„: ๋ฌธ์žฅ๋“ค์„ ๋ชจ์•„์„œ ์˜๋ฏธ ์žˆ๋Š” ์ฒญํฌ ์ƒ์„ฑ
chunks: List[str] = []
current_chunk: List[str] = []
current_size = 0
for sentence in all_sentences:
sentence_size = len(sentence)
# ํ˜„์žฌ ์ฒญํฌ์— ๋ฌธ์žฅ ์ถ”๊ฐ€ ์‹œ ์ตœ๋Œ€ ํฌ๊ธฐ๋ฅผ ์ดˆ๊ณผํ•˜๋Š” ๊ฒฝ์šฐ
if current_size + sentence_size > max_chunk_size and current_chunk:
# ํ˜„์žฌ ์ฒญํฌ ์ €์žฅ
chunk_text = '\n'.join(current_chunk)
if len(chunk_text.strip()) >= min_chunk_size:
chunks.append(chunk_text)
else:
# ์ตœ์†Œ ํฌ๊ธฐ ๋ฏธ๋งŒ์ด๋ฉด ๋‹ค์Œ ์ฒญํฌ์™€ ๋ณ‘ํ•ฉ
if chunks:
chunks[-1] = chunks[-1] + '\n' + chunk_text
else:
chunks.append(chunk_text)
# ์˜ค๋ฒ„๋žฉ์„ ์œ„ํ•œ ๋ฌธ์žฅ ์œ ์ง€
overlap_sentences: List[str] = []
overlap_size = 0
for s in reversed(current_chunk):
if overlap_size + len(s) <= overlap:
overlap_sentences.insert(0, s)
overlap_size += len(s) + 1
else:
break
current_chunk = overlap_sentences + [sentence]
current_size = overlap_size + sentence_size
else:
# ํ˜„์žฌ ์ฒญํฌ์— ๋ฌธ์žฅ ์ถ”๊ฐ€
current_chunk.append(sentence)
current_size += sentence_size + 1
# ๋งˆ์ง€๋ง‰ ์ฒญํฌ ์ถ”๊ฐ€
if current_chunk:
chunk_text = '\n'.join(current_chunk)
if chunks and len(chunk_text.strip()) < min_chunk_size:
chunks[-1] = chunks[-1] + '\n' + chunk_text
else:
chunks.append(chunk_text)
# ๋นˆ ์ฒญํฌ ์ œ๊ฑฐ ๋ฐ ์ตœ์†Œ ํฌ๊ธฐ ๋ฏธ๋งŒ ์ฒญํฌ ์ฒ˜๋ฆฌ
final_chunks: List[str] = []
for chunk in chunks:
chunk = chunk.strip()
if chunk and len(chunk) >= min_chunk_size:
final_chunks.append(chunk)
elif chunk:
if final_chunks:
final_chunks[-1] = final_chunks[-1] + '\n' + chunk
else:
final_chunks.append(chunk)
return final_chunks if final_chunks else [text] if text.strip() else []
def extract_chapter_number(text: str) -> Optional[int]:
"""
ํ…์ŠคํŠธ์—์„œ ์ฑ•ํ„ฐ ๋ฒˆํ˜ธ ์ถ”์ถœ
Args:
text: ์ฑ•ํ„ฐ ๋ฒˆํ˜ธ๋ฅผ ์ถ”์ถœํ•  ํ…์ŠคํŠธ
Returns:
์ฑ•ํ„ฐ ๋ฒˆํ˜ธ, ์—†์œผ๋ฉด None
"""
# ๋‹ค์–‘ํ•œ ์ฑ•ํ„ฐ ํŒจํ„ด ๋งค์นญ
patterns = [
r'์ œ\s*(\d+)\s*์žฅ', # ์ œ1์žฅ, ์ œ 1 ์žฅ
r'์ œ\s*(\d+)\s*ํ™”', # ์ œ1ํ™”
r'Chapter\s*(\d+)', # Chapter 1
r'CHAPTER\s*(\d+)', # CHAPTER 1
r'Ch\.\s*(\d+)', # Ch. 1
r'(\d+)\s*์žฅ', # 1์žฅ
r'(\d+)\s*ํ™”', # 1ํ™”
r'chap\.\s*(\d+)', # chap. 1
r'ch\s*(\d+)', # ch 1
r'(\d+)\s*็ซ ', # 1็ซ 
]
# ํ…์ŠคํŠธ์˜ ์ฒ˜์Œ 500์ž๋งŒ ๊ฒ€์‚ฌ
search_text = text[:500]
for pattern in patterns:
match = re.search(pattern, search_text, re.IGNORECASE)
if match:
try:
chapter_num = int(match.group(1))
return chapter_num
except (ValueError, AttributeError):
continue
return None