|
|
"""
|
|
|
ํ
์คํธ ์ฒ๋ฆฌ ์ ํธ๋ฆฌํฐ ํจ์
|
|
|
"""
|
|
|
|
|
|
import re
|
|
|
from typing import List, Optional
|
|
|
|
|
|
from app.core.logger import get_logger
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
|
"""
|
|
|
ํ
์คํธ ์ ๋ฆฌ (๊ณต๋ฐฑ ์ ๊ทํ ๋ฑ)
|
|
|
|
|
|
Args:
|
|
|
text: ์ ๋ฆฌํ ํ
์คํธ
|
|
|
|
|
|
Returns:
|
|
|
์ ๋ฆฌ๋ ํ
์คํธ
|
|
|
"""
|
|
|
if not text:
|
|
|
return ''
|
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
|
text = text.strip()
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
def split_text_into_chunks(
|
|
|
text: str,
|
|
|
min_chunk_size: int = 200,
|
|
|
max_chunk_size: int = 1000,
|
|
|
overlap: int = 150
|
|
|
) -> List[str]:
|
|
|
"""
|
|
|
์๋ฏธ ๊ธฐ๋ฐ ํ
์คํธ ์ฒญํน (๋ฌธ์ฅ๊ณผ ๋ฌธ๋จ ๊ฒฝ๊ณ๋ฅผ ๊ณ ๋ คํ์ฌ ๋ถํ )
|
|
|
|
|
|
Args:
|
|
|
text: ๋ถํ ํ ํ
์คํธ
|
|
|
min_chunk_size: ์ต์ ์ฒญํฌ ํฌ๊ธฐ
|
|
|
max_chunk_size: ์ต๋ ์ฒญํฌ ํฌ๊ธฐ
|
|
|
overlap: ์ค๋ฒ๋ฉ ํฌ๊ธฐ
|
|
|
|
|
|
Returns:
|
|
|
๋ถํ ๋ ์ฒญํฌ ๋ฆฌ์คํธ
|
|
|
"""
|
|
|
if not text or len(text.strip()) == 0:
|
|
|
return []
|
|
|
|
|
|
|
|
|
paragraphs = re.split(r'\n\s*\n', text.strip())
|
|
|
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
|
|
|
|
|
if not paragraphs:
|
|
|
return []
|
|
|
|
|
|
|
|
|
sentence_pattern = r'([.!?]+)(?=\s+|$)'
|
|
|
|
|
|
all_sentences: List[str] = []
|
|
|
for para in paragraphs:
|
|
|
parts = re.split(sentence_pattern, para)
|
|
|
combined_sentences: List[str] = []
|
|
|
current_sentence = ""
|
|
|
|
|
|
for part in parts:
|
|
|
if not part.strip():
|
|
|
continue
|
|
|
if re.match(r'^[.!?]+$', part):
|
|
|
|
|
|
current_sentence += part
|
|
|
if current_sentence.strip():
|
|
|
combined_sentences.append(current_sentence.strip())
|
|
|
current_sentence = ""
|
|
|
else:
|
|
|
|
|
|
current_sentence += part
|
|
|
|
|
|
|
|
|
if current_sentence.strip():
|
|
|
combined_sentences.append(current_sentence.strip())
|
|
|
|
|
|
|
|
|
if not combined_sentences and para.strip():
|
|
|
combined_sentences.append(para.strip())
|
|
|
|
|
|
all_sentences.extend(combined_sentences)
|
|
|
|
|
|
if not all_sentences:
|
|
|
return [text] if text.strip() else []
|
|
|
|
|
|
|
|
|
chunks: List[str] = []
|
|
|
current_chunk: List[str] = []
|
|
|
current_size = 0
|
|
|
|
|
|
for sentence in all_sentences:
|
|
|
sentence_size = len(sentence)
|
|
|
|
|
|
|
|
|
if current_size + sentence_size > max_chunk_size and current_chunk:
|
|
|
|
|
|
chunk_text = '\n'.join(current_chunk)
|
|
|
if len(chunk_text.strip()) >= min_chunk_size:
|
|
|
chunks.append(chunk_text)
|
|
|
else:
|
|
|
|
|
|
if chunks:
|
|
|
chunks[-1] = chunks[-1] + '\n' + chunk_text
|
|
|
else:
|
|
|
chunks.append(chunk_text)
|
|
|
|
|
|
|
|
|
overlap_sentences: List[str] = []
|
|
|
overlap_size = 0
|
|
|
for s in reversed(current_chunk):
|
|
|
if overlap_size + len(s) <= overlap:
|
|
|
overlap_sentences.insert(0, s)
|
|
|
overlap_size += len(s) + 1
|
|
|
else:
|
|
|
break
|
|
|
|
|
|
current_chunk = overlap_sentences + [sentence]
|
|
|
current_size = overlap_size + sentence_size
|
|
|
else:
|
|
|
|
|
|
current_chunk.append(sentence)
|
|
|
current_size += sentence_size + 1
|
|
|
|
|
|
|
|
|
if current_chunk:
|
|
|
chunk_text = '\n'.join(current_chunk)
|
|
|
if chunks and len(chunk_text.strip()) < min_chunk_size:
|
|
|
chunks[-1] = chunks[-1] + '\n' + chunk_text
|
|
|
else:
|
|
|
chunks.append(chunk_text)
|
|
|
|
|
|
|
|
|
final_chunks: List[str] = []
|
|
|
for chunk in chunks:
|
|
|
chunk = chunk.strip()
|
|
|
if chunk and len(chunk) >= min_chunk_size:
|
|
|
final_chunks.append(chunk)
|
|
|
elif chunk:
|
|
|
if final_chunks:
|
|
|
final_chunks[-1] = final_chunks[-1] + '\n' + chunk
|
|
|
else:
|
|
|
final_chunks.append(chunk)
|
|
|
|
|
|
return final_chunks if final_chunks else [text] if text.strip() else []
|
|
|
|
|
|
|
|
|
def extract_chapter_number(text: str) -> Optional[int]:
|
|
|
"""
|
|
|
ํ
์คํธ์์ ์ฑํฐ ๋ฒํธ ์ถ์ถ
|
|
|
|
|
|
Args:
|
|
|
text: ์ฑํฐ ๋ฒํธ๋ฅผ ์ถ์ถํ ํ
์คํธ
|
|
|
|
|
|
Returns:
|
|
|
์ฑํฐ ๋ฒํธ, ์์ผ๋ฉด None
|
|
|
"""
|
|
|
|
|
|
patterns = [
|
|
|
r'์ \s*(\d+)\s*์ฅ',
|
|
|
r'์ \s*(\d+)\s*ํ',
|
|
|
r'Chapter\s*(\d+)',
|
|
|
r'CHAPTER\s*(\d+)',
|
|
|
r'Ch\.\s*(\d+)',
|
|
|
r'(\d+)\s*์ฅ',
|
|
|
r'(\d+)\s*ํ',
|
|
|
r'chap\.\s*(\d+)',
|
|
|
r'ch\s*(\d+)',
|
|
|
r'(\d+)\s*็ซ ',
|
|
|
]
|
|
|
|
|
|
|
|
|
search_text = text[:500]
|
|
|
|
|
|
for pattern in patterns:
|
|
|
match = re.search(pattern, search_text, re.IGNORECASE)
|
|
|
if match:
|
|
|
try:
|
|
|
chapter_num = int(match.group(1))
|
|
|
return chapter_num
|
|
|
except (ValueError, AttributeError):
|
|
|
continue
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|