SOY NV AI
๋ฉํ๋ฐ์ดํฐ ์์ฑ ๊ธฐ๋ฅ ๊ฐ์ : ๊ธฐ์กด ๋ฉํ๋ฐ์ดํฐ ๋ณํฉ ๋ฐ ํ์ฐจ ์ ๋ณด ์ ์ง
d234e06
| """ | |
| ํ ์คํธ ์ฒ๋ฆฌ ์ ํธ๋ฆฌํฐ ํจ์ | |
| """ | |
| import re | |
| from typing import List, Optional | |
| from app.core.logger import get_logger | |
| logger = get_logger(__name__) | |
| def clean_text(text: str) -> str: | |
| """ | |
| ํ ์คํธ ์ ๋ฆฌ (๊ณต๋ฐฑ ์ ๊ทํ ๋ฑ) | |
| Args: | |
| text: ์ ๋ฆฌํ ํ ์คํธ | |
| Returns: | |
| ์ ๋ฆฌ๋ ํ ์คํธ | |
| """ | |
| if not text: | |
| return '' | |
| # ์ฐ์๋ ๊ณต๋ฐฑ ์ ๊ฑฐ | |
| text = re.sub(r'\s+', ' ', text) | |
| # ์๋ค ๊ณต๋ฐฑ ์ ๊ฑฐ | |
| text = text.strip() | |
| return text | |
| def split_text_into_chunks( | |
| text: str, | |
| min_chunk_size: int = 200, | |
| max_chunk_size: int = 1000, | |
| overlap: int = 150 | |
| ) -> List[str]: | |
| """ | |
| ์๋ฏธ ๊ธฐ๋ฐ ํ ์คํธ ์ฒญํน (๋ฌธ์ฅ๊ณผ ๋ฌธ๋จ ๊ฒฝ๊ณ๋ฅผ ๊ณ ๋ คํ์ฌ ๋ถํ ) | |
| Args: | |
| text: ๋ถํ ํ ํ ์คํธ | |
| min_chunk_size: ์ต์ ์ฒญํฌ ํฌ๊ธฐ | |
| max_chunk_size: ์ต๋ ์ฒญํฌ ํฌ๊ธฐ | |
| overlap: ์ค๋ฒ๋ฉ ํฌ๊ธฐ | |
| Returns: | |
| ๋ถํ ๋ ์ฒญํฌ ๋ฆฌ์คํธ | |
| """ | |
| if not text or len(text.strip()) == 0: | |
| return [] | |
| # 1๋จ๊ณ: ๋ฌธ๋จ ๋จ์๋ก ๋ถํ (๋น ์ค ๊ธฐ์ค) | |
| paragraphs = re.split(r'\n\s*\n', text.strip()) | |
| paragraphs = [p.strip() for p in paragraphs if p.strip()] | |
| if not paragraphs: | |
| return [] | |
| # 2๋จ๊ณ: ๊ฐ ๋ฌธ๋จ์ ๋ฌธ์ฅ ๋จ์๋ก ๋ถํ | |
| sentence_pattern = r'([.!?]+)(?=\s+|$)' | |
| all_sentences: List[str] = [] | |
| for para in paragraphs: | |
| parts = re.split(sentence_pattern, para) | |
| combined_sentences: List[str] = [] | |
| current_sentence = "" | |
| for part in parts: | |
| if not part.strip(): | |
| continue | |
| if re.match(r'^[.!?]+$', part): | |
| # ๊ตฌ๋์ ์ธ ๊ฒฝ์ฐ ํ์ฌ ๋ฌธ์ฅ์ ์ถ๊ฐํ๊ณ ๋ฌธ์ฅ ์์ฑ | |
| current_sentence += part | |
| if current_sentence.strip(): | |
| combined_sentences.append(current_sentence.strip()) | |
| current_sentence = "" | |
| else: | |
| # ํ ์คํธ์ธ ๊ฒฝ์ฐ ํ์ฌ ๋ฌธ์ฅ์ ์ถ๊ฐ | |
| current_sentence += part | |
| # ๋ง์ง๋ง ๋ฌธ์ฅ ์ฒ๋ฆฌ | |
| if current_sentence.strip(): | |
| combined_sentences.append(current_sentence.strip()) | |
| # ๋ฌธ์ฅ์ด ํ๋๋ ์๋ ๊ฒฝ์ฐ | |
| if not combined_sentences and para.strip(): | |
| combined_sentences.append(para.strip()) | |
| all_sentences.extend(combined_sentences) | |
| if not all_sentences: | |
| return [text] if text.strip() else [] | |
| # 3๋จ๊ณ: ๋ฌธ์ฅ๋ค์ ๋ชจ์์ ์๋ฏธ ์๋ ์ฒญํฌ ์์ฑ | |
| chunks: List[str] = [] | |
| current_chunk: List[str] = [] | |
| current_size = 0 | |
| for sentence in all_sentences: | |
| sentence_size = len(sentence) | |
| # ํ์ฌ ์ฒญํฌ์ ๋ฌธ์ฅ ์ถ๊ฐ ์ ์ต๋ ํฌ๊ธฐ๋ฅผ ์ด๊ณผํ๋ ๊ฒฝ์ฐ | |
| if current_size + sentence_size > max_chunk_size and current_chunk: | |
| # ํ์ฌ ์ฒญํฌ ์ ์ฅ | |
| chunk_text = '\n'.join(current_chunk) | |
| if len(chunk_text.strip()) >= min_chunk_size: | |
| chunks.append(chunk_text) | |
| else: | |
| # ์ต์ ํฌ๊ธฐ ๋ฏธ๋ง์ด๋ฉด ๋ค์ ์ฒญํฌ์ ๋ณํฉ | |
| if chunks: | |
| chunks[-1] = chunks[-1] + '\n' + chunk_text | |
| else: | |
| chunks.append(chunk_text) | |
| # ์ค๋ฒ๋ฉ์ ์ํ ๋ฌธ์ฅ ์ ์ง | |
| overlap_sentences: List[str] = [] | |
| overlap_size = 0 | |
| for s in reversed(current_chunk): | |
| if overlap_size + len(s) <= overlap: | |
| overlap_sentences.insert(0, s) | |
| overlap_size += len(s) + 1 | |
| else: | |
| break | |
| current_chunk = overlap_sentences + [sentence] | |
| current_size = overlap_size + sentence_size | |
| else: | |
| # ํ์ฌ ์ฒญํฌ์ ๋ฌธ์ฅ ์ถ๊ฐ | |
| current_chunk.append(sentence) | |
| current_size += sentence_size + 1 | |
| # ๋ง์ง๋ง ์ฒญํฌ ์ถ๊ฐ | |
| if current_chunk: | |
| chunk_text = '\n'.join(current_chunk) | |
| if chunks and len(chunk_text.strip()) < min_chunk_size: | |
| chunks[-1] = chunks[-1] + '\n' + chunk_text | |
| else: | |
| chunks.append(chunk_text) | |
| # ๋น ์ฒญํฌ ์ ๊ฑฐ ๋ฐ ์ต์ ํฌ๊ธฐ ๋ฏธ๋ง ์ฒญํฌ ์ฒ๋ฆฌ | |
| final_chunks: List[str] = [] | |
| for chunk in chunks: | |
| chunk = chunk.strip() | |
| if chunk and len(chunk) >= min_chunk_size: | |
| final_chunks.append(chunk) | |
| elif chunk: | |
| if final_chunks: | |
| final_chunks[-1] = final_chunks[-1] + '\n' + chunk | |
| else: | |
| final_chunks.append(chunk) | |
| return final_chunks if final_chunks else [text] if text.strip() else [] | |
| def extract_chapter_number(text: str) -> Optional[int]: | |
| """ | |
| ํ ์คํธ์์ ์ฑํฐ ๋ฒํธ ์ถ์ถ | |
| Args: | |
| text: ์ฑํฐ ๋ฒํธ๋ฅผ ์ถ์ถํ ํ ์คํธ | |
| Returns: | |
| ์ฑํฐ ๋ฒํธ, ์์ผ๋ฉด None | |
| """ | |
| # ๋ค์ํ ์ฑํฐ ํจํด ๋งค์นญ | |
| patterns = [ | |
| r'์ \s*(\d+)\s*์ฅ', # ์ 1์ฅ, ์ 1 ์ฅ | |
| r'์ \s*(\d+)\s*ํ', # ์ 1ํ | |
| r'Chapter\s*(\d+)', # Chapter 1 | |
| r'CHAPTER\s*(\d+)', # CHAPTER 1 | |
| r'Ch\.\s*(\d+)', # Ch. 1 | |
| r'(\d+)\s*์ฅ', # 1์ฅ | |
| r'(\d+)\s*ํ', # 1ํ | |
| r'chap\.\s*(\d+)', # chap. 1 | |
| r'ch\s*(\d+)', # ch 1 | |
| r'(\d+)\s*็ซ ', # 1็ซ | |
| ] | |
| # ํ ์คํธ์ ์ฒ์ 500์๋ง ๊ฒ์ฌ | |
| search_text = text[:500] | |
| for pattern in patterns: | |
| match = re.search(pattern, search_text, re.IGNORECASE) | |
| if match: | |
| try: | |
| chapter_num = int(match.group(1)) | |
| return chapter_num | |
| except (ValueError, AttributeError): | |
| continue | |
| return None | |