| import re |
| from typing import List, Dict, Tuple |
|
|
| def split_text_into_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]: |
| """ |
| Split text into chunks of specified size with overlap |
| |
| Args: |
| text: The text to split |
| chunk_size: Maximum size of each chunk in characters |
| overlap: Number of characters to overlap between chunks |
| |
| Returns: |
| List of text chunks |
| """ |
| if len(text) <= chunk_size: |
| return [text] |
| |
| chunks = [] |
| start = 0 |
| |
| while start < len(text): |
| |
| end = min(start + chunk_size, len(text)) |
| |
| if end < len(text): |
| |
| sentence_end = text.rfind('. ', start, end) |
| if sentence_end != -1 and sentence_end > start + chunk_size // 2: |
| end = sentence_end + 1 |
| else: |
| |
| word_end = text.rfind(' ', start, end) |
| if word_end != -1 and word_end > start + chunk_size // 2: |
| end = word_end |
| |
| chunk = text[start:end].strip() |
| if chunk: |
| chunks.append(chunk) |
| |
| start = end - overlap if end < len(text) else end |
| |
| return chunks |
|
|
| def extract_chapters_and_sections(text: str) -> List[Dict[str, str]]: |
| """ |
| Extract chapters and sections from the book content |
| |
| Args: |
| text: The book content in markdown format |
| |
| Returns: |
| List of dictionaries containing chapter/section information |
| """ |
| |
| chapters = re.findall(r'^# (.*?)\n(.*?)(?=^# |\Z)', text, re.MULTILINE | re.DOTALL) |
| |
| result = [] |
| |
| for chapter_title, chapter_content in chapters: |
| |
| if chapter_title.startswith("Chatbot Knowledge Base"): |
| continue |
| |
| |
| sections = re.findall(r'^## (.*?)\n(.*?)(?=^## |\Z)', chapter_content, re.MULTILINE | re.DOTALL) |
| |
| if not sections: |
| |
| result.append({ |
| "chapter": chapter_title, |
| "section": "", |
| "subsection": "", |
| "title": chapter_title, |
| "content": chapter_content.strip() |
| }) |
| else: |
| for section_title, section_content in sections: |
| |
| subsections = re.findall(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL) |
| |
| if not subsections: |
| |
| result.append({ |
| "chapter": chapter_title, |
| "section": section_title, |
| "subsection": "", |
| "title": f"{chapter_title} - {section_title}", |
| "content": section_content.strip() |
| }) |
| else: |
| for subsection_title, subsection_content in subsections: |
| result.append({ |
| "chapter": chapter_title, |
| "section": section_title, |
| "subsection": subsection_title, |
| "title": f"{chapter_title} - {section_title} - {subsection_title}", |
| "content": subsection_content.strip() |
| }) |
| |
| |
| |
| first_subsection_match = re.search(r'^### ', section_content, re.MULTILINE) |
| last_subsection_match = None |
| for match in re.finditer(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL): |
| last_subsection_match = match |
| |
| if first_subsection_match or last_subsection_match: |
| if first_subsection_match: |
| |
| before_content = section_content[:first_subsection_match.start()].strip() |
| if before_content: |
| result.append({ |
| "chapter": chapter_title, |
| "section": section_title, |
| "subsection": "", |
| "title": f"{chapter_title} - {section_title}", |
| "content": before_content |
| }) |
| |
| if last_subsection_match: |
| |
| last_subsection_end = last_subsection_match.end() |
| after_content = section_content[last_subsection_end:].strip() |
| if after_content: |
| result.append({ |
| "chapter": chapter_title, |
| "section": section_title, |
| "subsection": "Additional Content", |
| "title": f"{chapter_title} - {section_title} - Additional Content", |
| "content": after_content |
| }) |
| |
| return result |
|
|
| def clean_markdown(text: str) -> str: |
| """ |
| Clean markdown formatting from text |
| |
| Args: |
| text: Markdown text to clean |
| |
| Returns: |
| Cleaned text without markdown formatting |
| """ |
| |
| text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) |
| |
| |
| text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) |
| text = re.sub(r'\*(.*?)\*', r'\1', text) |
| text = re.sub(r'__(.*?)__', r'\1', text) |
| text = re.sub(r'_(.*?)_', r'\1', text) |
| |
| |
| text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) |
| |
| |
| text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) |
| text = re.sub(r'`([^`]+)`', r'\1', text) |
| |
| |
| text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE) |
| text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) |
| |
| |
| text = re.sub(r'\n{3,}', '\n\n', text) |
| text = text.strip() |
| |
| return text |