import re from typing import List, Dict, Tuple def split_text_into_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]: """ Split text into chunks of specified size with overlap Args: text: The text to split chunk_size: Maximum size of each chunk in characters overlap: Number of characters to overlap between chunks Returns: List of text chunks """ if len(text) <= chunk_size: return [text] chunks = [] start = 0 while start < len(text): # Find a good breaking point (sentence end) end = min(start + chunk_size, len(text)) if end < len(text): # Try to break at sentence end sentence_end = text.rfind('. ', start, end) if sentence_end != -1 and sentence_end > start + chunk_size // 2: end = sentence_end + 1 else: # Try to break at word boundary word_end = text.rfind(' ', start, end) if word_end != -1 and word_end > start + chunk_size // 2: end = word_end chunk = text[start:end].strip() if chunk: chunks.append(chunk) start = end - overlap if end < len(text) else end return chunks def extract_chapters_and_sections(text: str) -> List[Dict[str, str]]: """ Extract chapters and sections from the book content Args: text: The book content in markdown format Returns: List of dictionaries containing chapter/section information """ # Find all chapters (marked with #) chapters = re.findall(r'^# (.*?)\n(.*?)(?=^# |\Z)', text, re.MULTILINE | re.DOTALL) result = [] for chapter_title, chapter_content in chapters: # Skip the introductory content if chapter_title.startswith("Chatbot Knowledge Base"): continue # Find all sections (marked with ##) sections = re.findall(r'^## (.*?)\n(.*?)(?=^## |\Z)', chapter_content, re.MULTILINE | re.DOTALL) if not sections: # If no sections, treat the whole chapter as one section result.append({ "chapter": chapter_title, "section": "", "subsection": "", "title": chapter_title, "content": chapter_content.strip() }) else: for section_title, section_content in sections: # Find all subsections (marked with ###) subsections = re.findall(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL) if not subsections: # If no subsections, treat the section content as is result.append({ "chapter": chapter_title, "section": section_title, "subsection": "", "title": f"{chapter_title} - {section_title}", "content": section_content.strip() }) else: for subsection_title, subsection_content in subsections: result.append({ "chapter": chapter_title, "section": section_title, "subsection": subsection_title, "title": f"{chapter_title} - {section_title} - {subsection_title}", "content": subsection_content.strip() }) # Handle any remaining content in the section that's not in a subsection # Find content before the first ### and after the last ### first_subsection_match = re.search(r'^### ', section_content, re.MULTILINE) last_subsection_match = None for match in re.finditer(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL): last_subsection_match = match if first_subsection_match or last_subsection_match: if first_subsection_match: # Content before first subsection before_content = section_content[:first_subsection_match.start()].strip() if before_content: result.append({ "chapter": chapter_title, "section": section_title, "subsection": "", "title": f"{chapter_title} - {section_title}", "content": before_content }) if last_subsection_match: # Content after last subsection last_subsection_end = last_subsection_match.end() after_content = section_content[last_subsection_end:].strip() if after_content: result.append({ "chapter": chapter_title, "section": section_title, "subsection": "Additional Content", "title": f"{chapter_title} - {section_title} - Additional Content", "content": after_content }) return result def clean_markdown(text: str) -> str: """ Clean markdown formatting from text Args: text: Markdown text to clean Returns: Cleaned text without markdown formatting """ # Remove headers text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) # Remove bold and italic text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) text = re.sub(r'\*(.*?)\*', r'\1', text) text = re.sub(r'__(.*?)__', r'\1', text) text = re.sub(r'_(.*?)_', r'\1', text) # Remove links but keep the text text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # Remove code blocks text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) text = re.sub(r'`([^`]+)`', r'\1', text) # Remove lists text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE) text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) # Remove extra whitespace text = re.sub(r'\n{3,}', '\n\n', text) text = text.strip() return text