File size: 6,786 Bytes
efb660b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import re
from typing import List, Dict, Tuple

def split_text_into_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
    """
    Split text into chunks of specified size with overlap
    
    Args:
        text: The text to split
        chunk_size: Maximum size of each chunk in characters
        overlap: Number of characters to overlap between chunks
        
    Returns:
        List of text chunks
    """
    if len(text) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        # Find a good breaking point (sentence end)
        end = min(start + chunk_size, len(text))
        
        if end < len(text):
            # Try to break at sentence end
            sentence_end = text.rfind('. ', start, end)
            if sentence_end != -1 and sentence_end > start + chunk_size // 2:
                end = sentence_end + 1
            else:
                # Try to break at word boundary
                word_end = text.rfind(' ', start, end)
                if word_end != -1 and word_end > start + chunk_size // 2:
                    end = word_end
        
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        start = end - overlap if end < len(text) else end
    
    return chunks

def extract_chapters_and_sections(text: str) -> List[Dict[str, str]]:
    """
    Extract chapters and sections from the book content
    
    Args:
        text: The book content in markdown format
        
    Returns:
        List of dictionaries containing chapter/section information
    """
    # Find all chapters (marked with #)
    chapters = re.findall(r'^# (.*?)\n(.*?)(?=^# |\Z)', text, re.MULTILINE | re.DOTALL)
    
    result = []
    
    for chapter_title, chapter_content in chapters:
        # Skip the introductory content
        if chapter_title.startswith("Chatbot Knowledge Base"):
            continue
            
        # Find all sections (marked with ##)
        sections = re.findall(r'^## (.*?)\n(.*?)(?=^## |\Z)', chapter_content, re.MULTILINE | re.DOTALL)
        
        if not sections:
            # If no sections, treat the whole chapter as one section
            result.append({
                "chapter": chapter_title,
                "section": "",
                "subsection": "",
                "title": chapter_title,
                "content": chapter_content.strip()
            })
        else:
            for section_title, section_content in sections:
                # Find all subsections (marked with ###)
                subsections = re.findall(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL)
                
                if not subsections:
                    # If no subsections, treat the section content as is
                    result.append({
                        "chapter": chapter_title,
                        "section": section_title,
                        "subsection": "",
                        "title": f"{chapter_title} - {section_title}",
                        "content": section_content.strip()
                    })
                else:
                    for subsection_title, subsection_content in subsections:
                        result.append({
                            "chapter": chapter_title,
                            "section": section_title,
                            "subsection": subsection_title,
                            "title": f"{chapter_title} - {section_title} - {subsection_title}",
                            "content": subsection_content.strip()
                        })
                    
                    # Handle any remaining content in the section that's not in a subsection
                    # Find content before the first ### and after the last ###
                    first_subsection_match = re.search(r'^### ', section_content, re.MULTILINE)
                    last_subsection_match = None
                    for match in re.finditer(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL):
                        last_subsection_match = match
                    
                    if first_subsection_match or last_subsection_match:
                        if first_subsection_match:
                            # Content before first subsection
                            before_content = section_content[:first_subsection_match.start()].strip()
                            if before_content:
                                result.append({
                                    "chapter": chapter_title,
                                    "section": section_title,
                                    "subsection": "",
                                    "title": f"{chapter_title} - {section_title}",
                                    "content": before_content
                                })
                        
                        if last_subsection_match:
                            # Content after last subsection
                            last_subsection_end = last_subsection_match.end()
                            after_content = section_content[last_subsection_end:].strip()
                            if after_content:
                                result.append({
                                    "chapter": chapter_title,
                                    "section": section_title,
                                    "subsection": "Additional Content",
                                    "title": f"{chapter_title} - {section_title} - Additional Content",
                                    "content": after_content
                                })
    
    return result

def clean_markdown(text: str) -> str:
    """
    Clean markdown formatting from text
    
    Args:
        text: Markdown text to clean
        
    Returns:
        Cleaned text without markdown formatting
    """
    # Remove headers
    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
    
    # Remove bold and italic
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    text = re.sub(r'__(.*?)__', r'\1', text)
    text = re.sub(r'_(.*?)_', r'\1', text)
    
    # Remove links but keep the text
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    
    # Remove code blocks
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    
    # Remove lists
    text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
    
    # Remove extra whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = text.strip()
    
    return text