File size: 6,786 Bytes
efb660b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | import re
from typing import List, Dict, Tuple
def split_text_into_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
"""
Split text into chunks of specified size with overlap
Args:
text: The text to split
chunk_size: Maximum size of each chunk in characters
overlap: Number of characters to overlap between chunks
Returns:
List of text chunks
"""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
# Find a good breaking point (sentence end)
end = min(start + chunk_size, len(text))
if end < len(text):
# Try to break at sentence end
sentence_end = text.rfind('. ', start, end)
if sentence_end != -1 and sentence_end > start + chunk_size // 2:
end = sentence_end + 1
else:
# Try to break at word boundary
word_end = text.rfind(' ', start, end)
if word_end != -1 and word_end > start + chunk_size // 2:
end = word_end
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - overlap if end < len(text) else end
return chunks
def extract_chapters_and_sections(text: str) -> List[Dict[str, str]]:
"""
Extract chapters and sections from the book content
Args:
text: The book content in markdown format
Returns:
List of dictionaries containing chapter/section information
"""
# Find all chapters (marked with #)
chapters = re.findall(r'^# (.*?)\n(.*?)(?=^# |\Z)', text, re.MULTILINE | re.DOTALL)
result = []
for chapter_title, chapter_content in chapters:
# Skip the introductory content
if chapter_title.startswith("Chatbot Knowledge Base"):
continue
# Find all sections (marked with ##)
sections = re.findall(r'^## (.*?)\n(.*?)(?=^## |\Z)', chapter_content, re.MULTILINE | re.DOTALL)
if not sections:
# If no sections, treat the whole chapter as one section
result.append({
"chapter": chapter_title,
"section": "",
"subsection": "",
"title": chapter_title,
"content": chapter_content.strip()
})
else:
for section_title, section_content in sections:
# Find all subsections (marked with ###)
subsections = re.findall(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL)
if not subsections:
# If no subsections, treat the section content as is
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": "",
"title": f"{chapter_title} - {section_title}",
"content": section_content.strip()
})
else:
for subsection_title, subsection_content in subsections:
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": subsection_title,
"title": f"{chapter_title} - {section_title} - {subsection_title}",
"content": subsection_content.strip()
})
# Handle any remaining content in the section that's not in a subsection
# Find content before the first ### and after the last ###
first_subsection_match = re.search(r'^### ', section_content, re.MULTILINE)
last_subsection_match = None
for match in re.finditer(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL):
last_subsection_match = match
if first_subsection_match or last_subsection_match:
if first_subsection_match:
# Content before first subsection
before_content = section_content[:first_subsection_match.start()].strip()
if before_content:
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": "",
"title": f"{chapter_title} - {section_title}",
"content": before_content
})
if last_subsection_match:
# Content after last subsection
last_subsection_end = last_subsection_match.end()
after_content = section_content[last_subsection_end:].strip()
if after_content:
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": "Additional Content",
"title": f"{chapter_title} - {section_title} - Additional Content",
"content": after_content
})
return result
def clean_markdown(text: str) -> str:
"""
Clean markdown formatting from text
Args:
text: Markdown text to clean
Returns:
Cleaned text without markdown formatting
"""
# Remove headers
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
# Remove bold and italic
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
text = re.sub(r'\*(.*?)\*', r'\1', text)
text = re.sub(r'__(.*?)__', r'\1', text)
text = re.sub(r'_(.*?)_', r'\1', text)
# Remove links but keep the text
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
# Remove code blocks
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove lists
text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
# Remove extra whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = text.strip()
return text |