chatbot / app /utils /text_processing.py
Tahasaif3's picture
'changes'
efb660b
import re
from typing import List, Dict, Tuple
def split_text_into_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
"""
Split text into chunks of specified size with overlap
Args:
text: The text to split
chunk_size: Maximum size of each chunk in characters
overlap: Number of characters to overlap between chunks
Returns:
List of text chunks
"""
if len(text) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(text):
# Find a good breaking point (sentence end)
end = min(start + chunk_size, len(text))
if end < len(text):
# Try to break at sentence end
sentence_end = text.rfind('. ', start, end)
if sentence_end != -1 and sentence_end > start + chunk_size // 2:
end = sentence_end + 1
else:
# Try to break at word boundary
word_end = text.rfind(' ', start, end)
if word_end != -1 and word_end > start + chunk_size // 2:
end = word_end
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - overlap if end < len(text) else end
return chunks
def extract_chapters_and_sections(text: str) -> List[Dict[str, str]]:
"""
Extract chapters and sections from the book content
Args:
text: The book content in markdown format
Returns:
List of dictionaries containing chapter/section information
"""
# Find all chapters (marked with #)
chapters = re.findall(r'^# (.*?)\n(.*?)(?=^# |\Z)', text, re.MULTILINE | re.DOTALL)
result = []
for chapter_title, chapter_content in chapters:
# Skip the introductory content
if chapter_title.startswith("Chatbot Knowledge Base"):
continue
# Find all sections (marked with ##)
sections = re.findall(r'^## (.*?)\n(.*?)(?=^## |\Z)', chapter_content, re.MULTILINE | re.DOTALL)
if not sections:
# If no sections, treat the whole chapter as one section
result.append({
"chapter": chapter_title,
"section": "",
"subsection": "",
"title": chapter_title,
"content": chapter_content.strip()
})
else:
for section_title, section_content in sections:
# Find all subsections (marked with ###)
subsections = re.findall(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL)
if not subsections:
# If no subsections, treat the section content as is
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": "",
"title": f"{chapter_title} - {section_title}",
"content": section_content.strip()
})
else:
for subsection_title, subsection_content in subsections:
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": subsection_title,
"title": f"{chapter_title} - {section_title} - {subsection_title}",
"content": subsection_content.strip()
})
# Handle any remaining content in the section that's not in a subsection
# Find content before the first ### and after the last ###
first_subsection_match = re.search(r'^### ', section_content, re.MULTILINE)
last_subsection_match = None
for match in re.finditer(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL):
last_subsection_match = match
if first_subsection_match or last_subsection_match:
if first_subsection_match:
# Content before first subsection
before_content = section_content[:first_subsection_match.start()].strip()
if before_content:
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": "",
"title": f"{chapter_title} - {section_title}",
"content": before_content
})
if last_subsection_match:
# Content after last subsection
last_subsection_end = last_subsection_match.end()
after_content = section_content[last_subsection_end:].strip()
if after_content:
result.append({
"chapter": chapter_title,
"section": section_title,
"subsection": "Additional Content",
"title": f"{chapter_title} - {section_title} - Additional Content",
"content": after_content
})
return result
def clean_markdown(text: str) -> str:
"""
Clean markdown formatting from text
Args:
text: Markdown text to clean
Returns:
Cleaned text without markdown formatting
"""
# Remove headers
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
# Remove bold and italic
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
text = re.sub(r'\*(.*?)\*', r'\1', text)
text = re.sub(r'__(.*?)__', r'\1', text)
text = re.sub(r'_(.*?)_', r'\1', text)
# Remove links but keep the text
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
# Remove code blocks
text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
text = re.sub(r'`([^`]+)`', r'\1', text)
# Remove lists
text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
# Remove extra whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = text.strip()
return text