Spaces:

Tahasaif3
/

chatbot

Runtime error

App Files Files Community

chatbot / app /utils /text_processing.py

Tahasaif3

'changes'

efb660b 4 months ago

raw

history blame contribute delete

6.79 kB

	import re
	from typing import List, Dict, Tuple

	def split_text_into_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
	"""
	Split text into chunks of specified size with overlap

	Args:
	text: The text to split
	chunk_size: Maximum size of each chunk in characters
	overlap: Number of characters to overlap between chunks

	Returns:
	List of text chunks
	"""
	if len(text) <= chunk_size:
	return [text]

	chunks = []
	start = 0

	while start < len(text):
	# Find a good breaking point (sentence end)
	end = min(start + chunk_size, len(text))

	if end < len(text):
	# Try to break at sentence end
	sentence_end = text.rfind('. ', start, end)
	if sentence_end != -1 and sentence_end > start + chunk_size // 2:
	end = sentence_end + 1
	else:
	# Try to break at word boundary
	word_end = text.rfind(' ', start, end)
	if word_end != -1 and word_end > start + chunk_size // 2:
	end = word_end

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	start = end - overlap if end < len(text) else end

	return chunks

	def extract_chapters_and_sections(text: str) -> List[Dict[str, str]]:
	"""
	Extract chapters and sections from the book content

	Args:
	text: The book content in markdown format

	Returns:
	List of dictionaries containing chapter/section information
	"""
	# Find all chapters (marked with #)
	chapters = re.findall(r'^# (.?)\n(.?)(?=^# \|\Z)', text, re.MULTILINE \| re.DOTALL)

	result = []

	for chapter_title, chapter_content in chapters:
	# Skip the introductory content
	if chapter_title.startswith("Chatbot Knowledge Base"):
	continue

	# Find all sections (marked with ##)
	sections = re.findall(r'^## (.?)\n(.?)(?=^## \|\Z)', chapter_content, re.MULTILINE \| re.DOTALL)

	if not sections:
	# If no sections, treat the whole chapter as one section
	result.append({
	"chapter": chapter_title,
	"section": "",
	"subsection": "",
	"title": chapter_title,
	"content": chapter_content.strip()
	})
	else:
	for section_title, section_content in sections:
	# Find all subsections (marked with ###)
	subsections = re.findall(r'^### (.?)\n(.?)(?=^### \|\Z)', section_content, re.MULTILINE \| re.DOTALL)

	if not subsections:
	# If no subsections, treat the section content as is
	result.append({
	"chapter": chapter_title,
	"section": section_title,
	"subsection": "",
	"title": f"{chapter_title} - {section_title}",
	"content": section_content.strip()
	})
	else:
	for subsection_title, subsection_content in subsections:
	result.append({
	"chapter": chapter_title,
	"section": section_title,
	"subsection": subsection_title,
	"title": f"{chapter_title} - {section_title} - {subsection_title}",
	"content": subsection_content.strip()
	})

	# Handle any remaining content in the section that's not in a subsection
	# Find content before the first ### and after the last ###
	first_subsection_match = re.search(r'^### ', section_content, re.MULTILINE)
	last_subsection_match = None
	for match in re.finditer(r'^### (.?)\n(.?)(?=^### \|\Z)', section_content, re.MULTILINE \| re.DOTALL):
	last_subsection_match = match

	if first_subsection_match or last_subsection_match:
	if first_subsection_match:
	# Content before first subsection
	before_content = section_content[:first_subsection_match.start()].strip()
	if before_content:
	result.append({
	"chapter": chapter_title,
	"section": section_title,
	"subsection": "",
	"title": f"{chapter_title} - {section_title}",
	"content": before_content
	})

	if last_subsection_match:
	# Content after last subsection
	last_subsection_end = last_subsection_match.end()
	after_content = section_content[last_subsection_end:].strip()
	if after_content:
	result.append({
	"chapter": chapter_title,
	"section": section_title,
	"subsection": "Additional Content",
	"title": f"{chapter_title} - {section_title} - Additional Content",
	"content": after_content
	})

	return result

	def clean_markdown(text: str) -> str:
	"""
	Clean markdown formatting from text

	Args:
	text: Markdown text to clean

	Returns:
	Cleaned text without markdown formatting
	"""
	# Remove headers
	text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)

	# Remove bold and italic
	text = re.sub(r'\\(.?)\\*', r'\1', text)
	text = re.sub(r'\(.?)\*', r'\1', text)
	text = re.sub(r'__(.*?)__', r'\1', text)
	text = re.sub(r'_(.*?)_', r'\1', text)

	# Remove links but keep the text
	text = re.sub(r'\[(.?)\]\(.?\)', r'\1', text)

	# Remove code blocks
	text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
	text = re.sub(r'`([^`]+)`', r'\1', text)

	# Remove lists
	text = re.sub(r'^\s[\\-\+]\s+', '', text, flags=re.MULTILINE)
	text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)

	# Remove extra whitespace
	text = re.sub(r'\n{3,}', '\n\n', text)
	text = text.strip()

	return text