Spaces:

Tahasaif3
/

chatbot

Runtime error

File size: 6,786 Bytes

efb660b

import re
from typing import List, Dict, Tuple

def split_text_into_chunks(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
    """
    Split text into chunks of specified size with overlap
    
    Args:
        text: The text to split
        chunk_size: Maximum size of each chunk in characters
        overlap: Number of characters to overlap between chunks
        
    Returns:
        List of text chunks
    """
    if len(text) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        # Find a good breaking point (sentence end)
        end = min(start + chunk_size, len(text))
        
        if end < len(text):
            # Try to break at sentence end
            sentence_end = text.rfind('. ', start, end)
            if sentence_end != -1 and sentence_end > start + chunk_size // 2:
                end = sentence_end + 1
            else:
                # Try to break at word boundary
                word_end = text.rfind(' ', start, end)
                if word_end != -1 and word_end > start + chunk_size // 2:
                    end = word_end
        
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        start = end - overlap if end < len(text) else end
    
    return chunks

def extract_chapters_and_sections(text: str) -> List[Dict[str, str]]:
    """
    Extract chapters and sections from the book content
    
    Args:
        text: The book content in markdown format
        
    Returns:
        List of dictionaries containing chapter/section information
    """
    # Find all chapters (marked with #)
    chapters = re.findall(r'^# (.*?)\n(.*?)(?=^# |\Z)', text, re.MULTILINE | re.DOTALL)
    
    result = []
    
    for chapter_title, chapter_content in chapters:
        # Skip the introductory content
        if chapter_title.startswith("Chatbot Knowledge Base"):
            continue
            
        # Find all sections (marked with ##)
        sections = re.findall(r'^## (.*?)\n(.*?)(?=^## |\Z)', chapter_content, re.MULTILINE | re.DOTALL)
        
        if not sections:
            # If no sections, treat the whole chapter as one section
            result.append({
                "chapter": chapter_title,
                "section": "",
                "subsection": "",
                "title": chapter_title,
                "content": chapter_content.strip()
            })
        else:
            for section_title, section_content in sections:
                # Find all subsections (marked with ###)
                subsections = re.findall(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL)
                
                if not subsections:
                    # If no subsections, treat the section content as is
                    result.append({
                        "chapter": chapter_title,
                        "section": section_title,
                        "subsection": "",
                        "title": f"{chapter_title} - {section_title}",
                        "content": section_content.strip()
                    })
                else:
                    for subsection_title, subsection_content in subsections:
                        result.append({
                            "chapter": chapter_title,
                            "section": section_title,
                            "subsection": subsection_title,
                            "title": f"{chapter_title} - {section_title} - {subsection_title}",
                            "content": subsection_content.strip()
                        })
                    
                    # Handle any remaining content in the section that's not in a subsection
                    # Find content before the first ### and after the last ###
                    first_subsection_match = re.search(r'^### ', section_content, re.MULTILINE)
                    last_subsection_match = None
                    for match in re.finditer(r'^### (.*?)\n(.*?)(?=^### |\Z)', section_content, re.MULTILINE | re.DOTALL):
                        last_subsection_match = match
                    
                    if first_subsection_match or last_subsection_match:
                        if first_subsection_match:
                            # Content before first subsection
                            before_content = section_content[:first_subsection_match.start()].strip()
                            if before_content:
                                result.append({
                                    "chapter": chapter_title,
                                    "section": section_title,
                                    "subsection": "",
                                    "title": f"{chapter_title} - {section_title}",
                                    "content": before_content
                                })
                        
                        if last_subsection_match:
                            # Content after last subsection
                            last_subsection_end = last_subsection_match.end()
                            after_content = section_content[last_subsection_end:].strip()
                            if after_content:
                                result.append({
                                    "chapter": chapter_title,
                                    "section": section_title,
                                    "subsection": "Additional Content",
                                    "title": f"{chapter_title} - {section_title} - Additional Content",
                                    "content": after_content
                                })
    
    return result

def clean_markdown(text: str) -> str:
    """
    Clean markdown formatting from text
    
    Args:
        text: Markdown text to clean
        
    Returns:
        Cleaned text without markdown formatting
    """
    # Remove headers
    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
    
    # Remove bold and italic
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    text = re.sub(r'__(.*?)__', r'\1', text)
    text = re.sub(r'_(.*?)_', r'\1', text)
    
    # Remove links but keep the text
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    
    # Remove code blocks
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    
    # Remove lists
    text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
    
    # Remove extra whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = text.strip()
    
    return text