Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import ssl | |
| import re | |
| def sent_tokenize(text): | |
| """Simple sentence tokenizer using regex (simpler alternative to NLTK)""" | |
| # Split on sentence endings followed by whitespace and capital letter | |
| sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text) | |
| # Handle edge cases and clean up | |
| result = [] | |
| for sentence in sentences: | |
| # Further split on newlines that might indicate sentence breaks | |
| sub_sentences = sentence.split('\n') | |
| for sub in sub_sentences: | |
| sub = sub.strip() | |
| if len(sub) > 10: # Filter very short sentences | |
| result.append(sub) | |
| return result | |
| def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5): | |
| """ | |
| Creates text chunks up to max_words using sentences as undivisible units. | |
| Each chunk can overlap with the next one by overlap_sentences. | |
| Chunks smaller than min_words are merged with the next chunk. | |
| """ | |
| sentences = sent_tokenize(text) | |
| word_counts = [len(sentence.split()) for sentence in sentences] | |
| chunks = [] | |
| i = 0 | |
| while i < len(sentences): | |
| chunk_sentences = [] | |
| word_count = 0 | |
| chunk_start = i | |
| # Build chunk | |
| while i < len(sentences): | |
| if word_count + word_counts[i] > max_words and chunk_sentences: | |
| break | |
| chunk_sentences.append(sentences[i]) | |
| word_count += word_counts[i] | |
| i += 1 | |
| if chunk_sentences: | |
| chunks.append(" ".join(chunk_sentences)) | |
| # Add overlap for next chunk | |
| if i < len(sentences): | |
| chunk_size = len(chunk_sentences) | |
| overlap = min(overlap_sentences, chunk_size - 1) | |
| i = max(i - overlap, chunk_start + 1) | |
| # Merge small chunks with next chunk | |
| merged_chunks = [] | |
| i = 0 | |
| while i < len(chunks): | |
| current_chunk = chunks[i] | |
| current_words = len(current_chunk.split()) | |
| # If current chunk is too small and there's a next chunk, merge them | |
| if current_words < min_words and i + 1 < len(chunks): | |
| next_chunk = chunks[i + 1] | |
| next_words = len(next_chunk.split()) | |
| # Only merge if combined size won't be too large | |
| if current_words + next_words <= max_words: | |
| merged_chunk = current_chunk + " " + next_chunk | |
| merged_chunks.append(merged_chunk) | |
| i += 2 # Skip next chunk since we merged it | |
| else: | |
| # Keep small chunk as-is if merging would be too large | |
| merged_chunks.append(current_chunk) | |
| i += 1 | |
| else: | |
| merged_chunks.append(current_chunk) | |
| i += 1 | |
| # Remove chunks that are too long (likely data blocks or malformed content) | |
| final_chunks = [] | |
| for chunk in merged_chunks: | |
| if len(chunk.split()) <= 1000: | |
| final_chunks.append(chunk) | |
| return final_chunks | |
| def chapters_chunking(chapters, max_words=500, min_words=300, overlap_sentences=5): | |
| """ | |
| Chunk the chapters into smaller parts based on word count and overlap. | |
| :param chapters: List of chapter dictionaries. | |
| :param max_words: Maximum number of words per chunk. | |
| :param min_words: Minimum number of words per chunk. | |
| :param overlap_sentences: Number of sentences to overlap between chunks. | |
| :return: List of dictionaries with chapter information and their respective chunks. | |
| """ | |
| st.session_state['chapters_chunked'] = [ | |
| { | |
| 'chapter_number': chapter['chapter_number'], | |
| 'chapter_title': chapter['chapter_title'], | |
| 'chunks': text_chunking(chapter['content'], max_words, min_words, overlap_sentences) | |
| } | |
| for chapter in chapters | |
| ] |