Spaces:
Sleeping
Sleeping
File size: 3,899 Bytes
60710f0 1d8ed3b e9dc024 0ef6552 60ca4cd ec294c1 60ca4cd 0ef6552 1d8ed3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import streamlit as st
import os
import ssl
import re
def sent_tokenize(text):
"""Simple sentence tokenizer using regex (simpler alternative to NLTK)"""
# Split on sentence endings followed by whitespace and capital letter
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
# Handle edge cases and clean up
result = []
for sentence in sentences:
# Further split on newlines that might indicate sentence breaks
sub_sentences = sentence.split('\n')
for sub in sub_sentences:
sub = sub.strip()
if len(sub) > 10: # Filter very short sentences
result.append(sub)
return result
def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
"""
Creates text chunks up to max_words using sentences as undivisible units.
Each chunk can overlap with the next one by overlap_sentences.
Chunks smaller than min_words are merged with the next chunk.
"""
sentences = sent_tokenize(text)
word_counts = [len(sentence.split()) for sentence in sentences]
chunks = []
i = 0
while i < len(sentences):
chunk_sentences = []
word_count = 0
chunk_start = i
# Build chunk
while i < len(sentences):
if word_count + word_counts[i] > max_words and chunk_sentences:
break
chunk_sentences.append(sentences[i])
word_count += word_counts[i]
i += 1
if chunk_sentences:
chunks.append(" ".join(chunk_sentences))
# Add overlap for next chunk
if i < len(sentences):
chunk_size = len(chunk_sentences)
overlap = min(overlap_sentences, chunk_size - 1)
i = max(i - overlap, chunk_start + 1)
# Merge small chunks with next chunk
merged_chunks = []
i = 0
while i < len(chunks):
current_chunk = chunks[i]
current_words = len(current_chunk.split())
# If current chunk is too small and there's a next chunk, merge them
if current_words < min_words and i + 1 < len(chunks):
next_chunk = chunks[i + 1]
next_words = len(next_chunk.split())
# Only merge if combined size won't be too large
if current_words + next_words <= max_words:
merged_chunk = current_chunk + " " + next_chunk
merged_chunks.append(merged_chunk)
i += 2 # Skip next chunk since we merged it
else:
# Keep small chunk as-is if merging would be too large
merged_chunks.append(current_chunk)
i += 1
else:
merged_chunks.append(current_chunk)
i += 1
# Remove chunks that are too long (likely data blocks or malformed content)
final_chunks = []
for chunk in merged_chunks:
if len(chunk.split()) <= 1000:
final_chunks.append(chunk)
return final_chunks
def chapters_chunking(chapters, max_words=500, min_words=300, overlap_sentences=5):
"""
Chunk the chapters into smaller parts based on word count and overlap.
:param chapters: List of chapter dictionaries.
:param max_words: Maximum number of words per chunk.
:param min_words: Minimum number of words per chunk.
:param overlap_sentences: Number of sentences to overlap between chunks.
:return: List of dictionaries with chapter information and their respective chunks.
"""
st.session_state['chapters_chunked'] = [
{
'chapter_number': chapter['chapter_number'],
'chapter_title': chapter['chapter_title'],
'chunks': text_chunking(chapter['content'], max_words, min_words, overlap_sentences)
}
for chapter in chapters
] |