test2text / app /backend /text_processing.py
davidepanza's picture
Update app/backend/text_processing.py
ec294c1 verified
import streamlit as st
import os
import ssl
import re
def sent_tokenize(text):
"""Simple sentence tokenizer using regex (simpler alternative to NLTK)"""
# Split on sentence endings followed by whitespace and capital letter
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
# Handle edge cases and clean up
result = []
for sentence in sentences:
# Further split on newlines that might indicate sentence breaks
sub_sentences = sentence.split('\n')
for sub in sub_sentences:
sub = sub.strip()
if len(sub) > 10: # Filter very short sentences
result.append(sub)
return result
def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
"""
Creates text chunks up to max_words using sentences as undivisible units.
Each chunk can overlap with the next one by overlap_sentences.
Chunks smaller than min_words are merged with the next chunk.
"""
sentences = sent_tokenize(text)
word_counts = [len(sentence.split()) for sentence in sentences]
chunks = []
i = 0
while i < len(sentences):
chunk_sentences = []
word_count = 0
chunk_start = i
# Build chunk
while i < len(sentences):
if word_count + word_counts[i] > max_words and chunk_sentences:
break
chunk_sentences.append(sentences[i])
word_count += word_counts[i]
i += 1
if chunk_sentences:
chunks.append(" ".join(chunk_sentences))
# Add overlap for next chunk
if i < len(sentences):
chunk_size = len(chunk_sentences)
overlap = min(overlap_sentences, chunk_size - 1)
i = max(i - overlap, chunk_start + 1)
# Merge small chunks with next chunk
merged_chunks = []
i = 0
while i < len(chunks):
current_chunk = chunks[i]
current_words = len(current_chunk.split())
# If current chunk is too small and there's a next chunk, merge them
if current_words < min_words and i + 1 < len(chunks):
next_chunk = chunks[i + 1]
next_words = len(next_chunk.split())
# Only merge if combined size won't be too large
if current_words + next_words <= max_words:
merged_chunk = current_chunk + " " + next_chunk
merged_chunks.append(merged_chunk)
i += 2 # Skip next chunk since we merged it
else:
# Keep small chunk as-is if merging would be too large
merged_chunks.append(current_chunk)
i += 1
else:
merged_chunks.append(current_chunk)
i += 1
# Remove chunks that are too long (likely data blocks or malformed content)
final_chunks = []
for chunk in merged_chunks:
if len(chunk.split()) <= 1000:
final_chunks.append(chunk)
return final_chunks
def chapters_chunking(chapters, max_words=500, min_words=300, overlap_sentences=5):
"""
Chunk the chapters into smaller parts based on word count and overlap.
:param chapters: List of chapter dictionaries.
:param max_words: Maximum number of words per chunk.
:param min_words: Minimum number of words per chunk.
:param overlap_sentences: Number of sentences to overlap between chunks.
:return: List of dictionaries with chapter information and their respective chunks.
"""
st.session_state['chapters_chunked'] = [
{
'chapter_number': chapter['chapter_number'],
'chapter_title': chapter['chapter_title'],
'chunks': text_chunking(chapter['content'], max_words, min_words, overlap_sentences)
}
for chapter in chapters
]