Spaces:

davidepanza
/

test2text

Sleeping

App Files Files Community

test2text / app /backend /text_processing.py

davidepanza

Update app/backend/text_processing.py

ec294c1 verified 7 months ago

raw

history blame contribute delete

3.9 kB


	import streamlit as st
	import os
	import ssl
	import re


	def sent_tokenize(text):
	"""Simple sentence tokenizer using regex (simpler alternative to NLTK)"""
	# Split on sentence endings followed by whitespace and capital letter
	sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

	# Handle edge cases and clean up
	result = []
	for sentence in sentences:
	# Further split on newlines that might indicate sentence breaks
	sub_sentences = sentence.split('\n')
	for sub in sub_sentences:
	sub = sub.strip()
	if len(sub) > 10: # Filter very short sentences
	result.append(sub)

	return result


	def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
	"""
	Creates text chunks up to max_words using sentences as undivisible units.
	Each chunk can overlap with the next one by overlap_sentences.
	Chunks smaller than min_words are merged with the next chunk.
	"""
	sentences = sent_tokenize(text)
	word_counts = [len(sentence.split()) for sentence in sentences]

	chunks = []
	i = 0

	while i < len(sentences):
	chunk_sentences = []
	word_count = 0
	chunk_start = i

	# Build chunk
	while i < len(sentences):
	if word_count + word_counts[i] > max_words and chunk_sentences:
	break
	chunk_sentences.append(sentences[i])
	word_count += word_counts[i]
	i += 1

	if chunk_sentences:
	chunks.append(" ".join(chunk_sentences))

	# Add overlap for next chunk
	if i < len(sentences):
	chunk_size = len(chunk_sentences)
	overlap = min(overlap_sentences, chunk_size - 1)
	i = max(i - overlap, chunk_start + 1)

	# Merge small chunks with next chunk
	merged_chunks = []
	i = 0
	while i < len(chunks):
	current_chunk = chunks[i]
	current_words = len(current_chunk.split())

	# If current chunk is too small and there's a next chunk, merge them
	if current_words < min_words and i + 1 < len(chunks):
	next_chunk = chunks[i + 1]
	next_words = len(next_chunk.split())

	# Only merge if combined size won't be too large
	if current_words + next_words <= max_words:
	merged_chunk = current_chunk + " " + next_chunk
	merged_chunks.append(merged_chunk)
	i += 2 # Skip next chunk since we merged it
	else:
	# Keep small chunk as-is if merging would be too large
	merged_chunks.append(current_chunk)
	i += 1
	else:
	merged_chunks.append(current_chunk)
	i += 1

	# Remove chunks that are too long (likely data blocks or malformed content)
	final_chunks = []
	for chunk in merged_chunks:
	if len(chunk.split()) <= 1000:
	final_chunks.append(chunk)

	return final_chunks


	def chapters_chunking(chapters, max_words=500, min_words=300, overlap_sentences=5):
	"""
	Chunk the chapters into smaller parts based on word count and overlap.

	:param chapters: List of chapter dictionaries.
	:param max_words: Maximum number of words per chunk.
	:param min_words: Minimum number of words per chunk.
	:param overlap_sentences: Number of sentences to overlap between chunks.
	:return: List of dictionaries with chapter information and their respective chunks.
	"""
	st.session_state['chapters_chunked'] = [
	{
	'chapter_number': chapter['chapter_number'],
	'chapter_title': chapter['chapter_title'],
	'chunks': text_chunking(chapter['content'], max_words, min_words, overlap_sentences)
	}
	for chapter in chapters
	]