Spaces:

davidepanza
/

Mistral-RAG

Sleeping

App Files Files Community

Mistral-RAG / src /text_processing.py

davidepanza

Update src/text_processing.py

5a3d8ea verified 7 months ago

raw

history blame contribute delete

3.83 kB

	import nltk
	from nltk.tokenize import sent_tokenize
	import nltk
	import os

	# Set NLTK data path for HF Spaces
	home_dir = os.path.expanduser("~")
	nltk_data_dir = os.path.join(home_dir, 'nltk_data')

	# Ensure directory exists and is in NLTK path
	os.makedirs(nltk_data_dir, exist_ok=True)
	if nltk_data_dir not in nltk.data.path:
	nltk.data.path.append(nltk_data_dir)

	# Download NLTK data if not present
	try:
	nltk.data.find('tokenizers/punkt_tab')
	except LookupError:
	try:
	print("Downloading NLTK data...")
	nltk.download('punkt_tab', download_dir=nltk_data_dir, quiet=True)
	print("NLTK data downloaded successfully")
	except Exception as e:
	print(f"Warning: Could not download NLTK data: {e}")


	def paragraphs_chunking(text, max_words=200, max_sentence_words=50):
	"""
	Splits text into structured chunks, preserving paragraph integrity and avoiding unnatural breaks.
	- Uses paragraph-based splitting first.
	- Splits long paragraphs into smaller chunks based on sentence boundaries.
	"""
	# Split text into paragraphs first
	paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

	chunks = []
	for para in paragraphs:
	words = para.split()

	# If paragraph is within limit, keep as a single chunk
	if len(words) <= max_words:
	chunks.append(para)
	continue

	# Sentence-based chunking for large paragraphs
	sentences = sent_tokenize(para)
	chunk, chunk_word_count = [], 0

	for sentence in sentences:
	sentence_word_count = len(sentence.split())

	# If adding this sentence keeps chunk within word limit, add it
	if chunk_word_count + sentence_word_count <= max_words:
	chunk.append(sentence)
	chunk_word_count += sentence_word_count
	else:
	# Finalize current chunk and start a new one
	chunks.append(" ".join(chunk))
	chunk = [sentence]
	chunk_word_count = sentence_word_count

	# Append any remaining chunk
	if chunk:
	chunks.append(" ".join(chunk))

	return chunks


	def lines_chunking(text, max_words=200):
	"""
	Splits text into structured chunks, preserving paragraph integrity and avoiding unnatural breaks.
	- Uses paragraph-based splitting first.
	- Splits long paragraphs into smaller chunks based on sentence boundaries.
	"""
	# Split text into lines
	lines = text.splitlines()

	# Group lines into paragraphs
	paragraphs = []
	current_paragraph = []
	for line in lines:
	if line.strip():
	current_paragraph.append(line.strip())
	else: # Empty line indicates end of paragraph
	if current_paragraph:
	paragraphs.append(" ".join(current_paragraph))
	current_paragraph = []
	if current_paragraph:
	paragraphs.append(" ".join(current_paragraph))

	# Process paragraphs
	chunks = []
	for para in paragraphs:
	words = para.split()
	if len(words) <= max_words:
	chunks.append(para)
	else:
	sentences = sent_tokenize(para)
	chunk, chunk_word_count = [], 0
	for sentence in sentences:
	sentence_word_count = len(sentence.split())
	if chunk_word_count + sentence_word_count <= max_words:
	chunk.append(sentence)
	chunk_word_count += sentence_word_count
	else:
	chunks.append(" ".join(chunk))
	chunk = [sentence]
	chunk_word_count = sentence_word_count
	if chunk:
	chunks.append(" ".join(chunk))

	return chunks