import nltk from nltk.tokenize import sent_tokenize import nltk import os # Set NLTK data path for HF Spaces home_dir = os.path.expanduser("~") nltk_data_dir = os.path.join(home_dir, 'nltk_data') # Ensure directory exists and is in NLTK path os.makedirs(nltk_data_dir, exist_ok=True) if nltk_data_dir not in nltk.data.path: nltk.data.path.append(nltk_data_dir) # Download NLTK data if not present try: nltk.data.find('tokenizers/punkt_tab') except LookupError: try: print("Downloading NLTK data...") nltk.download('punkt_tab', download_dir=nltk_data_dir, quiet=True) print("NLTK data downloaded successfully") except Exception as e: print(f"Warning: Could not download NLTK data: {e}") def paragraphs_chunking(text, max_words=200, max_sentence_words=50): """ Splits text into structured chunks, preserving paragraph integrity and avoiding unnatural breaks. - Uses paragraph-based splitting first. - Splits long paragraphs into smaller chunks based on sentence boundaries. """ # Split text into paragraphs first paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] chunks = [] for para in paragraphs: words = para.split() # If paragraph is within limit, keep as a single chunk if len(words) <= max_words: chunks.append(para) continue # Sentence-based chunking for large paragraphs sentences = sent_tokenize(para) chunk, chunk_word_count = [], 0 for sentence in sentences: sentence_word_count = len(sentence.split()) # If adding this sentence keeps chunk within word limit, add it if chunk_word_count + sentence_word_count <= max_words: chunk.append(sentence) chunk_word_count += sentence_word_count else: # Finalize current chunk and start a new one chunks.append(" ".join(chunk)) chunk = [sentence] chunk_word_count = sentence_word_count # Append any remaining chunk if chunk: chunks.append(" ".join(chunk)) return chunks def lines_chunking(text, max_words=200): """ Splits text into structured chunks, preserving paragraph integrity and avoiding unnatural breaks. - Uses paragraph-based splitting first. - Splits long paragraphs into smaller chunks based on sentence boundaries. """ # Split text into lines lines = text.splitlines() # Group lines into paragraphs paragraphs = [] current_paragraph = [] for line in lines: if line.strip(): current_paragraph.append(line.strip()) else: # Empty line indicates end of paragraph if current_paragraph: paragraphs.append(" ".join(current_paragraph)) current_paragraph = [] if current_paragraph: paragraphs.append(" ".join(current_paragraph)) # Process paragraphs chunks = [] for para in paragraphs: words = para.split() if len(words) <= max_words: chunks.append(para) else: sentences = sent_tokenize(para) chunk, chunk_word_count = [], 0 for sentence in sentences: sentence_word_count = len(sentence.split()) if chunk_word_count + sentence_word_count <= max_words: chunk.append(sentence) chunk_word_count += sentence_word_count else: chunks.append(" ".join(chunk)) chunk = [sentence] chunk_word_count = sentence_word_count if chunk: chunks.append(" ".join(chunk)) return chunks