Spaces:

Kalpokoch
/

ChatbotDemo

Running

Kalpokoch commited on Aug 20

Commit

85b9cad

verified ·

1 Parent(s): 81c6ad0

Update create_granular_chunks.py

Files changed (1) hide show

create_granular_chunks.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import json
 import re
@@ -6,6 +7,7 @@ import nltk
 # Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
 nltk.download('punkt')
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
@@ -112,7 +114,8 @@ def split_text_into_chunks(text: str, max_char_length: int = 1500, overlap: int
     if len(text) <= max_char_length:
         return [text]
-    sentences = nltk.sent_tokenize(text)
     chunks = []
     current_chunk = ""

+# create_granular_chunks.py
 import os
 import json
 import re
 # Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
 nltk.download('punkt')
+nltk.download('punkt_tab')  # Also download punkt_tab to avoid LookupError
 # --- Configuration ---
 INPUT_FILE = "combined_context.jsonl"
     if len(text) <= max_char_length:
         return [text]
+    # Explicitly specify language to avoid punkt_tab error
+    sentences = nltk.tokenize.sent_tokenize(text, language='english')
     chunks = []
     current_chunk = ""