Kalpokoch commited on
Commit
85b9cad
·
verified ·
1 Parent(s): 81c6ad0

Update create_granular_chunks.py

Browse files
Files changed (1) hide show
  1. create_granular_chunks.py +4 -1
create_granular_chunks.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import json
3
  import re
@@ -6,6 +7,7 @@ import nltk
6
 
7
  # Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
8
  nltk.download('punkt')
 
9
 
10
  # --- Configuration ---
11
  INPUT_FILE = "combined_context.jsonl"
@@ -112,7 +114,8 @@ def split_text_into_chunks(text: str, max_char_length: int = 1500, overlap: int
112
  if len(text) <= max_char_length:
113
  return [text]
114
 
115
- sentences = nltk.sent_tokenize(text)
 
116
  chunks = []
117
  current_chunk = ""
118
 
 
1
+ # create_granular_chunks.py
2
  import os
3
  import json
4
  import re
 
7
 
8
  # Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
9
  nltk.download('punkt')
10
+ nltk.download('punkt_tab') # Also download punkt_tab to avoid LookupError
11
 
12
  # --- Configuration ---
13
  INPUT_FILE = "combined_context.jsonl"
 
114
  if len(text) <= max_char_length:
115
  return [text]
116
 
117
+ # Explicitly specify language to avoid punkt_tab error
118
+ sentences = nltk.tokenize.sent_tokenize(text, language='english')
119
  chunks = []
120
  current_chunk = ""
121