Spaces:
Running
Running
Update create_granular_chunks.py
Browse files
create_granular_chunks.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import re
|
|
@@ -6,6 +7,7 @@ import nltk
|
|
| 6 |
|
| 7 |
# Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
|
| 8 |
nltk.download('punkt')
|
|
|
|
| 9 |
|
| 10 |
# --- Configuration ---
|
| 11 |
INPUT_FILE = "combined_context.jsonl"
|
|
@@ -112,7 +114,8 @@ def split_text_into_chunks(text: str, max_char_length: int = 1500, overlap: int
|
|
| 112 |
if len(text) <= max_char_length:
|
| 113 |
return [text]
|
| 114 |
|
| 115 |
-
|
|
|
|
| 116 |
chunks = []
|
| 117 |
current_chunk = ""
|
| 118 |
|
|
|
|
| 1 |
+
# create_granular_chunks.py
|
| 2 |
import os
|
| 3 |
import json
|
| 4 |
import re
|
|
|
|
| 7 |
|
| 8 |
# Download punkt tokenizer if not already done (Ensure this runs once in your environment setup)
|
| 9 |
nltk.download('punkt')
|
| 10 |
+
nltk.download('punkt_tab') # Also download punkt_tab to avoid LookupError
|
| 11 |
|
| 12 |
# --- Configuration ---
|
| 13 |
INPUT_FILE = "combined_context.jsonl"
|
|
|
|
| 114 |
if len(text) <= max_char_length:
|
| 115 |
return [text]
|
| 116 |
|
| 117 |
+
# Explicitly specify language to avoid punkt_tab error
|
| 118 |
+
sentences = nltk.tokenize.sent_tokenize(text, language='english')
|
| 119 |
chunks = []
|
| 120 |
current_chunk = ""
|
| 121 |
|