Upload preprocess.py
Browse files- preprocess.py +10 -0
preprocess.py
CHANGED
|
@@ -20,6 +20,16 @@ try:
|
|
| 20 |
except Exception as e:
|
| 21 |
logger.warning(f"NLTK data download failed: {e}")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def get_tokenizer_wrapper():
|
| 24 |
try:
|
| 25 |
tokenizer = get_tokenizer("bert-base-uncased")
|
|
|
|
| 20 |
except Exception as e:
|
| 21 |
logger.warning(f"NLTK data download failed: {e}")
|
| 22 |
|
| 23 |
+
# Guarded NLTK downloads
|
| 24 |
+
if hasattr(nltk, "download"):
|
| 25 |
+
try:
|
| 26 |
+
nltk.download('punkt', quiet=True)
|
| 27 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.warning(f"NLTK download failed: {e}")
|
| 30 |
+
else:
|
| 31 |
+
logger.warning("NLTK.download not available; skipping corpus downloads")
|
| 32 |
+
|
| 33 |
def get_tokenizer_wrapper():
|
| 34 |
try:
|
| 35 |
tokenizer = get_tokenizer("bert-base-uncased")
|