Spaces:

davidepanza
/

test2text

Sleeping

Davide Panza commited on May 31, 2025

Commit

0ef6552

verified ·

1 Parent(s): 2867943

Update app/backend/text_processing.py

Files changed (1) hide show

app/backend/text_processing.py CHANGED Viewed

@@ -2,12 +2,40 @@
 import streamlit as st
 import os
 import nltk
 # Tell NLTK where to look for the punkt tokenizer
 nltk_path = os.path.join(os.getcwd(), "nltk_data")
 nltk.data.path.append(nltk_path)
 from nltk.tokenize import sent_tokenize
 def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
     """

 import streamlit as st
 import os
 import nltk
+import ssl
+# Handle SSL certificate issues and download NLTK data
+try:
+    _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+    pass
+else:
+    ssl._create_default_https_context = _create_unverified_https_context
+# Download NLTK data if not present
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    print("Downloading NLTK punkt data...")
+    nltk.download('punkt', quiet=True)
+try:
+    nltk.data.find('tokenizers/punkt_tab')
+except LookupError:
+    print("Downloading NLTK punkt_tab data...")
+    nltk.download('punkt_tab', quiet=True)
+# Now import the tokenizer
+from nltk.tokenize import sent_tokenize, word_tokenize
+"""
 # Tell NLTK where to look for the punkt tokenizer
 nltk_path = os.path.join(os.getcwd(), "nltk_data")
 nltk.data.path.append(nltk_path)
 from nltk.tokenize import sent_tokenize
+"""
 def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
     """