Spaces:

davidepanza
/

test2text

Sleeping

Davide Panza commited on May 31, 2025

Commit

60ca4cd

verified ·

1 Parent(s): a811010

Update app/backend/text_processing.py

Files changed (1) hide show

app/backend/text_processing.py CHANGED Viewed

@@ -1,50 +1,26 @@
 import streamlit as st
 import os
-import nltk
 import ssl
-# Handle SSL certificate issues
-try:
-    _create_unverified_https_context = ssl._create_unverified_context
-except AttributeError:
-    pass
-else:
-    ssl._create_default_https_context = _create_unverified_https_context
-# Set NLTK data path to a writable directory (override environment)
-nltk_data_dir = '/app/nltk_data'
-if not os.path.exists(nltk_data_dir):
-    os.makedirs(nltk_data_dir, exist_ok=True)
-# Clear existing NLTK data paths and add our custom one
-nltk.data.path.clear()
-nltk.data.path.append(nltk_data_dir)
-# Download NLTK data to the specific directory
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    print("Downloading NLTK punkt data...")
-    nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
-try:
-    nltk.data.find('tokenizers/punkt_tab')
-except LookupError:
-    print("Downloading NLTK punkt_tab data...")
-    nltk.download('punkt_tab', download_dir=nltk_data_dir, quiet=True)
-# Now import the tokenizer
-from nltk.tokenize import sent_tokenize, word_tokenize
-"""
-# Tell NLTK where to look for the punkt tokenizer
-nltk_path = os.path.join(os.getcwd(), "nltk_data")
-nltk.data.path.append(nltk_path)
-from nltk.tokenize import sent_tokenize
-"""
 def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):

 import streamlit as st
 import os
 import ssl
+import re
+def sent_tokenize(text):
+    """Simple sentence tokenizer using regex - no NLTK needed"""
+    # Split on sentence endings followed by whitespace and capital letter
+    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
+    # Handle edge cases and clean up
+    result = []
+    for sentence in sentences:
+        # Further split on newlines that might indicate sentence breaks
+        sub_sentences = sentence.split('\n')
+        for sub in sub_sentences:
+            sub = sub.strip()
+            if len(sub) > 10:  # Filter very short sentences
+                result.append(sub)
+    return result
 def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):