Davide Panza commited on
Commit
60ca4cd
·
verified ·
1 Parent(s): a811010

Update app/backend/text_processing.py

Browse files
Files changed (1) hide show
  1. app/backend/text_processing.py +17 -41
app/backend/text_processing.py CHANGED
@@ -1,50 +1,26 @@
1
 
2
  import streamlit as st
3
  import os
4
- import nltk
5
  import ssl
 
6
 
7
 
8
- # Handle SSL certificate issues
9
- try:
10
- _create_unverified_https_context = ssl._create_unverified_context
11
- except AttributeError:
12
- pass
13
- else:
14
- ssl._create_default_https_context = _create_unverified_https_context
15
-
16
- # Set NLTK data path to a writable directory (override environment)
17
- nltk_data_dir = '/app/nltk_data'
18
- if not os.path.exists(nltk_data_dir):
19
- os.makedirs(nltk_data_dir, exist_ok=True)
20
-
21
- # Clear existing NLTK data paths and add our custom one
22
- nltk.data.path.clear()
23
- nltk.data.path.append(nltk_data_dir)
24
-
25
- # Download NLTK data to the specific directory
26
- try:
27
- nltk.data.find('tokenizers/punkt')
28
- except LookupError:
29
- print("Downloading NLTK punkt data...")
30
- nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
31
-
32
- try:
33
- nltk.data.find('tokenizers/punkt_tab')
34
- except LookupError:
35
- print("Downloading NLTK punkt_tab data...")
36
- nltk.download('punkt_tab', download_dir=nltk_data_dir, quiet=True)
37
-
38
- # Now import the tokenizer
39
- from nltk.tokenize import sent_tokenize, word_tokenize
40
-
41
-
42
- """
43
- # Tell NLTK where to look for the punkt tokenizer
44
- nltk_path = os.path.join(os.getcwd(), "nltk_data")
45
- nltk.data.path.append(nltk_path)
46
- from nltk.tokenize import sent_tokenize
47
- """
48
 
49
 
50
  def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
 
1
 
2
  import streamlit as st
3
  import os
 
4
  import ssl
5
+ import re
6
 
7
 
8
+ def sent_tokenize(text):
9
+ """Simple sentence tokenizer using regex - no NLTK needed"""
10
+ # Split on sentence endings followed by whitespace and capital letter
11
+ sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
12
+
13
+ # Handle edge cases and clean up
14
+ result = []
15
+ for sentence in sentences:
16
+ # Further split on newlines that might indicate sentence breaks
17
+ sub_sentences = sentence.split('\n')
18
+ for sub in sub_sentences:
19
+ sub = sub.strip()
20
+ if len(sub) > 10: # Filter very short sentences
21
+ result.append(sub)
22
+
23
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):