Davide Panza commited on
Commit
0ef6552
·
verified ·
1 Parent(s): 2867943

Update app/backend/text_processing.py

Browse files
Files changed (1) hide show
  1. app/backend/text_processing.py +29 -1
app/backend/text_processing.py CHANGED
@@ -2,12 +2,40 @@
2
  import streamlit as st
3
  import os
4
  import nltk
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # Tell NLTK where to look for the punkt tokenizer
7
  nltk_path = os.path.join(os.getcwd(), "nltk_data")
8
  nltk.data.path.append(nltk_path)
9
-
10
  from nltk.tokenize import sent_tokenize
 
 
11
 
12
  def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
13
  """
 
2
  import streamlit as st
3
  import os
4
  import nltk
5
+ import ssl
6
 
7
+ # Handle SSL certificate issues and download NLTK data
8
+ try:
9
+ _create_unverified_https_context = ssl._create_unverified_context
10
+ except AttributeError:
11
+ pass
12
+ else:
13
+ ssl._create_default_https_context = _create_unverified_https_context
14
+
15
+ # Download NLTK data if not present
16
+ try:
17
+ nltk.data.find('tokenizers/punkt')
18
+ except LookupError:
19
+ print("Downloading NLTK punkt data...")
20
+ nltk.download('punkt', quiet=True)
21
+
22
+ try:
23
+ nltk.data.find('tokenizers/punkt_tab')
24
+ except LookupError:
25
+ print("Downloading NLTK punkt_tab data...")
26
+ nltk.download('punkt_tab', quiet=True)
27
+
28
+ # Now import the tokenizer
29
+ from nltk.tokenize import sent_tokenize, word_tokenize
30
+
31
+
32
+ """
33
  # Tell NLTK where to look for the punkt tokenizer
34
  nltk_path = os.path.join(os.getcwd(), "nltk_data")
35
  nltk.data.path.append(nltk_path)
 
36
  from nltk.tokenize import sent_tokenize
37
+ """
38
+
39
 
40
  def text_chunking(text, max_words=750, min_words=400, overlap_sentences=5):
41
  """