bluewhale2025 commited on
Commit
454f21d
·
1 Parent(s): 23e4091

Fix NLTK data paths and tokenizer loading

Browse files
Files changed (1) hide show
  1. summarizer.py +22 -5
summarizer.py CHANGED
@@ -6,11 +6,27 @@ import heapq
6
 
7
  class DocumentSummarizer:
8
  def __init__(self):
9
- # NLTK 다운로드
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  try:
11
- nltk.download('punkt', download_dir='/app/nltk_data')
12
- nltk.download('stopwords', download_dir='/app/nltk_data')
13
- nltk.data.path.append('/app/nltk_data')
 
14
  except Exception as e:
15
  print(f"Warning: NLTK data download failed: {str(e)}")
16
 
@@ -19,7 +35,8 @@ class DocumentSummarizer:
19
  try:
20
  self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
21
  except Exception as e:
22
- print(f"Warning: Failed to load tokenizer: {str(e)}")
 
23
  self.tokenizer = nltk.tokenize.sent_tokenize
24
 
25
  def summarize_text(self, text: str) -> Dict:
 
6
 
7
  class DocumentSummarizer:
8
  def __init__(self):
9
+ # Set NLTK data path
10
+ nltk_data_paths = [
11
+ '/usr/local/share/nltk_data',
12
+ '/usr/share/nltk_data',
13
+ '/usr/local/nltk_data',
14
+ '/usr/local/lib/nltk_data',
15
+ '/usr/lib/nltk_data',
16
+ '/root/nltk_data',
17
+ '/home/user/nltk_data',
18
+ '/app/nltk_data'
19
+ ]
20
+
21
+ # Add all possible NLTK data paths
22
+ nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
23
+
24
+ # Download NLTK data if not found
25
  try:
26
+ nltk.download('punkt')
27
+ nltk.download('stopwords')
28
+ nltk.download('wordnet')
29
+ nltk.download('averaged_perceptron_tagger')
30
  except Exception as e:
31
  print(f"Warning: NLTK data download failed: {str(e)}")
32
 
 
35
  try:
36
  self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
37
  except Exception as e:
38
+ print(f"Warning: Failed to load punkt tokenizer: {str(e)}")
39
+ # Fallback to default sent_tokenize
40
  self.tokenizer = nltk.tokenize.sent_tokenize
41
 
42
  def summarize_text(self, text: str) -> Dict: