Spaces:

bluewhale2025
/

parseai-document-processor

Build error

bluewhale2025 commited on May 23, 2025

Commit

454f21d

1 Parent(s): 23e4091

Fix NLTK data paths and tokenizer loading

Files changed (1) hide show

summarizer.py CHANGED Viewed

@@ -6,11 +6,27 @@ import heapq
 class DocumentSummarizer:
     def __init__(self):
-        # NLTK 다운로드
         try:
-            nltk.download('punkt', download_dir='/app/nltk_data')
-            nltk.download('stopwords', download_dir='/app/nltk_data')
-            nltk.data.path.append('/app/nltk_data')
         except Exception as e:
             print(f"Warning: NLTK data download failed: {str(e)}")
@@ -19,7 +35,8 @@ class DocumentSummarizer:
         try:
             self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
         except Exception as e:
-            print(f"Warning: Failed to load tokenizer: {str(e)}")
             self.tokenizer = nltk.tokenize.sent_tokenize
     def summarize_text(self, text: str) -> Dict:

 class DocumentSummarizer:
     def __init__(self):
+        # Set NLTK data path
+        nltk_data_paths = [
+            '/usr/local/share/nltk_data',
+            '/usr/share/nltk_data',
+            '/usr/local/nltk_data',
+            '/usr/local/lib/nltk_data',
+            '/usr/lib/nltk_data',
+            '/root/nltk_data',
+            '/home/user/nltk_data',
+            '/app/nltk_data'
+        ]
+        # Add all possible NLTK data paths
+        nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
+        # Download NLTK data if not found
         try:
+            nltk.download('punkt')
+            nltk.download('stopwords')
+            nltk.download('wordnet')
+            nltk.download('averaged_perceptron_tagger')
         except Exception as e:
             print(f"Warning: NLTK data download failed: {str(e)}")
         try:
             self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
         except Exception as e:
+            print(f"Warning: Failed to load punkt tokenizer: {str(e)}")
+            # Fallback to default sent_tokenize
             self.tokenizer = nltk.tokenize.sent_tokenize
     def summarize_text(self, text: str) -> Dict: