Spaces:

bluewhale2025
/

parseai-document-processor

Build error

App Files Files Community

bluewhale2025 commited on May 23, 2025

Commit

6bd8ed8

1 Parent(s): 29ad632

Improve text splitting and error handling in summarizer

Browse files

Files changed (1) hide show

summarizer.py +27 -6

summarizer.py CHANGED Viewed

@@ -114,24 +114,45 @@ class DocumentSummarizer:
     def _split_text(self, text: str) -> List[str]:
         """텍스트를 적절한 크기로 분할"""
         try:
-            sentences = nltk.sent_tokenize(text)
             chunks = []
             current_chunk = ""
             for sentence in sentences:
-                if len(current_chunk) + len(sentence) <= self.chunk_size:
-                    current_chunk += " " + sentence
                 else:
-                    chunks.append(current_chunk.strip())
                     current_chunk = sentence
             if current_chunk:
                 chunks.append(current_chunk.strip())
-            return chunks
         except Exception as e:
-            raise Exception(f"텍스트 분할 중 오류 발생: {str(e)}")
 # 싱글톤 인스턴스 생성
 document_summarizer = DocumentSummarizer()

     def _split_text(self, text: str) -> List[str]:
         """텍스트를 적절한 크기로 분할"""
         try:
+            # Use the configured tokenizer (either punkt or sent_tokenize)
+            if hasattr(self, 'tokenizer') and callable(self.tokenizer):
+                if self.tokenizer == nltk.tokenize.sent_tokenize:
+                    sentences = nltk.tokenize.sent_tokenize(text)
+                else:
+                    # Handle the case where tokenizer is a PunktSentenceTokenizer instance
+                    sentences = self.tokenizer.tokenize(text)
+            else:
+                # Fallback to default sentence tokenizer
+                nltk.download('punkt')
+                sentences = nltk.tokenize.sent_tokenize(text)
             chunks = []
             current_chunk = ""
             for sentence in sentences:
+                if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size:
+                    current_chunk = f"{current_chunk} {sentence}".strip()
                 else:
+                    if current_chunk:  # Only add non-empty chunks
+                        chunks.append(current_chunk)
                     current_chunk = sentence
+            # Add the last chunk if it's not empty
             if current_chunk:
                 chunks.append(current_chunk.strip())
+            return chunks if chunks else [text]  # Return at least one chunk
+        except LookupError as e:
+            # If punkt data is missing, try to download it
+            print(f"NLTK data missing, attempting to download: {e}")
+            nltk.download('punkt')
+            # Retry with the default tokenizer
+            return self._split_text(text)
         except Exception as e:
+            print(f"Error in _split_text: {str(e)}")
+            # If all else fails, return the original text as a single chunk
+            return [text]
 # 싱글톤 인스턴스 생성
 document_summarizer = DocumentSummarizer()