Spaces:
Build error
Build error
Commit
ยท
6bd8ed8
1
Parent(s):
29ad632
Improve text splitting and error handling in summarizer
Browse files- summarizer.py +27 -6
summarizer.py
CHANGED
|
@@ -114,24 +114,45 @@ class DocumentSummarizer:
|
|
| 114 |
def _split_text(self, text: str) -> List[str]:
|
| 115 |
"""ํ
์คํธ๋ฅผ ์ ์ ํ ํฌ๊ธฐ๋ก ๋ถํ """
|
| 116 |
try:
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
chunks = []
|
| 119 |
current_chunk = ""
|
| 120 |
|
| 121 |
for sentence in sentences:
|
| 122 |
-
if len(current_chunk) + len(sentence) <= self.chunk_size:
|
| 123 |
-
current_chunk
|
| 124 |
else:
|
| 125 |
-
chunks
|
|
|
|
| 126 |
current_chunk = sentence
|
| 127 |
|
|
|
|
| 128 |
if current_chunk:
|
| 129 |
chunks.append(current_chunk.strip())
|
| 130 |
|
| 131 |
-
return chunks
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
except Exception as e:
|
| 134 |
-
|
|
|
|
|
|
|
| 135 |
|
| 136 |
# ์ฑ๊ธํค ์ธ์คํด์ค ์์ฑ
|
| 137 |
document_summarizer = DocumentSummarizer()
|
|
|
|
| 114 |
def _split_text(self, text: str) -> List[str]:
|
| 115 |
"""ํ
์คํธ๋ฅผ ์ ์ ํ ํฌ๊ธฐ๋ก ๋ถํ """
|
| 116 |
try:
|
| 117 |
+
# Use the configured tokenizer (either punkt or sent_tokenize)
|
| 118 |
+
if hasattr(self, 'tokenizer') and callable(self.tokenizer):
|
| 119 |
+
if self.tokenizer == nltk.tokenize.sent_tokenize:
|
| 120 |
+
sentences = nltk.tokenize.sent_tokenize(text)
|
| 121 |
+
else:
|
| 122 |
+
# Handle the case where tokenizer is a PunktSentenceTokenizer instance
|
| 123 |
+
sentences = self.tokenizer.tokenize(text)
|
| 124 |
+
else:
|
| 125 |
+
# Fallback to default sentence tokenizer
|
| 126 |
+
nltk.download('punkt')
|
| 127 |
+
sentences = nltk.tokenize.sent_tokenize(text)
|
| 128 |
+
|
| 129 |
chunks = []
|
| 130 |
current_chunk = ""
|
| 131 |
|
| 132 |
for sentence in sentences:
|
| 133 |
+
if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size:
|
| 134 |
+
current_chunk = f"{current_chunk} {sentence}".strip()
|
| 135 |
else:
|
| 136 |
+
if current_chunk: # Only add non-empty chunks
|
| 137 |
+
chunks.append(current_chunk)
|
| 138 |
current_chunk = sentence
|
| 139 |
|
| 140 |
+
# Add the last chunk if it's not empty
|
| 141 |
if current_chunk:
|
| 142 |
chunks.append(current_chunk.strip())
|
| 143 |
|
| 144 |
+
return chunks if chunks else [text] # Return at least one chunk
|
| 145 |
|
| 146 |
+
except LookupError as e:
|
| 147 |
+
# If punkt data is missing, try to download it
|
| 148 |
+
print(f"NLTK data missing, attempting to download: {e}")
|
| 149 |
+
nltk.download('punkt')
|
| 150 |
+
# Retry with the default tokenizer
|
| 151 |
+
return self._split_text(text)
|
| 152 |
except Exception as e:
|
| 153 |
+
print(f"Error in _split_text: {str(e)}")
|
| 154 |
+
# If all else fails, return the original text as a single chunk
|
| 155 |
+
return [text]
|
| 156 |
|
| 157 |
# ์ฑ๊ธํค ์ธ์คํด์ค ์์ฑ
|
| 158 |
document_summarizer = DocumentSummarizer()
|