bluewhale2025 commited on
Commit
6bd8ed8
ยท
1 Parent(s): 29ad632

Improve text splitting and error handling in summarizer

Browse files
Files changed (1) hide show
  1. summarizer.py +27 -6
summarizer.py CHANGED
@@ -114,24 +114,45 @@ class DocumentSummarizer:
114
  def _split_text(self, text: str) -> List[str]:
115
  """ํ…์ŠคํŠธ๋ฅผ ์ ์ ˆํ•œ ํฌ๊ธฐ๋กœ ๋ถ„ํ• """
116
  try:
117
- sentences = nltk.sent_tokenize(text)
 
 
 
 
 
 
 
 
 
 
 
118
  chunks = []
119
  current_chunk = ""
120
 
121
  for sentence in sentences:
122
- if len(current_chunk) + len(sentence) <= self.chunk_size:
123
- current_chunk += " " + sentence
124
  else:
125
- chunks.append(current_chunk.strip())
 
126
  current_chunk = sentence
127
 
 
128
  if current_chunk:
129
  chunks.append(current_chunk.strip())
130
 
131
- return chunks
132
 
 
 
 
 
 
 
133
  except Exception as e:
134
- raise Exception(f"ํ…์ŠคํŠธ ๋ถ„ํ•  ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
 
 
135
 
136
  # ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
137
  document_summarizer = DocumentSummarizer()
 
114
  def _split_text(self, text: str) -> List[str]:
115
  """ํ…์ŠคํŠธ๋ฅผ ์ ์ ˆํ•œ ํฌ๊ธฐ๋กœ ๋ถ„ํ• """
116
  try:
117
+ # Use the configured tokenizer (either punkt or sent_tokenize)
118
+ if hasattr(self, 'tokenizer') and callable(self.tokenizer):
119
+ if self.tokenizer == nltk.tokenize.sent_tokenize:
120
+ sentences = nltk.tokenize.sent_tokenize(text)
121
+ else:
122
+ # Handle the case where tokenizer is a PunktSentenceTokenizer instance
123
+ sentences = self.tokenizer.tokenize(text)
124
+ else:
125
+ # Fallback to default sentence tokenizer
126
+ nltk.download('punkt')
127
+ sentences = nltk.tokenize.sent_tokenize(text)
128
+
129
  chunks = []
130
  current_chunk = ""
131
 
132
  for sentence in sentences:
133
+ if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size:
134
+ current_chunk = f"{current_chunk} {sentence}".strip()
135
  else:
136
+ if current_chunk: # Only add non-empty chunks
137
+ chunks.append(current_chunk)
138
  current_chunk = sentence
139
 
140
+ # Add the last chunk if it's not empty
141
  if current_chunk:
142
  chunks.append(current_chunk.strip())
143
 
144
+ return chunks if chunks else [text] # Return at least one chunk
145
 
146
+ except LookupError as e:
147
+ # If punkt data is missing, try to download it
148
+ print(f"NLTK data missing, attempting to download: {e}")
149
+ nltk.download('punkt')
150
+ # Retry with the default tokenizer
151
+ return self._split_text(text)
152
  except Exception as e:
153
+ print(f"Error in _split_text: {str(e)}")
154
+ # If all else fails, return the original text as a single chunk
155
+ return [text]
156
 
157
  # ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
158
  document_summarizer = DocumentSummarizer()