import nltk from typing import Dict, List import json from datetime import datetime import heapq class DocumentSummarizer: def __init__(self): # Set NLTK data path nltk_data_paths = [ '/usr/local/share/nltk_data', '/usr/share/nltk_data', '/usr/local/nltk_data', '/usr/local/lib/nltk_data', '/usr/lib/nltk_data', '/root/nltk_data', '/home/user/nltk_data', '/app/nltk_data' ] # Add all possible NLTK data paths nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path)) # Download NLTK data if not found try: nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') except Exception as e: print(f"Warning: NLTK data download failed: {str(e)}") # 텍스트 분할 크기 설정 self.chunk_size = 1000 # 토큰 기준 try: self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except Exception as e: print(f"Warning: Failed to load punkt tokenizer: {str(e)}") # Fallback to default sent_tokenize self.tokenizer = nltk.tokenize.sent_tokenize def summarize_text(self, text: str) -> Dict: """텍스트를 요약""" try: # 텍스트 분할 chunks = self._split_text(text) # 각 분할에 대해 요약 생성 summaries = [] for chunk in chunks: summary = self._summarize_chunk(chunk) if summary: summaries.append(summary) return { "timestamp": datetime.now().isoformat(), "full_summary": " ".join(summaries), "chunk_summaries": summaries } except Exception as e: raise Exception(f"요약 생성 중 오류 발생: {str(e)}") def _summarize_chunk(self, text: str) -> str: """개별 텍스트 분할을 요약""" try: # 텍스트 전처리 words = nltk.word_tokenize(text.lower()) sentences = nltk.sent_tokenize(text) # 불용어 제거 stop_words = set(nltk.corpus.stopwords.words('english')) words = [word for word in words if word.isalnum() and word not in stop_words] # 단어 빈도수 계산 word_frequencies = {} for word in words: if word not in word_frequencies: word_frequencies[word] = 1 else: word_frequencies[word] += 1 # 최대 빈도수 계산 max_frequency = max(word_frequencies.values()) # 정규화된 빈도수 계산 for word in word_frequencies: word_frequencies[word] = word_frequencies[word] / max_frequency # 문장 점수 계산 sentence_scores = {} for sentence in sentences: for word, freq in word_frequencies.items(): if word in sentence.lower(): if sentence not in sentence_scores: sentence_scores[sentence] = freq else: sentence_scores[sentence] += freq # 상위 30%의 문장 선택 summary_sentences = heapq.nlargest( int(len(sentences) * 0.3), sentence_scores, key=sentence_scores.get ) # 요약 생성 return " ".join(summary_sentences) except Exception as e: print(f"Chunk summarization error: {str(e)}") return "" def _split_text(self, text: str) -> List[str]: """텍스트를 적절한 크기로 분할""" try: # Use the configured tokenizer (either punkt or sent_tokenize) if hasattr(self, 'tokenizer') and callable(self.tokenizer): if self.tokenizer == nltk.tokenize.sent_tokenize: sentences = nltk.tokenize.sent_tokenize(text) else: # Handle the case where tokenizer is a PunktSentenceTokenizer instance sentences = self.tokenizer.tokenize(text) else: # Fallback to default sentence tokenizer nltk.download('punkt') sentences = nltk.tokenize.sent_tokenize(text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size: current_chunk = f"{current_chunk} {sentence}".strip() else: if current_chunk: # Only add non-empty chunks chunks.append(current_chunk) current_chunk = sentence # Add the last chunk if it's not empty if current_chunk: chunks.append(current_chunk.strip()) return chunks if chunks else [text] # Return at least one chunk except LookupError as e: # If punkt data is missing, try to download it print(f"NLTK data missing, attempting to download: {e}") nltk.download('punkt') # Retry with the default tokenizer return self._split_text(text) except Exception as e: print(f"Error in _split_text: {str(e)}") # If all else fails, return the original text as a single chunk return [text] # 싱글톤 인스턴스 생성 document_summarizer = DocumentSummarizer()