Spaces:

bluewhale2025
/

parseai-document-processor

Build error

File size: 6,043 Bytes

import nltk
from typing import Dict, List
import json
from datetime import datetime
import heapq

class DocumentSummarizer:
    def __init__(self):
        # Set NLTK data path
        nltk_data_paths = [
            '/usr/local/share/nltk_data',
            '/usr/share/nltk_data',
            '/usr/local/nltk_data',
            '/usr/local/lib/nltk_data',
            '/usr/lib/nltk_data',
            '/root/nltk_data',
            '/home/user/nltk_data',
            '/app/nltk_data'
        ]
        
        # Add all possible NLTK data paths
        nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
        
        # Download NLTK data if not found
        try:
            nltk.download('punkt')
            nltk.download('stopwords')
            nltk.download('wordnet')
            nltk.download('averaged_perceptron_tagger')
        except Exception as e:
            print(f"Warning: NLTK data download failed: {str(e)}")
            
        # 텍스트 분할 크기 설정
        self.chunk_size = 1000  # 토큰 기준
        try:
            self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        except Exception as e:
            print(f"Warning: Failed to load punkt tokenizer: {str(e)}")
            # Fallback to default sent_tokenize
            self.tokenizer = nltk.tokenize.sent_tokenize
        
    def summarize_text(self, text: str) -> Dict:
        """텍스트를 요약"""
        try:
            # 텍스트 분할
            chunks = self._split_text(text)
            
            # 각 분할에 대해 요약 생성
            summaries = []
            for chunk in chunks:
                summary = self._summarize_chunk(chunk)
                if summary:
                    summaries.append(summary)
            
            return {
                "timestamp": datetime.now().isoformat(),
                "full_summary": " ".join(summaries),
                "chunk_summaries": summaries
            }
            
        except Exception as e:
            raise Exception(f"요약 생성 중 오류 발생: {str(e)}")

    def _summarize_chunk(self, text: str) -> str:
        """개별 텍스트 분할을 요약"""
        try:
            # 텍스트 전처리
            words = nltk.word_tokenize(text.lower())
            sentences = nltk.sent_tokenize(text)
            
            # 불용어 제거
            stop_words = set(nltk.corpus.stopwords.words('english'))
            words = [word for word in words if word.isalnum() and word not in stop_words]
            
            # 단어 빈도수 계산
            word_frequencies = {}
            for word in words:
                if word not in word_frequencies:
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1
            
            # 최대 빈도수 계산
            max_frequency = max(word_frequencies.values())
            
            # 정규화된 빈도수 계산
            for word in word_frequencies:
                word_frequencies[word] = word_frequencies[word] / max_frequency
            
            # 문장 점수 계산
            sentence_scores = {}
            for sentence in sentences:
                for word, freq in word_frequencies.items():
                    if word in sentence.lower():
                        if sentence not in sentence_scores:
                            sentence_scores[sentence] = freq
                        else:
                            sentence_scores[sentence] += freq
            
            # 상위 30%의 문장 선택
            summary_sentences = heapq.nlargest(
                int(len(sentences) * 0.3),
                sentence_scores,
                key=sentence_scores.get
            )
            
            # 요약 생성
            return " ".join(summary_sentences)
            
        except Exception as e:
            print(f"Chunk summarization error: {str(e)}")
            return ""
    
    def _split_text(self, text: str) -> List[str]:
        """텍스트를 적절한 크기로 분할"""
        try:
            # Use the configured tokenizer (either punkt or sent_tokenize)
            if hasattr(self, 'tokenizer') and callable(self.tokenizer):
                if self.tokenizer == nltk.tokenize.sent_tokenize:
                    sentences = nltk.tokenize.sent_tokenize(text)
                else:
                    # Handle the case where tokenizer is a PunktSentenceTokenizer instance
                    sentences = self.tokenizer.tokenize(text)
            else:
                # Fallback to default sentence tokenizer
                nltk.download('punkt')
                sentences = nltk.tokenize.sent_tokenize(text)
            
            chunks = []
            current_chunk = ""
            
            for sentence in sentences:
                if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size:
                    current_chunk = f"{current_chunk} {sentence}".strip()
                else:
                    if current_chunk:  # Only add non-empty chunks
                        chunks.append(current_chunk)
                    current_chunk = sentence
            
            # Add the last chunk if it's not empty
            if current_chunk:
                chunks.append(current_chunk.strip())
            
            return chunks if chunks else [text]  # Return at least one chunk
            
        except LookupError as e:
            # If punkt data is missing, try to download it
            print(f"NLTK data missing, attempting to download: {e}")
            nltk.download('punkt')
            # Retry with the default tokenizer
            return self._split_text(text)
        except Exception as e:
            print(f"Error in _split_text: {str(e)}")
            # If all else fails, return the original text as a single chunk
            return [text]

# 싱글톤 인스턴스 생성
document_summarizer = DocumentSummarizer()