File size: 6,043 Bytes
3022fd1
 
 
 
 
 
 
 
454f21d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3022fd1
454f21d
 
 
 
3022fd1
 
 
 
 
 
 
 
454f21d
 
3022fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bd8ed8
 
 
 
 
 
 
 
 
 
 
 
3022fd1
 
 
 
6bd8ed8
 
3022fd1
6bd8ed8
 
3022fd1
 
6bd8ed8
3022fd1
 
 
6bd8ed8
3022fd1
6bd8ed8
 
 
 
 
 
3022fd1
6bd8ed8
 
 
3022fd1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import nltk
from typing import Dict, List
import json
from datetime import datetime
import heapq

class DocumentSummarizer:
    def __init__(self):
        # Set NLTK data path
        nltk_data_paths = [
            '/usr/local/share/nltk_data',
            '/usr/share/nltk_data',
            '/usr/local/nltk_data',
            '/usr/local/lib/nltk_data',
            '/usr/lib/nltk_data',
            '/root/nltk_data',
            '/home/user/nltk_data',
            '/app/nltk_data'
        ]
        
        # Add all possible NLTK data paths
        nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
        
        # Download NLTK data if not found
        try:
            nltk.download('punkt')
            nltk.download('stopwords')
            nltk.download('wordnet')
            nltk.download('averaged_perceptron_tagger')
        except Exception as e:
            print(f"Warning: NLTK data download failed: {str(e)}")
            
        # ํ…์ŠคํŠธ ๋ถ„ํ•  ํฌ๊ธฐ ์„ค์ •
        self.chunk_size = 1000  # ํ† ํฐ ๊ธฐ์ค€
        try:
            self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        except Exception as e:
            print(f"Warning: Failed to load punkt tokenizer: {str(e)}")
            # Fallback to default sent_tokenize
            self.tokenizer = nltk.tokenize.sent_tokenize
        
    def summarize_text(self, text: str) -> Dict:
        """ํ…์ŠคํŠธ๋ฅผ ์š”์•ฝ"""
        try:
            # ํ…์ŠคํŠธ ๋ถ„ํ• 
            chunks = self._split_text(text)
            
            # ๊ฐ ๋ถ„ํ• ์— ๋Œ€ํ•ด ์š”์•ฝ ์ƒ์„ฑ
            summaries = []
            for chunk in chunks:
                summary = self._summarize_chunk(chunk)
                if summary:
                    summaries.append(summary)
            
            return {
                "timestamp": datetime.now().isoformat(),
                "full_summary": " ".join(summaries),
                "chunk_summaries": summaries
            }
            
        except Exception as e:
            raise Exception(f"์š”์•ฝ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")

    def _summarize_chunk(self, text: str) -> str:
        """๊ฐœ๋ณ„ ํ…์ŠคํŠธ ๋ถ„ํ• ์„ ์š”์•ฝ"""
        try:
            # ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ
            words = nltk.word_tokenize(text.lower())
            sentences = nltk.sent_tokenize(text)
            
            # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
            stop_words = set(nltk.corpus.stopwords.words('english'))
            words = [word for word in words if word.isalnum() and word not in stop_words]
            
            # ๋‹จ์–ด ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
            word_frequencies = {}
            for word in words:
                if word not in word_frequencies:
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1
            
            # ์ตœ๋Œ€ ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
            max_frequency = max(word_frequencies.values())
            
            # ์ •๊ทœํ™”๋œ ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
            for word in word_frequencies:
                word_frequencies[word] = word_frequencies[word] / max_frequency
            
            # ๋ฌธ์žฅ ์ ์ˆ˜ ๊ณ„์‚ฐ
            sentence_scores = {}
            for sentence in sentences:
                for word, freq in word_frequencies.items():
                    if word in sentence.lower():
                        if sentence not in sentence_scores:
                            sentence_scores[sentence] = freq
                        else:
                            sentence_scores[sentence] += freq
            
            # ์ƒ์œ„ 30%์˜ ๋ฌธ์žฅ ์„ ํƒ
            summary_sentences = heapq.nlargest(
                int(len(sentences) * 0.3),
                sentence_scores,
                key=sentence_scores.get
            )
            
            # ์š”์•ฝ ์ƒ์„ฑ
            return " ".join(summary_sentences)
            
        except Exception as e:
            print(f"Chunk summarization error: {str(e)}")
            return ""
    
    def _split_text(self, text: str) -> List[str]:
        """ํ…์ŠคํŠธ๋ฅผ ์ ์ ˆํ•œ ํฌ๊ธฐ๋กœ ๋ถ„ํ• """
        try:
            # Use the configured tokenizer (either punkt or sent_tokenize)
            if hasattr(self, 'tokenizer') and callable(self.tokenizer):
                if self.tokenizer == nltk.tokenize.sent_tokenize:
                    sentences = nltk.tokenize.sent_tokenize(text)
                else:
                    # Handle the case where tokenizer is a PunktSentenceTokenizer instance
                    sentences = self.tokenizer.tokenize(text)
            else:
                # Fallback to default sentence tokenizer
                nltk.download('punkt')
                sentences = nltk.tokenize.sent_tokenize(text)
            
            chunks = []
            current_chunk = ""
            
            for sentence in sentences:
                if len(current_chunk.split()) + len(sentence.split()) <= self.chunk_size:
                    current_chunk = f"{current_chunk} {sentence}".strip()
                else:
                    if current_chunk:  # Only add non-empty chunks
                        chunks.append(current_chunk)
                    current_chunk = sentence
            
            # Add the last chunk if it's not empty
            if current_chunk:
                chunks.append(current_chunk.strip())
            
            return chunks if chunks else [text]  # Return at least one chunk
            
        except LookupError as e:
            # If punkt data is missing, try to download it
            print(f"NLTK data missing, attempting to download: {e}")
            nltk.download('punkt')
            # Retry with the default tokenizer
            return self._split_text(text)
        except Exception as e:
            print(f"Error in _split_text: {str(e)}")
            # If all else fails, return the original text as a single chunk
            return [text]

# ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
document_summarizer = DocumentSummarizer()