bluewhale2025's picture
Initial commit: Add ParseAI document processor application
3022fd1
raw
history blame
4.35 kB
import nltk
from typing import Dict, List
import json
from datetime import datetime
import heapq
class DocumentSummarizer:
def __init__(self):
# NLTK ๋‹ค์šด๋กœ๋“œ
try:
nltk.download('punkt', download_dir='/app/nltk_data')
nltk.download('stopwords', download_dir='/app/nltk_data')
nltk.data.path.append('/app/nltk_data')
except Exception as e:
print(f"Warning: NLTK data download failed: {str(e)}")
# ํ…์ŠคํŠธ ๋ถ„ํ•  ํฌ๊ธฐ ์„ค์ •
self.chunk_size = 1000 # ํ† ํฐ ๊ธฐ์ค€
try:
self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
except Exception as e:
print(f"Warning: Failed to load tokenizer: {str(e)}")
self.tokenizer = nltk.tokenize.sent_tokenize
def summarize_text(self, text: str) -> Dict:
"""ํ…์ŠคํŠธ๋ฅผ ์š”์•ฝ"""
try:
# ํ…์ŠคํŠธ ๋ถ„ํ• 
chunks = self._split_text(text)
# ๊ฐ ๋ถ„ํ• ์— ๋Œ€ํ•ด ์š”์•ฝ ์ƒ์„ฑ
summaries = []
for chunk in chunks:
summary = self._summarize_chunk(chunk)
if summary:
summaries.append(summary)
return {
"timestamp": datetime.now().isoformat(),
"full_summary": " ".join(summaries),
"chunk_summaries": summaries
}
except Exception as e:
raise Exception(f"์š”์•ฝ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def _summarize_chunk(self, text: str) -> str:
"""๊ฐœ๋ณ„ ํ…์ŠคํŠธ ๋ถ„ํ• ์„ ์š”์•ฝ"""
try:
# ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ
words = nltk.word_tokenize(text.lower())
sentences = nltk.sent_tokenize(text)
# ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
stop_words = set(nltk.corpus.stopwords.words('english'))
words = [word for word in words if word.isalnum() and word not in stop_words]
# ๋‹จ์–ด ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
word_frequencies = {}
for word in words:
if word not in word_frequencies:
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
# ์ตœ๋Œ€ ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
max_frequency = max(word_frequencies.values())
# ์ •๊ทœํ™”๋œ ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
for word in word_frequencies:
word_frequencies[word] = word_frequencies[word] / max_frequency
# ๋ฌธ์žฅ ์ ์ˆ˜ ๊ณ„์‚ฐ
sentence_scores = {}
for sentence in sentences:
for word, freq in word_frequencies.items():
if word in sentence.lower():
if sentence not in sentence_scores:
sentence_scores[sentence] = freq
else:
sentence_scores[sentence] += freq
# ์ƒ์œ„ 30%์˜ ๋ฌธ์žฅ ์„ ํƒ
summary_sentences = heapq.nlargest(
int(len(sentences) * 0.3),
sentence_scores,
key=sentence_scores.get
)
# ์š”์•ฝ ์ƒ์„ฑ
return " ".join(summary_sentences)
except Exception as e:
print(f"Chunk summarization error: {str(e)}")
return ""
def _split_text(self, text: str) -> List[str]:
"""ํ…์ŠคํŠธ๋ฅผ ์ ์ ˆํ•œ ํฌ๊ธฐ๋กœ ๋ถ„ํ• """
try:
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= self.chunk_size:
current_chunk += " " + sentence
else:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
except Exception as e:
raise Exception(f"ํ…์ŠคํŠธ ๋ถ„ํ•  ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
# ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
document_summarizer = DocumentSummarizer()