Spaces:

bluewhale2025
/

parseai-document-processor

Build error

App Files Files Community

parseai-document-processor / summarizer.py

bluewhale2025

Initial commit: Add ParseAI document processor application

3022fd1 8 months ago

raw

history blame

4.35 kB

	import nltk
	from typing import Dict, List
	import json
	from datetime import datetime
	import heapq

	class DocumentSummarizer:
	def __init__(self):
	# NLTK 다운로드
	try:
	nltk.download('punkt', download_dir='/app/nltk_data')
	nltk.download('stopwords', download_dir='/app/nltk_data')
	nltk.data.path.append('/app/nltk_data')
	except Exception as e:
	print(f"Warning: NLTK data download failed: {str(e)}")

	# 텍스트 분할 크기 설정
	self.chunk_size = 1000 # 토큰 기준
	try:
	self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	except Exception as e:
	print(f"Warning: Failed to load tokenizer: {str(e)}")
	self.tokenizer = nltk.tokenize.sent_tokenize

	def summarize_text(self, text: str) -> Dict:
	"""텍스트를 요약"""
	try:
	# 텍스트 분할
	chunks = self._split_text(text)

	# 각 분할에 대해 요약 생성
	summaries = []
	for chunk in chunks:
	summary = self._summarize_chunk(chunk)
	if summary:
	summaries.append(summary)

	return {
	"timestamp": datetime.now().isoformat(),
	"full_summary": " ".join(summaries),
	"chunk_summaries": summaries
	}

	except Exception as e:
	raise Exception(f"요약 생성 중 오류 발생: {str(e)}")

	def _summarize_chunk(self, text: str) -> str:
	"""개별 텍스트 분할을 요약"""
	try:
	# 텍스트 전처리
	words = nltk.word_tokenize(text.lower())
	sentences = nltk.sent_tokenize(text)

	# 불용어 제거
	stop_words = set(nltk.corpus.stopwords.words('english'))
	words = [word for word in words if word.isalnum() and word not in stop_words]

	# 단어 빈도수 계산
	word_frequencies = {}
	for word in words:
	if word not in word_frequencies:
	word_frequencies[word] = 1
	else:
	word_frequencies[word] += 1

	# 최대 빈도수 계산
	max_frequency = max(word_frequencies.values())

	# 정규화된 빈도수 계산
	for word in word_frequencies:
	word_frequencies[word] = word_frequencies[word] / max_frequency

	# 문장 점수 계산
	sentence_scores = {}
	for sentence in sentences:
	for word, freq in word_frequencies.items():
	if word in sentence.lower():
	if sentence not in sentence_scores:
	sentence_scores[sentence] = freq
	else:
	sentence_scores[sentence] += freq

	# 상위 30%의 문장 선택
	summary_sentences = heapq.nlargest(
	int(len(sentences) * 0.3),
	sentence_scores,
	key=sentence_scores.get
	)

	# 요약 생성
	return " ".join(summary_sentences)

	except Exception as e:
	print(f"Chunk summarization error: {str(e)}")
	return ""

	def _split_text(self, text: str) -> List[str]:
	"""텍스트를 적절한 크기로 분할"""
	try:
	sentences = nltk.sent_tokenize(text)
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(current_chunk) + len(sentence) <= self.chunk_size:
	current_chunk += " " + sentence
	else:
	chunks.append(current_chunk.strip())
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	except Exception as e:
	raise Exception(f"텍스트 분할 중 오류 발생: {str(e)}")

	# 싱글톤 인스턴스 생성
	document_summarizer = DocumentSummarizer()