Spaces:
Build error
Build error
| import nltk | |
| from typing import Dict, List | |
| import json | |
| from datetime import datetime | |
| import heapq | |
| class DocumentSummarizer: | |
| def __init__(self): | |
| # NLTK ๋ค์ด๋ก๋ | |
| try: | |
| nltk.download('punkt', download_dir='/app/nltk_data') | |
| nltk.download('stopwords', download_dir='/app/nltk_data') | |
| nltk.data.path.append('/app/nltk_data') | |
| except Exception as e: | |
| print(f"Warning: NLTK data download failed: {str(e)}") | |
| # ํ ์คํธ ๋ถํ ํฌ๊ธฐ ์ค์ | |
| self.chunk_size = 1000 # ํ ํฐ ๊ธฐ์ค | |
| try: | |
| self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
| except Exception as e: | |
| print(f"Warning: Failed to load tokenizer: {str(e)}") | |
| self.tokenizer = nltk.tokenize.sent_tokenize | |
| def summarize_text(self, text: str) -> Dict: | |
| """ํ ์คํธ๋ฅผ ์์ฝ""" | |
| try: | |
| # ํ ์คํธ ๋ถํ | |
| chunks = self._split_text(text) | |
| # ๊ฐ ๋ถํ ์ ๋ํด ์์ฝ ์์ฑ | |
| summaries = [] | |
| for chunk in chunks: | |
| summary = self._summarize_chunk(chunk) | |
| if summary: | |
| summaries.append(summary) | |
| return { | |
| "timestamp": datetime.now().isoformat(), | |
| "full_summary": " ".join(summaries), | |
| "chunk_summaries": summaries | |
| } | |
| except Exception as e: | |
| raise Exception(f"์์ฝ ์์ฑ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def _summarize_chunk(self, text: str) -> str: | |
| """๊ฐ๋ณ ํ ์คํธ ๋ถํ ์ ์์ฝ""" | |
| try: | |
| # ํ ์คํธ ์ ์ฒ๋ฆฌ | |
| words = nltk.word_tokenize(text.lower()) | |
| sentences = nltk.sent_tokenize(text) | |
| # ๋ถ์ฉ์ด ์ ๊ฑฐ | |
| stop_words = set(nltk.corpus.stopwords.words('english')) | |
| words = [word for word in words if word.isalnum() and word not in stop_words] | |
| # ๋จ์ด ๋น๋์ ๊ณ์ฐ | |
| word_frequencies = {} | |
| for word in words: | |
| if word not in word_frequencies: | |
| word_frequencies[word] = 1 | |
| else: | |
| word_frequencies[word] += 1 | |
| # ์ต๋ ๋น๋์ ๊ณ์ฐ | |
| max_frequency = max(word_frequencies.values()) | |
| # ์ ๊ทํ๋ ๋น๋์ ๊ณ์ฐ | |
| for word in word_frequencies: | |
| word_frequencies[word] = word_frequencies[word] / max_frequency | |
| # ๋ฌธ์ฅ ์ ์ ๊ณ์ฐ | |
| sentence_scores = {} | |
| for sentence in sentences: | |
| for word, freq in word_frequencies.items(): | |
| if word in sentence.lower(): | |
| if sentence not in sentence_scores: | |
| sentence_scores[sentence] = freq | |
| else: | |
| sentence_scores[sentence] += freq | |
| # ์์ 30%์ ๋ฌธ์ฅ ์ ํ | |
| summary_sentences = heapq.nlargest( | |
| int(len(sentences) * 0.3), | |
| sentence_scores, | |
| key=sentence_scores.get | |
| ) | |
| # ์์ฝ ์์ฑ | |
| return " ".join(summary_sentences) | |
| except Exception as e: | |
| print(f"Chunk summarization error: {str(e)}") | |
| return "" | |
| def _split_text(self, text: str) -> List[str]: | |
| """ํ ์คํธ๋ฅผ ์ ์ ํ ํฌ๊ธฐ๋ก ๋ถํ """ | |
| try: | |
| sentences = nltk.sent_tokenize(text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= self.chunk_size: | |
| current_chunk += " " + sentence | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| except Exception as e: | |
| raise Exception(f"ํ ์คํธ ๋ถํ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| # ์ฑ๊ธํค ์ธ์คํด์ค ์์ฑ | |
| document_summarizer = DocumentSummarizer() | |