Spaces:
Sleeping
Sleeping
| import os | |
| from typing import List | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EncoderDecoderModel | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| class DocumentSummarizer: | |
| def __init__(self): | |
| """ | |
| Inisialisasi Document Summarizer menggunakan HuggingFace transformers secara langsung | |
| untuk menghindari isu pipeline 'Unknown task summarization' di versi transformers terbaru. | |
| """ | |
| model_name = os.getenv("SUMMARIZATION_MODEL", "csebuetnlp/mT5_multilingual_XLSum") | |
| print(f"Loading summarization model: {model_name}...") | |
| try: | |
| # use_fast=False sering disarankan untuk model mT5 (SentencePiece) | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) | |
| self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| self.is_loaded = True | |
| except Exception as e: | |
| print(f"Error loading model {model_name}: {e}") | |
| self.is_loaded = False | |
| def summarize(self, text: str, max_length: int = 150, min_length: int = 40) -> str: | |
| """ | |
| Menghasilkan ringkasan dari teks dokumen. | |
| """ | |
| if not text or len(text.strip()) < 50: | |
| return text | |
| if not self.is_loaded: | |
| return "Error: Model peringkas tidak dapat dimuat." | |
| try: | |
| # Tokenize input | |
| input_ids = self.tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True) | |
| # Generate summary | |
| output_ids = self.model.generate( | |
| input_ids, | |
| max_length=max_length, | |
| min_length=min_length, | |
| no_repeat_ngram_size=3, | |
| num_beams=4, | |
| early_stopping=True | |
| ) | |
| # Decode output | |
| summary = self.tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| return summary | |
| except Exception as e: | |
| print(f"Summarization error: {e}") | |
| return "Error generating summary." | |
| def generate_bullet_points(self, text: str, num_points: int = 3) -> List[str]: | |
| """ | |
| Mengonversi teks menjadi bullet points. | |
| """ | |
| summary = self.summarize(text, max_length=200, min_length=50) | |
| import re | |
| sentences = re.split(r'(?<=[.!?]) +', summary) | |
| points = [s.strip() for s in sentences if len(s.strip()) > 10][:num_points] | |
| if not points: | |
| points = [summary] | |
| return points | |