Spaces:
Sleeping
Sleeping
| """ | |
| Indonesian Court Document Summarization using Pre-trained Models | |
| This module uses pre-trained multilingual models (mT5 or mBART) that already | |
| understand Indonesian language and can generate summaries without extensive training. | |
| """ | |
| import torch | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSeq2SeqLM, | |
| pipeline | |
| ) | |
| class IndonesianCourtSummarizer: | |
| """ | |
| Summarizer for Indonesian court documents using pre-trained models. | |
| Supports multiple model backends: | |
| - google/mt5-small (lightweight, good for quick inference) | |
| - google/mt5-base (balanced performance and quality) | |
| - csebuetnlp/mT5_multilingual_XLSum (fine-tuned for summarization) | |
| - facebook/mbart-large-50 (multilingual BART) | |
| """ | |
| def __init__(self, model_name="csebuetnlp/mT5_multilingual_XLSum", device=None): | |
| """ | |
| Initialize the summarizer with a pre-trained model. | |
| Args: | |
| model_name: HuggingFace model identifier | |
| device: Device to run on ('cuda', 'cpu', or None for auto-detect) | |
| """ | |
| print(f"Loading model: {model_name}") | |
| if device is None: | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| else: | |
| self.device = device | |
| print(f"Using device: {self.device}") | |
| # Load tokenizer and model | |
| try: | |
| # Try loading with fast tokenizer first | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| except Exception as e: | |
| print(f"Fast tokenizer failed, trying slow tokenizer...") | |
| # Fallback to slow tokenizer | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) | |
| self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| self.model.to(self.device) | |
| self.model.eval() | |
| print("Model loaded successfully!\n") | |
| def summarize(self, | |
| document, | |
| max_length=256, | |
| min_length=50, | |
| num_beams=4, | |
| length_penalty=1.0, | |
| early_stopping=True, | |
| add_prefix=True): | |
| """ | |
| Generate summary for an Indonesian court document. | |
| Args: | |
| document: The full court document text | |
| max_length: Maximum length of the summary | |
| min_length: Minimum length of the summary | |
| num_beams: Number of beams for beam search (higher = better quality, slower) | |
| length_penalty: Length penalty (>1.0 = longer summaries, <1.0 = shorter) | |
| early_stopping: Stop when all beams reach EOS | |
| add_prefix: Add "summarize: " prefix for some models | |
| Returns: | |
| Generated summary text | |
| """ | |
| # Prepare input | |
| if add_prefix: | |
| input_text = f"summarize: {document}" | |
| else: | |
| input_text = document | |
| # Tokenize | |
| inputs = self.tokenizer( | |
| input_text, | |
| max_length=1024, # Max input length | |
| truncation=True, | |
| return_tensors="pt" | |
| ).to(self.device) | |
| # Generate summary | |
| with torch.no_grad(): | |
| summary_ids = self.model.generate( | |
| inputs.input_ids, | |
| max_length=max_length, | |
| min_length=min_length, | |
| num_beams=num_beams, | |
| length_penalty=length_penalty, | |
| early_stopping=early_stopping, | |
| no_repeat_ngram_size=3 # Avoid repetition | |
| ) | |
| # Decode | |
| summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| return summary.strip() | |
| def summarize_batch(self, documents, **kwargs): | |
| """ | |
| Summarize multiple documents in batch. | |
| Args: | |
| documents: List of court document texts | |
| **kwargs: Additional arguments passed to summarize() | |
| Returns: | |
| List of generated summaries | |
| """ | |
| summaries = [] | |
| for i, doc in enumerate(documents): | |
| print(f"Summarizing document {i+1}/{len(documents)}...") | |
| summary = self.summarize(doc, **kwargs) | |
| summaries.append(summary) | |
| return summaries | |
| def create_summarizer(model_choice="balanced"): | |
| """ | |
| Factory function to create a summarizer with recommended settings. | |
| Args: | |
| model_choice: | |
| - "fast": Fastest, good for testing (mt5-small) | |
| - "balanced": Good balance of speed and quality (mT5_multilingual_XLSum) | |
| - "quality": Best quality, slower (mt5-base) | |
| Returns: | |
| IndonesianCourtSummarizer instance | |
| """ | |
| model_map = { | |
| "fast": "google/mt5-small", | |
| "balanced": "csebuetnlp/mT5_multilingual_XLSum", | |
| "quality": "google/mt5-base" | |
| } | |
| model_name = model_map.get(model_choice, model_choice) | |
| return IndonesianCourtSummarizer(model_name=model_name) | |
| # Convenience function for quick summarization | |
| def quick_summarize(document, model_choice="balanced", **kwargs): | |
| """ | |
| Quick one-line summarization function. | |
| Args: | |
| document: Court document text | |
| model_choice: Model to use ("fast", "balanced", or "quality") | |
| **kwargs: Additional arguments for summarize() | |
| Returns: | |
| Summary text | |
| """ | |
| summarizer = create_summarizer(model_choice) | |
| return summarizer.summarize(document, **kwargs) | |
| def main(): | |
| """Example usage""" | |
| # Example Indonesian court document | |
| court_document = """PUTUSAN | |
| Nomor 245/Pid.B/2024/PN.Jkt.Pst | |
| DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA | |
| Pengadilan Negeri Jakarta Pusat yang mengadili perkara pidana dengan acara pemeriksaan | |
| biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa | |
| BUDI SANTOSO yang didakwa melakukan tindak pidana penggelapan. Terdakwa menerima uang | |
| sejumlah Rp 350.000.000 dari PT Sejahtera Abadi untuk pembelian bahan baku namun | |
| menggunakan uang tersebut untuk keperluan pribadi. Berdasarkan keterangan saksi-saksi | |
| dan barang bukti, Majelis Hakim memutuskan Terdakwa terbukti secara sah dan meyakinkan | |
| bersalah melakukan tindak pidana penggelapan sebagaimana diatur dalam Pasal 372 KUHP. | |
| Terdakwa dijatuhi pidana penjara selama 2 tahun 6 bulan. Barang bukti berupa dokumen | |
| perjanjian kerjasama, buku tabungan, dan bukti transfer dirampas untuk negara. Terdakwa | |
| ditetapkan tetap ditahan dan dibebankan biaya perkara sebesar Rp 5.000.""" | |
| print("=" * 70) | |
| print("Indonesian Court Document Summarization Demo") | |
| print("Using Pre-trained Multilingual Models") | |
| print("=" * 70) | |
| print() | |
| # Create summarizer | |
| print("Creating summarizer with balanced model...") | |
| summarizer = create_summarizer("balanced") | |
| print("=" * 70) | |
| print("ORIGINAL DOCUMENT:") | |
| print("=" * 70) | |
| print(court_document) | |
| print() | |
| print("=" * 70) | |
| print("GENERATING SUMMARY...") | |
| print("=" * 70) | |
| # Generate summary | |
| summary = summarizer.summarize( | |
| court_document, | |
| max_length=150, | |
| min_length=30, | |
| num_beams=4 | |
| ) | |
| print() | |
| print("=" * 70) | |
| print("GENERATED SUMMARY:") | |
| print("=" * 70) | |
| print(summary) | |
| print("=" * 70) | |
| if __name__ == "__main__": | |
| main() | |