""" Korean Financial Report Extractive Summarization Model 문단에서 대표문장을 추출하고 역할(outlook, event, financial, risk)을 분류하는 모델 - klue/roberta-base 기반 - 문장별 [CLS] 인코딩 + Inter-sentence Transformer - 대표문장 이진 분류 + 역할 다중 분류 (Multi-task) """ import torch import torch.nn as nn from transformers import AutoConfig, AutoModel, AutoTokenizer, PretrainedConfig, PreTrainedModel ROLE_LABELS = ["outlook", "event", "financial", "risk"] NUM_ROLES = len(ROLE_LABELS) ROLE_TO_IDX = {role: idx for idx, role in enumerate(ROLE_LABELS)} IDX_TO_ROLE = {idx: role for idx, role in enumerate(ROLE_LABELS)} class DocumentEncoderConfig(PretrainedConfig): model_type = "document_encoder" def __init__( self, base_model_name: str = "klue/roberta-base", hidden_size: int = 768, num_transformer_layers: int = 2, num_roles: int = NUM_ROLES, max_length: int = 128, max_sentences: int = 30, role_labels: list = None, **kwargs, ): super().__init__(**kwargs) self.base_model_name = base_model_name self.hidden_size = hidden_size self.num_transformer_layers = num_transformer_layers self.num_roles = num_roles self.max_length = max_length self.max_sentences = max_sentences self.role_labels = role_labels or ROLE_LABELS class DocumentEncoderForExtractiveSummarization(PreTrainedModel): config_class = DocumentEncoderConfig def __init__(self, config: DocumentEncoderConfig): super().__init__(config) self.sentence_encoder = AutoModel.from_pretrained(config.base_model_name) encoder_layer = nn.TransformerEncoderLayer( d_model=config.hidden_size, nhead=8, dim_feedforward=2048, dropout=0.1, batch_first=True, ) self.inter_sentence_transformer = nn.TransformerEncoder( encoder_layer, num_layers=config.num_transformer_layers, ) self.classifier = nn.Sequential( nn.Linear(config.hidden_size, 256), nn.ReLU(), nn.Dropout(0.1), nn.Linear(256, 1), nn.Sigmoid(), ) self.role_classifier = nn.Sequential( nn.Linear(config.hidden_size, 256), nn.ReLU(), nn.Dropout(0.1), nn.Linear(256, config.num_roles), ) def encode_sentences(self, input_ids, attention_mask): outputs = self.sentence_encoder(input_ids=input_ids, attention_mask=attention_mask) return outputs.last_hidden_state[:, 0, :] def forward(self, sentences_input_ids, sentences_attention_mask, document_mask=None): """ Args: sentences_input_ids: (batch_size, num_sentences, seq_len) sentences_attention_mask: (batch_size, num_sentences, seq_len) document_mask: (batch_size, num_sentences) Returns: scores: (batch_size, num_sentences) 대표문장 점수 role_logits: (batch_size, num_sentences, num_roles) 역할 로짓 """ batch_size, num_sentences, seq_len = sentences_input_ids.shape flat_ids = sentences_input_ids.view(-1, seq_len) flat_mask = sentences_attention_mask.view(-1, seq_len) embeddings = self.encode_sentences(flat_ids, flat_mask) hidden_size = embeddings.shape[-1] embeddings = embeddings.view(batch_size, num_sentences, hidden_size) src_key_padding_mask = None if document_mask is not None: src_key_padding_mask = ~document_mask.bool() contextualized = self.inter_sentence_transformer( embeddings, src_key_padding_mask=src_key_padding_mask ) scores = self.classifier(contextualized).squeeze(-1) role_logits = self.role_classifier(contextualized) return scores, role_logits # Auto 클래스 등록 AutoConfig.register("document_encoder", DocumentEncoderConfig) AutoModel.register(DocumentEncoderConfig, DocumentEncoderForExtractiveSummarization)