Spaces:

igna7
/

summarizer

Sleeping

File size: 4,903 Bytes

b4c7cb7

"""

Espacio de Hugging Face: Resumidor de Texto (BERT2BERT)

========================================================

Modelo: mrm8488/bert2bert_shared-spanish-finetuned-summarization



Entrada: Texto largo en español

Salida: Texto resumido

"""

import gradio as gr
import torch
from transformers import BertTokenizerFast, EncoderDecoderModel


class SummarizationService:
    def __init__(self):
        ckpt = "mrm8488/bert2bert_shared-spanish-finetuned-summarization"
        self.device = torch.device("cpu")
        
        print(f"Cargando modelo BERT2BERT: {ckpt}...")
        self.tokenizer = BertTokenizerFast.from_pretrained(ckpt)
        self.model = EncoderDecoderModel.from_pretrained(
            ckpt,
            low_cpu_mem_usage=False,
            use_safetensors=False,
            torch_dtype=torch.float32,
        )
        self.model.eval()
        print("Modelo cargado correctamente.")

    def summarize(self, text: str) -> str:
        """Resume el texto usando micro-chunking para manejar textos largos."""
        text = text.replace("\n", " ").strip()
        
        gen_params = {
            "min_length": 25,
            "max_length": 100,
            "num_beams": 4,
            "length_penalty": 2.0,
            "no_repeat_ngram_size": 3,
            "early_stopping": True
        }

        chunks = self._chunk_text(text, max_tokens=200)
        summaries = []
        
        for chunk in chunks:
            inputs = self.tokenizer(
                [chunk],
                padding="max_length",
                truncation=True,
                max_length=512,
                return_tensors="pt"
            )
            input_ids = inputs["input_ids"].to(self.device)
            attention_mask = inputs["attention_mask"].to(self.device)

            with torch.no_grad():
                output_ids = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    **gen_params
                )
            
            summary_piece = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
            if summary_piece.strip():
                summaries.append(summary_piece.strip())

        return " ".join(summaries)

    def _chunk_text(self, text: str, max_tokens: int) -> list:
        """Divide el texto en fragmentos manejables para BERT."""
        sentences = text.split('. ')
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            tokens = self.tokenizer.tokenize(sentence)
            sent_len = len(tokens)
            
            if sent_len > max_tokens:
                if current_chunk:
                    chunks.append(". ".join(current_chunk) + ".")
                    current_chunk = []
                    current_length = 0
                chunks.append(sentence + ".")
                continue

            if current_length + sent_len > max_tokens:
                chunks.append(". ".join(current_chunk) + ".")
                current_chunk = [sentence]
                current_length = sent_len
            else:
                current_chunk.append(sentence)
                current_length += sent_len

        if current_chunk:
            chunks.append(". ".join(current_chunk) + ".")
            
        return chunks


# Inicializar servicio
print("Inicializando servicio de resumen...")
service = SummarizationService()
print("Servicio listo.")


def resumir_texto(texto: str) -> str:
    """Función principal para Gradio."""
    if not texto or not texto.strip():
        return "Por favor, introduce un texto para resumir."
    
    try:
        resumen = service.summarize(texto)
        return resumen
    except Exception as e:
        return f"Error al resumir: {str(e)}"


# Interfaz Gradio
iface = gr.Interface(
    fn=resumir_texto,
    inputs=gr.Textbox(
        lines=10,
        placeholder="Pega aquí tu texto largo en español...",
        label="Texto a Resumir"
    ),
    outputs=gr.Textbox(label="Resumen"),
    title="📝 Resumidor de Texto (BERT2BERT)",
    description="Resume textos largos en español usando el modelo BERT2BERT con técnica de micro-chunking.",
    examples=[
        ["La inteligencia artificial es un campo de la informática que se centra en crear sistemas inteligentes. Estos sistemas pueden aprender de la experiencia y realizar tareas como reconocimiento de voz y toma de decisiones. El aprendizaje automático permite a las computadoras mejorar su rendimiento a través de la experiencia."]
    ],
    flagging_mode="never",
)

if __name__ == "__main__":
    iface.launch()