Spaces:

igna7
/

summarizer

Sleeping

App Files Files Community

igna7 commited on Feb 5

Commit

b4c7cb7

verified ·

1 Parent(s): 67fc4b3

add summarizer files

Browse files

Files changed (3) hide show

README.md +45 -14
app.py +144 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,14 +1,45 @@
----
-title: Summarizer
-emoji: 🔥
-colorFrom: pink
-colorTo: purple
-sdk: gradio
-sdk_version: 6.5.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: Crea un resumen del texto recibido
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Resumidor de Texto BERT2BERT
+emoji: 📝
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.1
+app_file: app.py
+pinned: false
+---
+# 📝 Resumidor de Texto (BERT2BERT)
+Resume textos largos en español usando el modelo **BERT2BERT** con técnica de micro-chunking.
+## Modelo
+- **Nombre:** `mrm8488/bert2bert_shared-spanish-finetuned-summarization`
+- **Tipo:** Encoder-Decoder (BERT2BERT)
+- **Idioma:** Español
+## API
+Este espacio expone una API que puede ser usada con Gradio Client o Daggr:
+```python
+from gradio_client import Client
+client = Client("tu-usuario/summarizer")
+result = client.predict(texto="Tu texto largo aquí...")
+print(result)
+```
+## Uso con Daggr
+```python
+from daggr import GradioNode
+summarizer = GradioNode(
+    "tu-usuario/summarizer",
+    api_name="/predict",
+    inputs={"texto": gr.Textbox()},
+    outputs={"resumen": gr.Textbox()},
+)
+```

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Espacio de Hugging Face: Resumidor de Texto (BERT2BERT)
+========================================================
+Modelo: mrm8488/bert2bert_shared-spanish-finetuned-summarization
+Entrada: Texto largo en español
+Salida: Texto resumido
+"""
+import gradio as gr
+import torch
+from transformers import BertTokenizerFast, EncoderDecoderModel
+class SummarizationService:
+    def __init__(self):
+        ckpt = "mrm8488/bert2bert_shared-spanish-finetuned-summarization"
+        self.device = torch.device("cpu")
+        print(f"Cargando modelo BERT2BERT: {ckpt}...")
+        self.tokenizer = BertTokenizerFast.from_pretrained(ckpt)
+        self.model = EncoderDecoderModel.from_pretrained(
+            ckpt,
+            low_cpu_mem_usage=False,
+            use_safetensors=False,
+            torch_dtype=torch.float32,
+        )
+        self.model.eval()
+        print("Modelo cargado correctamente.")
+    def summarize(self, text: str) -> str:
+        """Resume el texto usando micro-chunking para manejar textos largos."""
+        text = text.replace("\n", " ").strip()
+        gen_params = {
+            "min_length": 25,
+            "max_length": 100,
+            "num_beams": 4,
+            "length_penalty": 2.0,
+            "no_repeat_ngram_size": 3,
+            "early_stopping": True
+        }
+        chunks = self._chunk_text(text, max_tokens=200)
+        summaries = []
+        for chunk in chunks:
+            inputs = self.tokenizer(
+                [chunk],
+                padding="max_length",
+                truncation=True,
+                max_length=512,
+                return_tensors="pt"
+            )
+            input_ids = inputs["input_ids"].to(self.device)
+            attention_mask = inputs["attention_mask"].to(self.device)
+            with torch.no_grad():
+                output_ids = self.model.generate(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    **gen_params
+                )
+            summary_piece = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+            if summary_piece.strip():
+                summaries.append(summary_piece.strip())
+        return " ".join(summaries)
+    def _chunk_text(self, text: str, max_tokens: int) -> list:
+        """Divide el texto en fragmentos manejables para BERT."""
+        sentences = text.split('. ')
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            tokens = self.tokenizer.tokenize(sentence)
+            sent_len = len(tokens)
+            if sent_len > max_tokens:
+                if current_chunk:
+                    chunks.append(". ".join(current_chunk) + ".")
+                    current_chunk = []
+                    current_length = 0
+                chunks.append(sentence + ".")
+                continue
+            if current_length + sent_len > max_tokens:
+                chunks.append(". ".join(current_chunk) + ".")
+                current_chunk = [sentence]
+                current_length = sent_len
+            else:
+                current_chunk.append(sentence)
+                current_length += sent_len
+        if current_chunk:
+            chunks.append(". ".join(current_chunk) + ".")
+        return chunks
+# Inicializar servicio
+print("Inicializando servicio de resumen...")
+service = SummarizationService()
+print("Servicio listo.")
+def resumir_texto(texto: str) -> str:
+    """Función principal para Gradio."""
+    if not texto or not texto.strip():
+        return "Por favor, introduce un texto para resumir."
+    try:
+        resumen = service.summarize(texto)
+        return resumen
+    except Exception as e:
+        return f"Error al resumir: {str(e)}"
+# Interfaz Gradio
+iface = gr.Interface(
+    fn=resumir_texto,
+    inputs=gr.Textbox(
+        lines=10,
+        placeholder="Pega aquí tu texto largo en español...",
+        label="Texto a Resumir"
+    ),
+    outputs=gr.Textbox(label="Resumen"),
+    title="📝 Resumidor de Texto (BERT2BERT)",
+    description="Resume textos largos en español usando el modelo BERT2BERT con técnica de micro-chunking.",
+    examples=[
+        ["La inteligencia artificial es un campo de la informática que se centra en crear sistemas inteligentes. Estos sistemas pueden aprender de la experiencia y realizar tareas como reconocimiento de voz y toma de decisiones. El aprendizaje automático permite a las computadoras mejorar su rendimiento a través de la experiencia."]
+    ],
+    flagging_mode="never",
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+torch
+transformers
+sentencepiece