Spaces:
Sleeping
Sleeping
| # app.py | |
| import torch | |
| import torch.nn as nn | |
| import math | |
| import re | |
| import pickle | |
| from huggingface_hub import hf_hub_download | |
| import gradio as gr | |
| # --- Reimplementar clases necesarias --- | |
| class SimpleTokenizer: | |
| def __init__(self, word2idx, idx2word): | |
| self.word2idx = word2idx | |
| self.idx2word = idx2word | |
| self.vocab_size = len(word2idx) | |
| def encode(self, text, add_sos=False, add_eos=False): | |
| tokens = re.findall(r'\w+', text.lower()) | |
| ids = [self.word2idx.get(t, self.word2idx["<unk>"]) for t in tokens] | |
| if add_sos: ids = [self.word2idx["<sos>"]] + ids | |
| if add_eos: ids = ids + [self.word2idx["<eos>"]] | |
| return ids | |
| def decode(self, ids): | |
| return " ".join([ | |
| self.idx2word.get(i, "<unk>") | |
| for i in ids | |
| if i not in [self.word2idx.get("<pad>", 0), self.word2idx.get("<sos>", 1), self.word2idx.get("<eos>", 2)] | |
| ]) | |
| class PositionalEncoding(nn.Module): | |
| def __init__(self, d_model, max_len=5000): | |
| super().__init__() | |
| pe = torch.zeros(max_len, d_model) | |
| position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) | |
| div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| self.register_buffer('pe', pe) | |
| def forward(self, x): | |
| x = x + self.pe[:x.size(1), :] | |
| return x | |
| class SummaraTransformer(nn.Module): | |
| def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=256): | |
| super().__init__() | |
| self.d_model = d_model | |
| self.embedding = nn.Embedding(vocab_size, d_model) | |
| self.pos_encoder = PositionalEncoding(d_model) | |
| encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, batch_first=True, dropout=0.1) | |
| self.encoder = nn.TransformerEncoder(encoder_layer, num_layers) | |
| decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, batch_first=True, dropout=0.1) | |
| self.decoder = nn.TransformerDecoder(decoder_layer, num_layers) | |
| self.fc_out = nn.Linear(d_model, vocab_size) | |
| self.dropout = nn.Dropout(0.1) | |
| def forward(self, src, tgt, src_key_padding_mask=None, tgt_mask=None): | |
| src_emb = self.dropout(self.pos_encoder(self.embedding(src) * math.sqrt(self.d_model))) | |
| tgt_emb = self.dropout(self.pos_encoder(self.embedding(tgt) * math.sqrt(self.d_model))) | |
| memory = self.encoder(src_emb, src_key_padding_mask=src_key_padding_mask) | |
| output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, memory_key_padding_mask=src_key_padding_mask) | |
| return self.fc_out(output) | |
| # --- Cargar modelo desde Hugging Face --- | |
| def load_summara_from_hub(): | |
| model_path = hf_hub_download( | |
| repo_id="teszenofficial/summara", | |
| filename="summara.pkl" | |
| ) | |
| with open(model_path, 'rb') as f: | |
| data = pickle.load(f) | |
| config = data['config'] | |
| model = SummaraTransformer( | |
| vocab_size=config['vocab_size'], | |
| d_model=config['d_model'], | |
| nhead=config['nhead'], | |
| num_layers=config['num_layers'], | |
| dim_feedforward=config['dim_feedforward'] | |
| ) | |
| model.load_state_dict(data['model_state_dict']) | |
| model.eval() | |
| tokenizer = data['tokenizer'] | |
| return model, tokenizer | |
| # --- Función de resumen con longitud ajustable --- | |
| def generate_square_subsequent_mask(sz): | |
| return torch.triu(torch.ones(sz, sz), diagonal=1).bool() | |
| def create_padding_mask(seq, pad_idx=0): | |
| return (seq == pad_idx) | |
| def summarize(text, max_words=50): | |
| try: | |
| model, tokenizer = load_summara_from_hub() | |
| device = "cpu" | |
| model.to(device) | |
| src = torch.tensor([tokenizer.encode(text, add_eos=True)], device=device) | |
| src_padding_mask = create_padding_mask(src).to(device) | |
| tgt = torch.tensor([[tokenizer.word2idx["<sos>"]]], device=device) | |
| words_generated = 0 | |
| max_tokens = max_words + 10 # margen para tokens no palabras | |
| with torch.no_grad(): | |
| for _ in range(max_tokens): | |
| tgt_mask = generate_square_subsequent_mask(tgt.size(1)).to(device) | |
| output = model(src, tgt, src_key_padding_mask=src_padding_mask, tgt_mask=tgt_mask) | |
| next_token = output.argmax(2)[:, -1].item() | |
| if next_token == tokenizer.word2idx.get("<eos>", 2): | |
| break | |
| tgt = torch.cat([tgt, torch.tensor([[next_token]], device=device)], dim=1) | |
| # Contar palabras (no tokens de control) | |
| if next_token not in [0, 1, 2]: | |
| words_generated += 1 | |
| if words_generated >= max_words: | |
| break | |
| summary_ids = tgt.squeeze().cpu().tolist() | |
| if not isinstance(summary_ids, list): | |
| summary_ids = [summary_ids] | |
| summary = tokenizer.decode(summary_ids[1:]) | |
| return summary if summary.strip() else "Resumen no disponible." | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # --- Interfaz Gradio --- | |
| with gr.Blocks(title="Summara") as demo: | |
| gr.Markdown("# 🧠 Summara\n### Resumidor de texto con Transformer entrenado desde cero") | |
| gr.Markdown("Modelo: [teszenofficial/summara](https://huggingface.co/teszenofficial/summara)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp = gr.Textbox( | |
| label="Texto a resumir", | |
| lines=10, | |
| placeholder="Pega un artículo, noticia, ensayo o cualquier texto que quieras resumir..." | |
| ) | |
| with gr.Row(): | |
| length_slider = gr.Slider( | |
| minimum=10, | |
| maximum=150, | |
| value=60, | |
| step=5, | |
| label="Longitud del resumen (palabras aproximadas)" | |
| ) | |
| btn = gr.Button(" Generar Resumen ", variant="primary") | |
| with gr.Column(): | |
| out = gr.Textbox( | |
| label="Resumen generado", | |
| lines=10, | |
| interactive=False | |
| ) | |
| btn.click( | |
| fn=summarize, | |
| inputs=[inp, length_slider], | |
| outputs=out | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["La inteligencia artificial está transformando múltiples industrias. En medicina, permite diagnósticos más precisos. En educación, personaliza el aprendizaje. En transporte, impulsa los vehículos autónomos. A pesar de sus beneficios, también plantea desafíos éticos y de privacidad que la sociedad debe abordar con cuidado."], | |
| ["El cambio climático es uno de los mayores desafíos del siglo XXI. Sus efectos incluyen el aumento del nivel del mar, fenómenos meteorológicos extremos y pérdida de biodiversidad. Para mitigarlo, es esencial reducir las emisiones de gases de efecto invernadero, invertir en energías renovables y promover políticas ambientales sostenibles a nivel global."] | |
| ], | |
| inputs=inp, | |
| label="Ejemplos para probar" | |
| ) | |
| gr.Markdown("💡 **Consejo**: Usa textos de al menos 3-4 oraciones para obtener mejores resultados.") | |
| if __name__ == "__main__": | |
| demo.launch() |