juanesbch
/

summarization_model

Model card Files Files and versions

juanesbch commited on Apr 10, 2023

Commit

d37aee2

·

1 Parent(s): 8e12542

Delete model.py

Files changed (1) hide show

model.py +0 -81

model.py DELETED Viewed

@@ -1,81 +0,0 @@
-# Load the dataset
-df = pd.read_csv('data_larazon_publico_v2.csv')
-# Define stopwords and stemmer for Spanish
-stop_words = set(stopwords.words('spanish'))
-stemmer = SnowballStemmer('spanish')
-# Preprocess the text data
-for i, row in df.iterrows():
-    # Tokenize the text
-    text = row['cuerpo']
-    tokens = word_tokenize(text.lower(), language='spanish')
-    # Remove stopwords, punctuation and stem the remaining words
-    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()]
-    # Rejoin the stemmed tokens into a string and update the DataFrame
-    df.at[i, 'cuerpo'] = ' '.join(stemmed_tokens)
-# Preprocess the data for summarization
-tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization")
-model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization")
-max_input_length = 512
-max_output_length = 128
-input_ids = []
-attention_masks = []
-output_ids = []
-for i in range(len(df)):
-    input_text = df.iloc[i]['cuerpo']
-    output_text = df.iloc[i]['cuerpo']
-    input_encoded = tokenizer.encode_plus(input_text, add_special_tokens=True,
-                                          max_length=max_input_length, pad_to_max_length=True,
-                                          return_attention_mask=True, return_tensors='pt')
-    output_encoded = tokenizer.encode_plus(output_text, add_special_tokens=True,
-                                      max_length=max_output_length, pad_to_max_length=True,
-                                      return_attention_mask=True, return_tensors='pt')
-    input_ids.append(input_encoded['input_ids'])
-    attention_masks.append(input_encoded['attention_mask'])
-    output_ids.append(output_encoded['input_ids'])
-input_ids = torch.cat(input_ids, dim=0)
-attention_masks = torch.cat(attention_masks, dim=0)
-output_ids = torch.cat(output_ids, dim=0)
-batch_size = 200
-learning_rate = 2e-5
-num_epochs = 1
-optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-scheduler = trf.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
-                                                num_training_steps=len(input_ids) // batch_size * num_epochs)
-# Train the model
-model.train()
-for epoch in range(num_epochs):
-    for i in range(0, len(input_ids), batch_size):
-        batch_input_ids = input_ids[i:i+batch_size]
-        batch_attention_masks = attention_masks[i:i+batch_size]
-        batch_output_ids = output_ids[i:i+batch_size]
-        model.zero_grad()
-        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks,
-                        decoder_input_ids=batch_output_ids[:, :-1], labels=batch_output_ids[:, 1:].reshape(-1, 1))
-        loss = outputs[0]
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-        optimizer.step()
-        scheduler.step()
-        if i % 1000 == 0:
-            print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(input_ids)}, Loss: {loss.item()}")
-# Save the trained model
-model.save_pretrained('/Users/Juanes/Downloads/summarization_model')