awngsz
/

baseline_model

@@ -17,6 +17,7 @@ import joblib
 !huggingface-cli login
 import pandas as pd
 import torch
 import torchvision
 from torchvision import transforms, utils
 import torch.nn as nn
@@ -200,46 +201,68 @@ X_test = consistency_checks(X_test, 'title') </pre>
 <pre>
-def get_embeddings(text_all, tokenizer, model, max_len = 128):
-  ''' return: embeddings list '''
-  embeddings = []
-  count = 0
-  print('Start embeddings:')
-  for text in text_all:
-    count += 1
-    if count % (len(text_all) // 10) == 0:
-      print(f'{count / len(text_all) * 100:.1f}% done ...')
-    model_input_token = tokenizer(
-                    text,
-                    add_special_tokens = True,
-                    max_length = max_len,
-                    padding = 'max_length',
-                    truncation = True,
-                    return_tensors = 'pt'
-                    )
-    with torch.no_grad():
-      model_output = model(**model_input_token)
-      cls_embedding = model_output.last_hidden_state[:, 0, :]
-      cls_embedding = cls_embedding.squeeze().numpy()
-      embeddings.append(cls_embedding)
   return embeddings </pre>
-# Load the tokenizer and model from Hugging Face
-<pre>tokenizer_DBERT = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-transformer_model_DBERT = DistilBertModel.from_pretrained('distilbert-base-uncased')
-</pre>
-# Set the model to evaluation mode
-<pre>transformer_model_DBERT.eval() </pre>
-# Get the embeddings for the test data
-<pre>max_len = max(len(text) for text in X_test)
-#this may take awhile to run
-X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_DBERT, transformer_model_DBERT, max_len = max_len)
 prediction = model.predict(X_test_embeddings_DBERT)
 </pre>

 !huggingface-cli login
 import pandas as pd
 import torch
+from transformers import AutoTokenizer, AutoModel
 import torchvision
 from torchvision import transforms, utils
 import torch.nn as nn
 <pre>
+def get_embeddings(text_all, tokenizer, model, device, max_len=128):
+    '''
+    Generate embeddings using a transformer model on GPU if available.
+    Args:
+    - text_all: List of input texts
+    - tokenizer: Tokenizer for the model
+    - model: Transformer model
+    - device: torch.device to run the computations
+    - max_len: Maximum token length for the input
+    Returns:
+    - embeddings: List of embeddings for each input text
+    '''
+    embeddings = []
+    count = 0
+    print('Start embeddings:')
+    for text in text_all:
+        count += 1
+        if count % (len(text_all) // 10) == 0:
+            print(f'{count / len(text_all) * 100:.1f}% done ...')
+        # Tokenize the input text
+        model_input_token = tokenizer(
+            text,
+            add_special_tokens=True,
+            max_length=max_len,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        ).to(device)  # Move input tensors to GPU
+        # Generate embeddings without gradient computation
+        with torch.no_grad():
+            model_output = model(**model_input_token)
+            cls_embedding = model_output.last_hidden_state[:, 0, :]  # Use CLS token embedding
+            cls_embedding = cls_embedding.squeeze().cpu().numpy()  # Move back to CPU for numpy
+            embeddings.append(cls_embedding)
   return embeddings </pre>
+# Check for GPU availability
+<pre>
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f'Using device: {device}')
+# Load the tokenizer and model for 'all-mpnet-base-v2'
+print("Loading model and tokenizer...")
+# Load model and tokenizer
+tokenizer_news = AutoTokenizer.from_pretrained('distilbert-base-uncased')
+model_news = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
+# Set the model to evaluation mode
+model_news.eval()
+############################################# DBERT UNCASED Embedding #############################################
+############################################# Embedding #############################################
+print("Computing DBERT embeddings for training data...")
+X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
+print("DBERT embeddings for training data computed!")
 prediction = model.predict(X_test_embeddings_DBERT)
 </pre>