awngsz
/

lr_model

Transformers

Joblib

Model card Files Files and versions

xet

Community

seanzxj commited on Dec 17, 2024

Commit

895eef3

verified ·

1 Parent(s): 0d73126

Update README.md

Browse files

Files changed (1) hide show

README.md +15 -64

README.md CHANGED Viewed

@@ -49,6 +49,13 @@ filename='lr_clf_test2.joblib'
 model_file_path=hf_hub_download(repo_id=repo_id, filename=filename)  <br>
 model=joblib.load(model_file_path)
 print(model)
 #Load test dataset (assuming the name is the same as the one in the Ed post) <br>
 test_df = pd.read_csv(file_path)
@@ -215,78 +222,22 @@ X_test = X_test.dropna(subset = ['title'])
 X_test = handle_missing_data(X_test, 'title')
 X_test = consistency_checks(X_test, 'title') </pre>
-# Load the embedding model from Huggingface. Transformer: DistilBERT
-<pre>
-def get_embeddings(text_all, tokenizer, model, device, max_len=128):
-    '''
-    Generate embeddings using a transformer model on GPU if available.
-    Args:
-    - text_all: List of input texts
-    - tokenizer: Tokenizer for the model
-    - model: Transformer model
-    - device: torch.device to run the computations
-    - max_len: Maximum token length for the input
-    Returns:
-    - embeddings: List of embeddings for each input text
-    '''
-    embeddings = []
-    count = 0
-    print('Start embeddings:')
-    for text in text_all:
-        count += 1
-        if count % (len(text_all) // 10) == 0:
-            print(f'{count / len(text_all) * 100:.1f}% done ...')
-        # Tokenize the input text
-        model_input_token = tokenizer(
-            text,
-            add_special_tokens=True,
-            max_length=max_len,
-            padding='max_length',
-            truncation=True,
-            return_tensors='pt'
-        ).to(device)  # Move input tensors to GPU
-        # Generate embeddings without gradient computation
-        with torch.no_grad():
-            model_output = model(**model_input_token)
-            cls_embedding = model_output.last_hidden_state[:, 0, :]  # Use CLS token embedding
-            cls_embedding = cls_embedding.squeeze().cpu().numpy()  # Move back to CPU for numpy
-            embeddings.append(cls_embedding)
-  return embeddings </pre>
-# Check for GPU availability
-<pre>
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-print(f'Using device: {device}')
-# Load the tokenizer and model for 'all-mpnet-base-v2'
-print("Loading model and tokenizer...")
-# Load model and tokenizer
-tokenizer_news = AutoTokenizer.from_pretrained('distilbert-base-uncased')
-model_news = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
-# Set the model to evaluation mode
-model_news.eval()
-############################################# DBERT UNCASED Embedding #############################################
 ############################################# Embedding #############################################
-print("Computing DBERT embeddings for training data...")
 y_test = X_test['labels']
 X_test = X_test['title']
-X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
-print("DBERT embeddings for training data computed!")
-prediction = model.predict(X_test_embeddings_DBERT)
 </pre>
 # Accuracy
 <pre>label_map = {'NBC': 0, 'FoxNews': 1}

 model_file_path=hf_hub_download(repo_id=repo_id, filename=filename)  <br>
 model=joblib.load(model_file_path)
 print(model)
+repo_id2='awngsz/tfidf_model'  ############# <--- check tfidf model name
+filename2='embed_tfidf.joblib'
+model_file_path2=hf_hub_download(repo_id=repo_id2, filename=filename2)  <br>
+tfidf_model=joblib.load(model_file_path2)
+print(tfidf_model)
 #Load test dataset (assuming the name is the same as the one in the Ed post) <br>
 test_df = pd.read_csv(file_path)
 X_test = handle_missing_data(X_test, 'title')
 X_test = consistency_checks(X_test, 'title') </pre>
+############################################# TF-IDF Embedding #############################################
 ############################################# Embedding #############################################
+from sklearn.feature_extraction.text import TfidfVectorizer
+print("Computing embeddings ...")
 y_test = X_test['labels']
 X_test = X_test['title']
+X_test_tfidf = tfidf_model.transform(X_test)
+#X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
+print("Embeddings computed!")
+prediction = model.predict(X_test_tfidf)
 </pre>
 # Accuracy
 <pre>label_map = {'NBC': 0, 'FoxNews': 1}