Update README.md
Browse files
README.md
CHANGED
|
@@ -49,6 +49,13 @@ filename='lr_clf_test2.joblib'
|
|
| 49 |
model_file_path=hf_hub_download(repo_id=repo_id, filename=filename) <br>
|
| 50 |
model=joblib.load(model_file_path)
|
| 51 |
print(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
#Load test dataset (assuming the name is the same as the one in the Ed post) <br>
|
| 54 |
test_df = pd.read_csv(file_path)
|
|
@@ -215,78 +222,22 @@ X_test = X_test.dropna(subset = ['title'])
|
|
| 215 |
X_test = handle_missing_data(X_test, 'title')
|
| 216 |
X_test = consistency_checks(X_test, 'title') </pre>
|
| 217 |
|
| 218 |
-
# Load the embedding model from Huggingface. Transformer: DistilBERT
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
<pre>
|
| 222 |
-
def get_embeddings(text_all, tokenizer, model, device, max_len=128):
|
| 223 |
-
'''
|
| 224 |
-
Generate embeddings using a transformer model on GPU if available.
|
| 225 |
-
Args:
|
| 226 |
-
- text_all: List of input texts
|
| 227 |
-
- tokenizer: Tokenizer for the model
|
| 228 |
-
- model: Transformer model
|
| 229 |
-
- device: torch.device to run the computations
|
| 230 |
-
- max_len: Maximum token length for the input
|
| 231 |
-
Returns:
|
| 232 |
-
- embeddings: List of embeddings for each input text
|
| 233 |
-
'''
|
| 234 |
-
embeddings = []
|
| 235 |
-
|
| 236 |
-
count = 0
|
| 237 |
-
print('Start embeddings:')
|
| 238 |
-
|
| 239 |
-
for text in text_all:
|
| 240 |
-
count += 1
|
| 241 |
-
if count % (len(text_all) // 10) == 0:
|
| 242 |
-
print(f'{count / len(text_all) * 100:.1f}% done ...')
|
| 243 |
-
|
| 244 |
-
# Tokenize the input text
|
| 245 |
-
model_input_token = tokenizer(
|
| 246 |
-
text,
|
| 247 |
-
add_special_tokens=True,
|
| 248 |
-
max_length=max_len,
|
| 249 |
-
padding='max_length',
|
| 250 |
-
truncation=True,
|
| 251 |
-
return_tensors='pt'
|
| 252 |
-
).to(device) # Move input tensors to GPU
|
| 253 |
-
|
| 254 |
-
# Generate embeddings without gradient computation
|
| 255 |
-
with torch.no_grad():
|
| 256 |
-
model_output = model(**model_input_token)
|
| 257 |
-
cls_embedding = model_output.last_hidden_state[:, 0, :] # Use CLS token embedding
|
| 258 |
-
cls_embedding = cls_embedding.squeeze().cpu().numpy() # Move back to CPU for numpy
|
| 259 |
-
embeddings.append(cls_embedding)
|
| 260 |
-
|
| 261 |
-
return embeddings </pre>
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
# Check for GPU availability
|
| 265 |
-
<pre>
|
| 266 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 267 |
-
print(f'Using device: {device}')
|
| 268 |
-
|
| 269 |
-
# Load the tokenizer and model for 'all-mpnet-base-v2'
|
| 270 |
-
print("Loading model and tokenizer...")
|
| 271 |
-
# Load model and tokenizer
|
| 272 |
-
tokenizer_news = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
| 273 |
-
model_news = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
|
| 274 |
|
| 275 |
-
# Set the model to evaluation mode
|
| 276 |
-
model_news.eval()
|
| 277 |
|
| 278 |
-
#############################################
|
| 279 |
############################################# Embedding #############################################
|
| 280 |
-
|
|
|
|
| 281 |
|
| 282 |
y_test = X_test['labels']
|
| 283 |
X_test = X_test['title']
|
| 284 |
-
|
| 285 |
-
X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
|
| 286 |
-
print("DBERT embeddings for training data computed!")
|
| 287 |
|
|
|
|
| 288 |
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
| 290 |
</pre>
|
| 291 |
# Accuracy
|
| 292 |
<pre>label_map = {'NBC': 0, 'FoxNews': 1}
|
|
|
|
| 49 |
model_file_path=hf_hub_download(repo_id=repo_id, filename=filename) <br>
|
| 50 |
model=joblib.load(model_file_path)
|
| 51 |
print(model)
|
| 52 |
+
|
| 53 |
+
repo_id2='awngsz/tfidf_model' ############# <--- check tfidf model name
|
| 54 |
+
filename2='embed_tfidf.joblib'
|
| 55 |
+
|
| 56 |
+
model_file_path2=hf_hub_download(repo_id=repo_id2, filename=filename2) <br>
|
| 57 |
+
tfidf_model=joblib.load(model_file_path2)
|
| 58 |
+
print(tfidf_model)
|
| 59 |
|
| 60 |
#Load test dataset (assuming the name is the same as the one in the Ed post) <br>
|
| 61 |
test_df = pd.read_csv(file_path)
|
|
|
|
| 222 |
X_test = handle_missing_data(X_test, 'title')
|
| 223 |
X_test = consistency_checks(X_test, 'title') </pre>
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
############################################# TF-IDF Embedding #############################################
|
| 228 |
############################################# Embedding #############################################
|
| 229 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 230 |
+
print("Computing embeddings ...")
|
| 231 |
|
| 232 |
y_test = X_test['labels']
|
| 233 |
X_test = X_test['title']
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
+
X_test_tfidf = tfidf_model.transform(X_test)
|
| 236 |
|
| 237 |
+
#X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
|
| 238 |
+
print("Embeddings computed!")
|
| 239 |
+
|
| 240 |
+
prediction = model.predict(X_test_tfidf)
|
| 241 |
</pre>
|
| 242 |
# Accuracy
|
| 243 |
<pre>label_map = {'NBC': 0, 'FoxNews': 1}
|