RideSearchhhhh / create_embeddings.py
Levimichael4's picture
Upload 14 files
d0e7248 verified
raw
history blame contribute delete
848 Bytes
import numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
CSV='RideSearch_dataset.csv'
df = pd.read_csv(CSV)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
te = model.encode(df['text_record'].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
np.save('emb_text.npy', np.asarray(te, dtype='float32'))
NUM = ['horsepower','zero_to_100_kmh_s','seats','cargo_liters','price_usd','popularity_score','comfort_score','reliability_score','tech_score','ownership_cost_score','safety_rating']
X = df[NUM].copy(); X['zero_to_100_kmh_s'] = -X['zero_to_100_kmh_s']
Xs = StandardScaler().fit_transform(X.values.astype('float32'))
np.save('emb_num.npy', Xs.astype('float32'))
print('Saved emb_text.npy and emb_num.npy')