Spaces:

Duplicated from Levimichael4/RideSearch

Levimichael4
/

RideSearchhhhh

Sleeping

App Files Files Community

RideSearchhhhh / create_embeddings.py

Levimichael4's picture

Upload 14 files

d0e7248 verified 5 months ago

history blame contribute delete

848 Bytes


	import numpy as np, pandas as pd
	from sentence_transformers import SentenceTransformer
	from sklearn.preprocessing import StandardScaler

	CSV='RideSearch_dataset.csv'
	df = pd.read_csv(CSV)

	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	te = model.encode(df['text_record'].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
	np.save('emb_text.npy', np.asarray(te, dtype='float32'))

	NUM = ['horsepower','zero_to_100_kmh_s','seats','cargo_liters','price_usd','popularity_score','comfort_score','reliability_score','tech_score','ownership_cost_score','safety_rating']
	X = df[NUM].copy(); X['zero_to_100_kmh_s'] = -X['zero_to_100_kmh_s']
	Xs = StandardScaler().fit_transform(X.values.astype('float32'))
	np.save('emb_num.npy', Xs.astype('float32'))
	print('Saved emb_text.npy and emb_num.npy')