Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| import pandas as pd | |
| # Lazy import to allow CPU-only envs | |
| from sentence_transformers import SentenceTransformer | |
| DATASET_CSV = os.getenv("DATASET_CSV", "cars1200_text_dataset.csv") | |
| EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy") | |
| ID_PATH = os.getenv("ID_PATH", "ids.csv") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2") | |
| TEXT_COL = os.getenv("TEXT_COL", "text_record") | |
| def main(): | |
| if not os.path.exists(DATASET_CSV): | |
| raise FileNotFoundError(f"Dataset not found: {DATASET_CSV}") | |
| df = pd.read_csv(DATASET_CSV) | |
| if TEXT_COL not in df.columns: | |
| raise KeyError(f"Column '{TEXT_COL}' not found in {DATASET_CSV}.") | |
| print(f"Loading model: {MODEL_NAME}") | |
| model = SentenceTransformer(MODEL_NAME) | |
| texts = df[TEXT_COL].astype(str).tolist() | |
| print(f"Encoding {len(texts)} records...") | |
| embs = model.encode(texts, batch_size=256, show_progress_bar=True, normalize_embeddings=True) | |
| embs = np.asarray(embs, dtype="float32") | |
| np.save(EMB_PATH, embs) | |
| df[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False) | |
| print(f"Saved embeddings to {EMB_PATH} and ids to {ID_PATH}") | |
| if __name__ == "__main__": | |
| main() | |