File size: 1,782 Bytes
7cd7caf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
import time
import os
# CONFIGURATION
INPUT_FILE = "chat_1turn.csv"
OUTPUT_FILE = "chat_embeddings.pt"
MODEL_NAME = "Snowflake/snowflake-arctic-embed-l-v2.0"
BATCH_SIZE = 128 # Go big or go slow
USE_GPU = torch.cuda.is_available()
MAX_ROWS = 2000 # Set to e.g. 1000 for quick dev tests
# π§ Sanity checks
assert os.path.exists(INPUT_FILE), f"β File not found: {INPUT_FILE}"
# π Load model
print(f"π§ Loading model: {MODEL_NAME} {'[GPU]' if USE_GPU else '[CPU]'}")
model = SentenceTransformer(MODEL_NAME, device="cuda" if USE_GPU else "cpu")
# π Load data
print("π Reading CSV...")
df = pd.read_csv(INPUT_FILE)
assert 'source' in df.columns and 'target' in df.columns, "β Missing 'source' or 'target' column!"
if MAX_ROWS:
df = df.head(MAX_ROWS)
sources = df['source'].fillna("").tolist()
targets = df['target'].fillna("").tolist()
# β±οΈ Embed all at once
def embed_all(texts, label):
print(f"βοΈ Embedding {label} ({len(texts)} items)...")
start = time.time()
embeddings = model.encode(
texts,
batch_size=BATCH_SIZE,
convert_to_tensor=True,
normalize_embeddings=True,
show_progress_bar=True,
device="cuda" if USE_GPU else "cpu",
torch_dtype=torch.int8
)
print(f"β
{label} embedding done in {time.time() - start:.2f}s")
return embeddings
source_tensor = embed_all(sources, "source")
target_tensor = embed_all(targets, "target")
# πΎ Save
print(f"πΎ Saving to {OUTPUT_FILE}...")
torch.save({"source": source_tensor, "target": target_tensor}, OUTPUT_FILE)
print(f"β
Saved {len(sources)} embeddings to {OUTPUT_FILE}")
|