File size: 1,782 Bytes
7cd7caf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
import time
import os

# CONFIGURATION
INPUT_FILE = "chat_1turn.csv"
OUTPUT_FILE = "chat_embeddings.pt"
MODEL_NAME = "Snowflake/snowflake-arctic-embed-l-v2.0"
BATCH_SIZE = 128  # Go big or go slow
USE_GPU = torch.cuda.is_available()
MAX_ROWS = 2000  # Set to e.g. 1000 for quick dev tests

# πŸ”§ Sanity checks
assert os.path.exists(INPUT_FILE), f"❌ File not found: {INPUT_FILE}"

# πŸš€ Load model
print(f"🧠 Loading model: {MODEL_NAME} {'[GPU]' if USE_GPU else '[CPU]'}")
model = SentenceTransformer(MODEL_NAME, device="cuda" if USE_GPU else "cpu")

# πŸ“‚ Load data
print("πŸ“‚ Reading CSV...")
df = pd.read_csv(INPUT_FILE)
assert 'source' in df.columns and 'target' in df.columns, "❌ Missing 'source' or 'target' column!"

if MAX_ROWS:
    df = df.head(MAX_ROWS)

sources = df['source'].fillna("").tolist()
targets = df['target'].fillna("").tolist()

# ⏱️ Embed all at once
def embed_all(texts, label):
    print(f"βš™οΈ Embedding {label} ({len(texts)} items)...")
    start = time.time()
    embeddings = model.encode(
        texts,
        batch_size=BATCH_SIZE,
        convert_to_tensor=True,
        normalize_embeddings=True,
        show_progress_bar=True,
        device="cuda" if USE_GPU else "cpu",
        torch_dtype=torch.int8
    )
    print(f"βœ… {label} embedding done in {time.time() - start:.2f}s")
    return embeddings

source_tensor = embed_all(sources, "source")
target_tensor = embed_all(targets, "target")

# πŸ’Ύ Save
print(f"πŸ’Ύ Saving to {OUTPUT_FILE}...")
torch.save({"source": source_tensor, "target": target_tensor}, OUTPUT_FILE)
print(f"βœ… Saved {len(sources)} embeddings to {OUTPUT_FILE}")