Simrandhiman commited on
Commit
decfa0a
·
verified ·
1 Parent(s): 0a040df

Create embed.py

Browse files
Files changed (1) hide show
  1. embed.py +50 -0
embed.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embed.py
2
+ # Purpose: turn text-based columns (e.g., city, notes) or entire customer row into dense embeddings using
3
+ # a sentence-transformers model from Hugging Face.
4
+
5
+
6
+ import pandas as pd
7
+ import numpy as np
8
+ from sentence_transformers import SentenceTransformer
9
+ import joblib
10
+
11
+
12
+ MODEL_NAME = 'all-MiniLM-L6-v2' # lightweight sentence-transformer
13
+
14
+
15
+
16
+
17
+ def build_text_for_embedding(df: pd.DataFrame, text_cols=None):
18
+ # combine useful text columns into one string per customer
19
+ if text_cols is None:
20
+ # try to pick common textual columns
21
+ cand = [c for c in df.columns if df[c].dtype == 'object']
22
+ text_cols = cand[:3] # at most 3
23
+ texts = (df[text_cols].fillna('').astype(str).agg(' | '.join, axis=1))
24
+ return texts.tolist()
25
+
26
+
27
+
28
+
29
+ def embed_texts(texts, model_name=MODEL_NAME, device='cpu'):
30
+ model = SentenceTransformer(model_name)
31
+ embs = model.encode(texts, show_progress_bar=True)
32
+ return embs
33
+
34
+
35
+
36
+
37
+ if __name__ == '__main__':
38
+ import argparse
39
+ parser = argparse.ArgumentParser()
40
+ parser.add_argument('--features', default='data/features.parquet')
41
+ parser.add_argument('--out_emb', default='data/embeddings.npy')
42
+ parser.add_argument('--text_cols', nargs='*', default=None)
43
+ args = parser.parse_args()
44
+
45
+
46
+ df = pd.read_parquet(args.features)
47
+ texts = build_text_for_embedding(df, text_cols=args.text_cols)
48
+ embs = embed_texts(texts)
49
+ np.save(args.out_emb, embs)
50
+ print('Saved embeddings to', args.out_emb)