typesdigital commited on
Commit
fe4cd2a
·
1 Parent(s): 1a51fb3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from nomic import atlas
3
+ import glob
4
+ from tqdm import tqdm
5
+ from datasets import load_dataset, concatenate_datasets
6
+ from sklearn.decomposition import PCA
7
+
8
+ files = glob.glob("inference/*.jsonl")
9
+ print(files)
10
+ df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)])
11
+
12
+ print(len(df))
13
+ print(df)
14
+
15
+ df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]},
16
+ batched=True,
17
+ num_proc=64)
18
+
19
+ df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]},
20
+ batched=True,
21
+ num_proc=64)
22
+
23
+ df = df.remove_columns("is_train")
24
+
25
+ text = df.remove_columns(["labels", "input_ids", "embeddings"])
26
+
27
+ text_df = [text[i] for i in range(len(text))]
28
+
29
+ atlas.map_text(text_df, indexed_field="inputs",
30
+ name="CHANGE ME!",
31
+ colorable_fields=["source", "loss", "trained_on"],
32
+ reset_project_if_exists=True,
33
+ )
34
+
35
+ # index is local to train/test split, regenerate
36
+ data = df.remove_columns(["labels", "input_ids", "index"])
37
+ data = data.add_column("index", list(range(len(data))))
38
+ # max embed dim is 2048 for now
39
+ # note! this is slow in pyarrow/hf datasets
40
+ embeddings = np.array(data["embeddings"])
41
+ print("embeddings shape:", embeddings.shape)
42
+ embeddings = PCA(n_components=2048).fit_transform(embeddings)
43
+
44
+ data = data.remove_columns(["embeddings"])
45
+ columns = data.to_pandas().to_dict("records")
46
+
47
+ atlas.map_embeddings(embeddings,
48
+ data=columns,
49
+ id_field="index",
50
+ name="CHANGE ME!",
51
+ colorable_fields=["source", "loss", "trained_on"],
52
+ build_topic_model=True,
53
+ topic_label_field="inputs",
54
+ reset_project_if_exists=True,)