Spaces:
Runtime error
Runtime error
Commit
·
fe4cd2a
1
Parent(s):
1a51fb3
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from nomic import atlas
|
| 3 |
+
import glob
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from datasets import load_dataset, concatenate_datasets
|
| 6 |
+
from sklearn.decomposition import PCA
|
| 7 |
+
|
| 8 |
+
files = glob.glob("inference/*.jsonl")
|
| 9 |
+
print(files)
|
| 10 |
+
df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)])
|
| 11 |
+
|
| 12 |
+
print(len(df))
|
| 13 |
+
print(df)
|
| 14 |
+
|
| 15 |
+
df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]},
|
| 16 |
+
batched=True,
|
| 17 |
+
num_proc=64)
|
| 18 |
+
|
| 19 |
+
df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]},
|
| 20 |
+
batched=True,
|
| 21 |
+
num_proc=64)
|
| 22 |
+
|
| 23 |
+
df = df.remove_columns("is_train")
|
| 24 |
+
|
| 25 |
+
text = df.remove_columns(["labels", "input_ids", "embeddings"])
|
| 26 |
+
|
| 27 |
+
text_df = [text[i] for i in range(len(text))]
|
| 28 |
+
|
| 29 |
+
atlas.map_text(text_df, indexed_field="inputs",
|
| 30 |
+
name="CHANGE ME!",
|
| 31 |
+
colorable_fields=["source", "loss", "trained_on"],
|
| 32 |
+
reset_project_if_exists=True,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# index is local to train/test split, regenerate
|
| 36 |
+
data = df.remove_columns(["labels", "input_ids", "index"])
|
| 37 |
+
data = data.add_column("index", list(range(len(data))))
|
| 38 |
+
# max embed dim is 2048 for now
|
| 39 |
+
# note! this is slow in pyarrow/hf datasets
|
| 40 |
+
embeddings = np.array(data["embeddings"])
|
| 41 |
+
print("embeddings shape:", embeddings.shape)
|
| 42 |
+
embeddings = PCA(n_components=2048).fit_transform(embeddings)
|
| 43 |
+
|
| 44 |
+
data = data.remove_columns(["embeddings"])
|
| 45 |
+
columns = data.to_pandas().to_dict("records")
|
| 46 |
+
|
| 47 |
+
atlas.map_embeddings(embeddings,
|
| 48 |
+
data=columns,
|
| 49 |
+
id_field="index",
|
| 50 |
+
name="CHANGE ME!",
|
| 51 |
+
colorable_fields=["source", "loss", "trained_on"],
|
| 52 |
+
build_topic_model=True,
|
| 53 |
+
topic_label_field="inputs",
|
| 54 |
+
reset_project_if_exists=True,)
|