probejie commited on
Commit
02f5e41
·
verified ·
1 Parent(s): c631a0a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ grt-t5-xl/merged_triple_processed_new_withID.json filter=lfs diff=lfs merge=lfs -text
grt-t5-xl/document_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e7b075e43cbe0aa57358d9abf71cf2315b5be7f0ebd6a997ca2eb537a8adcc
3
+ size 15919002752
grt-t5-xl/encode.log ADDED
The diff for this file is too large to render. See raw diff
 
grt-t5-xl/encode.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import json
4
+ from tqdm import tqdm
5
+ import os
6
+ from torch.nn import DataParallel
7
+ from transformers import AutoTokenizer, AutoModel, T5EncoderModel
8
+ import torch
9
+ from sentence_transformers import SentenceTransformer
10
+ from multiprocessing import Pool
11
+
12
+ # 添加总体进度条
13
+ print("Loading data...")
14
+ with open("merged_triple_processed_new_withID.json", "r") as fi:
15
+ data = json.load(fi)
16
+
17
+ sentences = [_['contents'] for _ in data]
18
+ print(f"Chunks nums: {len(sentences)}")
19
+
20
+ model_path = 'sentence-transformers/gtr-t5-xl'
21
+
22
+ def encode_sentences_on_gpu(params):
23
+ sentences_chunk, device_id = params
24
+ device = torch.device(f'cuda:{device_id}')
25
+ model = SentenceTransformer(model_path, device=device)
26
+ embeddings = model.encode(
27
+ sentences_chunk,
28
+ batch_size=424,
29
+ show_progress_bar=True,
30
+ convert_to_numpy=True,
31
+ normalize_embeddings=True,
32
+ desc=f'GPU {device_id} encoding'
33
+ )
34
+ return embeddings
35
+
36
+ num_gpus = torch.cuda.device_count()
37
+ print(f"Number of GPUs: {num_gpus}")
38
+
39
+ sentences_chunks = np.array_split(sentences, num_gpus)
40
+ params = [(sentences_chunks[i], i) for i in range(num_gpus)]
41
+
42
+ print("Starting encoding process...")
43
+ with Pool(processes=num_gpus) as pool:
44
+ embeddings_list = list(tqdm(
45
+ pool.imap(encode_sentences_on_gpu, params),
46
+ total=num_gpus,
47
+ desc='Overall progress'
48
+ ))
49
+
50
+ print("Concatenating embeddings...")
51
+ sentence_embeddings = np.concatenate(embeddings_list, axis=0)
52
+
53
+ # Create a FAISS index
54
+ print("Creating FAISS index...")
55
+ dim = sentence_embeddings.shape[1]
56
+ faiss_index = faiss.IndexFlatIP(dim)
57
+
58
+ # 添加进度提示
59
+ print("Adding embeddings to FAISS index...")
60
+ faiss_index.add(sentence_embeddings)
61
+
62
+ # 保存索引和嵌入
63
+ print("Saving FAISS index...")
64
+ faiss_index_file = 'faiss_index.bin'
65
+ faiss.write_index(faiss_index, faiss_index_file)
66
+ print(f"FAISS index saved to {faiss_index_file}")
67
+
68
+ print("Saving embeddings...")
69
+ embeddings_file = 'document_embeddings.npy'
70
+ np.save(embeddings_file, sentence_embeddings)
71
+ print(f"Document embeddings saved to {embeddings_file}")
grt-t5-xl/faiss_index.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7964d1a43f2f893a5b5822410e8fa12bed662383aeae016c669abcb02248b9aa
3
+ size 15919002669
grt-t5-xl/merged_triple_processed_new_withID.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94b1b64e4217e11c1a4eacb788d08e24c1b9fe62efa0870956bf4150105d63f4
3
+ size 4428096608