Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +1 -0
grt-t5-xl/document_embeddings.npy +3 -0
grt-t5-xl/encode.log +0 -0
grt-t5-xl/encode.py +71 -0
grt-t5-xl/faiss_index.bin +3 -0
grt-t5-xl/merged_triple_processed_new_withID.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+grt-t5-xl/merged_triple_processed_new_withID.json filter=lfs diff=lfs merge=lfs -text

grt-t5-xl/document_embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3e7b075e43cbe0aa57358d9abf71cf2315b5be7f0ebd6a997ca2eb537a8adcc
+size 15919002752

grt-t5-xl/encode.log ADDED Viewed

The diff for this file is too large to render. See raw diff

grt-t5-xl/encode.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import faiss
+import numpy as np
+import json
+from tqdm import tqdm
+import os
+from torch.nn import DataParallel
+from transformers import AutoTokenizer, AutoModel, T5EncoderModel
+import torch
+from sentence_transformers import SentenceTransformer
+from multiprocessing import Pool
+# 添加总体进度条
+print("Loading data...")
+with open("merged_triple_processed_new_withID.json", "r") as fi:
+    data = json.load(fi)
+sentences = [_['contents'] for _ in data]
+print(f"Chunks nums: {len(sentences)}")
+model_path = 'sentence-transformers/gtr-t5-xl'
+def encode_sentences_on_gpu(params):
+    sentences_chunk, device_id = params
+    device = torch.device(f'cuda:{device_id}')
+    model = SentenceTransformer(model_path, device=device)
+    embeddings = model.encode(
+        sentences_chunk,
+        batch_size=424,
+        show_progress_bar=True,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+        desc=f'GPU {device_id} encoding'
+    )
+    return embeddings
+num_gpus = torch.cuda.device_count()
+print(f"Number of GPUs: {num_gpus}")
+sentences_chunks = np.array_split(sentences, num_gpus)
+params = [(sentences_chunks[i], i) for i in range(num_gpus)]
+print("Starting encoding process...")
+with Pool(processes=num_gpus) as pool:
+    embeddings_list = list(tqdm(
+        pool.imap(encode_sentences_on_gpu, params),
+        total=num_gpus,
+        desc='Overall progress'
+    ))
+print("Concatenating embeddings...")
+sentence_embeddings = np.concatenate(embeddings_list, axis=0)
+# Create a FAISS index
+print("Creating FAISS index...")
+dim = sentence_embeddings.shape[1]
+faiss_index = faiss.IndexFlatIP(dim)
+# 添加进度提示
+print("Adding embeddings to FAISS index...")
+faiss_index.add(sentence_embeddings)
+# 保存索引和嵌入
+print("Saving FAISS index...")
+faiss_index_file = 'faiss_index.bin'
+faiss.write_index(faiss_index, faiss_index_file)
+print(f"FAISS index saved to {faiss_index_file}")
+print("Saving embeddings...")
+embeddings_file = 'document_embeddings.npy'
+np.save(embeddings_file, sentence_embeddings)
+print(f"Document embeddings saved to {embeddings_file}")

grt-t5-xl/faiss_index.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7964d1a43f2f893a5b5822410e8fa12bed662383aeae016c669abcb02248b9aa
+size 15919002669

grt-t5-xl/merged_triple_processed_new_withID.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94b1b64e4217e11c1a4eacb788d08e24c1b9fe62efa0870956bf4150105d63f4
+size 4428096608