probejie's picture
Upload folder using huggingface_hub
13cefc2 verified
import faiss
import numpy as np
import json
from tqdm import tqdm
import os
from torch.nn import DataParallel
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
import torch
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool
# 添加总体进度条
print("Loading data...")
with open("merged_triple_processed_new_withID.json", "r") as fi:
data = json.load(fi)
sentences = [_['contents'] for _ in data]
print(f"Chunks nums: {len(sentences)}")
model_path = 'sentence-transformers/gtr-t5-large'
def encode_sentences_on_gpu(params):
sentences_chunk, device_id = params
device = torch.device(f'cuda:{device_id}')
model = SentenceTransformer(model_path, device=device)
embeddings = model.encode(
sentences_chunk,
batch_size=512,
show_progress_bar=True,
convert_to_numpy=True,
normalize_embeddings=True,
desc=f'GPU {device_id} encoding'
)
return embeddings
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs: {num_gpus}")
sentences_chunks = np.array_split(sentences, num_gpus)
params = [(sentences_chunks[i], i) for i in range(num_gpus)]
print("Starting encoding process...")
with Pool(processes=num_gpus) as pool:
embeddings_list = list(tqdm(
pool.imap(encode_sentences_on_gpu, params),
total=num_gpus,
desc='Overall progress'
))
print("Concatenating embeddings...")
sentence_embeddings = np.concatenate(embeddings_list, axis=0)
# Create a FAISS index
print("Creating FAISS index...")
dim = sentence_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dim)
# 添加进度提示
print("Adding embeddings to FAISS index...")
faiss_index.add(sentence_embeddings)
# 保存索引和嵌入
print("Saving FAISS index...")
faiss_index_file = 'faiss_index.bin'
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")
print("Saving embeddings...")
embeddings_file = 'document_embeddings.npy'
np.save(embeddings_file, sentence_embeddings)
print(f"Document embeddings saved to {embeddings_file}")