Spaces:

sunil448832
/

retrieval-augment-generation

Runtime error

App Files Files Community

retrieval-augment-generation / ingest.py

sunil448832

Initial Commit

eccde2c over 2 years ago

raw

history blame contribute delete

2.37 kB

	from data_processor import DocumentReader, SentenceSplitter
	from models import EmbeddingModel
	from vector_store import FaissVectorStore
	from tqdm import tqdm
	import argparse

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument("--data_path", default='data/KnowledgeDocument(pan_card_services).txt',help="Input file name")
	parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector")
	args = parser.parse_args()

	# Define the paths to the data and vector database
	DATA_PATH = args.data_path
	VECTOR_DATABASE_PATH = args.vector_database_path

	# Read the document from the specified path
	documents = DocumentReader.read_document(DATA_PATH)

	# Split the document into sentences with specified chunk parameters
	splitter = SentenceSplitter(chunk_size=60, chunk_overlap=20)
	splitted_documents = splitter.split_texts(documents)

	# Initialize the embedding model
	embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')

	# Create a dictionary to store documents and their corresponding vectors
	database_documents = {}
	batch_size = 16
	print("Generating embedding vectors....")
	# Process the documents in batches
	for i in tqdm(range(0, len(splitted_documents), batch_size)):
	batch = splitted_documents[i:i + batch_size]
	texts = []

	# Extract the text from each document in the batch
	for b in batch:
	texts.append(b.text)

	# Generate embeddings for the batch of texts using the embedding model
	embeddings = embedding_model.encode(texts)

	# Associate each document with its corresponding vector and store in the dictionary
	for i, b in enumerate(batch):
	data = {'document': b, 'vector': embeddings[i]}
	database_documents[b.doc_id] = data
	print("Total embeddings: ",len(database_documents))
	# Create a Faiss vector store from the processed documents and vectors
	vector_store = FaissVectorStore.from_documents(database_documents, dimension=embedding_model.embedding_dim, nlists=100, nprobe=10)

	# Write the vector store to the specified path
	vector_store.write(VECTOR_DATABASE_PATH)
	print(f"Successfully written embedding vectors to {VECTOR_DATABASE_PATH} .")