|
|
import argparse |
|
|
|
|
|
import weave |
|
|
from dataloaders.langchain import FinanceBenchDataloader |
|
|
|
|
|
from rag_pipelines.embeddings.dense import DenseEmbeddings |
|
|
from rag_pipelines.embeddings.sparse_pinecone_text import SparseEmbeddings |
|
|
from rag_pipelines.vectordb.pinecone_hybrid_index import PineconeHybridVectorDB |
|
|
|
|
|
|
|
|
def parse_arguments() -> argparse.Namespace: |
|
|
"""Parse command-line arguments for the FinanceBench pipeline. |
|
|
|
|
|
Returns: |
|
|
argparse.Namespace: Parsed command-line arguments. |
|
|
""" |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Process FinanceBench data, generate embeddings, and add processed documents to a Pinecone hybrid index." |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--project_name", |
|
|
required=True, |
|
|
help="Weave project name to initialize tracing.", |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--dataset_name", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Name of the FinanceBench dataset (e.g., 'PatronusAI/financebench').", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--split", |
|
|
type=str, |
|
|
default="train[:1]", |
|
|
help="Dataset split to use (e.g., 'train[:1]').", |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--dense_model_name", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Dense embedding model name (e.g., 'sentence-transformers/all-MiniLM-L6-v2').", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--dense_device", |
|
|
type=str, |
|
|
default="cpu", |
|
|
help="Device to run the dense embedding model (e.g., 'cpu' or 'cuda').", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--normalize_embeddings", |
|
|
action="store_true", |
|
|
help="Flag to normalize embeddings during encoding.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--show_progress", |
|
|
action="store_true", |
|
|
help="Flag to show progress during embedding generation.", |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--sparse_max_seq_length", |
|
|
type=int, |
|
|
required=True, |
|
|
help="Maximum sequence length for sparse embeddings.", |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--chunking_threshold_type", |
|
|
type=str, |
|
|
default="percentile", |
|
|
help="Threshold type for semantic chunking (e.g., 'percentile' or 'absolute').", |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--pinecone_api_key", |
|
|
type=str, |
|
|
required=True, |
|
|
help="API key for the Pinecone vector database.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--pinecone_index_name", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Name of the Pinecone index.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--pinecone_dimension", |
|
|
type=int, |
|
|
required=True, |
|
|
help="Vector dimension in the Pinecone index.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--pinecone_metric", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Similarity metric for the Pinecone index (e.g., 'dotproduct' or 'cosine').", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--pinecone_region", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Pinecone region (e.g., 'us-east-1').", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--pinecone_cloud", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Pinecone cloud provider (e.g., 'aws').", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--namespace", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Namespace for document storage in Pinecone.", |
|
|
) |
|
|
|
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
"""Load FinanceBench data, generate dense and sparse embeddings, add processed documents to a Pinecone index. |
|
|
|
|
|
The pipeline performs the following steps: |
|
|
1. Initialize Weave tracing. |
|
|
2. Load FinanceBench documents. |
|
|
3. Generate dense and sparse embeddings for the documents. |
|
|
4. Initialize and configure the Pinecone hybrid vector database. |
|
|
5. Index the processed documents in Pinecone. |
|
|
""" |
|
|
args = parse_arguments() |
|
|
|
|
|
|
|
|
weave.init(args.project_name) |
|
|
|
|
|
|
|
|
data_loader = FinanceBenchDataloader( |
|
|
dataset_name=args.dataset_name, |
|
|
split=args.split, |
|
|
) |
|
|
|
|
|
data_loader.get_corpus_pdfs() |
|
|
|
|
|
documents = data_loader.create_documents() |
|
|
print("Loaded Documents:") |
|
|
print(documents) |
|
|
|
|
|
|
|
|
dense_embeddings = DenseEmbeddings( |
|
|
model_name=args.dense_model_name, |
|
|
model_kwargs={"device": args.dense_device}, |
|
|
encode_kwargs={"normalize_embeddings": args.normalize_embeddings}, |
|
|
show_progress=args.show_progress, |
|
|
) |
|
|
|
|
|
|
|
|
sparse_embeddings = SparseEmbeddings(model_kwargs={"max_seq_length": args.sparse_max_seq_length}) |
|
|
|
|
|
|
|
|
dense_embedding_model = dense_embeddings.embedding_model |
|
|
sparse_embedding_model = sparse_embeddings.sparse_embedding_model |
|
|
|
|
|
|
|
|
pinecone_vector_db = PineconeHybridVectorDB( |
|
|
api_key=args.pinecone_api_key, |
|
|
index_name=args.pinecone_index_name, |
|
|
dimension=args.pinecone_dimension, |
|
|
metric=args.pinecone_metric, |
|
|
region=args.pinecone_region, |
|
|
cloud=args.pinecone_cloud, |
|
|
) |
|
|
|
|
|
|
|
|
pinecone_vector_db.add_documents( |
|
|
documents=documents, |
|
|
dense_embedding_model=dense_embedding_model, |
|
|
sparse_embedding_model=sparse_embedding_model, |
|
|
namespace=args.namespace, |
|
|
) |
|
|
|
|
|
print("Documents have been indexed successfully in Pinecone.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|