# Load .jsonl import json from langchain_chroma import Chroma from langchain_huggingface import HuggingFaceEmbeddings from langchain.tools.retriever import create_retriever_tool import chromadb chromadb.config.Settings.telemetry_enabled = False if __name__=='__main__': with open('metadata.jsonl', 'r') as jsonl_file: json_list = list(jsonl_file) json_QA = [] for json_str in json_list: json_data = json.loads(json_str) json_QA.append(json_data) # Usa gli stessi embeddings embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") print(1) # Inizializza Chroma from langchain.schema import Document from langchain_community.vectorstores import Chroma # Prepara la lista di documenti docs = [] print("orig:",len(json_QA)) for sample in json_QA: print(len(docs)) content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}" metadata = {"source": sample['task_id']} doc = Document(page_content=content, metadata=metadata) docs.append(doc) # Inizializza il vector store Chroma vector_store = Chroma.from_documents( documents=docs, embedding=embeddings, persist_directory="./chroma_db" ) ''' # Ricrea lo stesso oggetto embeddings usato nella creazione embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # Carica il vector store salvato precedentemente vector_store = Chroma( embedding_function=embeddings, persist_directory="./chroma_db" # stesso path usato durante il salvataggio ) # Ottieni il retriever retriever = vector_store.as_retriever() query = "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?" results = retriever.invoke(query) print(results[0].page_content) '''