Final_Assignment_Template / build_knowledge.py
dgsilvia's picture
Upload 4 files
a64f470 verified
# Load .jsonl
import json
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.tools.retriever import create_retriever_tool
import chromadb
chromadb.config.Settings.telemetry_enabled = False
if __name__=='__main__':
with open('metadata.jsonl', 'r') as jsonl_file:
json_list = list(jsonl_file)
json_QA = []
for json_str in json_list:
json_data = json.loads(json_str)
json_QA.append(json_data)
# Usa gli stessi embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
print(1)
# Inizializza Chroma
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
# Prepara la lista di documenti
docs = []
print("orig:",len(json_QA))
for sample in json_QA:
print(len(docs))
content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
metadata = {"source": sample['task_id']}
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
# Inizializza il vector store Chroma
vector_store = Chroma.from_documents(
documents=docs,
embedding=embeddings,
persist_directory="./chroma_db"
)
'''
# Ricrea lo stesso oggetto embeddings usato nella creazione
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# Carica il vector store salvato precedentemente
vector_store = Chroma(
embedding_function=embeddings,
persist_directory="./chroma_db" # stesso path usato durante il salvataggio
)
# Ottieni il retriever
retriever = vector_store.as_retriever()
query = "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?"
results = retriever.invoke(query)
print(results[0].page_content)
'''