Spaces:
Sleeping
Sleeping
File size: 2,001 Bytes
a64f470 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | # Load .jsonl
import json
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.tools.retriever import create_retriever_tool
import chromadb
chromadb.config.Settings.telemetry_enabled = False
if __name__=='__main__':
with open('metadata.jsonl', 'r') as jsonl_file:
json_list = list(jsonl_file)
json_QA = []
for json_str in json_list:
json_data = json.loads(json_str)
json_QA.append(json_data)
# Usa gli stessi embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
print(1)
# Inizializza Chroma
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
# Prepara la lista di documenti
docs = []
print("orig:",len(json_QA))
for sample in json_QA:
print(len(docs))
content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
metadata = {"source": sample['task_id']}
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
# Inizializza il vector store Chroma
vector_store = Chroma.from_documents(
documents=docs,
embedding=embeddings,
persist_directory="./chroma_db"
)
'''
# Ricrea lo stesso oggetto embeddings usato nella creazione
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# Carica il vector store salvato precedentemente
vector_store = Chroma(
embedding_function=embeddings,
persist_directory="./chroma_db" # stesso path usato durante il salvataggio
)
# Ottieni il retriever
retriever = vector_store.as_retriever()
query = "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?"
results = retriever.invoke(query)
print(results[0].page_content)
''' |