Spaces:
Sleeping
Sleeping
| # Load .jsonl | |
| import json | |
| from langchain_chroma import Chroma | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain.tools.retriever import create_retriever_tool | |
| import chromadb | |
| chromadb.config.Settings.telemetry_enabled = False | |
| if __name__=='__main__': | |
| with open('metadata.jsonl', 'r') as jsonl_file: | |
| json_list = list(jsonl_file) | |
| json_QA = [] | |
| for json_str in json_list: | |
| json_data = json.loads(json_str) | |
| json_QA.append(json_data) | |
| # Usa gli stessi embeddings | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
| print(1) | |
| # Inizializza Chroma | |
| from langchain.schema import Document | |
| from langchain_community.vectorstores import Chroma | |
| # Prepara la lista di documenti | |
| docs = [] | |
| print("orig:",len(json_QA)) | |
| for sample in json_QA: | |
| print(len(docs)) | |
| content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}" | |
| metadata = {"source": sample['task_id']} | |
| doc = Document(page_content=content, metadata=metadata) | |
| docs.append(doc) | |
| # Inizializza il vector store Chroma | |
| vector_store = Chroma.from_documents( | |
| documents=docs, | |
| embedding=embeddings, | |
| persist_directory="./chroma_db" | |
| ) | |
| ''' | |
| # Ricrea lo stesso oggetto embeddings usato nella creazione | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
| # Carica il vector store salvato precedentemente | |
| vector_store = Chroma( | |
| embedding_function=embeddings, | |
| persist_directory="./chroma_db" # stesso path usato durante il salvataggio | |
| ) | |
| # Ottieni il retriever | |
| retriever = vector_store.as_retriever() | |
| query = "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?" | |
| results = retriever.invoke(query) | |
| print(results[0].page_content) | |
| ''' |