dgsilvia commited on
Commit
a64f470
·
verified ·
1 Parent(s): 1c0ea74

Upload 4 files

Browse files
Files changed (4) hide show
  1. build_knowledge.py +62 -0
  2. metadata.jsonl +0 -0
  3. requirements.txt +10 -1
  4. system_prompt.txt +5 -0
build_knowledge.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load .jsonl
2
+ import json
3
+ from langchain_chroma import Chroma
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain.tools.retriever import create_retriever_tool
6
+
7
+ import chromadb
8
+ chromadb.config.Settings.telemetry_enabled = False
9
+
10
+
11
+ if __name__=='__main__':
12
+ with open('metadata.jsonl', 'r') as jsonl_file:
13
+ json_list = list(jsonl_file)
14
+
15
+ json_QA = []
16
+ for json_str in json_list:
17
+ json_data = json.loads(json_str)
18
+ json_QA.append(json_data)
19
+
20
+ # Usa gli stessi embeddings
21
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
22
+ print(1)
23
+
24
+ # Inizializza Chroma
25
+
26
+ from langchain.schema import Document
27
+ from langchain_community.vectorstores import Chroma
28
+
29
+ # Prepara la lista di documenti
30
+ docs = []
31
+ print("orig:",len(json_QA))
32
+ for sample in json_QA:
33
+ print(len(docs))
34
+ content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
35
+ metadata = {"source": sample['task_id']}
36
+ doc = Document(page_content=content, metadata=metadata)
37
+ docs.append(doc)
38
+
39
+ # Inizializza il vector store Chroma
40
+ vector_store = Chroma.from_documents(
41
+ documents=docs,
42
+ embedding=embeddings,
43
+ persist_directory="./chroma_db"
44
+ )
45
+
46
+
47
+ '''
48
+ # Ricrea lo stesso oggetto embeddings usato nella creazione
49
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
50
+
51
+ # Carica il vector store salvato precedentemente
52
+ vector_store = Chroma(
53
+ embedding_function=embeddings,
54
+ persist_directory="./chroma_db" # stesso path usato durante il salvataggio
55
+ )
56
+
57
+ # Ottieni il retriever
58
+ retriever = vector_store.as_retriever()
59
+ query = "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?"
60
+ results = retriever.invoke(query)
61
+ print(results[0].page_content)
62
+ '''
metadata.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -15,4 +15,13 @@ arxiv
15
  pymupdf
16
  wikipedia
17
  pgvector
18
- python-dotenv
 
 
 
 
 
 
 
 
 
 
15
  pymupdf
16
  wikipedia
17
  pgvector
18
+ python-dotenv
19
+ langgraph
20
+ langchain
21
+ langchain-core
22
+ langchain-community
23
+ duckduckgo-search
24
+ sentence-transformers
25
+ chromadb
26
+ arxiv
27
+ wikipedia
system_prompt.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ You are a helpful assistant tasked with answering questions using a set of tools.
2
+ Now, I will ask you a question. Report your thoughts, and finish your answer with the following template:
3
+ FINAL ANSWER: [YOUR FINAL ANSWER].
4
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If I provide you a similar question and answer for reference, use this information before using any other tools.
5
+ Your answer should only start with "FINAL ANSWER: ", then follows with the answer.