Abdullraffayy commited on
Commit
b42528f
·
verified ·
1 Parent(s): 7440b8b

Upload 2 files

Browse files
Files changed (2) hide show
  1. chatbot_ingestion.py +77 -0
  2. chatbot_retrivel.py +40 -0
chatbot_ingestion.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import basics
2
+ import os
3
+ import time
4
+ from dotenv import load_dotenv
5
+
6
+ # import pinecone
7
+ from pinecone import Pinecone, ServerlessSpec
8
+
9
+ # import langchain
10
+ from langchain_pinecone import PineconeVectorStore
11
+ from langchain_openai import OpenAIEmbeddings
12
+ from langchain_core.documents import Document
13
+ # from langchain_huggingface import HuggingFaceEmbeddings
14
+ from langchain_huggingface import HuggingFaceEmbeddings
15
+
16
+ #documents
17
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
18
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+
20
+ load_dotenv()
21
+
22
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
23
+
24
+ # initialize pinecone database
25
+ index_name = os.environ.get("PINECONE_INDEX_NAME") # change if desired
26
+
27
+ # check whether index exists, and create if not
28
+ existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
29
+
30
+ if index_name not in existing_indexes:
31
+ pc.create_index(
32
+ name=index_name,
33
+ dimension=768,
34
+ metric="cosine",
35
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
36
+ )
37
+ while not pc.describe_index(index_name).status["ready"]:
38
+ time.sleep(1)
39
+
40
+ index = pc.Index(index_name)
41
+
42
+ # initialize embeddings model + vector store
43
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
44
+
45
+ vector_store = PineconeVectorStore(index=index, embedding=embeddings)
46
+
47
+
48
+ # loading the PDF document
49
+ loader = PyPDFDirectoryLoader("document/")
50
+
51
+ raw_documents = loader.load()
52
+
53
+ # splitting the document
54
+ text_splitter = RecursiveCharacterTextSplitter(
55
+ chunk_size=800,
56
+ chunk_overlap=400,
57
+ length_function=len,
58
+ is_separator_regex=False,
59
+ )
60
+
61
+ # creating the chunks
62
+ documents = text_splitter.split_documents(raw_documents)
63
+
64
+ # generate unique id's
65
+
66
+ i = 0
67
+ uuids = []
68
+
69
+ while i < len(documents):
70
+
71
+ i += 1
72
+
73
+ uuids.append(f"id{i}")
74
+
75
+ # add to database
76
+
77
+ vector_store.add_documents(documents=documents, ids=uuids)
chatbot_retrivel.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import basics
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # import pinecone
6
+ from pinecone import Pinecone, ServerlessSpec
7
+ # from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+
10
+ # import langchain
11
+ from langchain_pinecone import PineconeVectorStore
12
+ from langchain_core.documents import Document
13
+
14
+ load_dotenv()
15
+
16
+ # initialize pinecone database
17
+ pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
18
+
19
+ # set the pinecone index
20
+
21
+ index_name = os.environ.get("PINECONE_INDEX_NAME")
22
+ index = pc.Index(index_name)
23
+
24
+ # initialize embeddings model + vector store
25
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
26
+
27
+ vector_store = PineconeVectorStore(index=index, embedding=embeddings)
28
+
29
+ # retrieval
30
+ retriever = vector_store.as_retriever(
31
+ search_type="similarity_score_threshold",
32
+ search_kwargs={"k": 5, "score_threshold": 0.5},
33
+ )
34
+ results = retriever.invoke("what is retrieval augmented generation?")
35
+
36
+ # show results
37
+ print("RESULTS:")
38
+
39
+ for res in results:
40
+ print(f"* {res.page_content} [{res.metadata}]")