kheopss commited on
Commit
c107328
·
verified ·
1 Parent(s): f4dd31d

Update vdb.py

Browse files
Files changed (1) hide show
  1. vdb.py +138 -129
vdb.py CHANGED
@@ -1,129 +1,138 @@
1
- import hashlib
2
- import json
3
- import re
4
- from pathlib import Path
5
-
6
- from dotenv import load_dotenv
7
- from llama_index.core import (QueryBundle)
8
- from llama_index.core.postprocessor import LLMRerank
9
- from nest_asyncio import apply
10
- from openai import OpenAI
11
- from tqdm import tqdm
12
-
13
- from llama_index.core import VectorStoreIndex
14
- from llama_index.embeddings.openai import OpenAIEmbedding
15
-
16
- from llama_index.core import Document
17
-
18
- # Load variables from .env
19
- load_dotenv()
20
-
21
- def build_documents(sections):
22
- docs = []
23
- for s in sections:
24
- metadata = {"section_title": s["title"]}
25
- docs.append(Document(text=s["content"], metadata=metadata))
26
- return docs
27
-
28
- def create_vector_index(docs):
29
- embed_model = OpenAIEmbedding()
30
- index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
31
- return index
32
-
33
- def split_markdown_by_section(md_path: str):
34
- text = Path(md_path).read_text(encoding="utf-8")
35
- sections = re.split(r"(?m)^# ", text)
36
- chunks = []
37
- for section in sections:
38
- if not section.strip():
39
- continue
40
- title, *content = section.split("\n", 1)
41
- body = content[0].strip() if content else ""
42
- chunks.append({"title": title.strip(), "content": body})
43
- return chunks
44
-
45
-
46
-
47
- client = OpenAI()
48
-
49
- apply()
50
-
51
- tqdm.pandas()
52
-
53
-
54
- def hash_data(data):
55
- json_str = json.dumps(data, sort_keys=True)
56
-
57
- json_bytes = json_str.encode('utf-8')
58
-
59
- hash_hex = hashlib.sha256(json_bytes).hexdigest()
60
-
61
- return hash_hex
62
-
63
-
64
- def get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=True):
65
- query_bundle = QueryBundle(query)
66
- retriever = index.as_retriever(similarity_top_k=vector_top_k)
67
- retrieved_nodes = retriever.retrieve(query_bundle)
68
-
69
- if with_reranker:
70
- reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n)
71
- retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
72
-
73
- return retrieved_nodes
74
-
75
-
76
- def get_all_text(nodes):
77
- return ' '.join(f"\n- {node.get_text()}" for node in nodes)
78
-
79
-
80
- async def further_retrieve(query, index, messages):
81
- try:
82
- retrieved_nodes = get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=False)
83
- return completion(query, get_all_text(retrieved_nodes), messages)
84
- except Exception as e:
85
- print(e)
86
- return None
87
-
88
-
89
- async def completion(query, docs, messages):
90
- messages.extend([
91
- {
92
- "role": "system",
93
- "content": f"""
94
- Given tone and voice guidelines and customer support help documents, act as a customer support bot.
95
- Answer any further questions as if you are customer support bot.
96
- TONE AND VOICE:
97
- promote the society, be gentle, be kind always positive.
98
-
99
- DOCUMENT:
100
- {docs}
101
-
102
-
103
-
104
- INSTRUCTIONS:
105
-
106
- - Answer the users QUESTION using the DOCUMENT text above.
107
- - Format formula into latex format between $...$ or \[...\]
108
- - Keep your answer ground in the facts of the DOCUMENT or chat history.
109
- - If document has an image markdown ,use it in your answer
110
- - Respond in same language as user Question
111
- - Use Markdown Structure
112
- - DOCUMENT can have images with there descriptions
113
- - if a text is followed by an image dont skip the image
114
- QUESTION:
115
- """
116
- },
117
- {
118
- "role": "system",
119
- "content": query
120
- }
121
- ])
122
- completion = client.chat.completions.create(
123
- model="gpt-4o-mini",
124
- messages=messages,
125
- stream=True
126
- )
127
- for chunk in completion:
128
- if chunk.choices[0].delta.content:
129
- yield chunk.choices[0].delta.content
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import re
4
+ from pathlib import Path
5
+
6
+ from dotenv import load_dotenv
7
+ from llama_index.core import (QueryBundle)
8
+ from llama_index.core.postprocessor import LLMRerank
9
+ from nest_asyncio import apply
10
+ from openai import OpenAI
11
+ from tqdm import tqdm
12
+
13
+ from llama_index.core import VectorStoreIndex, Settings
14
+ from llama_index.embeddings.openai import OpenAIEmbedding
15
+
16
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
17
+
18
+ from llama_index.core import Document
19
+
20
+
21
+ embed_model = HuggingFaceEmbedding(
22
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
23
+ )
24
+ Settings.embed_model = embed_model
25
+
26
+ # Load variables from .env
27
+ load_dotenv()
28
+
29
+ def build_documents(sections):
30
+ docs = []
31
+ for s in sections:
32
+ metadata = {"section_title": s["title"]}
33
+ docs.append(Document(text=s["content"], metadata=metadata))
34
+ return docs
35
+
36
+ def create_vector_index(docs):
37
+ # embed_model = OpenAIEmbedding()
38
+ # index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
39
+ index = VectorStoreIndex.from_documents(docs)
40
+ return index
41
+
42
+ def split_markdown_by_section(md_path: str):
43
+ text = Path(md_path).read_text(encoding="utf-8")
44
+ sections = re.split(r"(?m)^# ", text)
45
+ chunks = []
46
+ for section in sections:
47
+ if not section.strip():
48
+ continue
49
+ title, *content = section.split("\n", 1)
50
+ body = content[0].strip() if content else ""
51
+ chunks.append({"title": title.strip(), "content": body})
52
+ return chunks
53
+
54
+
55
+
56
+ client = OpenAI()
57
+
58
+ apply()
59
+
60
+ tqdm.pandas()
61
+
62
+
63
+ def hash_data(data):
64
+ json_str = json.dumps(data, sort_keys=True)
65
+
66
+ json_bytes = json_str.encode('utf-8')
67
+
68
+ hash_hex = hashlib.sha256(json_bytes).hexdigest()
69
+
70
+ return hash_hex
71
+
72
+
73
+ def get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=True):
74
+ query_bundle = QueryBundle(query)
75
+ retriever = index.as_retriever(similarity_top_k=vector_top_k)
76
+ retrieved_nodes = retriever.retrieve(query_bundle)
77
+
78
+ if with_reranker:
79
+ reranker = LLMRerank(choice_batch_size=5, top_n=reranker_top_n)
80
+ retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
81
+
82
+ return retrieved_nodes
83
+
84
+
85
+ def get_all_text(nodes):
86
+ return ' '.join(f"\n- {node.get_text()}" for node in nodes)
87
+
88
+
89
+ async def further_retrieve(query, index, messages):
90
+ try:
91
+ retrieved_nodes = get_retrieved_nodes(query, index, vector_top_k=10, reranker_top_n=3, with_reranker=False)
92
+ return completion(query, get_all_text(retrieved_nodes), messages)
93
+ except Exception as e:
94
+ print(e)
95
+ return None
96
+
97
+
98
+ async def completion(query, docs, messages):
99
+ messages.extend([
100
+ {
101
+ "role": "system",
102
+ "content": f"""
103
+ Given tone and voice guidelines and customer support help documents, act as a customer support bot.
104
+ Answer any further questions as if you are customer support bot.
105
+ TONE AND VOICE:
106
+ promote the society, be gentle, be kind always positive.
107
+
108
+ DOCUMENT:
109
+ {docs}
110
+
111
+
112
+
113
+ INSTRUCTIONS:
114
+
115
+ - Answer the users QUESTION using the DOCUMENT text above.
116
+ - Format formula into latex format between $...$ or \[...\]
117
+ - Keep your answer ground in the facts of the DOCUMENT or chat history.
118
+ - If document has an image markdown ,use it in your answer
119
+ - Respond in same language as user Question
120
+ - Use Markdown Structure
121
+ - DOCUMENT can have images with there descriptions
122
+ - if a text is followed by an image dont skip the image
123
+ QUESTION:
124
+ """
125
+ },
126
+ {
127
+ "role": "system",
128
+ "content": query
129
+ }
130
+ ])
131
+ completion = client.chat.completions.create(
132
+ model="gpt-4o-mini",
133
+ messages=messages,
134
+ stream=True
135
+ )
136
+ for chunk in completion:
137
+ if chunk.choices[0].delta.content:
138
+ yield chunk.choices[0].delta.content