Spaces:
Sleeping
Sleeping
File size: 11,343 Bytes
4986502 ce48500 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from tqdm import tqdm
from openai import OpenAI
import string
import pickle
import os
import time
from langchain_community.document_loaders import PyMuPDFLoader
class API_Interface:
def __init__(self, OPEN_AI_KEY, PINECONE_KEY, chunk_size:int = 1500, embed_model:str = "text-embedding-3-small", chat_model:str = "gpt-3.5-turbo"):
self.chunk_size = chunk_size
self.embed_model = embed_model
self.chat_model = chat_model
#with open("open_ai_key.txt") as infile:
# OPEN_AI_KEY = infile.readline().strip()
#with open("pinecone_key.txt") as infile:
# PINECONE_KEY = infile.readline().strip()
OPEN_AI_KEY = OPEN_AI_KEY
PINECONE_KEY = PINECONE_KEY
self.__client = OpenAI(api_key=OPEN_AI_KEY)
self.__pc = Pinecone(api_key=PINECONE_KEY)
self.__index = self.__pc.Index('eep596mp2')
print("Chunking documents.")
self.chunked_texts, self.chunked_pnums = self.__chunk_document()
self.table_texts, self.table_pnums = self.__chunk_tables()
print("Initializing vector store.")
self.namespace, self.vectorstore = self.__init_vectorstore(OPEN_AI_KEY)
print("Initializing table store.")
self.tablespace, self.tablestore = self.__init_tablestore(OPEN_AI_KEY)
def __chunk_document(self) -> tuple[list[str], list[int]]:
loader = PyMuPDFLoader(file_path = "Solar Eclipse Information.pdf", mode = "page")
docs = loader.load()
page_texts = [page.page_content for page in docs] # Extract page_content
page_numbers = [page.metadata["page"] for page in docs] # Extract metadata["page"]
splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=500)
chunked_texts, chunk_page_numbers = [], []
previous_page_tail = ""
for text, pnum in zip(page_texts, page_numbers):
chunks = splitter.split_text(previous_page_tail + " " + text)
chunked_texts.extend(chunks[:-1])
chunk_page_numbers.extend([pnum]*(len(chunks)-1))
previous_page_tail = chunks[-1]
chunked_texts.append(chunks[-1])
chunk_page_numbers.append(pnum)
return chunked_texts, chunk_page_numbers
def __chunk_tables(self):
tabler = PyMuPDFLoader(file_path = "Solar Eclipse Table.pdf", mode = "page")
tables = tabler.load()
HEADER = "Catalog Number, Canon Plate, Calendar Date, Terrestrial Dynamical Time of Greatest Eclipse, UT - TD (s), Luna Number, Saros Number, Eclipse Type, QLE, Gamma, Eclipse Magnitude, Latitude, Longitude, Sun Altitude, Sun Azimuth, Path Width (km), Central Line Duration"
table_texts = []
# print(tables[0].page_content)
c = tables[0].page_content
i = c.find("km")
# print(c[i+3:])
for page in tables:
c = page.page_content
i = c.find("km")
values = c[i+3:].split("\n")
text = ""
idv = 0
dates = [None, None]
partial_flag = False
for val in values:
if idv == 2:
year = val
if dates[0] is None:
dates[0] = year
else:
dates[1] = year
if idv % 16 == 4:
val = val.replace(" ", " ")
text += val + " "
idv += 1
if val.startswith("P"):
partial_flag = True
if val.endswith("W") or val.endswith("E"):
if partial_flag:
idv = -2
else:
idv = -4
if idv == 0:
text += "\n"
partial_flag = False
table_texts.append(f"Solar eclipses between {dates[0]} and {dates[1]}:\n\n" + HEADER + "\n" + text)
table_numbers = [page.metadata["page"] for page in tables]
return table_texts, table_numbers
def __init_vectorstore(self, OPEN_AI_KEY):
NAMESPACE = f"ns_eclipse_{self.chunk_size}"
_ns = self.__index.describe_index_stats()['namespaces'].get(NAMESPACE)
if _ns is not None and _ns.get('vector_count') in (None, 0):
self.__index.delete(delete_all=True, namespace=NAMESPACE)
_ns = None
if _ns is None:
print("... generating embeddings.")
self.__generate_embeddings()
records = []
for i, (text, pnum, embedding) in enumerate(zip(self.chunked_texts, self.chunked_pnums, self.embeddings)):
records.append({
"id": f"chunk{i}",
"values": embedding,
"metadata": {
"text": text,
"page_number": pnum
}
})
print(len(records))
batch_size = 180
print("... upsertting records.")
for b in tqdm(range((len(records)-1)//batch_size+1)):
self.__index.upsert(records[b*batch_size:(b+1)*batch_size], namespace=NAMESPACE)
# print(b+1, "/", (len(records)-1)//batch_size+1)
while self.__index.describe_index_stats()['namespaces'].get(NAMESPACE) is None:
time.sleep(1)
print("Index stats:", self.__index.describe_index_stats())
openaiembs = OpenAIEmbeddings(api_key=OPEN_AI_KEY, model=self.embed_model)
vectorstore = PineconeVectorStore(self.__index, embedding=openaiembs)
return NAMESPACE, vectorstore
def __init_tablestore(self, OPEN_AI_KEY):
NAMESPACE = f"ts_eclipse"
_ns = self.__index.describe_index_stats()['namespaces'].get(NAMESPACE)
if _ns is not None and _ns.get('vector_count') in (None, 0):
self.__index.delete(delete_all=True, namespace=NAMESPACE)
_ns = None
if _ns is None:
print("... generating table embeddings.")
self.__generate_table_embeddings()
records = []
for i, (text, pnum, embedding) in enumerate(zip(self.table_texts, self.table_pnums, self.tmbeddings)):
records.append({
"id": f"chunk{i}",
"values": embedding,
"metadata": {
"text": text,
"page_number": pnum
}
})
print(len(records))
batch_size = 180
print("... upsertting records.")
for b in tqdm(range((len(records)-1)//batch_size+1)):
self.__index.upsert(records[b*batch_size:(b+1)*batch_size], namespace=NAMESPACE)
# print(b+1, "/", (len(records)-1)//batch_size+1)
while self.__index.describe_index_stats()['namespaces'].get(NAMESPACE) is None:
time.sleep(1)
print("Index stats:", self.__index.describe_index_stats())
openaiembs = OpenAIEmbeddings(api_key=OPEN_AI_KEY, model=self.embed_model)
vectorstore = PineconeVectorStore(self.__index, embedding=openaiembs)
return NAMESPACE, vectorstore
def __generate_embeddings(self) -> None:
""" Generates self.embeddings """
EMBED_PATH = f"eclipse_text_embeddings_{self.chunk_size}.pkl"
def get_embedding(text):
text = text.replace("\n", " ")
# text = text.replace(string.punctuation, "")
response = self.__client.embeddings.create(input = [text], model=self.embed_model)
return response.data[0].embedding
if not os.path.exists(EMBED_PATH):
self.embeddings = []
for text in tqdm(self.chunked_texts):
self.embeddings.append(get_embedding(text))
with open(EMBED_PATH, "wb") as outfile:
pickle.dump(self.embeddings, outfile)
else:
print("--- found existing embeddings file. Shortcutting.")
with open(EMBED_PATH, "rb") as infile:
self.embeddings:list[list[float]] = pickle.load(infile)
def __generate_table_embeddings(self) -> None:
""" Generates self.tmbeddings """
TABLE_PATH = f"eclipse_table_embeddings.pkl"
HEADER = """Catalog Number, Canon Plate, Calendar Date, Terrestrial Dynamical Time of Greatest Eclipse, UT - TD (s), Luna Number, Saros Number, Eclipse Type, QLE, Gamma, Eclipse Magnitude, Latitude, Longitude, Sun Altitude, Sun Azimuth, Path Width (km), Central Line Duration"""
def get_embedding(text):
text = text.replace("\n", " ")
response = self.__client.embeddings.create(input = [text], model=self.embed_model)
return response.data[0].embedding
if not os.path.exists(TABLE_PATH):
self.tmbeddings = []
for table in tqdm(self.table_texts):
self.tmbeddings.append(get_embedding(table))
with open(TABLE_PATH, "wb") as outfile:
pickle.dump(self.tmbeddings, outfile)
else:
print("--- found existing embeddings file. Shortcutting.")
with open(TABLE_PATH, "rb") as infile:
self.tmbeddings:list[list[float]] = pickle.load(infile)
def query_pinecone_vector_store(self, query:str, top_k_docs:int = 5, top_k_tbls:int = 5,
namespace:str = None, tablespace:str = None):
namespace = namespace or self.namespace
tablespace = tablespace or self.tablespace
assert namespace in self.__index.describe_index_stats().get('namespaces')
assert tablespace in self.__index.describe_index_stats().get('namespaces')
response = self.vectorstore.similarity_search_with_relevance_scores(query=query,
k=top_k_docs,
namespace=namespace)
tesponse = self.vectorstore.similarity_search_with_relevance_scores(query=query,
k=top_k_tbls,
namespace=tablespace)
return [tuple(zip(*response)), tuple(zip(*tesponse))]
def client_chat(self, messages, model=None):
model = model or self.chat_model
response = self.__client.chat.completions.create(messages=messages, model=model)
return response.choices[0].message.content
if __name__ == "__main__":
tester = API_Interface()
my_query = "What is the backpropogation algorithm?"
response = tester.query_pinecone_vector_store(my_query)
for doc in response:
print(doc.metadata["page_number"], doc.page_content, "\n\n") |