Spaces:
Sleeping
Sleeping
whymath
commited on
Commit
·
08f0208
1
Parent(s):
c1a97fb
Removing openai from requirements
Browse files- requirements.txt +0 -1
- utils.py +20 -3
requirements.txt
CHANGED
|
@@ -11,4 +11,3 @@ pymupdf
|
|
| 11 |
wandb
|
| 12 |
chainlit
|
| 13 |
huggingface_hub
|
| 14 |
-
openai
|
|
|
|
| 11 |
wandb
|
| 12 |
chainlit
|
| 13 |
huggingface_hub
|
|
|
utils.py
CHANGED
|
@@ -23,8 +23,11 @@ def chunk_documents(docs, tiktoken_len):
|
|
| 23 |
chunk_overlap = 0,
|
| 24 |
length_function = tiktoken_len,
|
| 25 |
)
|
|
|
|
| 26 |
split_chunks = text_splitter.split_documents(docs)
|
|
|
|
| 27 |
print('len(split_chunks) =', len(split_chunks))
|
|
|
|
| 28 |
return split_chunks
|
| 29 |
|
| 30 |
|
|
@@ -32,22 +35,31 @@ def create_raqa_chain_from_docs():
|
|
| 32 |
# Load the documents from a PDF file using PyMuPDFLoader
|
| 33 |
# docs = PyMuPDFLoader("data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
|
| 34 |
docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
|
|
|
|
|
|
|
| 35 |
print("Loaded", len(docs), "documents")
|
|
|
|
|
|
|
| 36 |
print(docs[0])
|
| 37 |
|
| 38 |
-
#
|
| 39 |
split_chunks = chunk_documents(docs, tiktoken_len)
|
|
|
|
|
|
|
| 40 |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
|
|
|
|
|
|
| 41 |
qdrant_vectorstore = Qdrant.from_documents(
|
| 42 |
split_chunks,
|
| 43 |
embedding_model,
|
| 44 |
location=":memory:",
|
| 45 |
collection_name="Meta 10-k Filings",
|
| 46 |
)
|
|
|
|
|
|
|
| 47 |
qdrant_retriever = qdrant_vectorstore.as_retriever()
|
| 48 |
|
| 49 |
# Define the RAG prompt template
|
| 50 |
-
openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
|
| 51 |
RAG_PROMPT = """
|
| 52 |
CONTEXT:
|
| 53 |
{context}
|
|
@@ -57,9 +69,14 @@ def create_raqa_chain_from_docs():
|
|
| 57 |
|
| 58 |
Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with "I don't know".
|
| 59 |
"""
|
|
|
|
|
|
|
| 60 |
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
| 61 |
|
| 62 |
-
#
|
|
|
|
|
|
|
|
|
|
| 63 |
retrieval_augmented_qa_chain = (
|
| 64 |
{"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
|
| 65 |
| RunnablePassthrough.assign(context=itemgetter("context"))
|
|
|
|
| 23 |
chunk_overlap = 0,
|
| 24 |
length_function = tiktoken_len,
|
| 25 |
)
|
| 26 |
+
|
| 27 |
split_chunks = text_splitter.split_documents(docs)
|
| 28 |
+
|
| 29 |
print('len(split_chunks) =', len(split_chunks))
|
| 30 |
+
|
| 31 |
return split_chunks
|
| 32 |
|
| 33 |
|
|
|
|
| 35 |
# Load the documents from a PDF file using PyMuPDFLoader
|
| 36 |
# docs = PyMuPDFLoader("data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
|
| 37 |
docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
|
| 38 |
+
|
| 39 |
+
# Print the number of loaded documents
|
| 40 |
print("Loaded", len(docs), "documents")
|
| 41 |
+
|
| 42 |
+
# Print the first document
|
| 43 |
print(docs[0])
|
| 44 |
|
| 45 |
+
# Split the documents into chunks based on their length
|
| 46 |
split_chunks = chunk_documents(docs, tiktoken_len)
|
| 47 |
+
|
| 48 |
+
# Create an instance of the OpenAIEmbeddings model for text embeddings
|
| 49 |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 50 |
+
|
| 51 |
+
# Create a Qdrant vector store from the split chunks
|
| 52 |
qdrant_vectorstore = Qdrant.from_documents(
|
| 53 |
split_chunks,
|
| 54 |
embedding_model,
|
| 55 |
location=":memory:",
|
| 56 |
collection_name="Meta 10-k Filings",
|
| 57 |
)
|
| 58 |
+
|
| 59 |
+
# Create a retriever from the Qdrant vector store
|
| 60 |
qdrant_retriever = qdrant_vectorstore.as_retriever()
|
| 61 |
|
| 62 |
# Define the RAG prompt template
|
|
|
|
| 63 |
RAG_PROMPT = """
|
| 64 |
CONTEXT:
|
| 65 |
{context}
|
|
|
|
| 69 |
|
| 70 |
Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with "I don't know".
|
| 71 |
"""
|
| 72 |
+
|
| 73 |
+
# Create a ChatPromptTemplate instance from the RAG prompt template
|
| 74 |
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
| 75 |
|
| 76 |
+
# Create an instance of the ChatOpenAI model
|
| 77 |
+
openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
|
| 78 |
+
|
| 79 |
+
# Define the retrieval augmented QA chain
|
| 80 |
retrieval_augmented_qa_chain = (
|
| 81 |
{"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
|
| 82 |
| RunnablePassthrough.assign(context=itemgetter("context"))
|