Spaces:
Paused
Paused
whymath
commited on
Commit
·
a3074e5
1
Parent(s):
6153200
Adding old base files for RAG QA PDF
Browse files- .gitignore +2 -0
- Dockerfile +11 -0
- README.md +8 -1
- app.py +37 -0
- chainlit.md +8 -0
- requirements.txt +13 -0
- utils.py +86 -0
.gitignore
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
# Byte-compiled / optimized / DLL files
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
|
|
|
| 1 |
+
wandb/
|
| 2 |
+
|
| 3 |
# Byte-compiled / optimized / DLL files
|
| 4 |
__pycache__/
|
| 5 |
*.py[cod]
|
Dockerfile
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
RUN useradd -m -u 1000 user
|
| 3 |
+
USER user
|
| 4 |
+
ENV HOME=/home/user \
|
| 5 |
+
PATH=/home/user/.local/bin:$PATH
|
| 6 |
+
WORKDIR $HOME/app
|
| 7 |
+
COPY --chown=user . $HOME/app
|
| 8 |
+
COPY ./requirements.txt ~/app/requirements.txt
|
| 9 |
+
RUN pip install -r requirements.txt
|
| 10 |
+
COPY . .
|
| 11 |
+
CMD ["chainlit", "run", "app.py", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,4 +1,11 @@
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
license: gpl-3.0
|
| 3 |
-
title: T2LVirtualStudent
|
| 4 |
---
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: RAG QA PDF
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
license: gpl-3.0
|
|
|
|
| 9 |
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import chainlit as cl
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import utils
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@cl.on_chat_start
|
| 11 |
+
async def start_chat():
|
| 12 |
+
# Create the RAQA chain and store it in the user session
|
| 13 |
+
raqa_chain = utils.create_raqa_chain_from_docs()
|
| 14 |
+
settings = {
|
| 15 |
+
"chain": raqa_chain
|
| 16 |
+
}
|
| 17 |
+
cl.user_session.set("settings", settings)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@cl.on_message
|
| 21 |
+
async def main(message: cl.Message):
|
| 22 |
+
# Print the message content
|
| 23 |
+
user_query = message.content
|
| 24 |
+
print('user_query =', user_query)
|
| 25 |
+
|
| 26 |
+
# Get the chain from the user session
|
| 27 |
+
settings = cl.user_session.get("settings")
|
| 28 |
+
raqa_chain = settings["chain"]
|
| 29 |
+
|
| 30 |
+
# Generate the response from the chain
|
| 31 |
+
query_response = raqa_chain.invoke({"question" : user_query})
|
| 32 |
+
query_answer = query_response["response"].content
|
| 33 |
+
print('query_answer =', query_answer)
|
| 34 |
+
|
| 35 |
+
# Create and send the message stream
|
| 36 |
+
msg = cl.Message(content=query_answer)
|
| 37 |
+
await msg.send()
|
chainlit.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# RAG Pipeline Demo
|
| 3 |
+
|
| 4 |
+
This app uses a LangChain-based RAG pipeline to demonstrate conversational Q&A from PDFs, as part of the AIM AIE2 Midterm Project.
|
| 5 |
+
|
| 6 |
+
Ask me questions about the 10-k filings of Meta Platforms, Inc. for FY 2023.
|
| 7 |
+
|
| 8 |
+
*By Yohan Mathew*
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ipykernel
|
| 2 |
+
numpy
|
| 3 |
+
pandas
|
| 4 |
+
langchain
|
| 5 |
+
langchain-core
|
| 6 |
+
langchain-community
|
| 7 |
+
langchain-openai
|
| 8 |
+
qdrant-client
|
| 9 |
+
tiktoken
|
| 10 |
+
pymupdf
|
| 11 |
+
wandb
|
| 12 |
+
chainlit
|
| 13 |
+
huggingface_hub
|
utils.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tiktoken
|
| 2 |
+
from langchain.document_loaders import PyMuPDFLoader
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
| 5 |
+
from langchain_community.vectorstores import Qdrant
|
| 6 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 7 |
+
from langchain_openai import ChatOpenAI
|
| 8 |
+
from operator import itemgetter
|
| 9 |
+
# from langchain.schema.output_parser import StrOutputParser
|
| 10 |
+
from langchain.schema.runnable import RunnablePassthrough
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def tiktoken_len(text):
|
| 14 |
+
tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
|
| 15 |
+
text,
|
| 16 |
+
)
|
| 17 |
+
return len(tokens)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def chunk_documents(docs, tiktoken_len):
|
| 21 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 22 |
+
chunk_size = 200,
|
| 23 |
+
chunk_overlap = 0,
|
| 24 |
+
length_function = tiktoken_len,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
split_chunks = text_splitter.split_documents(docs)
|
| 28 |
+
|
| 29 |
+
print('len(split_chunks) =', len(split_chunks))
|
| 30 |
+
|
| 31 |
+
return split_chunks
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def create_raqa_chain_from_docs():
|
| 35 |
+
# Load the documents from a PDF file using PyMuPDFLoader
|
| 36 |
+
# docs = PyMuPDFLoader("data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
|
| 37 |
+
docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
|
| 38 |
+
|
| 39 |
+
# Print the number of loaded documents
|
| 40 |
+
print("Loaded", len(docs), "documents")
|
| 41 |
+
|
| 42 |
+
# Print the first document
|
| 43 |
+
print(docs[0])
|
| 44 |
+
|
| 45 |
+
# Split the documents into chunks based on their length
|
| 46 |
+
split_chunks = chunk_documents(docs, tiktoken_len)
|
| 47 |
+
|
| 48 |
+
# Create an instance of the OpenAIEmbeddings model for text embeddings
|
| 49 |
+
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 50 |
+
|
| 51 |
+
# Create a Qdrant vector store from the split chunks
|
| 52 |
+
qdrant_vectorstore = Qdrant.from_documents(
|
| 53 |
+
split_chunks,
|
| 54 |
+
embedding_model,
|
| 55 |
+
location=":memory:",
|
| 56 |
+
collection_name="Meta 10-k Filings",
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Create a retriever from the Qdrant vector store
|
| 60 |
+
qdrant_retriever = qdrant_vectorstore.as_retriever()
|
| 61 |
+
|
| 62 |
+
# Define the RAG prompt template
|
| 63 |
+
RAG_PROMPT = """
|
| 64 |
+
CONTEXT:
|
| 65 |
+
{context}
|
| 66 |
+
|
| 67 |
+
QUERY:
|
| 68 |
+
{question}
|
| 69 |
+
|
| 70 |
+
Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with "I don't know".
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
# Create a ChatPromptTemplate instance from the RAG prompt template
|
| 74 |
+
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
| 75 |
+
|
| 76 |
+
# Create an instance of the ChatOpenAI model
|
| 77 |
+
openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
|
| 78 |
+
|
| 79 |
+
# Define the retrieval augmented QA chain
|
| 80 |
+
retrieval_augmented_qa_chain = (
|
| 81 |
+
{"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
|
| 82 |
+
| RunnablePassthrough.assign(context=itemgetter("context"))
|
| 83 |
+
| {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
return retrieval_augmented_qa_chain
|