whymath commited on
Commit
a3074e5
·
1 Parent(s): 6153200

Adding old base files for RAG QA PDF

Browse files
Files changed (7) hide show
  1. .gitignore +2 -0
  2. Dockerfile +11 -0
  3. README.md +8 -1
  4. app.py +37 -0
  5. chainlit.md +8 -0
  6. requirements.txt +13 -0
  7. utils.py +86 -0
.gitignore CHANGED
@@ -1,3 +1,5 @@
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
 
1
+ wandb/
2
+
3
  # Byte-compiled / optimized / DLL files
4
  __pycache__/
5
  *.py[cod]
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
README.md CHANGED
@@ -1,4 +1,11 @@
1
  ---
 
 
 
 
 
 
2
  license: gpl-3.0
3
- title: T2LVirtualStudent
4
  ---
 
 
 
1
  ---
2
+ title: RAG QA PDF
3
+ emoji: 📊
4
+ colorFrom: yellow
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
  license: gpl-3.0
 
9
  ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import chainlit as cl
3
+ from dotenv import load_dotenv
4
+ import utils
5
+
6
+
7
+ load_dotenv()
8
+
9
+
10
+ @cl.on_chat_start
11
+ async def start_chat():
12
+ # Create the RAQA chain and store it in the user session
13
+ raqa_chain = utils.create_raqa_chain_from_docs()
14
+ settings = {
15
+ "chain": raqa_chain
16
+ }
17
+ cl.user_session.set("settings", settings)
18
+
19
+
20
+ @cl.on_message
21
+ async def main(message: cl.Message):
22
+ # Print the message content
23
+ user_query = message.content
24
+ print('user_query =', user_query)
25
+
26
+ # Get the chain from the user session
27
+ settings = cl.user_session.get("settings")
28
+ raqa_chain = settings["chain"]
29
+
30
+ # Generate the response from the chain
31
+ query_response = raqa_chain.invoke({"question" : user_query})
32
+ query_answer = query_response["response"].content
33
+ print('query_answer =', query_answer)
34
+
35
+ # Create and send the message stream
36
+ msg = cl.Message(content=query_answer)
37
+ await msg.send()
chainlit.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ # RAG Pipeline Demo
3
+
4
+ This app uses a LangChain-based RAG pipeline to demonstrate conversational Q&A from PDFs, as part of the AIM AIE2 Midterm Project.
5
+
6
+ Ask me questions about the 10-k filings of Meta Platforms, Inc. for FY 2023.
7
+
8
+ *By Yohan Mathew*
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ipykernel
2
+ numpy
3
+ pandas
4
+ langchain
5
+ langchain-core
6
+ langchain-community
7
+ langchain-openai
8
+ qdrant-client
9
+ tiktoken
10
+ pymupdf
11
+ wandb
12
+ chainlit
13
+ huggingface_hub
utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ from langchain.document_loaders import PyMuPDFLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_openai.embeddings import OpenAIEmbeddings
5
+ from langchain_community.vectorstores import Qdrant
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_openai import ChatOpenAI
8
+ from operator import itemgetter
9
+ # from langchain.schema.output_parser import StrOutputParser
10
+ from langchain.schema.runnable import RunnablePassthrough
11
+
12
+
13
+ def tiktoken_len(text):
14
+ tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
15
+ text,
16
+ )
17
+ return len(tokens)
18
+
19
+
20
+ def chunk_documents(docs, tiktoken_len):
21
+ text_splitter = RecursiveCharacterTextSplitter(
22
+ chunk_size = 200,
23
+ chunk_overlap = 0,
24
+ length_function = tiktoken_len,
25
+ )
26
+
27
+ split_chunks = text_splitter.split_documents(docs)
28
+
29
+ print('len(split_chunks) =', len(split_chunks))
30
+
31
+ return split_chunks
32
+
33
+
34
+ def create_raqa_chain_from_docs():
35
+ # Load the documents from a PDF file using PyMuPDFLoader
36
+ # docs = PyMuPDFLoader("data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
37
+ docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
38
+
39
+ # Print the number of loaded documents
40
+ print("Loaded", len(docs), "documents")
41
+
42
+ # Print the first document
43
+ print(docs[0])
44
+
45
+ # Split the documents into chunks based on their length
46
+ split_chunks = chunk_documents(docs, tiktoken_len)
47
+
48
+ # Create an instance of the OpenAIEmbeddings model for text embeddings
49
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
50
+
51
+ # Create a Qdrant vector store from the split chunks
52
+ qdrant_vectorstore = Qdrant.from_documents(
53
+ split_chunks,
54
+ embedding_model,
55
+ location=":memory:",
56
+ collection_name="Meta 10-k Filings",
57
+ )
58
+
59
+ # Create a retriever from the Qdrant vector store
60
+ qdrant_retriever = qdrant_vectorstore.as_retriever()
61
+
62
+ # Define the RAG prompt template
63
+ RAG_PROMPT = """
64
+ CONTEXT:
65
+ {context}
66
+
67
+ QUERY:
68
+ {question}
69
+
70
+ Use the provided context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, respond with "I don't know".
71
+ """
72
+
73
+ # Create a ChatPromptTemplate instance from the RAG prompt template
74
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
75
+
76
+ # Create an instance of the ChatOpenAI model
77
+ openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
78
+
79
+ # Define the retrieval augmented QA chain
80
+ retrieval_augmented_qa_chain = (
81
+ {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
82
+ | RunnablePassthrough.assign(context=itemgetter("context"))
83
+ | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
84
+ )
85
+
86
+ return retrieval_augmented_qa_chain