austinbv commited on
Commit ·
2c9f2c4
1
Parent(s): 59bc1bb
End of Video 1
Browse files- .gitignore +1 -0
- .idea/misc.xml +3 -0
- app/rag_chain.py +45 -0
- app/server.py +3 -1
- config.py +2 -0
- importer/__init__.py +0 -0
- importer/load_and_process.py +37 -0
- poetry.lock +0 -0
- pyproject.toml +10 -1
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
__pycache__
|
|
|
|
|
|
| 1 |
__pycache__
|
| 2 |
+
.env
|
.idea/misc.xml
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
<project version="4">
|
|
|
|
|
|
|
|
|
|
| 3 |
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (pdf_rag)" project-jdk-type="Python SDK" />
|
| 4 |
</project>
|
|
|
|
| 1 |
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
<project version="4">
|
| 3 |
+
<component name="Black">
|
| 4 |
+
<option name="sdkName" value="Poetry (pdf_rag)" />
|
| 5 |
+
</component>
|
| 6 |
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (pdf_rag)" project-jdk-type="Python SDK" />
|
| 7 |
</project>
|
app/rag_chain.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from operator import itemgetter
|
| 3 |
+
from typing import TypedDict
|
| 4 |
+
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from langchain_community.vectorstores.pgvector import PGVector
|
| 7 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 8 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 9 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 10 |
+
|
| 11 |
+
from config import PG_COLLECTION_NAME
|
| 12 |
+
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
vector_store = PGVector(
|
| 16 |
+
collection_name=PG_COLLECTION_NAME,
|
| 17 |
+
connection_string=os.getenv("POSTGRES_URL"),
|
| 18 |
+
embedding_function=OpenAIEmbeddings()
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
template = """
|
| 22 |
+
Answer given the following context:
|
| 23 |
+
{context}
|
| 24 |
+
|
| 25 |
+
Question: {question}
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
|
| 29 |
+
|
| 30 |
+
llm = ChatOpenAI(temperature=0, model='gpt-4-1106-preview', streaming=True)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class RagInput(TypedDict):
|
| 34 |
+
question: str
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
final_chain = (
|
| 38 |
+
{
|
| 39 |
+
"context": itemgetter("question") | vector_store.as_retriever(),
|
| 40 |
+
"question": itemgetter("question")
|
| 41 |
+
}
|
| 42 |
+
| ANSWER_PROMPT
|
| 43 |
+
| llm
|
| 44 |
+
| StrOutputParser()
|
| 45 |
+
).with_types(input_type=RagInput)
|
app/server.py
CHANGED
|
@@ -2,6 +2,8 @@ from fastapi import FastAPI
|
|
| 2 |
from fastapi.responses import RedirectResponse
|
| 3 |
from langserve import add_routes
|
| 4 |
|
|
|
|
|
|
|
| 5 |
app = FastAPI()
|
| 6 |
|
| 7 |
|
|
@@ -11,7 +13,7 @@ async def redirect_root_to_docs():
|
|
| 11 |
|
| 12 |
|
| 13 |
# Edit this to add the chain you want to add
|
| 14 |
-
add_routes(app,
|
| 15 |
|
| 16 |
if __name__ == "__main__":
|
| 17 |
import uvicorn
|
|
|
|
| 2 |
from fastapi.responses import RedirectResponse
|
| 3 |
from langserve import add_routes
|
| 4 |
|
| 5 |
+
from app.rag_chain import final_chain
|
| 6 |
+
|
| 7 |
app = FastAPI()
|
| 8 |
|
| 9 |
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
# Edit this to add the chain you want to add
|
| 16 |
+
add_routes(app, final_chain, path="/rag")
|
| 17 |
|
| 18 |
if __name__ == "__main__":
|
| 19 |
import uvicorn
|
config.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
| 2 |
+
PG_COLLECTION_NAME = "pdf_rag"
|
importer/__init__.py
ADDED
|
File without changes
|
importer/load_and_process.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader
|
| 5 |
+
from langchain_community.vectorstores.pgvector import PGVector
|
| 6 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
| 7 |
+
from langchain_openai import OpenAIEmbeddings
|
| 8 |
+
|
| 9 |
+
from config import EMBEDDING_MODEL, PG_COLLECTION_NAME
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
loader = DirectoryLoader(
|
| 14 |
+
os.path.abspath("../source_docs"),
|
| 15 |
+
glob="**/*.pdf",
|
| 16 |
+
use_multithreading=True,
|
| 17 |
+
show_progress=True,
|
| 18 |
+
max_concurrency=50,
|
| 19 |
+
loader_cls=UnstructuredPDFLoader,
|
| 20 |
+
)
|
| 21 |
+
docs = loader.load()
|
| 22 |
+
|
| 23 |
+
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, )
|
| 24 |
+
|
| 25 |
+
text_splitter = SemanticChunker(
|
| 26 |
+
embeddings=OpenAIEmbeddings()
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
chunks = text_splitter.split_documents(docs)
|
| 30 |
+
|
| 31 |
+
PGVector.from_documents(
|
| 32 |
+
documents=chunks,
|
| 33 |
+
embedding=embeddings,
|
| 34 |
+
collection_name=PG_COLLECTION_NAME,
|
| 35 |
+
connection_string="postgresql+psycopg://postgres@localhost:5432/pdf_rag_vectors",
|
| 36 |
+
pre_delete_collection=True,
|
| 37 |
+
)
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -9,10 +9,19 @@ packages = [
|
|
| 9 |
]
|
| 10 |
|
| 11 |
[tool.poetry.dependencies]
|
| 12 |
-
python = "
|
| 13 |
uvicorn = "^0.23.2"
|
| 14 |
langserve = {extras = ["server"], version = ">=0.0.30"}
|
| 15 |
pydantic = "<2"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
[tool.poetry.group.dev.dependencies]
|
|
|
|
| 9 |
]
|
| 10 |
|
| 11 |
[tool.poetry.dependencies]
|
| 12 |
+
python = ">=3.11,<3.12"
|
| 13 |
uvicorn = "^0.23.2"
|
| 14 |
langserve = {extras = ["server"], version = ">=0.0.30"}
|
| 15 |
pydantic = "<2"
|
| 16 |
+
tqdm = "^4.66.1"
|
| 17 |
+
unstructured = {extras = ["all-docs"], version = "^0.12.2"}
|
| 18 |
+
langchain-experimental = "^0.0.49"
|
| 19 |
+
python-dotenv = "^1.0.0"
|
| 20 |
+
openai = "^1.9.0"
|
| 21 |
+
tiktoken = "^0.5.2"
|
| 22 |
+
langchain-openai = "^0.0.3"
|
| 23 |
+
psycopg = "^3.1.17"
|
| 24 |
+
pgvector = "^0.2.4"
|
| 25 |
|
| 26 |
|
| 27 |
[tool.poetry.group.dev.dependencies]
|