Spaces:
Runtime error
Runtime error
Tried vectorstore cache but failed.
Browse files- app.py +38 -11
- getVectorstore.py +16 -9
- pyproject.toml +1 -1
- uv.lock +22 -6
app.py
CHANGED
|
@@ -34,6 +34,7 @@ async def on_chat_start():
|
|
| 34 |
).send()
|
| 35 |
|
| 36 |
file = files[0]
|
|
|
|
| 37 |
|
| 38 |
doc = pymupdf.Document(file.path)
|
| 39 |
toc = doc.get_toc()
|
|
@@ -61,12 +62,18 @@ async def on_chat_start():
|
|
| 61 |
# need a rect that will exclude headers and footers
|
| 62 |
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
|
| 63 |
|
| 64 |
-
#
|
| 65 |
extracted_text = ""
|
|
|
|
|
|
|
| 66 |
for page in doc.pages():
|
| 67 |
-
if page.number in
|
|
|
|
|
|
|
| 68 |
# print(page.get_text(clip=rect))
|
| 69 |
extracted_text += page.get_text(clip=rect)
|
|
|
|
|
|
|
| 70 |
msg = cl.Message(
|
| 71 |
content=f"""Processing selected file: `{file.name}`...
|
| 72 |
Extraction beginning on page {start_page} and ending on page {end_page}.
|
|
@@ -102,29 +109,29 @@ async def on_chat_start():
|
|
| 102 |
|
| 103 |
await msg.send()
|
| 104 |
|
|
|
|
| 105 |
|
| 106 |
-
|
|
|
|
| 107 |
|
| 108 |
-
document_titles = ["protocol.pdf", "consent.pdf"]
|
| 109 |
|
| 110 |
-
# protocol_retriever = qdrant_vectorstore.as_retriever()
|
| 111 |
-
|
| 112 |
-
# protocol_retriever = create_protocol_retriever(document_titles)
|
| 113 |
protocol_retriever = qdrant_vectorstore.as_retriever(
|
| 114 |
search_kwargs={
|
| 115 |
'filter': rest.Filter(
|
| 116 |
-
|
| 117 |
rest.FieldCondition(
|
| 118 |
key="metadata.document_title",
|
| 119 |
-
|
| 120 |
)
|
| 121 |
]
|
| 122 |
),
|
| 123 |
-
'k':15,
|
| 124 |
}
|
| 125 |
)
|
|
|
|
| 126 |
|
| 127 |
-
|
|
|
|
| 128 |
# Create prompt
|
| 129 |
rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
|
| 130 |
|
|
@@ -134,3 +141,23 @@ async def on_chat_start():
|
|
| 134 |
{"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
|
| 135 |
| rag_prompt | llm | StrOutputParser()
|
| 136 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
).send()
|
| 35 |
|
| 36 |
file = files[0]
|
| 37 |
+
print(f"filename is {file.name}")
|
| 38 |
|
| 39 |
doc = pymupdf.Document(file.path)
|
| 40 |
toc = doc.get_toc()
|
|
|
|
| 62 |
# need a rect that will exclude headers and footers
|
| 63 |
rect = pymupdf.Rect(0.0, 100.0, 612.0, 650.0)
|
| 64 |
|
| 65 |
+
#capture the first 2 page
|
| 66 |
extracted_text = ""
|
| 67 |
+
|
| 68 |
+
|
| 69 |
for page in doc.pages():
|
| 70 |
+
if page.number in [0, 1, 2]:
|
| 71 |
+
extracted_text += page.get_text()
|
| 72 |
+
elif page.number in range(start_page-1, end_page):
|
| 73 |
# print(page.get_text(clip=rect))
|
| 74 |
extracted_text += page.get_text(clip=rect)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
msg = cl.Message(
|
| 78 |
content=f"""Processing selected file: `{file.name}`...
|
| 79 |
Extraction beginning on page {start_page} and ending on page {end_page}.
|
|
|
|
| 109 |
|
| 110 |
await msg.send()
|
| 111 |
|
| 112 |
+
qdrant_vectorstore = getVectorstore(document, file.name)
|
| 113 |
|
| 114 |
+
protocol_retriever = qdrant_vectorstore.as_retriever(search_kwargs={"k":15})
|
| 115 |
+
# document_titles = [file.name]
|
| 116 |
|
|
|
|
| 117 |
|
|
|
|
|
|
|
|
|
|
| 118 |
protocol_retriever = qdrant_vectorstore.as_retriever(
|
| 119 |
search_kwargs={
|
| 120 |
'filter': rest.Filter(
|
| 121 |
+
must=[
|
| 122 |
rest.FieldCondition(
|
| 123 |
key="metadata.document_title",
|
| 124 |
+
match=rest.MatchAny(any=[file.name])
|
| 125 |
)
|
| 126 |
]
|
| 127 |
),
|
| 128 |
+
'k': 15,
|
| 129 |
}
|
| 130 |
)
|
| 131 |
+
# # protocol_retriever = qdrant_vectorstore.as_retriever()
|
| 132 |
|
| 133 |
+
# protocol_retriever = create_protocol_retriever(document_titles)
|
| 134 |
+
|
| 135 |
# Create prompt
|
| 136 |
rag_prompt = ChatPromptTemplate.from_template(prompts.rag_prompt_template)
|
| 137 |
|
|
|
|
| 141 |
{"context": itemgetter("question") | protocol_retriever, "question": itemgetter("question")}
|
| 142 |
| rag_prompt | llm | StrOutputParser()
|
| 143 |
)
|
| 144 |
+
|
| 145 |
+
from datetime import date
|
| 146 |
+
# Heading for top of ICF document
|
| 147 |
+
protocol_title = rag_chain.invoke({"question": "What is the exact title of this protocol? Only return the title itself without any other description."})
|
| 148 |
+
principal_investigator = rag_chain.invoke({"question":"What is the name of the principal investigator of the study? Only return the name itself without any other description."})
|
| 149 |
+
support = rag_chain.invoke({"question":"What agency is funding the study? Only return the name of the agency without any other description."})
|
| 150 |
+
version_date = date.today().strftime("%B %d, %Y")
|
| 151 |
+
|
| 152 |
+
msg = cl.Message(
|
| 153 |
+
content=f"""
|
| 154 |
+
**Study Title:** {protocol_title}
|
| 155 |
+
**Principal Investigator:** {principal_investigator}
|
| 156 |
+
**Version Date:** {version_date}
|
| 157 |
+
**Source of Support:** {support}
|
| 158 |
+
---
|
| 159 |
+
"""
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
await msg.send()
|
| 163 |
+
|
getVectorstore.py
CHANGED
|
@@ -19,29 +19,36 @@ This could also be useful if different versions of documents are in existence.
|
|
| 19 |
recreate a large vectorstore. But the user could select the most recent version.
|
| 20 |
"""
|
| 21 |
|
| 22 |
-
|
| 23 |
def get_document_hash(doc_content):
|
| 24 |
"""Generate a unique hash for the document content."""
|
| 25 |
return hashlib.md5(doc_content.encode()).hexdigest()
|
| 26 |
|
| 27 |
-
def getVectorstore(document,
|
| 28 |
# Add a unique hash to your documents
|
| 29 |
for doc in document:
|
| 30 |
doc.metadata['content_hash'] = get_document_hash(doc.page_content)
|
| 31 |
|
| 32 |
# Add the document title
|
| 33 |
for doc in document:
|
| 34 |
-
doc.metadata['document_title'] =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
client = QdrantClient( url=qdrant_url)
|
| 37 |
-
|
|
|
|
| 38 |
# If the collection exists, then we need to check to see if our document is already
|
| 39 |
# present, in which case we would not want to store it again.
|
| 40 |
-
if client.collection_exists(
|
| 41 |
print("Collection exists")
|
| 42 |
qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
|
| 43 |
embedding=embedding_model,
|
| 44 |
-
collection_name=
|
| 45 |
url=qdrant_url
|
| 46 |
# location = ":memory:"
|
| 47 |
)
|
|
@@ -61,7 +68,7 @@ def getVectorstore(document, file_path):
|
|
| 61 |
)
|
| 62 |
|
| 63 |
scroll_results = client.scroll(
|
| 64 |
-
collection_name=
|
| 65 |
scroll_filter=scroll_filter,
|
| 66 |
limit=len(document) # Adjust this if you have a large number of documents
|
| 67 |
)
|
|
@@ -82,8 +89,8 @@ def getVectorstore(document, file_path):
|
|
| 82 |
qdrant_vectorstore = QdrantVectorStore.from_documents(
|
| 83 |
documents=document,
|
| 84 |
embedding=embedding_model,
|
| 85 |
-
collection_name=
|
| 86 |
# location = ":memory:"
|
| 87 |
url=qdrant_url
|
| 88 |
)
|
| 89 |
-
return qdrant_vectorstore
|
|
|
|
| 19 |
recreate a large vectorstore. But the user could select the most recent version.
|
| 20 |
"""
|
| 21 |
|
|
|
|
| 22 |
def get_document_hash(doc_content):
|
| 23 |
"""Generate a unique hash for the document content."""
|
| 24 |
return hashlib.md5(doc_content.encode()).hexdigest()
|
| 25 |
|
| 26 |
+
def getVectorstore(document, file_name):
|
| 27 |
# Add a unique hash to your documents
|
| 28 |
for doc in document:
|
| 29 |
doc.metadata['content_hash'] = get_document_hash(doc.page_content)
|
| 30 |
|
| 31 |
# Add the document title
|
| 32 |
for doc in document:
|
| 33 |
+
doc.metadata['document_title'] = file_name
|
| 34 |
+
|
| 35 |
+
# Add page to metadata
|
| 36 |
+
for i, doc in enumerate(document):
|
| 37 |
+
doc.metadata['source'] = f"source_{i}"
|
| 38 |
|
| 39 |
+
# collection_name = f"pdf_to_parse_{uuid.uuid4()}"
|
| 40 |
+
collection_name = "protocol_collection"
|
| 41 |
+
|
| 42 |
client = QdrantClient( url=qdrant_url)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
# If the collection exists, then we need to check to see if our document is already
|
| 46 |
# present, in which case we would not want to store it again.
|
| 47 |
+
if client.collection_exists(collection_name):
|
| 48 |
print("Collection exists")
|
| 49 |
qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
|
| 50 |
embedding=embedding_model,
|
| 51 |
+
collection_name=collection_name,
|
| 52 |
url=qdrant_url
|
| 53 |
# location = ":memory:"
|
| 54 |
)
|
|
|
|
| 68 |
)
|
| 69 |
|
| 70 |
scroll_results = client.scroll(
|
| 71 |
+
collection_name=collection_name,
|
| 72 |
scroll_filter=scroll_filter,
|
| 73 |
limit=len(document) # Adjust this if you have a large number of documents
|
| 74 |
)
|
|
|
|
| 89 |
qdrant_vectorstore = QdrantVectorStore.from_documents(
|
| 90 |
documents=document,
|
| 91 |
embedding=embedding_model,
|
| 92 |
+
collection_name=collection_name,
|
| 93 |
# location = ":memory:"
|
| 94 |
url=qdrant_url
|
| 95 |
)
|
| 96 |
+
return qdrant_vectorstore
|
pyproject.toml
CHANGED
|
@@ -14,8 +14,8 @@ dependencies = [
|
|
| 14 |
"langchain-openai>=0.2.14",
|
| 15 |
"langgraph>=0.2.60",
|
| 16 |
"pymupdf>=1.25.1",
|
| 17 |
-
"chainlit>=1.1.202",
|
| 18 |
"websockets>=14.1",
|
|
|
|
| 19 |
]
|
| 20 |
|
| 21 |
[dependency-groups]
|
|
|
|
| 14 |
"langchain-openai>=0.2.14",
|
| 15 |
"langgraph>=0.2.60",
|
| 16 |
"pymupdf>=1.25.1",
|
|
|
|
| 17 |
"websockets>=14.1",
|
| 18 |
+
"chainlit>=1.1.202",
|
| 19 |
]
|
| 20 |
|
| 21 |
[dependency-groups]
|
uv.lock
CHANGED
|
@@ -258,7 +258,7 @@ dependencies = [
|
|
| 258 |
{ name = "pydantic", marker = "python_full_version >= '3.13'" },
|
| 259 |
{ name = "pyjwt", marker = "python_full_version >= '3.13'" },
|
| 260 |
{ name = "python-dotenv", marker = "python_full_version >= '3.13'" },
|
| 261 |
-
{ name = "python-multipart", marker = "python_full_version >= '3.13'" },
|
| 262 |
{ name = "starlette", version = "0.37.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
| 263 |
{ name = "syncer", marker = "python_full_version >= '3.13'" },
|
| 264 |
{ name = "tomli", marker = "python_full_version >= '3.13'" },
|
|
@@ -273,7 +273,7 @@ wheels = [
|
|
| 273 |
|
| 274 |
[[package]]
|
| 275 |
name = "chainlit"
|
| 276 |
-
version = "
|
| 277 |
source = { registry = "https://pypi.org/simple" }
|
| 278 |
resolution-markers = [
|
| 279 |
"python_full_version < '3.12.4'",
|
|
@@ -295,7 +295,7 @@ dependencies = [
|
|
| 295 |
{ name = "pydantic", marker = "python_full_version < '3.13'" },
|
| 296 |
{ name = "pyjwt", marker = "python_full_version < '3.13'" },
|
| 297 |
{ name = "python-dotenv", marker = "python_full_version < '3.13'" },
|
| 298 |
-
{ name = "python-multipart", marker = "python_full_version < '3.13'" },
|
| 299 |
{ name = "python-socketio", marker = "python_full_version < '3.13'" },
|
| 300 |
{ name = "starlette", version = "0.41.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
| 301 |
{ name = "syncer", marker = "python_full_version < '3.13'" },
|
|
@@ -304,9 +304,9 @@ dependencies = [
|
|
| 304 |
{ name = "uvicorn", marker = "python_full_version < '3.13'" },
|
| 305 |
{ name = "watchfiles", marker = "python_full_version < '3.13'" },
|
| 306 |
]
|
| 307 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 308 |
wheels = [
|
| 309 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 310 |
]
|
| 311 |
|
| 312 |
[[package]]
|
|
@@ -783,7 +783,7 @@ version = "0.1.0"
|
|
| 783 |
source = { virtual = "." }
|
| 784 |
dependencies = [
|
| 785 |
{ name = "chainlit", version = "1.1.202", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
| 786 |
-
{ name = "chainlit", version = "
|
| 787 |
{ name = "langchain" },
|
| 788 |
{ name = "langchain-community" },
|
| 789 |
{ name = "langchain-openai" },
|
|
@@ -1808,11 +1808,27 @@ wheels = [
|
|
| 1808 |
name = "python-multipart"
|
| 1809 |
version = "0.0.9"
|
| 1810 |
source = { registry = "https://pypi.org/simple" }
|
|
|
|
|
|
|
|
|
|
| 1811 |
sdist = { url = "https://files.pythonhosted.org/packages/5c/0f/9c55ac6c84c0336e22a26fa84ca6c51d58d7ac3a2d78b0dfa8748826c883/python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026", size = 31516 }
|
| 1812 |
wheels = [
|
| 1813 |
{ url = "https://files.pythonhosted.org/packages/3d/47/444768600d9e0ebc82f8e347775d24aef8f6348cf00e9fa0e81910814e6d/python_multipart-0.0.9-py3-none-any.whl", hash = "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215", size = 22299 },
|
| 1814 |
]
|
| 1815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1816 |
[[package]]
|
| 1817 |
name = "python-socketio"
|
| 1818 |
version = "5.12.1"
|
|
|
|
| 258 |
{ name = "pydantic", marker = "python_full_version >= '3.13'" },
|
| 259 |
{ name = "pyjwt", marker = "python_full_version >= '3.13'" },
|
| 260 |
{ name = "python-dotenv", marker = "python_full_version >= '3.13'" },
|
| 261 |
+
{ name = "python-multipart", version = "0.0.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
| 262 |
{ name = "starlette", version = "0.37.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
| 263 |
{ name = "syncer", marker = "python_full_version >= '3.13'" },
|
| 264 |
{ name = "tomli", marker = "python_full_version >= '3.13'" },
|
|
|
|
| 273 |
|
| 274 |
[[package]]
|
| 275 |
name = "chainlit"
|
| 276 |
+
version = "2.0.0"
|
| 277 |
source = { registry = "https://pypi.org/simple" }
|
| 278 |
resolution-markers = [
|
| 279 |
"python_full_version < '3.12.4'",
|
|
|
|
| 295 |
{ name = "pydantic", marker = "python_full_version < '3.13'" },
|
| 296 |
{ name = "pyjwt", marker = "python_full_version < '3.13'" },
|
| 297 |
{ name = "python-dotenv", marker = "python_full_version < '3.13'" },
|
| 298 |
+
{ name = "python-multipart", version = "0.0.18", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
| 299 |
{ name = "python-socketio", marker = "python_full_version < '3.13'" },
|
| 300 |
{ name = "starlette", version = "0.41.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
| 301 |
{ name = "syncer", marker = "python_full_version < '3.13'" },
|
|
|
|
| 304 |
{ name = "uvicorn", marker = "python_full_version < '3.13'" },
|
| 305 |
{ name = "watchfiles", marker = "python_full_version < '3.13'" },
|
| 306 |
]
|
| 307 |
+
sdist = { url = "https://files.pythonhosted.org/packages/45/24/424679b769664876093b3e42167911535d1739bc1bc88f3963c69affed9e/chainlit-2.0.0.tar.gz", hash = "sha256:47b3a274a20cefb443f356d69f1c6a48818d67eb4a11552c749bfa6f414423ed", size = 4637040 }
|
| 308 |
wheels = [
|
| 309 |
+
{ url = "https://files.pythonhosted.org/packages/87/2a/e2bbb86fc3a34c7bf798644edb95bf14fd79a8b3f6c99e4b27e5df1e24f0/chainlit-2.0.0-py3-none-any.whl", hash = "sha256:2b58ac6b513d94aef0380d1d68b73f74718c0c844586b050ce8d5e0a82eb8133", size = 4703622 },
|
| 310 |
]
|
| 311 |
|
| 312 |
[[package]]
|
|
|
|
| 783 |
source = { virtual = "." }
|
| 784 |
dependencies = [
|
| 785 |
{ name = "chainlit", version = "1.1.202", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
|
| 786 |
+
{ name = "chainlit", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
|
| 787 |
{ name = "langchain" },
|
| 788 |
{ name = "langchain-community" },
|
| 789 |
{ name = "langchain-openai" },
|
|
|
|
| 1808 |
name = "python-multipart"
|
| 1809 |
version = "0.0.9"
|
| 1810 |
source = { registry = "https://pypi.org/simple" }
|
| 1811 |
+
resolution-markers = [
|
| 1812 |
+
"python_full_version >= '3.13'",
|
| 1813 |
+
]
|
| 1814 |
sdist = { url = "https://files.pythonhosted.org/packages/5c/0f/9c55ac6c84c0336e22a26fa84ca6c51d58d7ac3a2d78b0dfa8748826c883/python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026", size = 31516 }
|
| 1815 |
wheels = [
|
| 1816 |
{ url = "https://files.pythonhosted.org/packages/3d/47/444768600d9e0ebc82f8e347775d24aef8f6348cf00e9fa0e81910814e6d/python_multipart-0.0.9-py3-none-any.whl", hash = "sha256:97ca7b8ea7b05f977dc3849c3ba99d51689822fab725c3703af7c866a0c2b215", size = 22299 },
|
| 1817 |
]
|
| 1818 |
|
| 1819 |
+
[[package]]
|
| 1820 |
+
name = "python-multipart"
|
| 1821 |
+
version = "0.0.18"
|
| 1822 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1823 |
+
resolution-markers = [
|
| 1824 |
+
"python_full_version < '3.12.4'",
|
| 1825 |
+
"python_full_version >= '3.12.4' and python_full_version < '3.13'",
|
| 1826 |
+
]
|
| 1827 |
+
sdist = { url = "https://files.pythonhosted.org/packages/b4/86/b6b38677dec2e2e7898fc5b6f7e42c2d011919a92d25339451892f27b89c/python_multipart-0.0.18.tar.gz", hash = "sha256:7a68db60c8bfb82e460637fa4750727b45af1d5e2ed215593f917f64694d34fe", size = 36622 }
|
| 1828 |
+
wheels = [
|
| 1829 |
+
{ url = "https://files.pythonhosted.org/packages/13/6b/b60f47101ba2cac66b4a83246630e68ae9bbe2e614cbae5f4465f46dee13/python_multipart-0.0.18-py3-none-any.whl", hash = "sha256:efe91480f485f6a361427a541db4796f9e1591afc0fb8e7a4ba06bfbc6708996", size = 24389 },
|
| 1830 |
+
]
|
| 1831 |
+
|
| 1832 |
[[package]]
|
| 1833 |
name = "python-socketio"
|
| 1834 |
version = "5.12.1"
|