Spaces:
Paused
Paused
Rafael Uzarowski commited on
feat: DocumentQuery initial version
Browse files- prompts/default/agent.system.tool.document_query.md +58 -0
- prompts/default/agent.system.tools.md +2 -0
- prompts/default/fw.document_query.optmimize_query.md +7 -0
- prompts/default/fw.document_query.system_prompt.md +5 -0
- prompts/default/{tool.knowledge.response.md → fw.knowledge_tool.response.md} +0 -0
- python/helpers/document_query.py +702 -0
- python/tools/document_query.py +20 -0
- python/tools/knowledge_tool.py +44 -2
- requirements.txt +7 -2
prompts/default/agent.system.tool.document_query.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### document_query:
|
| 2 |
+
This tool can be used to read or analyze remote and local documents.
|
| 3 |
+
It can be used to:
|
| 4 |
+
* Get webpage or remote document text content
|
| 5 |
+
* Get local document text content
|
| 6 |
+
* Answer queries about a webpage, remote or local document
|
| 7 |
+
By default, when the "queries" argument is empty, this tool returns the text content of the document retrieved using OCR.
|
| 8 |
+
Additionally, you can pass a list of "queries" - in this case, the tool returns the answers to all the passed queries about the document.
|
| 9 |
+
!!! This is a universal document reader qnd query tool
|
| 10 |
+
!!! Supported document dormats: HTML, PDF, Office Documents (word,excel, powerpoint), Textfiles and many more.
|
| 11 |
+
|
| 12 |
+
#### Arguments:
|
| 13 |
+
* "document" (string) : The web address or local path to the document in question. Webdocuments need "http://" or "https://" protocol prefix. For local files the "file:" protocol prefix is optional. Local files MUST be passed with full filesystem path.
|
| 14 |
+
* "queries" (Optional, list[str]) : Optionally, here you can pass one or more queries to be answered (using and/or about) the document
|
| 15 |
+
|
| 16 |
+
#### Usage example 1:
|
| 17 |
+
##### Request:
|
| 18 |
+
```json
|
| 19 |
+
{
|
| 20 |
+
"thoughts": [
|
| 21 |
+
"...",
|
| 22 |
+
],
|
| 23 |
+
"tool_name": "document_query",
|
| 24 |
+
"tool_args": {
|
| 25 |
+
"document": "https://...somexample",
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
```
|
| 29 |
+
##### Response:
|
| 30 |
+
```plaintext
|
| 31 |
+
... Here is the entire content of the web document requested ...
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
#### Usage example 2:
|
| 35 |
+
##### Request:
|
| 36 |
+
```json
|
| 37 |
+
{
|
| 38 |
+
"thoughts": [
|
| 39 |
+
"...",
|
| 40 |
+
],
|
| 41 |
+
"tool_name": "document_query",
|
| 42 |
+
"tool_args": {
|
| 43 |
+
"document": "https://...somexample",
|
| 44 |
+
"queries": [
|
| 45 |
+
"What is the topic?",
|
| 46 |
+
"Who is the audience?"
|
| 47 |
+
]
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
```
|
| 51 |
+
##### Response:
|
| 52 |
+
```plaintext
|
| 53 |
+
# What is the topic?
|
| 54 |
+
... Description of the document topic ...
|
| 55 |
+
|
| 56 |
+
# Who is the audience?
|
| 57 |
+
... The intended document audience list with short descriptions ...
|
| 58 |
+
```
|
prompts/default/agent.system.tools.md
CHANGED
|
@@ -19,3 +19,5 @@
|
|
| 19 |
{{ include './agent.system.tool.browser.md' }}
|
| 20 |
|
| 21 |
{{ include './agent.system.tool.scheduler.md' }}
|
|
|
|
|
|
|
|
|
| 19 |
{{ include './agent.system.tool.browser.md' }}
|
| 20 |
|
| 21 |
{{ include './agent.system.tool.scheduler.md' }}
|
| 22 |
+
|
| 23 |
+
{{ include './agent.system.tool.document_query.md' }}
|
prompts/default/fw.document_query.optmimize_query.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI assistant being part of a larger RAG system based on vector similarity search.
|
| 2 |
+
Your job is to take a human written question and convert it into a concise vector store search query.
|
| 3 |
+
The goal is to yield as many correct results and as few false positives as possible.
|
| 4 |
+
!! You will be given a "Search Query".
|
| 5 |
+
!! The response should ONLY contain the optimized search query
|
| 6 |
+
!! Do not include any other text, confirmations or explanations. Do not prefix your response.
|
| 7 |
+
!! You are working as a tool and not as a conversational agent.
|
prompts/default/fw.document_query.system_prompt.md
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI assistant who can answer questions about a given document text.
|
| 2 |
+
The assistant is part of a larger application that is used to answer questions about a document.
|
| 3 |
+
The assistant is given a document and a list of queries and the assistant must answer the quries based on the document.
|
| 4 |
+
!! The response should be in markdown format.
|
| 5 |
+
!! The response should only include the queries as headings and the answers to the queries. The markdown should contain paragraphs with "#### <Query>" as headings (<Query> being the original query) followed by the query answer as the paragraph text content.
|
prompts/default/{tool.knowledge.response.md → fw.knowledge_tool.response.md}
RENAMED
|
File without changes
|
python/helpers/document_query.py
ADDED
|
@@ -0,0 +1,702 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import mimetypes
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
import aiohttp
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
os.environ["USER_AGENT"] = "@mixedbread-ai/unstructured" # noqa E402
|
| 8 |
+
from langchain_unstructured import UnstructuredLoader # noqa E402
|
| 9 |
+
|
| 10 |
+
from urllib.parse import urlparse
|
| 11 |
+
from typing import Sequence, List, Optional, Tuple
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
|
| 14 |
+
from langchain_community.document_loaders import AsyncHtmlLoader
|
| 15 |
+
from langchain_community.document_loaders.text import TextLoader
|
| 16 |
+
from langchain_community.document_loaders.pdf import PyMuPDFLoader
|
| 17 |
+
from langchain_community.document_transformers import MarkdownifyTransformer
|
| 18 |
+
from langchain_community.document_loaders.parsers.images import TesseractBlobParser
|
| 19 |
+
|
| 20 |
+
from langchain_core.documents import Document
|
| 21 |
+
from langchain.prompts import ChatPromptTemplate
|
| 22 |
+
from langchain.schema import SystemMessage, HumanMessage
|
| 23 |
+
from langchain.storage import LocalFileStore
|
| 24 |
+
from langchain.embeddings import CacheBackedEmbeddings
|
| 25 |
+
|
| 26 |
+
from langchain_community.vectorstores import FAISS
|
| 27 |
+
import faiss
|
| 28 |
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
| 29 |
+
from langchain_community.vectorstores.utils import (
|
| 30 |
+
DistanceStrategy,
|
| 31 |
+
)
|
| 32 |
+
from langchain_core.embeddings import Embeddings
|
| 33 |
+
|
| 34 |
+
from python.helpers.print_style import PrintStyle
|
| 35 |
+
from python.helpers import files
|
| 36 |
+
from agent import Agent
|
| 37 |
+
import models
|
| 38 |
+
|
| 39 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class DocumentQueryStore:
|
| 43 |
+
"""
|
| 44 |
+
FAISS Store for document query results.
|
| 45 |
+
Manages documents identified by URI for storage, retrieval, and searching.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
# Default chunking parameters
|
| 49 |
+
DEFAULT_CHUNK_SIZE = 1000
|
| 50 |
+
DEFAULT_CHUNK_OVERLAP = 100
|
| 51 |
+
|
| 52 |
+
# Cache for initialized stores
|
| 53 |
+
_stores: dict[str, "DocumentQueryStore"] = {}
|
| 54 |
+
|
| 55 |
+
@staticmethod
|
| 56 |
+
async def get(agent: Agent):
|
| 57 |
+
"""Get or create a DocumentQueryStore instance for the specified agent."""
|
| 58 |
+
if not agent or not agent.config:
|
| 59 |
+
raise ValueError("Agent and agent config must be provided")
|
| 60 |
+
|
| 61 |
+
memory_subdir = agent.config.memory_subdir or "default"
|
| 62 |
+
store_key = f"{memory_subdir}/document_query"
|
| 63 |
+
|
| 64 |
+
if store_key not in DocumentQueryStore._stores:
|
| 65 |
+
# Initialize embeddings model from agent config
|
| 66 |
+
embeddings_model = agent.get_embedding_model()
|
| 67 |
+
|
| 68 |
+
# Initialize store
|
| 69 |
+
store = DocumentQueryStore(agent, embeddings_model, memory_subdir)
|
| 70 |
+
DocumentQueryStore._stores[store_key] = store
|
| 71 |
+
return store
|
| 72 |
+
else:
|
| 73 |
+
return DocumentQueryStore._stores[store_key]
|
| 74 |
+
|
| 75 |
+
@staticmethod
|
| 76 |
+
async def reload(agent: Agent):
|
| 77 |
+
"""Reload the DocumentQueryStore for the specified agent."""
|
| 78 |
+
memory_subdir = agent.config.memory_subdir or "default"
|
| 79 |
+
store_key = f"{memory_subdir}/document_query"
|
| 80 |
+
|
| 81 |
+
if store_key in DocumentQueryStore._stores:
|
| 82 |
+
del DocumentQueryStore._stores[store_key]
|
| 83 |
+
|
| 84 |
+
return await DocumentQueryStore.get(agent)
|
| 85 |
+
|
| 86 |
+
def __init__(
|
| 87 |
+
self,
|
| 88 |
+
agent: Agent,
|
| 89 |
+
embeddings_model: Embeddings,
|
| 90 |
+
memory_subdir: str,
|
| 91 |
+
):
|
| 92 |
+
"""Initialize a DocumentQueryStore instance."""
|
| 93 |
+
self.agent = agent
|
| 94 |
+
self.memory_subdir = memory_subdir
|
| 95 |
+
|
| 96 |
+
# Get directory paths
|
| 97 |
+
db_dir = self._get_db_dir()
|
| 98 |
+
em_dir = os.path.join(db_dir, "embeddings")
|
| 99 |
+
|
| 100 |
+
# Create directories
|
| 101 |
+
os.makedirs(db_dir, exist_ok=True)
|
| 102 |
+
os.makedirs(em_dir, exist_ok=True)
|
| 103 |
+
|
| 104 |
+
# Setup embeddings cache
|
| 105 |
+
store = LocalFileStore(em_dir)
|
| 106 |
+
self.embeddings = CacheBackedEmbeddings.from_bytes_store(
|
| 107 |
+
embeddings_model,
|
| 108 |
+
store,
|
| 109 |
+
namespace=f"document_query_{getattr(embeddings_model, 'model', getattr(embeddings_model, 'model_name', 'default'))}"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Initialize vector store
|
| 113 |
+
index_path = os.path.join(db_dir, "index.faiss")
|
| 114 |
+
docstore_path = os.path.join(db_dir, "docstore.json")
|
| 115 |
+
|
| 116 |
+
if os.path.exists(index_path) and os.path.exists(docstore_path):
|
| 117 |
+
PrintStyle.standard(f"Loading existing vector store from {db_dir}")
|
| 118 |
+
try:
|
| 119 |
+
self.vectorstore = FAISS.load_local(
|
| 120 |
+
folder_path=db_dir,
|
| 121 |
+
embeddings=self.embeddings,
|
| 122 |
+
allow_dangerous_deserialization=True,
|
| 123 |
+
distance_strategy=DistanceStrategy.COSINE,
|
| 124 |
+
)
|
| 125 |
+
except Exception as e:
|
| 126 |
+
PrintStyle.error(f"Error loading vector store: {str(e)}")
|
| 127 |
+
self._initialize_new_vectorstore()
|
| 128 |
+
else:
|
| 129 |
+
PrintStyle.standard("Creating new vector store in '{db_dir}'")
|
| 130 |
+
self._initialize_new_vectorstore()
|
| 131 |
+
|
| 132 |
+
def _initialize_new_vectorstore(self):
|
| 133 |
+
"""Initialize a new vector store."""
|
| 134 |
+
dimension = len(self.embeddings.embed_query("test"))
|
| 135 |
+
index = faiss.IndexFlatIP(dimension)
|
| 136 |
+
self.vectorstore = FAISS(
|
| 137 |
+
embedding_function=self.embeddings,
|
| 138 |
+
index=index,
|
| 139 |
+
docstore=InMemoryDocstore(),
|
| 140 |
+
index_to_docstore_id={},
|
| 141 |
+
distance_strategy=DistanceStrategy.COSINE,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
def _get_db_dir(self) -> str:
|
| 145 |
+
"""Get the absolute path to the database directory."""
|
| 146 |
+
return files.get_abs_path("memory", self.memory_subdir, "document_query")
|
| 147 |
+
|
| 148 |
+
def _save_vectorstore(self):
|
| 149 |
+
"""Save the vector store to disk."""
|
| 150 |
+
db_dir = self._get_db_dir()
|
| 151 |
+
PrintStyle.standard(f"Saving vector store to {db_dir}")
|
| 152 |
+
self.vectorstore.save_local(folder_path=db_dir)
|
| 153 |
+
PrintStyle.standard(f"Vector store saved with {len(self.vectorstore.index_to_docstore_id)} documents")
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def _normalize_uri(uri: str) -> str:
|
| 157 |
+
"""
|
| 158 |
+
Normalize a document URI to ensure consistent lookup.
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
uri: The URI to normalize
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
Normalized URI
|
| 165 |
+
"""
|
| 166 |
+
# Convert to lowercase
|
| 167 |
+
normalized = uri.lower()
|
| 168 |
+
|
| 169 |
+
# Parse the URL to get scheme
|
| 170 |
+
parsed = urlparse(normalized)
|
| 171 |
+
scheme = parsed.scheme or "file"
|
| 172 |
+
|
| 173 |
+
# Normalize based on scheme
|
| 174 |
+
if scheme == "file":
|
| 175 |
+
if not normalized.startswith("file:"):
|
| 176 |
+
normalized = "file:" + normalized
|
| 177 |
+
if normalized.startswith("file://"):
|
| 178 |
+
normalized = normalized.replace("file://", "file:")
|
| 179 |
+
elif scheme in ["http", "https"]:
|
| 180 |
+
# Always use https for web URLs
|
| 181 |
+
normalized = normalized.replace("http://", "https://")
|
| 182 |
+
|
| 183 |
+
return normalized
|
| 184 |
+
|
| 185 |
+
async def add_document(self, text: str, document_uri: str, metadata: dict = None) -> bool:
|
| 186 |
+
"""
|
| 187 |
+
Add a document to the store with the given URI.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
text: The document text content
|
| 191 |
+
document_uri: The URI that uniquely identifies this document
|
| 192 |
+
metadata: Optional metadata for the document
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
True if successful, False otherwise
|
| 196 |
+
"""
|
| 197 |
+
# Normalize the URI
|
| 198 |
+
document_uri = self._normalize_uri(document_uri)
|
| 199 |
+
|
| 200 |
+
# Delete existing document if it exists to avoid duplicates
|
| 201 |
+
await self.delete_document(document_uri)
|
| 202 |
+
|
| 203 |
+
# Initialize metadata
|
| 204 |
+
doc_metadata = metadata or {}
|
| 205 |
+
doc_metadata["document_uri"] = document_uri
|
| 206 |
+
doc_metadata["timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 207 |
+
|
| 208 |
+
# Split text into chunks
|
| 209 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 210 |
+
chunk_size=self.DEFAULT_CHUNK_SIZE,
|
| 211 |
+
chunk_overlap=self.DEFAULT_CHUNK_OVERLAP
|
| 212 |
+
)
|
| 213 |
+
chunks = text_splitter.split_text(text)
|
| 214 |
+
|
| 215 |
+
# Create documents
|
| 216 |
+
docs = []
|
| 217 |
+
for i, chunk in enumerate(chunks):
|
| 218 |
+
chunk_metadata = doc_metadata.copy()
|
| 219 |
+
chunk_metadata["chunk_index"] = i
|
| 220 |
+
chunk_metadata["total_chunks"] = len(chunks)
|
| 221 |
+
docs.append(Document(page_content=chunk, metadata=chunk_metadata))
|
| 222 |
+
|
| 223 |
+
if not docs:
|
| 224 |
+
PrintStyle.error(f"No chunks created for document: {document_uri}")
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
# Apply rate limiter
|
| 228 |
+
try:
|
| 229 |
+
docs_text = "".join(chunk.page_content for chunk in docs)
|
| 230 |
+
await self.agent.rate_limiter(
|
| 231 |
+
model_config=self.agent.config.embeddings_model,
|
| 232 |
+
input=docs_text
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Add documents to vector store
|
| 236 |
+
self.vectorstore.add_documents(documents=docs)
|
| 237 |
+
self._save_vectorstore()
|
| 238 |
+
PrintStyle.standard(f"Added document '{document_uri}' with {len(docs)} chunks")
|
| 239 |
+
return True
|
| 240 |
+
except Exception as e:
|
| 241 |
+
PrintStyle.error(f"Error adding document '{document_uri}': {str(e)}")
|
| 242 |
+
return False
|
| 243 |
+
|
| 244 |
+
async def get_document(self, document_uri: str) -> Optional[Document]:
|
| 245 |
+
"""
|
| 246 |
+
Retrieve a document by its URI.
|
| 247 |
+
|
| 248 |
+
Args:
|
| 249 |
+
document_uri: The URI of the document to retrieve
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
The complete document if found, None otherwise
|
| 253 |
+
"""
|
| 254 |
+
# Normalize the URI
|
| 255 |
+
document_uri = self._normalize_uri(document_uri)
|
| 256 |
+
|
| 257 |
+
# Get all chunks for this document
|
| 258 |
+
docs = await self._get_document_chunks(document_uri)
|
| 259 |
+
if not docs:
|
| 260 |
+
PrintStyle.error(f"Document not found: {document_uri}")
|
| 261 |
+
return None
|
| 262 |
+
|
| 263 |
+
# Combine chunks into a single document
|
| 264 |
+
chunks = sorted(docs, key=lambda x: x.metadata.get("chunk_index", 0))
|
| 265 |
+
full_content = "\n".join(chunk.page_content for chunk in chunks)
|
| 266 |
+
|
| 267 |
+
# Use metadata from first chunk
|
| 268 |
+
metadata = chunks[0].metadata.copy()
|
| 269 |
+
metadata.pop("chunk_index", None)
|
| 270 |
+
metadata.pop("total_chunks", None)
|
| 271 |
+
|
| 272 |
+
return Document(page_content=full_content, metadata=metadata)
|
| 273 |
+
|
| 274 |
+
async def _get_document_chunks(self, document_uri: str) -> List[Document]:
|
| 275 |
+
"""
|
| 276 |
+
Get all chunks for a document.
|
| 277 |
+
|
| 278 |
+
Args:
|
| 279 |
+
document_uri: The URI of the document
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
List of document chunks
|
| 283 |
+
"""
|
| 284 |
+
# Normalize the URI
|
| 285 |
+
document_uri = self._normalize_uri(document_uri)
|
| 286 |
+
|
| 287 |
+
# Access docstore directly
|
| 288 |
+
chunks = []
|
| 289 |
+
for doc_id, doc in self.vectorstore.docstore._dict.items(): # type: ignore
|
| 290 |
+
if isinstance(doc.metadata, dict) and doc.metadata.get("document_uri") == document_uri:
|
| 291 |
+
chunks.append(doc)
|
| 292 |
+
|
| 293 |
+
PrintStyle.standard(f"Found {len(chunks)} chunks for document: {document_uri}")
|
| 294 |
+
return chunks
|
| 295 |
+
|
| 296 |
+
async def document_exists(self, document_uri: str) -> bool:
|
| 297 |
+
"""
|
| 298 |
+
Check if a document exists in the store.
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
document_uri: The URI of the document to check
|
| 302 |
+
|
| 303 |
+
Returns:
|
| 304 |
+
True if the document exists, False otherwise
|
| 305 |
+
"""
|
| 306 |
+
# Normalize the URI
|
| 307 |
+
document_uri = self._normalize_uri(document_uri)
|
| 308 |
+
|
| 309 |
+
chunks = await self._get_document_chunks(document_uri)
|
| 310 |
+
return len(chunks) > 0
|
| 311 |
+
|
| 312 |
+
async def delete_document(self, document_uri: str) -> bool:
|
| 313 |
+
"""
|
| 314 |
+
Delete a document from the store.
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
document_uri: The URI of the document to delete
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
True if deleted, False if not found
|
| 321 |
+
"""
|
| 322 |
+
# Normalize the URI
|
| 323 |
+
document_uri = self._normalize_uri(document_uri)
|
| 324 |
+
|
| 325 |
+
chunks = await self._get_document_chunks(document_uri)
|
| 326 |
+
if not chunks:
|
| 327 |
+
return False
|
| 328 |
+
|
| 329 |
+
# Collect IDs to delete
|
| 330 |
+
ids_to_delete = []
|
| 331 |
+
for chunk in chunks:
|
| 332 |
+
for doc_id, doc_ref in self.vectorstore.docstore._dict.items(): # type: ignore
|
| 333 |
+
if doc_ref == chunk:
|
| 334 |
+
ids_to_delete.append(doc_id)
|
| 335 |
+
|
| 336 |
+
# Delete from vector store
|
| 337 |
+
if ids_to_delete:
|
| 338 |
+
self.vectorstore.delete(ids_to_delete)
|
| 339 |
+
self._save_vectorstore()
|
| 340 |
+
PrintStyle.standard(f"Deleted document '{document_uri}' with {len(ids_to_delete)} chunks")
|
| 341 |
+
return True
|
| 342 |
+
|
| 343 |
+
return False
|
| 344 |
+
|
| 345 |
+
async def expire_documents(self, older_than_days: float) -> int:
|
| 346 |
+
"""
|
| 347 |
+
Delete documents older than the specified number of days.
|
| 348 |
+
|
| 349 |
+
Args:
|
| 350 |
+
older_than_days: Number of days (can be fractional) before current time
|
| 351 |
+
|
| 352 |
+
Returns:
|
| 353 |
+
Number of documents deleted
|
| 354 |
+
"""
|
| 355 |
+
if older_than_days <= 0:
|
| 356 |
+
return 0
|
| 357 |
+
|
| 358 |
+
# Calculate cutoff timestamp
|
| 359 |
+
cutoff_date = datetime.now().timestamp() - (older_than_days * 24 * 60 * 60)
|
| 360 |
+
|
| 361 |
+
# Find expired documents
|
| 362 |
+
expired_uris = set()
|
| 363 |
+
|
| 364 |
+
# Check all documents in the store
|
| 365 |
+
for doc_id, doc in self.vectorstore.docstore._dict.items(): # type: ignore
|
| 366 |
+
if not isinstance(doc.metadata, dict):
|
| 367 |
+
continue
|
| 368 |
+
|
| 369 |
+
# Only process each document once (first chunk)
|
| 370 |
+
if doc.metadata.get("chunk_index", 0) != 0:
|
| 371 |
+
continue
|
| 372 |
+
|
| 373 |
+
doc_uri = doc.metadata.get("document_uri")
|
| 374 |
+
if not doc_uri:
|
| 375 |
+
continue
|
| 376 |
+
|
| 377 |
+
try:
|
| 378 |
+
# Check timestamp
|
| 379 |
+
timestamp_str = doc.metadata.get("timestamp")
|
| 380 |
+
if not timestamp_str:
|
| 381 |
+
continue
|
| 382 |
+
|
| 383 |
+
doc_timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").timestamp()
|
| 384 |
+
if doc_timestamp < cutoff_date:
|
| 385 |
+
expired_uris.add(doc_uri)
|
| 386 |
+
except (ValueError, TypeError):
|
| 387 |
+
# Skip documents with invalid timestamps
|
| 388 |
+
continue
|
| 389 |
+
|
| 390 |
+
# Delete expired documents
|
| 391 |
+
deleted_count = 0
|
| 392 |
+
for uri in expired_uris:
|
| 393 |
+
if await self.delete_document(uri):
|
| 394 |
+
deleted_count += 1
|
| 395 |
+
|
| 396 |
+
PrintStyle.standard(f"Expired {deleted_count} documents older than {older_than_days} days")
|
| 397 |
+
return deleted_count
|
| 398 |
+
|
| 399 |
+
async def search_documents(self, query: str, limit: int = 10, threshold: float = 0.5) -> List[Document]:
|
| 400 |
+
"""
|
| 401 |
+
Search for documents similar to the query across the entire store.
|
| 402 |
+
|
| 403 |
+
Args:
|
| 404 |
+
query: The search query string
|
| 405 |
+
limit: Maximum number of results to return
|
| 406 |
+
threshold: Minimum similarity score threshold (0-1)
|
| 407 |
+
|
| 408 |
+
Returns:
|
| 409 |
+
List of matching documents
|
| 410 |
+
"""
|
| 411 |
+
# Handle empty query
|
| 412 |
+
if not query:
|
| 413 |
+
PrintStyle.standard("Empty search query, returning empty results")
|
| 414 |
+
return []
|
| 415 |
+
|
| 416 |
+
# Apply rate limiter
|
| 417 |
+
await self.agent.rate_limiter(
|
| 418 |
+
model_config=self.agent.config.embeddings_model,
|
| 419 |
+
input=query
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
# Perform search
|
| 423 |
+
try:
|
| 424 |
+
results = self.vectorstore.similarity_search_with_score(
|
| 425 |
+
query=query,
|
| 426 |
+
k=limit,
|
| 427 |
+
score_threshold=threshold
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
# Extract documents from results (which are (doc, score) pairs)
|
| 431 |
+
docs = [doc for doc, score in results]
|
| 432 |
+
PrintStyle.standard(f"Search '{query}' returned {len(docs)} results")
|
| 433 |
+
return docs
|
| 434 |
+
except Exception as e:
|
| 435 |
+
PrintStyle.error(f"Error searching documents: {str(e)}")
|
| 436 |
+
return []
|
| 437 |
+
|
| 438 |
+
async def search_document(self, document_uri: str, query: str, limit: int = 10, threshold: float = 0.5) -> List[Document]:
|
| 439 |
+
"""
|
| 440 |
+
Search for content within a specific document.
|
| 441 |
+
|
| 442 |
+
Args:
|
| 443 |
+
document_uri: The URI of the document to search within
|
| 444 |
+
query: The search query string
|
| 445 |
+
limit: Maximum number of results to return
|
| 446 |
+
threshold: Minimum similarity score threshold (0-1)
|
| 447 |
+
|
| 448 |
+
Returns:
|
| 449 |
+
List of matching document chunks
|
| 450 |
+
"""
|
| 451 |
+
# Normalize the URI
|
| 452 |
+
document_uri = self._normalize_uri(document_uri)
|
| 453 |
+
|
| 454 |
+
# Handle empty query
|
| 455 |
+
if not query:
|
| 456 |
+
PrintStyle.standard("Empty search query, returning empty results")
|
| 457 |
+
return []
|
| 458 |
+
|
| 459 |
+
# Check if document exists
|
| 460 |
+
if not await self.document_exists(document_uri):
|
| 461 |
+
PrintStyle.error(f"Document not found: {document_uri}")
|
| 462 |
+
return []
|
| 463 |
+
|
| 464 |
+
# Apply rate limiter
|
| 465 |
+
await self.agent.rate_limiter(
|
| 466 |
+
model_config=self.agent.config.embeddings_model,
|
| 467 |
+
input=query
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
# Perform search with document filter
|
| 471 |
+
try:
|
| 472 |
+
# Create metadata filter function
|
| 473 |
+
def filter_fn(doc_metadata):
|
| 474 |
+
return doc_metadata.get("document_uri") == document_uri
|
| 475 |
+
|
| 476 |
+
results = self.vectorstore.similarity_search_with_score(
|
| 477 |
+
query=query,
|
| 478 |
+
k=limit,
|
| 479 |
+
score_threshold=threshold,
|
| 480 |
+
filter=filter_fn
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# Extract documents from results
|
| 484 |
+
docs = [doc for doc, score in results]
|
| 485 |
+
PrintStyle.standard(f"Search '{query}' in document '{document_uri}' returned {len(docs)} results")
|
| 486 |
+
|
| 487 |
+
# Try with lower threshold if no results
|
| 488 |
+
if not docs and threshold > 0.3:
|
| 489 |
+
PrintStyle.standard("No results found, trying with lower threshold (0.3)")
|
| 490 |
+
results = self.vectorstore.similarity_search_with_score(
|
| 491 |
+
query=query,
|
| 492 |
+
k=limit,
|
| 493 |
+
score_threshold=0.3,
|
| 494 |
+
filter=filter_fn
|
| 495 |
+
)
|
| 496 |
+
docs = [doc for doc, score in results]
|
| 497 |
+
PrintStyle.standard(f"Retry search returned {len(docs)} results")
|
| 498 |
+
|
| 499 |
+
return docs
|
| 500 |
+
except Exception as e:
|
| 501 |
+
PrintStyle.error(f"Error searching within document: {str(e)}")
|
| 502 |
+
return []
|
| 503 |
+
|
| 504 |
+
async def list_documents(self) -> List[str]:
|
| 505 |
+
"""
|
| 506 |
+
Get a list of all document URIs in the store.
|
| 507 |
+
|
| 508 |
+
Returns:
|
| 509 |
+
List of document URIs
|
| 510 |
+
"""
|
| 511 |
+
# Extract unique URIs
|
| 512 |
+
uris = set()
|
| 513 |
+
for doc in self.vectorstore.docstore._dict.values(): # type: ignore
|
| 514 |
+
if isinstance(doc.metadata, dict):
|
| 515 |
+
uri = doc.metadata.get("document_uri")
|
| 516 |
+
if uri:
|
| 517 |
+
uris.add(uri)
|
| 518 |
+
|
| 519 |
+
return sorted(list(uris))
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
class DocumentQueryHelper:
|
| 523 |
+
|
| 524 |
+
def __init__(self, agent: Agent):
|
| 525 |
+
self.agent = agent
|
| 526 |
+
self.store: DocumentQueryStore = asyncio.run(DocumentQueryStore.get(agent))
|
| 527 |
+
|
| 528 |
+
async def document_qa(self, document_uri: str, questions: Sequence[str]) -> Tuple[bool, str]:
|
| 529 |
+
_ = await self.document_get_content(document_uri)
|
| 530 |
+
content = ""
|
| 531 |
+
for question in questions:
|
| 532 |
+
human_content = f'Search Query: "{question}"'
|
| 533 |
+
system_content = self.agent.parse_prompt("fw.document_query.optmimize_query.md")
|
| 534 |
+
|
| 535 |
+
optimized_query = await self.agent.call_utility_model(
|
| 536 |
+
system=system_content,
|
| 537 |
+
message=human_content
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
chunks = await self.store.search_document(
|
| 541 |
+
document_uri=document_uri,
|
| 542 |
+
query=str(optimized_query),
|
| 543 |
+
limit=10000,
|
| 544 |
+
threshold=0.66
|
| 545 |
+
)
|
| 546 |
+
content += "\n\n----\n\n".join([chunk.page_content for chunk in chunks]) + "\n\n----\n\n"
|
| 547 |
+
|
| 548 |
+
if not content:
|
| 549 |
+
content = f"!!! No content found for document: {document_uri} matching queries: {json.dumps(questions)}"
|
| 550 |
+
return False, content
|
| 551 |
+
|
| 552 |
+
questions_str = "\n".join([f" * {question}" for question in questions])
|
| 553 |
+
|
| 554 |
+
qa_system_message = self.agent.parse_prompt("fw.document_query.system_prompt.md")
|
| 555 |
+
qa_user_message = f"# Document:\n{content}\n\n# Queries:\n{questions_str}"
|
| 556 |
+
|
| 557 |
+
ai_response = await self.agent.call_chat_model(
|
| 558 |
+
prompt=ChatPromptTemplate.from_messages([
|
| 559 |
+
SystemMessage(content=qa_system_message),
|
| 560 |
+
HumanMessage(content=qa_user_message),
|
| 561 |
+
])
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
return True, str(ai_response)
|
| 565 |
+
|
| 566 |
+
async def document_get_content(self, document_uri: str) -> str:
|
| 567 |
+
url = urlparse(document_uri)
|
| 568 |
+
scheme = url.scheme or "file"
|
| 569 |
+
mimetype, encoding = mimetypes.guess_type(document_uri)
|
| 570 |
+
mimetype = mimetype or "application/octet-stream"
|
| 571 |
+
|
| 572 |
+
if mimetype == "application/octet-stream":
|
| 573 |
+
if url.scheme in ["http", "https"]:
|
| 574 |
+
response: aiohttp.ClientResponse | None = None
|
| 575 |
+
retries = 0
|
| 576 |
+
last_error = ""
|
| 577 |
+
while not response and retries < 3:
|
| 578 |
+
try:
|
| 579 |
+
async with aiohttp.ClientSession() as session:
|
| 580 |
+
response = await session.head(document_uri, timeout=aiohttp.ClientTimeout(total=2.0), allow_redirects=True)
|
| 581 |
+
if response.status > 399:
|
| 582 |
+
raise Exception(response.status)
|
| 583 |
+
break
|
| 584 |
+
except Exception as e:
|
| 585 |
+
await asyncio.sleep(1)
|
| 586 |
+
last_error = str(e)
|
| 587 |
+
retries += 1
|
| 588 |
+
|
| 589 |
+
if not response:
|
| 590 |
+
raise ValueError(f"DocumentQueryHelper::document_get_content: Document fetch error: {document_uri} ({last_error})")
|
| 591 |
+
|
| 592 |
+
mimetype = response.headers["content-type"]
|
| 593 |
+
if "content-length" in response.headers:
|
| 594 |
+
content_length = float(response.headers["content-length"]) / 1024 / 1024 # MB
|
| 595 |
+
if content_length > 25.0:
|
| 596 |
+
raise ValueError(f"Document content length exceeds max. 25MB: {content_length} MB ({document_uri})")
|
| 597 |
+
if mimetype and '; charset=' in mimetype:
|
| 598 |
+
mimetype = mimetype.split('; charset=')[0]
|
| 599 |
+
|
| 600 |
+
if scheme == "file":
|
| 601 |
+
try:
|
| 602 |
+
document_uri = os.path.abspath(url.path)
|
| 603 |
+
except Exception as e:
|
| 604 |
+
raise ValueError(f"Invalid document path '{url.path}'") from e
|
| 605 |
+
|
| 606 |
+
if encoding:
|
| 607 |
+
raise ValueError(f"Compressed documents are unsupported '{encoding}' ({document_uri})")
|
| 608 |
+
|
| 609 |
+
if mimetype == "application/octet-stream":
|
| 610 |
+
raise ValueError(f"Unsupported document mimetype '{mimetype}' ({document_uri})")
|
| 611 |
+
|
| 612 |
+
# Use the store's normalization method
|
| 613 |
+
document_uri_norm = self.store._normalize_uri(document_uri)
|
| 614 |
+
|
| 615 |
+
await self.store.expire_documents(7)
|
| 616 |
+
exists = await self.store.document_exists(document_uri_norm)
|
| 617 |
+
document_content = ""
|
| 618 |
+
if not exists:
|
| 619 |
+
if mimetype.startswith("image/"):
|
| 620 |
+
document_content = self.handle_image_document(document_uri, scheme)
|
| 621 |
+
elif mimetype == "text/html":
|
| 622 |
+
document_content = self.handle_html_document(document_uri, scheme)
|
| 623 |
+
elif mimetype.startswith("text/") or mimetype == "application/json":
|
| 624 |
+
document_content = self.handle_text_document(document_uri, scheme)
|
| 625 |
+
elif mimetype == "application/pdf":
|
| 626 |
+
document_content = self.handle_pdf_document(document_uri, scheme)
|
| 627 |
+
else:
|
| 628 |
+
document_content = self.handle_unstructured_document(document_uri, scheme)
|
| 629 |
+
await self.store.add_document(document_content, document_uri_norm)
|
| 630 |
+
else:
|
| 631 |
+
doc = await self.store.get_document(document_uri_norm)
|
| 632 |
+
if doc:
|
| 633 |
+
document_content = doc.page_content
|
| 634 |
+
else:
|
| 635 |
+
raise ValueError(f"DocumentQueryHelper::document_get_content: Document not found: {document_uri_norm}")
|
| 636 |
+
return document_content
|
| 637 |
+
|
| 638 |
+
def handle_image_document(self, document: str, scheme: str) -> str:
|
| 639 |
+
return self.handle_unstructured_document(document, scheme)
|
| 640 |
+
|
| 641 |
+
def handle_html_document(self, document: str, scheme: str) -> str:
|
| 642 |
+
if scheme in ["http", "https"]:
|
| 643 |
+
loader = AsyncHtmlLoader(web_path=document)
|
| 644 |
+
elif scheme == "file":
|
| 645 |
+
loader = TextLoader(file_path=document)
|
| 646 |
+
else:
|
| 647 |
+
raise ValueError(f"Unsupported scheme: {scheme}")
|
| 648 |
+
|
| 649 |
+
parts: list[Document] = loader.load()
|
| 650 |
+
return "\n".join([element.page_content for element in MarkdownifyTransformer().transform_documents(parts)])
|
| 651 |
+
|
| 652 |
+
def handle_text_document(self, document: str, scheme: str) -> str:
|
| 653 |
+
if scheme in ["http", "https"]:
|
| 654 |
+
loader = AsyncHtmlLoader(web_path=document)
|
| 655 |
+
elif scheme == "file":
|
| 656 |
+
loader = TextLoader(file_path=document)
|
| 657 |
+
else:
|
| 658 |
+
raise ValueError(f"Unsupported scheme: {scheme}")
|
| 659 |
+
|
| 660 |
+
elements: list[Document] = loader.load()
|
| 661 |
+
return "\n".join([element.page_content for element in elements])
|
| 662 |
+
|
| 663 |
+
def handle_pdf_document(self, document: str, scheme: str) -> str:
|
| 664 |
+
if scheme not in ["file", "http", "https"]:
|
| 665 |
+
raise ValueError(f"Unsupported scheme: {scheme}")
|
| 666 |
+
|
| 667 |
+
loader = PyMuPDFLoader(
|
| 668 |
+
document,
|
| 669 |
+
mode="single",
|
| 670 |
+
extract_tables="markdown",
|
| 671 |
+
extract_images=True,
|
| 672 |
+
images_inner_format="text",
|
| 673 |
+
images_parser=TesseractBlobParser(),
|
| 674 |
+
pages_delimiter="\n",
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
elements: list[Document] = loader.load()
|
| 678 |
+
return "\n".join([element.page_content for element in elements])
|
| 679 |
+
|
| 680 |
+
def handle_unstructured_document(self, document: str, scheme: str) -> str:
|
| 681 |
+
if scheme in ["http", "https"]:
|
| 682 |
+
# loader = UnstructuredURLLoader(urls=[document], mode="single")
|
| 683 |
+
loader = UnstructuredLoader(
|
| 684 |
+
web_url=document,
|
| 685 |
+
mode="single",
|
| 686 |
+
partition_via_api=False,
|
| 687 |
+
# chunking_strategy="by_page",
|
| 688 |
+
strategy="hi_res",
|
| 689 |
+
)
|
| 690 |
+
elif scheme == "file":
|
| 691 |
+
loader = UnstructuredLoader(
|
| 692 |
+
file_path=document,
|
| 693 |
+
mode="single",
|
| 694 |
+
partition_via_api=False,
|
| 695 |
+
# chunking_strategy="by_page",
|
| 696 |
+
strategy="hi_res",
|
| 697 |
+
)
|
| 698 |
+
else:
|
| 699 |
+
raise ValueError(f"Unsupported scheme: {scheme}")
|
| 700 |
+
|
| 701 |
+
elements: list[Document] = loader.load()
|
| 702 |
+
return "\n".join([element.page_content for element in elements])
|
python/tools/document_query.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from python.helpers.tool import Tool, Response
|
| 2 |
+
from python.helpers.document_query import DocumentQueryHelper
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class DocumentQueryTool(Tool):
|
| 6 |
+
|
| 7 |
+
async def execute(self, **kwargs):
|
| 8 |
+
document_uri = kwargs["document"] or None
|
| 9 |
+
queries = kwargs["queries"] if "queries" in kwargs else [kwargs["query"]] if ("query" in kwargs and kwargs["query"]) else []
|
| 10 |
+
if not isinstance(document_uri, str) or not document_uri:
|
| 11 |
+
return Response(message="Error: no document provided", break_loop=False)
|
| 12 |
+
try:
|
| 13 |
+
helper = DocumentQueryHelper(self.agent)
|
| 14 |
+
if not queries:
|
| 15 |
+
content = await helper.document_get_content(document_uri)
|
| 16 |
+
else:
|
| 17 |
+
_, content = await helper.document_qa(document_uri, queries)
|
| 18 |
+
return Response(message=content, break_loop=False)
|
| 19 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
| 20 |
+
return Response(message=f"Error processing document: {e}", break_loop=False)
|
python/tools/knowledge_tool.py
CHANGED
|
@@ -6,6 +6,7 @@ from python.helpers.print_style import PrintStyle
|
|
| 6 |
from python.helpers.errors import handle_error
|
| 7 |
from python.helpers.searxng import search as searxng
|
| 8 |
from python.tools.memory_load import DEFAULT_THRESHOLD as DEFAULT_MEMORY_THRESHOLD
|
|
|
|
| 9 |
|
| 10 |
SEARCH_ENGINE_RESULTS = 10
|
| 11 |
|
|
@@ -26,6 +27,9 @@ class Knowledge(Tool):
|
|
| 26 |
# perplexity_result, duckduckgo_result, memory_result = results
|
| 27 |
searxng_result, memory_result = results
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
# Handle exceptions and format results
|
| 30 |
# perplexity_result = self.format_result(perplexity_result, "Perplexity")
|
| 31 |
# duckduckgo_result = self.format_result(duckduckgo_result, "DuckDuckGo")
|
|
@@ -33,7 +37,7 @@ class Knowledge(Tool):
|
|
| 33 |
memory_result = self.format_result(memory_result, "Memory")
|
| 34 |
|
| 35 |
msg = self.agent.read_prompt(
|
| 36 |
-
"
|
| 37 |
# online_sources = ((perplexity_result + "\n\n") if perplexity_result else "") + str(duckduckgo_result),
|
| 38 |
online_sources=((searxng_result + "\n\n") if searxng_result else ""),
|
| 39 |
memory=memory_result,
|
|
@@ -66,6 +70,30 @@ class Knowledge(Tool):
|
|
| 66 |
async def searxng_search(self, question):
|
| 67 |
return await searxng(question)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
async def mem_search(self, question: str):
|
| 70 |
db = await memory.Memory.get(self.agent)
|
| 71 |
docs = await db.search_similarity_threshold(
|
|
@@ -87,6 +115,20 @@ class Knowledge(Tool):
|
|
| 87 |
|
| 88 |
outputs = []
|
| 89 |
for item in result["results"]:
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
return "\n\n".join(outputs[:SEARCH_ENGINE_RESULTS]).strip()
|
|
|
|
| 6 |
from python.helpers.errors import handle_error
|
| 7 |
from python.helpers.searxng import search as searxng
|
| 8 |
from python.tools.memory_load import DEFAULT_THRESHOLD as DEFAULT_MEMORY_THRESHOLD
|
| 9 |
+
from python.helpers.document_query import DocumentQueryHelper
|
| 10 |
|
| 11 |
SEARCH_ENGINE_RESULTS = 10
|
| 12 |
|
|
|
|
| 27 |
# perplexity_result, duckduckgo_result, memory_result = results
|
| 28 |
searxng_result, memory_result = results
|
| 29 |
|
| 30 |
+
# enrich results with qa
|
| 31 |
+
searxng_result = await self.searxng_document_qa(searxng_result, question)
|
| 32 |
+
|
| 33 |
# Handle exceptions and format results
|
| 34 |
# perplexity_result = self.format_result(perplexity_result, "Perplexity")
|
| 35 |
# duckduckgo_result = self.format_result(duckduckgo_result, "DuckDuckGo")
|
|
|
|
| 37 |
memory_result = self.format_result(memory_result, "Memory")
|
| 38 |
|
| 39 |
msg = self.agent.read_prompt(
|
| 40 |
+
"fw.knowledge_tool.response.md",
|
| 41 |
# online_sources = ((perplexity_result + "\n\n") if perplexity_result else "") + str(duckduckgo_result),
|
| 42 |
online_sources=((searxng_result + "\n\n") if searxng_result else ""),
|
| 43 |
memory=memory_result,
|
|
|
|
| 70 |
async def searxng_search(self, question):
|
| 71 |
return await searxng(question)
|
| 72 |
|
| 73 |
+
async def searxng_document_qa(self, result, query):
|
| 74 |
+
if isinstance(result, Exception) or not query or not result or not result["results"]:
|
| 75 |
+
return result
|
| 76 |
+
|
| 77 |
+
result["results"] = result["results"][:SEARCH_ENGINE_RESULTS]
|
| 78 |
+
|
| 79 |
+
tasks = []
|
| 80 |
+
helper = DocumentQueryHelper(self.agent)
|
| 81 |
+
|
| 82 |
+
for index, item in enumerate(result["results"]):
|
| 83 |
+
tasks.append(helper.document_qa(item["url"], [query]))
|
| 84 |
+
|
| 85 |
+
task_results = list(await asyncio.gather(*tasks, return_exceptions=True))
|
| 86 |
+
|
| 87 |
+
for index, item in enumerate(result["results"]):
|
| 88 |
+
if isinstance(task_results[index], BaseException):
|
| 89 |
+
continue
|
| 90 |
+
found, qa = task_results[index] # type: ignore
|
| 91 |
+
if not found:
|
| 92 |
+
continue
|
| 93 |
+
result["results"][index]["qa"] = qa
|
| 94 |
+
|
| 95 |
+
return result
|
| 96 |
+
|
| 97 |
async def mem_search(self, question: str):
|
| 98 |
db = await memory.Memory.get(self.agent)
|
| 99 |
docs = await db.search_similarity_threshold(
|
|
|
|
| 115 |
|
| 116 |
outputs = []
|
| 117 |
for item in result["results"]:
|
| 118 |
+
if "qa" in item:
|
| 119 |
+
outputs.append(
|
| 120 |
+
f"## Next Result\n"
|
| 121 |
+
f"Title: {item['title'].strip()}\n"
|
| 122 |
+
f"URL: {item['url'].strip()}\n"
|
| 123 |
+
f"Search Engine Summary: {item['content'].strip()}\n"
|
| 124 |
+
f"Query Result: {item['qa'].strip()}"
|
| 125 |
+
)
|
| 126 |
+
else:
|
| 127 |
+
outputs.append(
|
| 128 |
+
f"## Next Result\n"
|
| 129 |
+
f"Title: {item['title'].strip()}\n"
|
| 130 |
+
f"URL: {item['url'].strip()}\n"
|
| 131 |
+
f"Search Engine Summary: {item['content'].strip()}"
|
| 132 |
+
)
|
| 133 |
|
| 134 |
return "\n\n".join(outputs[:SEARCH_ENGINE_RESULTS]).strip()
|
requirements.txt
CHANGED
|
@@ -19,6 +19,7 @@ langchain-huggingface==0.1.2
|
|
| 19 |
langchain-mistralai==0.2.4
|
| 20 |
langchain-ollama==0.2.2
|
| 21 |
langchain-openai==0.3.1
|
|
|
|
| 22 |
openai-whisper==20240930
|
| 23 |
lxml_html_clean==0.3.1
|
| 24 |
markdown==3.7
|
|
@@ -31,8 +32,12 @@ python-dotenv==1.1.0
|
|
| 31 |
pytz==2024.2
|
| 32 |
sentence-transformers==3.0.1
|
| 33 |
tiktoken==0.8.0
|
| 34 |
-
unstructured==0.
|
| 35 |
-
unstructured-client==0.
|
| 36 |
webcolors==24.6.0
|
| 37 |
nest-asyncio==1.6.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
crontab==1.0.1
|
|
|
|
| 19 |
langchain-mistralai==0.2.4
|
| 20 |
langchain-ollama==0.2.2
|
| 21 |
langchain-openai==0.3.1
|
| 22 |
+
langchain-unstructured[all-docs]==0.1.6
|
| 23 |
openai-whisper==20240930
|
| 24 |
lxml_html_clean==0.3.1
|
| 25 |
markdown==3.7
|
|
|
|
| 32 |
pytz==2024.2
|
| 33 |
sentence-transformers==3.0.1
|
| 34 |
tiktoken==0.8.0
|
| 35 |
+
unstructured[all-docs]==0.16.23
|
| 36 |
+
unstructured-client==0.31.0
|
| 37 |
webcolors==24.6.0
|
| 38 |
nest-asyncio==1.6.0
|
| 39 |
+
markdownify==0.14.1
|
| 40 |
+
pymupdf==1.25.3
|
| 41 |
+
pytesseract==0.3.13
|
| 42 |
+
pdf2image==1.17.0
|
| 43 |
crontab==1.0.1
|