LONGYKING
update
ed3fa91
import json
from fs_s3fs import S3FS
from src.libs.logger import logger
from src.libs.s3fs import get_s3_credentials
from phi.vectordb.pgvector import PgVector2
from phi.knowledge.json import JSONKnowledgeBase
from phi.knowledge.pdf import PDFUrlKnowledgeBase
from src.databases.postgres import sqlalchemy_engine
from phi.embedder.ollama import OllamaEmbedder
class PDFUrlKnowledgeBaseExtended(PDFUrlKnowledgeBase):
s3fs: S3FS = None # Explicitly declare the s3fs attribute
def __init__(
self,
s3_bucket_name,
vector_db,
s3_access_key_id,
s3_secret_access_key,
s3_endpoint_url,
s3_region,
):
super().__init__(path=s3_bucket_name, vector_db=vector_db, bucket_name=s3_bucket_name)
# Initialize the S3 filesystem
self.s3fs = S3FS(
bucket_name=s3_bucket_name,
aws_access_key_id=s3_access_key_id,
aws_secret_access_key=s3_secret_access_key,
endpoint_url=s3_endpoint_url,
region=s3_region,
)
def load_knowledge_base(self, recreate: bool = False):
pdf_knowledge_base.load(recreate=recreate)
def chunk_and_store_to_vector_db(self, urls):
pdf_knowledge_base.urls = urls
# S3 credentials
_s3_credendtials = get_s3_credentials()
_pdf_knowledge_base_arguments = {
"vector_db": PgVector2(
collection="pdf_documents",
db_engine=sqlalchemy_engine,
embedder=OllamaEmbedder()
),
**_s3_credendtials
}
# Initialize the extended PDFKnowledgeBase with the S3 bucket name and S3 credentials
pdf_knowledge_base = PDFUrlKnowledgeBaseExtended(
**_pdf_knowledge_base_arguments
)