import json from fs_s3fs import S3FS from src.libs.logger import logger from src.libs.s3fs import get_s3_credentials from phi.vectordb.pgvector import PgVector2 from phi.knowledge.json import JSONKnowledgeBase from phi.knowledge.pdf import PDFUrlKnowledgeBase from src.databases.postgres import sqlalchemy_engine from phi.embedder.ollama import OllamaEmbedder class PDFUrlKnowledgeBaseExtended(PDFUrlKnowledgeBase): s3fs: S3FS = None # Explicitly declare the s3fs attribute def __init__( self, s3_bucket_name, vector_db, s3_access_key_id, s3_secret_access_key, s3_endpoint_url, s3_region, ): super().__init__(path=s3_bucket_name, vector_db=vector_db, bucket_name=s3_bucket_name) # Initialize the S3 filesystem self.s3fs = S3FS( bucket_name=s3_bucket_name, aws_access_key_id=s3_access_key_id, aws_secret_access_key=s3_secret_access_key, endpoint_url=s3_endpoint_url, region=s3_region, ) def load_knowledge_base(self, recreate: bool = False): pdf_knowledge_base.load(recreate=recreate) def chunk_and_store_to_vector_db(self, urls): pdf_knowledge_base.urls = urls # S3 credentials _s3_credendtials = get_s3_credentials() _pdf_knowledge_base_arguments = { "vector_db": PgVector2( collection="pdf_documents", db_engine=sqlalchemy_engine, embedder=OllamaEmbedder() ), **_s3_credendtials } # Initialize the extended PDFKnowledgeBase with the S3 bucket name and S3 credentials pdf_knowledge_base = PDFUrlKnowledgeBaseExtended( **_pdf_knowledge_base_arguments )