|
|
import json |
|
|
from fs_s3fs import S3FS |
|
|
from src.libs.logger import logger |
|
|
from src.libs.s3fs import get_s3_credentials |
|
|
from phi.vectordb.pgvector import PgVector2 |
|
|
from phi.knowledge.json import JSONKnowledgeBase |
|
|
from phi.knowledge.pdf import PDFUrlKnowledgeBase |
|
|
from src.databases.postgres import sqlalchemy_engine |
|
|
from phi.embedder.ollama import OllamaEmbedder |
|
|
|
|
|
|
|
|
class PDFUrlKnowledgeBaseExtended(PDFUrlKnowledgeBase): |
|
|
s3fs: S3FS = None |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
s3_bucket_name, |
|
|
vector_db, |
|
|
s3_access_key_id, |
|
|
s3_secret_access_key, |
|
|
s3_endpoint_url, |
|
|
s3_region, |
|
|
): |
|
|
super().__init__(path=s3_bucket_name, vector_db=vector_db, bucket_name=s3_bucket_name) |
|
|
|
|
|
|
|
|
self.s3fs = S3FS( |
|
|
bucket_name=s3_bucket_name, |
|
|
aws_access_key_id=s3_access_key_id, |
|
|
aws_secret_access_key=s3_secret_access_key, |
|
|
endpoint_url=s3_endpoint_url, |
|
|
region=s3_region, |
|
|
) |
|
|
|
|
|
def load_knowledge_base(self, recreate: bool = False): |
|
|
pdf_knowledge_base.load(recreate=recreate) |
|
|
|
|
|
def chunk_and_store_to_vector_db(self, urls): |
|
|
pdf_knowledge_base.urls = urls |
|
|
|
|
|
|
|
|
|
|
|
_s3_credendtials = get_s3_credentials() |
|
|
_pdf_knowledge_base_arguments = { |
|
|
"vector_db": PgVector2( |
|
|
collection="pdf_documents", |
|
|
db_engine=sqlalchemy_engine, |
|
|
embedder=OllamaEmbedder() |
|
|
|
|
|
), |
|
|
**_s3_credendtials |
|
|
} |
|
|
|
|
|
|
|
|
pdf_knowledge_base = PDFUrlKnowledgeBaseExtended( |
|
|
**_pdf_knowledge_base_arguments |
|
|
) |
|
|
|