|
|
import os |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
from database import init_db, check_if_indexed, delete_database_and_index, get_db_connection |
|
|
from create_index import create_initial_index as build_secure_index |
|
|
from search import search as secure_search |
|
|
from ingest_document import ingest_pdf |
|
|
|
|
|
|
|
|
MODEL_NAME = 'clip-ViT-B-32' |
|
|
|
|
|
class KnowledgeBase: |
|
|
def __init__(self): |
|
|
self.model = SentenceTransformer(MODEL_NAME) |
|
|
|
|
|
init_db() |
|
|
|
|
|
if not check_if_indexed(): |
|
|
print("Local knowledge base not found. Building initial knowledge base...") |
|
|
self._build_initial_knowledge_base() |
|
|
|
|
|
def _build_initial_knowledge_base(self): |
|
|
current_dir = os.path.dirname(__file__) |
|
|
knowledge_base_data_dir = os.path.join(current_dir, "knowledge_base_data") |
|
|
|
|
|
document_filenames = [ |
|
|
"healthy_maize_remedy.txt", |
|
|
"maize_phosphorus_deficiency_remedy.txt", |
|
|
"comic_relief.txt" |
|
|
] |
|
|
|
|
|
documents_content = {} |
|
|
for filename in document_filenames: |
|
|
file_path = os.path.join(knowledge_base_data_dir, filename) |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
documents_content[filename] = f.read() |
|
|
except FileNotFoundError: |
|
|
print(f"Warning: Knowledge base file not found, skipping: {file_path}") |
|
|
|
|
|
if documents_content: |
|
|
build_secure_index(documents_content) |
|
|
else: |
|
|
print("No initial knowledge base documents found to index.") |
|
|
|
|
|
def create_initial_index(self, documents_dict): |
|
|
|
|
|
build_secure_index(documents_dict) |
|
|
|
|
|
def rebuild_from_default_files(self): |
|
|
|
|
|
self._build_initial_knowledge_base() |
|
|
|
|
|
def ingest_pdf(self, file_path, file_name): |
|
|
|
|
|
ingest_pdf(file_path, file_name) |
|
|
|
|
|
def search(self, query, k=1): |
|
|
|
|
|
return secure_search(query, k) |
|
|
|
|
|
def get_retriever(): |
|
|
kb = KnowledgeBase() |
|
|
class Retriever: |
|
|
def __init__(self, kb): |
|
|
self.kb = kb |
|
|
def get_relevant_documents(self, query): |
|
|
results = self.kb.search(query) |
|
|
from langchain.schema import Document |
|
|
|
|
|
|
|
|
text_documents = [Document(page_content=r['content']) for r in results if r['type'] == 'text'] |
|
|
return text_documents |
|
|
|
|
|
return Retriever(kb) |
|
|
|