# coding=utf-8 import json import os import sqlite3 import time from werkzeug.security import generate_password_hash from server.app.utils.diskcache_client import diskcache_client from server.constant.constants import SQLITE_DB_DIR, SQLITE_DB_NAME from dotenv import load_dotenv os.makedirs(SQLITE_DB_DIR, exist_ok=True) def init_chroma_db(): # Load environment variables from .env file load_dotenv(override=True) try: from server.constant.env_constants import check_env_variables check_env_variables() from server.rag.index.embedder.document_embedder import document_embedder return True except Exception as e: print(f"[ERROR] init_chroma_db is failed, the exception is {e}") return False def create_table(): conn = sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}') cur = conn.cursor() # Create table to store domain information and status of sitemap cur.execute(''' CREATE TABLE IF NOT EXISTS t_sitemap_domain_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, domain TEXT NOT NULL, domain_status INTEGER NOT NULL, version INTEGER NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') #`domain_status` meanings: # 1 - 'Domain statistics gathering' # 2 - 'Domain statistics gathering collected' # 3 - 'Domain processing' # 4 - 'Domain processed' # Create table to store sitemap webpage information cur.execute(''' CREATE TABLE IF NOT EXISTS t_sitemap_url_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, domain TEXT NOT NULL, url TEXT NOT NULL, content TEXT NOT NULL, content_length INTEGER NOT NULL, content_md5 TEXT NOT NULL, doc_status INTEGER NOT NULL, version INTEGER NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') #`doc_status` meanings: # 0 - 'Process failed' # 1 - 'Sitemaps web page recorded' # 2 - 'Sitemaps web page crawling' # 3 - 'Sitemaps web page crawling completed' # 4 - 'Sitemaps web text Embedding stored in VectorDB' # 5 - 'Sitemaps web page expired and needed crawled again' # Create table to store isolated webpage information cur.execute(''' CREATE TABLE IF NOT EXISTS t_isolated_url_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT NOT NULL, content TEXT NOT NULL, content_length INTEGER NOT NULL, content_md5 TEXT NOT NULL, doc_status INTEGER NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') #`doc_status` meanings: # 0 - 'Process failed' # 1 - 'Isolated web page recorded' # 2 - 'Isolated web page crawling' # 3 - 'Isolated web page crawling completed' # 4 - 'Isolated web text Embedding stored in VectorDB' # Create table to store local file cur.execute(''' CREATE TABLE IF NOT EXISTS t_local_file_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT NOT NULL, origin_file_name TEXT NOT NULL, file_path TEXT NOT NULL, file_type TEXT NOT NULL, content_length INTEGER NOT NULL, content_md5 TEXT NOT NULL, doc_status INTEGER NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') #`doc_status` meanings: # 0 - 'Process failed' # 1 - 'Local files recorded' # 2 - 'Local files parsing' # 3 - 'Local files parsing completed' # 4 - 'Local files text Embedding stored in VectorDB' cur.execute(''' CREATE TABLE IF NOT EXISTS t_local_file_chunk_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, file_id INTEGER NOT NULL, chunk_index INTEGER NOT NULL, content TEXT NOT NULL, content_length INTEGER NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') # Create document embedding map table to link documents to their embeddings cur.execute(''' CREATE TABLE IF NOT EXISTS t_doc_embedding_map_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, doc_id INTEGER NOT NULL, doc_source INTEGER NOT NULL, embedding_id_list TEXT NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') #`doc_source` meanings: # 1 - 'from sitemap URLs' # 2 - 'from isolated URLs' # 3 - 'from local files' # Create user QA record table to store user queries and responses cur.execute(''' CREATE TABLE IF NOT EXISTS t_user_qa_record_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, user_id TEXT NOT NULL, query TEXT NOT NULL, answer TEXT NOT NULL, source TEXT NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') # Create user QA intervene table for manual intervention in QA pairs cur.execute(''' CREATE TABLE IF NOT EXISTS t_user_qa_intervene_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, query TEXT NOT NULL, intervene_answer TEXT NOT NULL, source TEXT NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') # Create account table to store user account information cur.execute(''' CREATE TABLE IF NOT EXISTS t_account_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, account_name TEXT NOT NULL, password_hash TEXT NOT NULL, is_login INTEGER NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') # Create bot setting table to store chatbot settings cur.execute(''' CREATE TABLE IF NOT EXISTS t_bot_setting_tab ( id INTEGER PRIMARY KEY AUTOINCREMENT, initial_messages TEXT NOT NULL, suggested_messages TEXT NOT NULL, bot_name TEXT NOT NULL, bot_avatar TEXT NOT NULL, chat_icon TEXT NOT NULL, placeholder TEXT NOT NULL, model TEXT NOT NULL, ctime INTEGER NOT NULL, mtime INTEGER NOT NULL ) ''') conn.commit() conn.close() def create_index(): with sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}') as conn: # the index of t_sitemap_domain_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_domain ON t_sitemap_domain_tab (domain)') # the index of t_sitemap_url_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_url ON t_sitemap_url_tab (url)') conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_sitemap_url_tab (ctime)') # the index of t_isolated_url_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_url ON t_isolated_url_tab (url)') conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_isolated_url_tab (ctime)') # the index of t_local_file_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_url ON t_local_file_tab (url)') conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_content_md5 ON t_local_file_tab (content_md5)') conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_local_file_tab (ctime)') # the index of t_local_file_chunk_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_file_id_chunk_index ON t_local_file_chunk_tab (file_id, chunk_index)') # the index of t_doc_embedding_map_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_id_doc_source ON t_doc_embedding_map_tab (doc_id, doc_source)') conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_doc_embedding_map_tab (ctime)') # the index of t_user_qa_record_tab conn.execute('CREATE INDEX IF NOT EXISTS idx_user_id ON t_user_qa_record_tab (user_id)') conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_user_qa_record_tab (ctime)') # the index of t_user_qa_intervene_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_query ON t_user_qa_intervene_tab (query)') conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_user_qa_intervene_tab (ctime)') # the index of t_account_tab conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_account_name ON t_account_tab (account_name)') def init_admin_account(): # Initialize admin account with predefined credentials account_name = 'admin' password = 'bestadmin' password_hash = generate_password_hash(password, method='pbkdf2:sha256', salt_length=10) conn = None try: conn = sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}') cur = conn.cursor() # Check if the account name already exists cur.execute('SELECT id FROM t_account_tab WHERE account_name = ?', (account_name,)) account = cur.fetchone() if account: print(f"[INFO] account_name:'{account_name}' already exists.") else: timestamp = int(time.time()) cur.execute('INSERT INTO t_account_tab (account_name, password_hash, is_login, ctime, mtime) VALUES (?, ?, ?, ?, ?)', (account_name, password_hash, 0, timestamp, timestamp)) conn.commit() except Exception as e: print(f"[ERROR] init_admin_account is failed, the exception is {e}") finally: if conn: conn.close() def init_bot_setting(): # Initialize bot settings initial_messages = ['Hi! What can I help you with?'] suggested_messages = [] bot_name = '' bot_avatar = '' chat_icon = '' placeholder = 'Message...' model = 'gpt-3.5-turbo' conn = None try: conn = sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}') cur = conn.cursor() # Check if the setting table is empty cur.execute('SELECT COUNT(*) FROM t_bot_setting_tab') if cur.fetchone()[0] > 0: print("[INFO] the bot setting already exists.") else: timestamp = int(time.time()) cur.execute(''' INSERT INTO t_bot_setting_tab (id, initial_messages, suggested_messages, bot_name, bot_avatar, chat_icon, placeholder, model, ctime, mtime) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( 1, json.dumps(initial_messages), json.dumps(suggested_messages), bot_name, bot_avatar, chat_icon, placeholder, model, timestamp, timestamp) ) conn.commit() # Add bot setting into Cache try: key = "open_kf:bot_setting" bot_setting = { 'id': 1, 'initial_messages': initial_messages, 'suggested_messages': suggested_messages, 'bot_name': bot_name, 'bot_avatar': bot_avatar, 'chat_icon': chat_icon, 'placeholder': placeholder, 'model': model, 'ctime': timestamp, 'mtime': timestamp } diskcache_client.set(key, json.dumps(bot_setting)) except Exception as e: print(f"[ERROR] add bot setting into Cache is failed, the exception is {e}") except Exception as e: print(f"[ERROR] init_bot_setting is failed, the exception is {e}") finally: if conn: conn.close() if __name__ == '__main__': print('Create tables in the SQLite database') create_table() print('Create indexes for the tables') create_index() print('Initialize the admin account') init_admin_account() print('Initialize the bot settings') init_bot_setting() print('SQLite init Done!\n\n') print("Init Chroma DB") ret = init_chroma_db() if ret: print("Init Chroma DB Done!") else: print("Init Chroma DB Failed!")