kyobody-chatbot / create_sqlite_db.py
dahyedahye's picture
.
b9eca80
# coding=utf-8
import json
import os
import sqlite3
import time
from werkzeug.security import generate_password_hash
from server.app.utils.diskcache_client import diskcache_client
from server.constant.constants import SQLITE_DB_DIR, SQLITE_DB_NAME
from dotenv import load_dotenv
os.makedirs(SQLITE_DB_DIR, exist_ok=True)
def init_chroma_db():
# Load environment variables from .env file
load_dotenv(override=True)
try:
from server.constant.env_constants import check_env_variables
check_env_variables()
from server.rag.index.embedder.document_embedder import document_embedder
return True
except Exception as e:
print(f"[ERROR] init_chroma_db is failed, the exception is {e}")
return False
def create_table():
conn = sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}')
cur = conn.cursor()
# Create table to store domain information and status of sitemap
cur.execute('''
CREATE TABLE IF NOT EXISTS t_sitemap_domain_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL,
domain_status INTEGER NOT NULL,
version INTEGER NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
#`domain_status` meanings:
# 1 - 'Domain statistics gathering'
# 2 - 'Domain statistics gathering collected'
# 3 - 'Domain processing'
# 4 - 'Domain processed'
# Create table to store sitemap webpage information
cur.execute('''
CREATE TABLE IF NOT EXISTS t_sitemap_url_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL,
url TEXT NOT NULL,
content TEXT NOT NULL,
content_length INTEGER NOT NULL,
content_md5 TEXT NOT NULL,
doc_status INTEGER NOT NULL,
version INTEGER NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
#`doc_status` meanings:
# 0 - 'Process failed'
# 1 - 'Sitemaps web page recorded'
# 2 - 'Sitemaps web page crawling'
# 3 - 'Sitemaps web page crawling completed'
# 4 - 'Sitemaps web text Embedding stored in VectorDB'
# 5 - 'Sitemaps web page expired and needed crawled again'
# Create table to store isolated webpage information
cur.execute('''
CREATE TABLE IF NOT EXISTS t_isolated_url_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
content TEXT NOT NULL,
content_length INTEGER NOT NULL,
content_md5 TEXT NOT NULL,
doc_status INTEGER NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
#`doc_status` meanings:
# 0 - 'Process failed'
# 1 - 'Isolated web page recorded'
# 2 - 'Isolated web page crawling'
# 3 - 'Isolated web page crawling completed'
# 4 - 'Isolated web text Embedding stored in VectorDB'
# Create table to store local file
cur.execute('''
CREATE TABLE IF NOT EXISTS t_local_file_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
origin_file_name TEXT NOT NULL,
file_path TEXT NOT NULL,
file_type TEXT NOT NULL,
content_length INTEGER NOT NULL,
content_md5 TEXT NOT NULL,
doc_status INTEGER NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
#`doc_status` meanings:
# 0 - 'Process failed'
# 1 - 'Local files recorded'
# 2 - 'Local files parsing'
# 3 - 'Local files parsing completed'
# 4 - 'Local files text Embedding stored in VectorDB'
cur.execute('''
CREATE TABLE IF NOT EXISTS t_local_file_chunk_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
content_length INTEGER NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
# Create document embedding map table to link documents to their embeddings
cur.execute('''
CREATE TABLE IF NOT EXISTS t_doc_embedding_map_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
doc_id INTEGER NOT NULL,
doc_source INTEGER NOT NULL,
embedding_id_list TEXT NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
#`doc_source` meanings:
# 1 - 'from sitemap URLs'
# 2 - 'from isolated URLs'
# 3 - 'from local files'
# Create user QA record table to store user queries and responses
cur.execute('''
CREATE TABLE IF NOT EXISTS t_user_qa_record_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
query TEXT NOT NULL,
answer TEXT NOT NULL,
source TEXT NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
# Create user QA intervene table for manual intervention in QA pairs
cur.execute('''
CREATE TABLE IF NOT EXISTS t_user_qa_intervene_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query TEXT NOT NULL,
intervene_answer TEXT NOT NULL,
source TEXT NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
# Create account table to store user account information
cur.execute('''
CREATE TABLE IF NOT EXISTS t_account_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
account_name TEXT NOT NULL,
password_hash TEXT NOT NULL,
is_login INTEGER NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
# Create bot setting table to store chatbot settings
cur.execute('''
CREATE TABLE IF NOT EXISTS t_bot_setting_tab (
id INTEGER PRIMARY KEY AUTOINCREMENT,
initial_messages TEXT NOT NULL,
suggested_messages TEXT NOT NULL,
bot_name TEXT NOT NULL,
bot_avatar TEXT NOT NULL,
chat_icon TEXT NOT NULL,
placeholder TEXT NOT NULL,
model TEXT NOT NULL,
ctime INTEGER NOT NULL,
mtime INTEGER NOT NULL
)
''')
conn.commit()
conn.close()
def create_index():
with sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}') as conn:
# the index of t_sitemap_domain_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_domain ON t_sitemap_domain_tab (domain)')
# the index of t_sitemap_url_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_url ON t_sitemap_url_tab (url)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_sitemap_url_tab (ctime)')
# the index of t_isolated_url_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_url ON t_isolated_url_tab (url)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_isolated_url_tab (ctime)')
# the index of t_local_file_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_url ON t_local_file_tab (url)')
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_content_md5 ON t_local_file_tab (content_md5)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_local_file_tab (ctime)')
# the index of t_local_file_chunk_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_file_id_chunk_index ON t_local_file_chunk_tab (file_id, chunk_index)')
# the index of t_doc_embedding_map_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_doc_id_doc_source ON t_doc_embedding_map_tab (doc_id, doc_source)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_doc_embedding_map_tab (ctime)')
# the index of t_user_qa_record_tab
conn.execute('CREATE INDEX IF NOT EXISTS idx_user_id ON t_user_qa_record_tab (user_id)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_user_qa_record_tab (ctime)')
# the index of t_user_qa_intervene_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_query ON t_user_qa_intervene_tab (query)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_ctime ON t_user_qa_intervene_tab (ctime)')
# the index of t_account_tab
conn.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_account_name ON t_account_tab (account_name)')
def init_admin_account():
# Initialize admin account with predefined credentials
account_name = 'admin'
password = 'bestadmin'
password_hash = generate_password_hash(password, method='pbkdf2:sha256', salt_length=10)
conn = None
try:
conn = sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}')
cur = conn.cursor()
# Check if the account name already exists
cur.execute('SELECT id FROM t_account_tab WHERE account_name = ?', (account_name,))
account = cur.fetchone()
if account:
print(f"[INFO] account_name:'{account_name}' already exists.")
else:
timestamp = int(time.time())
cur.execute('INSERT INTO t_account_tab (account_name, password_hash, is_login, ctime, mtime) VALUES (?, ?, ?, ?, ?)',
(account_name, password_hash, 0, timestamp, timestamp))
conn.commit()
except Exception as e:
print(f"[ERROR] init_admin_account is failed, the exception is {e}")
finally:
if conn:
conn.close()
def init_bot_setting():
# Initialize bot settings
initial_messages = ['Hi! What can I help you with?']
suggested_messages = []
bot_name = ''
bot_avatar = ''
chat_icon = ''
placeholder = 'Message...'
model = 'gpt-3.5-turbo'
conn = None
try:
conn = sqlite3.connect(f'{SQLITE_DB_DIR}/{SQLITE_DB_NAME}')
cur = conn.cursor()
# Check if the setting table is empty
cur.execute('SELECT COUNT(*) FROM t_bot_setting_tab')
if cur.fetchone()[0] > 0:
print("[INFO] the bot setting already exists.")
else:
timestamp = int(time.time())
cur.execute('''
INSERT INTO t_bot_setting_tab (id, initial_messages, suggested_messages, bot_name, bot_avatar, chat_icon, placeholder, model, ctime, mtime)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
1,
json.dumps(initial_messages),
json.dumps(suggested_messages),
bot_name, bot_avatar, chat_icon, placeholder, model,
timestamp, timestamp)
)
conn.commit()
# Add bot setting into Cache
try:
key = "open_kf:bot_setting"
bot_setting = {
'id': 1,
'initial_messages': initial_messages,
'suggested_messages': suggested_messages,
'bot_name': bot_name,
'bot_avatar': bot_avatar,
'chat_icon': chat_icon,
'placeholder': placeholder,
'model': model,
'ctime': timestamp,
'mtime': timestamp
}
diskcache_client.set(key, json.dumps(bot_setting))
except Exception as e:
print(f"[ERROR] add bot setting into Cache is failed, the exception is {e}")
except Exception as e:
print(f"[ERROR] init_bot_setting is failed, the exception is {e}")
finally:
if conn:
conn.close()
if __name__ == '__main__':
print('Create tables in the SQLite database')
create_table()
print('Create indexes for the tables')
create_index()
print('Initialize the admin account')
init_admin_account()
print('Initialize the bot settings')
init_bot_setting()
print('SQLite init Done!\n\n')
print("Init Chroma DB")
ret = init_chroma_db()
if ret:
print("Init Chroma DB Done!")
else:
print("Init Chroma DB Failed!")