dahyedahye's picture
.
1161dd2
# Directory for storing cache files used by the DiskCache library
DISKCACHE_DIR = "diskcache_dir"
# Directory for storing SQLite database files
SQLITE_DB_DIR = "sqlite_dir"
# Name of the SQLite database file
SQLITE_DB_NAME = "mydatabase.sqlite3"
# Maximum number of concurrent requests allowed for web crawling
MAX_CRAWL_PARALLEL_REQUEST = 5
# Directory for storing Chroma vector database files
CHROMA_DB_DIR = "chroma_dir"
# Name of the collection in the Chroma vector database
CHROMA_COLLECTION_NAME = "mychroma_collection"
# Name of the OpenAI model used for embedding text
OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-3-small"
# Name of the ZhipuAI model used for embedding text
ZHIPUAI_EMBEDDING_MODEL_NAME = "embedding-2"
# Name of the Ollama model used for embedding text
OLLAMA_EMBEDDING_MODEL_NAME = "mxbai-embed-large"
# Maximum length of text chunks when splitting up large documents
MAX_CHUNK_LENGTH = 1300
# Amount of overlap between consecutive text chunks
CHUNK_OVERLAP = 100
# Maximum allowable length for a single query string
MAX_QUERY_LENGTH = 200
# Number of top documents to recall for initial retrieval in search operations
RECALL_TOP_K = 5
# Number of top documents to recall when using re-ranking
RERANK_RECALL_TOP_K = 10
# Defines the model used for re-ranking.
# 'ms-marco-TinyBERT-L-2-v2': Nano (~4MB), blazing fast model & competitive performance (ranking precision).
# 'ms-marco-MiniLM-L-12-v2': Small (~34MB), slightly slower & best performance (ranking precision).
RERANK_MODEL_NAME = "ms-marco-MiniLM-L-12-v2"
# Maximum number of historical user sessions to retain
MAX_HISTORY_SESSION_LENGTH = 2
# Duration in seconds before a session expires
SESSION_EXPIRE_TIME = 1800
# Base directory for serving static files
STATIC_DIR = "web"
# Sub-directory under STATIC_DIR where media files are stored
MEDIA_DIR = "media_dir"
# Unique identifier for the distributed lock in the DiskCache
DISTRIBUTED_LOCK_ID = "open_kf:distributed_lock"
# Expiration time for the distributed lock (in seconds)
DISTRIBUTED_LOCK_EXPIRE_TIME = 20
# Constant to indicate adding content to the sitemap
ADD_SITEMAP_CONTENT = 1
# Constant to indicate deleting content from the sitemap
DELETE_SITEMAP_CONTENT = 2
# Constant to indicate updating content in the sitemap
UPDATE_SITEMAP_CONTENT = 3
# Constant to indicate adding content to isolated URLs
ADD_ISOLATED_URL_CONTENT = 1
# Constant to indicate deleting content from isolated URLs
DELETE_ISOLATED_URL_CONTENT = 2
# Maximum number of isolated URLs that can be processed in a batch
MAX_ISOLATED_URL_BATCH_LENGTH = 10
# Directory where downloaded local files are stored
LOCAL_FILE_DOWNLOAD_DIR = "download_dir"
# Maximum number of concurrent requests allowed for file writing
MAX_CONCURRENT_WRITES = 5
# Maximum file size (30MB in bytes)
MAX_FILE_SIZE = 30 * 1024 * 1024
# Maximum number of files per upload
MAX_LOCAL_FILE_BATCH_LENGTH = 10
# Supported file extensions
FILE_LOADER_EXTENSIONS = {
".txt",
".md",
".pdf",
".epub",
".mobi",
".html",
".docx",
".pptx",
".xlsx",
".csv",
}
# in t_sitemap_domain_tab
# `domain_status` meanings:
# 1 - 'Domain statistics gathering'
# 2 - 'Domain statistics gathering collected'
# 3 - 'Domain processing'
# 4 - 'Domain processed'
DOMAIN_STATISTICS_GATHERING = 1
DOMAIN_STATISTICS_GATHERING_COLLECTED = 2
DOMAIN_PROCESSING = 3
DOMAIN_PROCESSED = 4
# in t_sitemap_url_tab
# `doc_status` meanings:
# 0 - 'Process failed'
# 1 - 'Sitemaps web page recorded'
# 2 - 'Sitemaps web page crawling'
# 3 - 'Sitemaps web page crawling completed'
# 4 - 'Sitemaps web text Embedding stored in VectorDB'
# 5 - 'Sitemaps web page expired and needed crawled again'
SITEMAP_URL_PROCESS_FAILED = 0
SITEMAP_URL_RECORDED = 1
SITEMAP_URL_CRAWLING = 2
SITEMAP_URL_CRAWLING_COMPLETED = 3
SITEMAP_URL_EMBEDDED = 4
SITEMAP_URL_EXPIRED = 5
# in t_isolated_url_tab
# `doc_status` meanings:
# 0 - 'Process failed'
# 1 - 'Isolated web page recorded'
# 2 - 'Isolated web page crawling'
# 3 - 'Isolated web page crawling completed'
# 4 - 'Isolated web text Embedding stored in VectorDB'
ISOLATED_URL_PROCESS_FAILED = 0
ISOLATED_URL_RECORDED = 1
ISOLATED_URL_CRAWLING = 2
ISOLATED_URL_CRAWLING_COMPLETED = 3
ISOLATED_URL_EMBEDDED = 4
# in t_local_file_tab
# `doc_status` meanings:
# 0 - 'Process failed'
# 1 - 'Local files recorded'
# 2 - 'Local files parsing'
# 3 - 'Local files parsing completed'
# 4 - 'Local files text Embedding stored in VectorDB'
LOCAL_FILE_PROCESS_FAILED = 0
LOCAL_FILE_RECORDED = 1
LOCAL_FILE_PARSING = 2
LOCAL_FILE_PARSING_COMPLETED = 3
LOCAL_FILE_EMBEDDED = 4
# in t_doc_embedding_map_tab
# `doc_source` meanings:
# 1 - 'from sitemap URLs'
# 2 - 'from isolated URLs'
# 3 - 'from local files'
FROM_SITEMAP_URL = 1
FROM_ISOLATED_URL = 2
FROM_LOCAL_FILE = 3