# Directory for storing cache files used by the DiskCache library DISKCACHE_DIR = "diskcache_dir" # Directory for storing SQLite database files SQLITE_DB_DIR = "sqlite_dir" # Name of the SQLite database file SQLITE_DB_NAME = "mydatabase.sqlite3" # Maximum number of concurrent requests allowed for web crawling MAX_CRAWL_PARALLEL_REQUEST = 5 # Directory for storing Chroma vector database files CHROMA_DB_DIR = "chroma_dir" # Name of the collection in the Chroma vector database CHROMA_COLLECTION_NAME = "mychroma_collection" # Name of the OpenAI model used for embedding text OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-3-small" # Name of the ZhipuAI model used for embedding text ZHIPUAI_EMBEDDING_MODEL_NAME = "embedding-2" # Name of the Ollama model used for embedding text OLLAMA_EMBEDDING_MODEL_NAME = "mxbai-embed-large" # Maximum length of text chunks when splitting up large documents MAX_CHUNK_LENGTH = 1300 # Amount of overlap between consecutive text chunks CHUNK_OVERLAP = 100 # Maximum allowable length for a single query string MAX_QUERY_LENGTH = 200 # Number of top documents to recall for initial retrieval in search operations RECALL_TOP_K = 5 # Number of top documents to recall when using re-ranking RERANK_RECALL_TOP_K = 10 # Defines the model used for re-ranking. # 'ms-marco-TinyBERT-L-2-v2': Nano (~4MB), blazing fast model & competitive performance (ranking precision). # 'ms-marco-MiniLM-L-12-v2': Small (~34MB), slightly slower & best performance (ranking precision). RERANK_MODEL_NAME = "ms-marco-MiniLM-L-12-v2" # Maximum number of historical user sessions to retain MAX_HISTORY_SESSION_LENGTH = 2 # Duration in seconds before a session expires SESSION_EXPIRE_TIME = 1800 # Base directory for serving static files STATIC_DIR = "web" # Sub-directory under STATIC_DIR where media files are stored MEDIA_DIR = "media_dir" # Unique identifier for the distributed lock in the DiskCache DISTRIBUTED_LOCK_ID = "open_kf:distributed_lock" # Expiration time for the distributed lock (in seconds) DISTRIBUTED_LOCK_EXPIRE_TIME = 20 # Constant to indicate adding content to the sitemap ADD_SITEMAP_CONTENT = 1 # Constant to indicate deleting content from the sitemap DELETE_SITEMAP_CONTENT = 2 # Constant to indicate updating content in the sitemap UPDATE_SITEMAP_CONTENT = 3 # Constant to indicate adding content to isolated URLs ADD_ISOLATED_URL_CONTENT = 1 # Constant to indicate deleting content from isolated URLs DELETE_ISOLATED_URL_CONTENT = 2 # Maximum number of isolated URLs that can be processed in a batch MAX_ISOLATED_URL_BATCH_LENGTH = 10 # Directory where downloaded local files are stored LOCAL_FILE_DOWNLOAD_DIR = "download_dir" # Maximum number of concurrent requests allowed for file writing MAX_CONCURRENT_WRITES = 5 # Maximum file size (30MB in bytes) MAX_FILE_SIZE = 30 * 1024 * 1024 # Maximum number of files per upload MAX_LOCAL_FILE_BATCH_LENGTH = 10 # Supported file extensions FILE_LOADER_EXTENSIONS = { ".txt", ".md", ".pdf", ".epub", ".mobi", ".html", ".docx", ".pptx", ".xlsx", ".csv", } # in t_sitemap_domain_tab # `domain_status` meanings: # 1 - 'Domain statistics gathering' # 2 - 'Domain statistics gathering collected' # 3 - 'Domain processing' # 4 - 'Domain processed' DOMAIN_STATISTICS_GATHERING = 1 DOMAIN_STATISTICS_GATHERING_COLLECTED = 2 DOMAIN_PROCESSING = 3 DOMAIN_PROCESSED = 4 # in t_sitemap_url_tab # `doc_status` meanings: # 0 - 'Process failed' # 1 - 'Sitemaps web page recorded' # 2 - 'Sitemaps web page crawling' # 3 - 'Sitemaps web page crawling completed' # 4 - 'Sitemaps web text Embedding stored in VectorDB' # 5 - 'Sitemaps web page expired and needed crawled again' SITEMAP_URL_PROCESS_FAILED = 0 SITEMAP_URL_RECORDED = 1 SITEMAP_URL_CRAWLING = 2 SITEMAP_URL_CRAWLING_COMPLETED = 3 SITEMAP_URL_EMBEDDED = 4 SITEMAP_URL_EXPIRED = 5 # in t_isolated_url_tab # `doc_status` meanings: # 0 - 'Process failed' # 1 - 'Isolated web page recorded' # 2 - 'Isolated web page crawling' # 3 - 'Isolated web page crawling completed' # 4 - 'Isolated web text Embedding stored in VectorDB' ISOLATED_URL_PROCESS_FAILED = 0 ISOLATED_URL_RECORDED = 1 ISOLATED_URL_CRAWLING = 2 ISOLATED_URL_CRAWLING_COMPLETED = 3 ISOLATED_URL_EMBEDDED = 4 # in t_local_file_tab # `doc_status` meanings: # 0 - 'Process failed' # 1 - 'Local files recorded' # 2 - 'Local files parsing' # 3 - 'Local files parsing completed' # 4 - 'Local files text Embedding stored in VectorDB' LOCAL_FILE_PROCESS_FAILED = 0 LOCAL_FILE_RECORDED = 1 LOCAL_FILE_PARSING = 2 LOCAL_FILE_PARSING_COMPLETED = 3 LOCAL_FILE_EMBEDDED = 4 # in t_doc_embedding_map_tab # `doc_source` meanings: # 1 - 'from sitemap URLs' # 2 - 'from isolated URLs' # 3 - 'from local files' FROM_SITEMAP_URL = 1 FROM_ISOLATED_URL = 2 FROM_LOCAL_FILE = 3