comparIA's picture
Scale to 25K conversations with local embeddings + improved visualization
55b1789 verified
import os
# Paths
PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
LANCEDB_DIR = os.path.join(PROJECT_DIR, "data", "lancedb")
TABLE_NAME = "comparia_conversations"
# Datasets
HF_DATASET_NAME = "ministere-culture/comparia-conversations"
HF_VOTES_DATASET = "ministere-culture/comparia-votes"
HF_REACTIONS_DATASET = "ministere-culture/comparia-reactions"
SAMPLE_SIZE = int(os.environ.get("SAMPLE_SIZE", "1000"))
TABLE_NAME_VOTES = "comparia_votes"
TABLE_NAME_REACTIONS = "comparia_reactions"
# Embedding via OpenRouter API
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
EMBEDDING_MODEL = "openai/text-embedding-3-small"
EMBEDDING_DIMENSIONS = 384 # multilingual-e5-small output dims (local model)
# Search defaults
DEFAULT_SEARCH_LIMIT = 20
# Topic map
TOPIC_NUM_CLUSTERS = 15
TOPIC_MAP_MAX_DISPLAY = int(os.environ.get("TOPIC_MAP_MAX_DISPLAY", "2000"))
SEARCH_VECTOR_DIMS = 384 # Truncation for search/UMAP (multilingual-e5-small outputs 384D)
EMBED_CACHE_SIZE = 256 # LRU cache slots for query embeddings
TOPIC_LABEL_MODEL = "mistralai/mistral-small-3.1-24b-instruct"
TOPIC_MAP_CACHE = os.path.join(PROJECT_DIR, "data", "topic_map_cache.pkl")
# Privacy: set to "false" to hide individual conversation messages in the UI
SHOW_CONVERSATIONS = os.environ.get("SHOW_CONVERSATIONS", "true").lower() == "true"