Spaces:

comparIA
/

French-arena-dataset-preview

Sleeping

Scale to 25K conversations with local embeddings + improved visualization

55b1789 verified 2 months ago

1.34 kB

	import os

	# Paths
	PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
	LANCEDB_DIR = os.path.join(PROJECT_DIR, "data", "lancedb")
	TABLE_NAME = "comparia_conversations"

	# Datasets
	HF_DATASET_NAME = "ministere-culture/comparia-conversations"
	HF_VOTES_DATASET = "ministere-culture/comparia-votes"
	HF_REACTIONS_DATASET = "ministere-culture/comparia-reactions"
	SAMPLE_SIZE = int(os.environ.get("SAMPLE_SIZE", "1000"))
	TABLE_NAME_VOTES = "comparia_votes"
	TABLE_NAME_REACTIONS = "comparia_reactions"

	# Embedding via OpenRouter API
	OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
	EMBEDDING_MODEL = "openai/text-embedding-3-small"
	EMBEDDING_DIMENSIONS = 384 # multilingual-e5-small output dims (local model)

	# Search defaults
	DEFAULT_SEARCH_LIMIT = 20

	# Topic map
	TOPIC_NUM_CLUSTERS = 15
	TOPIC_MAP_MAX_DISPLAY = int(os.environ.get("TOPIC_MAP_MAX_DISPLAY", "2000"))
	SEARCH_VECTOR_DIMS = 384 # Truncation for search/UMAP (multilingual-e5-small outputs 384D)
	EMBED_CACHE_SIZE = 256 # LRU cache slots for query embeddings
	TOPIC_LABEL_MODEL = "mistralai/mistral-small-3.1-24b-instruct"
	TOPIC_MAP_CACHE = os.path.join(PROJECT_DIR, "data", "topic_map_cache.pkl")

	# Privacy: set to "false" to hide individual conversation messages in the UI
	SHOW_CONVERSATIONS = os.environ.get("SHOW_CONVERSATIONS", "true").lower() == "true"