""" AIFinder Configuration Dataset registry, label mappings, and feature parameters. """ import os # --- Paths --- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) MODEL_DIR = os.path.join(BASE_DIR, "models") # --- Dataset Registry --- # Each entry: (hf_dataset_id, provider, model_name, optional_kwargs) # optional_kwargs: subset name, split, etc. DATASET_REGISTRY = [ # Anthropic ("TeichAI/claude-4.5-opus-high-reasoning-250x", "Anthropic", "Claude 4.5 Opus", {}), ("TeichAI/claude-sonnet-4.5-high-reasoning-250x", "Anthropic", "Claude Sonnet 4.5", {}), ("Roman1111111/claude-opus-4.6-10000x", "Anthropic", "Claude Opus 4.6", {"max_samples": 1500}), # OpenAI ("TeichAI/gpt-5.2-high-reasoning-250x", "OpenAI", "GPT-5.2", {}), ("TeichAI/gpt-5.1-high-reasoning-1000x", "OpenAI", "GPT-5.1", {}), ("TeichAI/gpt-5.1-codex-max-1000x", "OpenAI", "GPT-5.1 Codex Max", {}), ("TeichAI/gpt-5-codex-250x", "OpenAI", "GPT-5 Codex", {}), ("TeichAI/gpt-5-codex-1000x", "OpenAI", "GPT-5 Codex", {}), # Google ("TeichAI/gemini-3-pro-preview-high-reasoning-1000x", "Google", "Gemini 3 Pro", {}), ("TeichAI/gemini-3-pro-preview-high-reasoning-250x", "Google", "Gemini 3 Pro", {}), ("TeichAI/gemini-2.5-flash-11000x", "Google", "Gemini 2.5 Flash", {"max_samples": 1500}), ("TeichAI/Gemini-3-Flash-Preview-VIBE", "Google", "Gemini 3 Flash", {}), ("TeichAI/gemini-3-flash-preview-1000x", "Google", "Gemini 3 Flash", {}), ("TeichAI/gemini-3-flash-preview-complex-1000x", "Google", "Gemini 3 Flash", {}), # xAI ("TeichAI/brainstorm-v3.1-grok-4-fast-200x", "xAI", "Grok 4 Fast", {}), ("TeichAI/sherlock-thinking-alpha-11000x", "xAI", "Grok 4.1 Fast", {"max_samples": 1500}), ("TeichAI/sherlock-dash-alpha-1000x", "xAI", "Grok 4.1 Fast", {}), ("TeichAI/sherlock-think-alpha-1000x", "xAI", "Grok 4.1 Fast", {}), ("TeichAI/grok-code-fast-1-1000x", "xAI", "Grok Code Fast 1", {}), # MoonshotAI ("TeichAI/kimi-k2-thinking-250x", "MoonshotAI", "Kimi K2", {}), ("TeichAI/kimi-k2-thinking-1000x", "MoonshotAI", "Kimi K2", {}), # Mistral ("TeichAI/mistral-small-creative-500x", "Mistral", "Mistral Small", {}), # MiniMax ("TeichAI/MiniMax-M2.1-Code-SFT", "MiniMax", "MiniMax M2.1", {}), ("TeichAI/convo-v1", "MiniMax", "MiniMax M2.1", {}), # StepFun ("TeichAI/Step-3.5-Flash-2600x", "StepFun", "Step 3.5 Flash", {"max_samples": 1500}), # Zhipu ("TeichAI/Pony-Alpha-15k", "Zhipu", "GLM-5", {"max_samples": 1500}), # DeepSeek (TeichAI) ("TeichAI/deepseek-v3.2-speciale-1000x", "DeepSeek", "DeepSeek V3.2 Speciale", {}), ("TeichAI/deepseek-v3.2-speciale-openr1-math-3k", "DeepSeek", "DeepSeek V3.2 Speciale", {"max_samples": 1500}), ] # DeepSeek (a-m-team) — different format, handled separately DEEPSEEK_AM_DATASETS = [ ("a-m-team/AM-DeepSeek-R1-Distilled-1.4M", "DeepSeek", "DeepSeek R1", {"name": "am_0.9M_sample_1k", "max_samples": 1000}), ] # --- All providers and models --- PROVIDERS = [ "Anthropic", "OpenAI", "Google", "xAI", "MoonshotAI", "Mistral", "MiniMax", "StepFun", "Zhipu", "DeepSeek" ] # --- Feature parameters --- TFIDF_WORD_PARAMS = { "analyzer": "word", "ngram_range": (1, 2), "max_features": 20000, "sublinear_tf": True, "min_df": 3, } TFIDF_CHAR_PARAMS = { "analyzer": "char_wb", "ngram_range": (3, 5), "max_features": 20000, "sublinear_tf": True, "min_df": 3, } # --- Train/test split --- TEST_SIZE = 0.2 RANDOM_STATE = 42 # --- Neural Network --- HIDDEN_DIM = 1024 EMBED_DIM = 256 DROPOUT = 0.3 BATCH_SIZE = 2048 EPOCHS = 50 EARLY_STOP_PATIENCE = 8 LEARNING_RATE = 1e-3 WEIGHT_DECAY = 1e-4