import os import re from pathlib import Path from .infra.dotenv import load_dotenv from .infra.user_config import get_config_path, load_user_config # ===================== 路径配置 ===================== # scripts/pipeline/config.py -> scripts/pipeline -> scripts -> Paper-KG-Pipeline CURRENT_DIR = Path(__file__).parent PROJECT_ROOT = CURRENT_DIR.parent.parent REPO_ROOT = PROJECT_ROOT.parent OUTPUT_DIR = PROJECT_ROOT / "output" # 尝试加载 .env(入口脚本也会加载,这里作为兜底) _DOTENV_STATUS = load_dotenv(REPO_ROOT / ".env", override=False) # 加载用户配置文件(非敏感参数) _CONFIG_PATH = get_config_path(REPO_ROOT) _USER_CONFIG = load_user_config(_CONFIG_PATH) def _get_from_cfg(cfg: dict, path: list | None): if not path: return None cur = cfg for key in path: if not isinstance(cur, dict) or key not in cur: return None cur = cur[key] return cur def _to_bool(value): if isinstance(value, bool): return value if value is None: return None if isinstance(value, (int, float)): return value != 0 if isinstance(value, str): return value.strip() == "1" return bool(value) def _cast(value, cast): if cast is None: return value if cast is bool: return _to_bool(value) if cast is int: return int(value) if cast is float: return float(value) if cast is str: return str(value) if cast is Path: return Path(value) return cast(value) def _cast_list_float(value): if isinstance(value, str): parts = [p.strip() for p in value.split(",") if p.strip()] return [float(p) for p in parts] if isinstance(value, (list, tuple)): return [float(v) for v in value] return value def _get(key: str, default, cast=None, cfg_path: list | None = None): env_val = os.getenv(key) if env_val is not None: value = env_val else: cfg_val = _get_from_cfg(_USER_CONFIG, cfg_path) value = cfg_val if cfg_val is not None else default return _cast(value, cast) if cast else value # ===================== LLM API 配置 ===================== # Secret: only from env/.env (do not put in i2p_config.json) LLM_API_KEY = os.getenv("LLM_API_KEY", "") LLM_PROVIDER = _get( "LLM_PROVIDER", "openai_compatible_chat", cast=str, cfg_path=["llm", "provider"], ) LLM_BASE_URL = _get( "LLM_BASE_URL", "", cast=str, cfg_path=["llm", "base_url"], ) LLM_API_URL = _get( "LLM_API_URL", "", cast=str, cfg_path=["llm", "api_url"], ) LLM_MODEL = _get( "LLM_MODEL", "gpt-4o-mini", cast=str, cfg_path=["llm", "model"], ) LLM_ANTHROPIC_VERSION = _get( "LLM_ANTHROPIC_VERSION", "2023-06-01", cast=str, cfg_path=["llm", "anthropic_version"], ) LLM_EXTRA_HEADERS = _get( "LLM_EXTRA_HEADERS_JSON", None, cfg_path=["llm", "extra_headers"], ) LLM_EXTRA_BODY = _get( "LLM_EXTRA_BODY_JSON", None, cfg_path=["llm", "extra_body"], ) # ===================== Embedding API 配置 ===================== # Embedding 可独立配置;默认使用 OpenAI-compatible /v1/embeddings 形态。 EMBEDDING_PROVIDER = _get( "EMBEDDING_PROVIDER", "openai_compatible", cast=str, cfg_path=["embedding", "provider"], ) EMBEDDING_API_URL = _get( "EMBEDDING_API_URL", "https://api.openai.com/v1/embeddings", cast=str, cfg_path=["embedding", "api_url"], ) EMBEDDING_MODEL = _get( "EMBEDDING_MODEL", "text-embedding-3-large", cast=str, cfg_path=["embedding", "model"], ) # Secret: only from env/.env; fallback to LLM_API_KEY EMBEDDING_API_KEY = os.getenv("EMBEDDING_API_KEY", "") or LLM_API_KEY # ===================== Run Logging 配置 ===================== LOG_ROOT = _get( "I2P_LOG_DIR", str(REPO_ROOT / "log"), cast=Path, cfg_path=["logging", "dir"], ) ENABLE_RUN_LOGGING = _get( "I2P_ENABLE_LOGGING", True, cast=bool, cfg_path=["logging", "enable"], ) LOG_MAX_TEXT_CHARS = _get( "I2P_LOG_MAX_TEXT_CHARS", 20000, cast=int, cfg_path=["logging", "max_text_chars"], ) # ===================== Results Bundling 配置 ===================== RESULTS_ROOT = _get( "I2P_RESULTS_DIR", str(REPO_ROOT / "results"), cast=Path, cfg_path=["results", "dir"], ) RESULTS_ENABLE = _get( "I2P_RESULTS_ENABLE", True, cast=bool, cfg_path=["results", "enable"], ) # Hard-coded: always copy results into `results/run_.../` (no symlinks). # This avoids platform-specific symlink issues and makes results fully portable. RESULTS_MODE = "copy" RESULTS_KEEP_LOG = _get( "I2P_RESULTS_KEEP_LOG", True, cast=bool, cfg_path=["results", "keep_log"], ) # ===================== Index Dir Mode 配置 ===================== INDEX_DIR_MODE = _get( "I2P_INDEX_DIR_MODE", "manual", cast=str, cfg_path=["index", "dir_mode"], ) _PROFILE_SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+") def _sanitize_profile_component(value: str) -> str: if value is None: return "" text = str(value) text = text.replace("/", "_").replace(" ", "_") return _PROFILE_SAFE_RE.sub("_", text) def _compute_profile_id(model: str) -> str: model_s = _sanitize_profile_component(model) return model_s or "unknown_model" if INDEX_DIR_MODE == "auto_profile": _PROFILE_ID = _compute_profile_id(EMBEDDING_MODEL) _DEFAULT_NOVELTY_INDEX_DIR = str(OUTPUT_DIR / f"novelty_index__{_PROFILE_ID}") _DEFAULT_RECALL_INDEX_DIR = str(OUTPUT_DIR / f"recall_index__{_PROFILE_ID}") else: _PROFILE_ID = None _DEFAULT_NOVELTY_INDEX_DIR = str(OUTPUT_DIR / "novelty_index") _DEFAULT_RECALL_INDEX_DIR = str(OUTPUT_DIR / "recall_index") # ===================== Novelty Check 配置 ===================== NOVELTY_ENABLE = _get( "I2P_NOVELTY_ENABLE", True, cast=bool, cfg_path=["novelty", "enable"], ) NOVELTY_TOPK = _get( "I2P_NOVELTY_TOPK", 100, cast=int, cfg_path=["novelty", "top_k"], ) NOVELTY_HIGH_TH = _get( "I2P_NOVELTY_HIGH_TH", 0.88, cast=float, cfg_path=["novelty", "high_th"], ) NOVELTY_MEDIUM_TH = _get( "I2P_NOVELTY_MEDIUM_TH", 0.82, cast=float, cfg_path=["novelty", "medium_th"], ) NOVELTY_INDEX_DIR = _get( "I2P_NOVELTY_INDEX_DIR", _DEFAULT_NOVELTY_INDEX_DIR, cast=Path, cfg_path=["novelty", "index_dir"], ) NOVELTY_AUTO_BUILD_INDEX = _get( "I2P_NOVELTY_AUTO_BUILD_INDEX", False, cast=bool, cfg_path=["novelty", "auto_build_index"], ) NOVELTY_INDEX_BUILD_BATCH_SIZE = _get( "I2P_NOVELTY_INDEX_BUILD_BATCH_SIZE", 32, cast=int, cfg_path=["novelty", "index_batch_size"], ) NOVELTY_INDEX_BUILD_RESUME = _get( "I2P_NOVELTY_INDEX_BUILD_RESUME", True, cast=bool, cfg_path=["novelty", "index_resume"], ) NOVELTY_INDEX_BUILD_MAX_RETRIES = _get( "I2P_NOVELTY_INDEX_BUILD_MAX_RETRIES", 3, cast=int, cfg_path=["novelty", "index_max_retries"], ) NOVELTY_INDEX_BUILD_SLEEP_SEC = _get( "I2P_NOVELTY_INDEX_BUILD_SLEEP_SEC", 1.0, cast=float, cfg_path=["novelty", "index_sleep_sec"], ) NOVELTY_ACTION = _get( "I2P_NOVELTY_ACTION", "pivot", cast=str, cfg_path=["novelty", "action"], ) NOVELTY_MAX_PIVOTS = _get( "I2P_NOVELTY_MAX_PIVOTS", 2, cast=int, cfg_path=["novelty", "max_pivots"], ) NOVELTY_REQUIRE_EMBEDDING = _get( "I2P_NOVELTY_REQUIRE_EMBEDDING", False, cast=bool, cfg_path=["novelty", "require_embedding"], ) NOVELTY_REPORT_IN_OUTPUT = _get( "I2P_NOVELTY_REPORT_IN_OUTPUT", False, cast=bool, cfg_path=["novelty", "report_in_output"], ) # ===================== Pipeline 配置 ===================== class PipelineConfig: """Pipeline 配置参数""" # Pattern 选择 SELECT_PATTERN_COUNT = 3 # 选择 3 个不同策略的 Pattern CONSERVATIVE_RANK_RANGE = (0, 2) # 稳健型: Rank 1-3 INNOVATIVE_CLUSTER_SIZE_THRESHOLD = 10 # 创新型: Cluster Size < 10 # Critic 阈值 PASS_SCORE = _get( "I2P_PASS_SCORE", 7.0, cast=float, cfg_path=["pass", "fixed_score"], ) # 评分 >= 7 为通过 MAX_REFINE_ITERATIONS = 3 # 最多修正 3 轮 # Pass mode (pattern-aware) PASS_MODE = _get( "I2P_PASS_MODE", "two_of_three_q75_and_avg_ge_q50", cast=str, cfg_path=["pass", "mode"], ) PASS_MIN_PATTERN_PAPERS = _get( "I2P_PASS_MIN_PATTERN_PAPERS", 20, cast=int, cfg_path=["pass", "min_pattern_papers"], ) PASS_FALLBACK = _get( "I2P_PASS_FALLBACK", "global", cast=str, cfg_path=["pass", "fallback"], ) # global|fixed # LLM Temperature (per stage; defaults preserve current behavior) LLM_TEMPERATURE_DEFAULT = _get( "I2P_LLM_TEMPERATURE_DEFAULT", 0.7, cast=float, cfg_path=["llm", "temperature", "default"], ) LLM_TEMPERATURE_STORY_GENERATOR = _get( "I2P_LLM_TEMPERATURE_STORY_GENERATOR", 0.7, cast=float, cfg_path=["llm", "temperature", "story_generator"], ) LLM_TEMPERATURE_STORY_GENERATOR_REWRITE = _get( "I2P_LLM_TEMPERATURE_STORY_GENERATOR_REWRITE", 0.3, cast=float, cfg_path=["llm", "temperature", "story_generator_rewrite"], ) LLM_TEMPERATURE_STORY_REFLECTOR = _get( "I2P_LLM_TEMPERATURE_STORY_REFLECTOR", 0.5, cast=float, cfg_path=["llm", "temperature", "story_reflector"], ) LLM_TEMPERATURE_PATTERN_SELECTOR = _get( "I2P_LLM_TEMPERATURE_PATTERN_SELECTOR", 0.3, cast=float, cfg_path=["llm", "temperature", "pattern_selector"], ) LLM_TEMPERATURE_IDEA_FUSION = _get( "I2P_LLM_TEMPERATURE_IDEA_FUSION", 0.7, cast=float, cfg_path=["llm", "temperature", "idea_fusion"], ) LLM_TEMPERATURE_IDEA_FUSION_STAGE2 = _get( "I2P_LLM_TEMPERATURE_IDEA_FUSION_STAGE2", 0.8, cast=float, cfg_path=["llm", "temperature", "idea_fusion_stage2"], ) LLM_TEMPERATURE_IDEA_FUSION_STAGE3 = _get( "I2P_LLM_TEMPERATURE_IDEA_FUSION_STAGE3", 0.9, cast=float, cfg_path=["llm", "temperature", "idea_fusion_stage3"], ) LLM_TEMPERATURE_CRITIC_MAIN = _get( "I2P_LLM_TEMPERATURE_CRITIC_MAIN", 0.0, cast=float, cfg_path=["llm", "temperature", "critic_main"], ) LLM_TEMPERATURE_CRITIC_REPAIR = _get( "I2P_LLM_TEMPERATURE_CRITIC_REPAIR", 0.0, cast=float, cfg_path=["llm", "temperature", "critic_repair"], ) LLM_TEMPERATURE_CRITIC_ANCHORED = _get( "I2P_LLM_TEMPERATURE_CRITIC_ANCHORED", 0.3, cast=float, cfg_path=["llm", "temperature", "critic_anchored"], ) # Idea Packaging (optional; defaults preserve current behavior) IDEA_PACKAGING_ENABLE = _get( "I2P_IDEA_PACKAGING_ENABLE", False, cast=bool, cfg_path=["idea", "packaging_enable"], ) IDEA_PACKAGING_TOPN_PATTERNS = _get( "I2P_IDEA_PACKAGING_TOPN_PATTERNS", 5, cast=int, cfg_path=["idea", "packaging_topn_patterns"], ) IDEA_PACKAGING_MAX_EXEMPLAR_PAPERS = _get( "I2P_IDEA_PACKAGING_MAX_EXEMPLAR_PAPERS", 8, cast=int, cfg_path=["idea", "packaging_max_exemplar_papers"], ) IDEA_PACKAGING_CANDIDATE_K = _get( "I2P_IDEA_PACKAGING_CANDIDATE_K", 3, cast=int, cfg_path=["idea", "packaging_candidate_k"], ) IDEA_PACKAGING_SELECT_MODE = _get( "I2P_IDEA_PACKAGING_SELECT_MODE", "llm_then_recall", cast=str, cfg_path=["idea", "packaging_select_mode"], ) IDEA_PACKAGING_FORCE_EN_QUERY = _get( "I2P_IDEA_PACKAGING_FORCE_EN_QUERY", True, cast=bool, cfg_path=["idea", "packaging_force_en_query"], ) # Idea Packaging LLM temperatures LLM_TEMPERATURE_IDEA_PACKAGING_PARSE = _get( "I2P_LLM_TEMPERATURE_IDEA_PACKAGING_PARSE", 0.0, cast=float, cfg_path=["llm", "temperature", "idea_packaging_parse"], ) LLM_TEMPERATURE_IDEA_PACKAGING_PATTERN_GUIDED = _get( "I2P_LLM_TEMPERATURE_IDEA_PACKAGING_PATTERN_GUIDED", 0.3, cast=float, cfg_path=["llm", "temperature", "idea_packaging_pattern_guided"], ) LLM_TEMPERATURE_IDEA_PACKAGING_JUDGE = _get( "I2P_LLM_TEMPERATURE_IDEA_PACKAGING_JUDGE", 0.0, cast=float, cfg_path=["llm", "temperature", "idea_packaging_judge"], ) # 新颖性模式配置 NOVELTY_MODE_MAX_PATTERNS = 3 # 新颖性模式最多尝试的 Pattern 数 NOVELTY_SCORE_THRESHOLD = 6.0 # 新颖性得分阈值 # 召回审计配置(召回候选与分数落盘) RECALL_AUDIT_ENABLE = _get( "I2P_RECALL_AUDIT_ENABLE", True, cast=bool, cfg_path=["recall", "audit_enable"], ) RECALL_AUDIT_TOPN = _get( "I2P_RECALL_AUDIT_TOPN", 50, cast=int, cfg_path=["recall", "audit_topn"], ) RECALL_AUDIT_SNIPPET_CHARS = _get( "I2P_RECALL_AUDIT_SNIPPET_CHARS", 240, cast=int, cfg_path=["recall", "audit_snippet_chars"], ) RECALL_AUDIT_IN_EVENTS = _get( "I2P_RECALL_AUDIT_IN_EVENTS", True, cast=bool, cfg_path=["recall", "audit_in_events"], ) RECALL_EMBED_BATCH_SIZE = _get( "I2P_RECALL_EMBED_BATCH_SIZE", 32, cast=int, cfg_path=["recall", "embed_batch_size"], ) RECALL_EMBED_MAX_RETRIES = _get( "I2P_RECALL_EMBED_MAX_RETRIES", 3, cast=int, cfg_path=["recall", "embed_max_retries"], ) RECALL_EMBED_SLEEP_SEC = _get( "I2P_RECALL_EMBED_SLEEP_SEC", 0.5, cast=float, cfg_path=["recall", "embed_sleep_sec"], ) RECALL_USE_OFFLINE_INDEX = _get( "I2P_RECALL_USE_OFFLINE_INDEX", False, cast=bool, cfg_path=["recall", "use_offline_index"], ) SUBDOMAIN_TAXONOMY_ENABLE = _get( "I2P_SUBDOMAIN_TAXONOMY_ENABLE", False, cast=bool, cfg_path=["recall", "subdomain_taxonomy_enable"], ) SUBDOMAIN_TAXONOMY_PATH = _get( "I2P_SUBDOMAIN_TAXONOMY_PATH", "", cast=str, cfg_path=["recall", "subdomain_taxonomy_path"], ) SUBDOMAIN_TAXONOMY_STOPLIST_MODE = _get( "I2P_SUBDOMAIN_TAXONOMY_STOPLIST_MODE", "drop", cast=str, cfg_path=["recall", "subdomain_taxonomy_stoplist_mode"], ) RECALL_INDEX_DIR = _get( "I2P_RECALL_INDEX_DIR", _DEFAULT_RECALL_INDEX_DIR, cast=Path, cfg_path=["recall", "index_dir"], ) # Index preflight (auto-prepare before run) INDEX_AUTO_PREPARE = _get( "I2P_INDEX_AUTO_PREPARE", True, cast=bool, cfg_path=["index", "auto_prepare"], ) INDEX_ALLOW_BUILD = _get( "I2P_INDEX_ALLOW_BUILD", True, cast=bool, cfg_path=["index", "allow_build"], ) # Phase 4 查重开关 VERIFICATION_ENABLE = _get( "I2P_VERIFICATION_ENABLE", True, cast=bool, cfg_path=["verification", "enable"], ) # RAG 查重阈值 COLLISION_THRESHOLD = _get( "I2P_COLLISION_THRESHOLD", 0.75, cast=float, cfg_path=["verification", "collision_threshold"], ) # 相似度 > 阈值 认为撞车 # Refinement 策略 TAIL_INJECTION_RANK_RANGE = (4, 9) # 长尾注入: Rank 5-10 HEAD_INJECTION_RANK_RANGE = (0, 2) # 头部注入: Rank 1-3 HEAD_INJECTION_CLUSTER_THRESHOLD = 15 # 头部注入: Cluster Size > 15 # Anchored Critic 配置 ANCHOR_QUANTILES = _get( "I2P_ANCHOR_QUANTILES", [0.05, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.95], cast=_cast_list_float, cfg_path=["anchors", "quantiles"], ) ANCHOR_MAX_INITIAL = _get( "I2P_ANCHOR_MAX_INITIAL", 11, cast=int, cfg_path=["anchors", "max_initial"], ) ANCHOR_MAX_TOTAL = _get( "I2P_ANCHOR_MAX_TOTAL", 13, cast=int, cfg_path=["anchors", "max_total"], ) ANCHOR_MAX_EXEMPLARS = _get( "I2P_ANCHOR_MAX_EXEMPLARS", 2, cast=int, cfg_path=["anchors", "max_exemplars"], ) DENSIFY_OFFSETS = _get( "I2P_DENSIFY_OFFSETS", [-0.6, -0.4, -0.2, 0.2, 0.4, 0.6], cast=_cast_list_float, cfg_path=["anchors", "densify_offsets"], ) ANCHOR_BUCKET_SIZE = _get( "I2P_ANCHOR_BUCKET_SIZE", 1.0, cast=float, cfg_path=["anchors", "bucket_size"], ) ANCHOR_BUCKET_COUNT = _get( "I2P_ANCHOR_BUCKET_COUNT", 3, cast=int, cfg_path=["anchors", "bucket_count"], ) SIGMOID_K = _get( "I2P_SIGMOID_K", 1.2, cast=float, cfg_path=["anchors", "sigmoid_k"], ) GRID_STEP = _get( "I2P_GRID_STEP", 0.01, cast=float, cfg_path=["anchors", "grid_step"], ) DENSIFY_LOSS_THRESHOLD = _get( "I2P_DENSIFY_LOSS_THRESHOLD", 0.05, cast=float, cfg_path=["anchors", "densify_loss_threshold"], ) DENSIFY_MIN_AVG_CONF = _get( "I2P_DENSIFY_MIN_AVG_CONF", 0.35, cast=float, cfg_path=["anchors", "densify_min_avg_conf"], ) ANCHOR_DENSIFY_ENABLE = _get( "I2P_ANCHOR_DENSIFY_ENABLE", True, cast=bool, cfg_path=["anchors", "densify_enable"], ) # Critic JSON reliability (quality-first) CRITIC_STRICT_JSON = _get( "I2P_CRITIC_STRICT_JSON", True, cast=bool, cfg_path=["critic", "strict_json"], ) CRITIC_JSON_RETRIES = _get( "I2P_CRITIC_JSON_RETRIES", 2, cast=int, cfg_path=["critic", "json_retries"], ) # Blind Judge tau config JUDGE_TAU_PATH = _get( "I2P_JUDGE_TAU_PATH", str(OUTPUT_DIR / "judge_tau.json"), cast=Path, cfg_path=["critic", "tau_path"], ) JUDGE_TAU_DEFAULT = _get( "I2P_JUDGE_TAU_DEFAULT", 1.0, cast=float, cfg_path=["critic", "tau_default"], ) TAU_METHODOLOGY = _get( "I2P_TAU_METHODOLOGY", 1.0, cast=float, cfg_path=["critic", "tau_methodology"], ) TAU_NOVELTY = _get( "I2P_TAU_NOVELTY", 1.0, cast=float, cfg_path=["critic", "tau_novelty"], ) TAU_STORYTELLER = _get( "I2P_TAU_STORYTELLER", 1.0, cast=float, cfg_path=["critic", "tau_storyteller"], ) CRITIC_COACH_ENABLE = _get( "I2P_CRITIC_COACH_ENABLE", True, cast=bool, cfg_path=["critic", "coach_enable"], ) CRITIC_COACH_TEMPERATURE = _get( "I2P_CRITIC_COACH_TEMPERATURE", 0.3, cast=float, cfg_path=["critic", "coach_temperature"], ) CRITIC_COACH_MAX_TOKENS = _get( "I2P_CRITIC_COACH_MAX_TOKENS", 4096, cast=int, cfg_path=["critic", "coach_max_tokens"], )