conversor-ideas / config.py
lukasnet's picture
Upload 206 files
5e4ec1d verified
import os
import re
from pathlib import Path
from .infra.dotenv import load_dotenv
from .infra.user_config import get_config_path, load_user_config
# ===================== 路径配置 =====================
# scripts/pipeline/config.py -> scripts/pipeline -> scripts -> Paper-KG-Pipeline
CURRENT_DIR = Path(__file__).parent
PROJECT_ROOT = CURRENT_DIR.parent.parent
REPO_ROOT = PROJECT_ROOT.parent
OUTPUT_DIR = PROJECT_ROOT / "output"
# 尝试加载 .env(入口脚本也会加载,这里作为兜底)
_DOTENV_STATUS = load_dotenv(REPO_ROOT / ".env", override=False)
# 加载用户配置文件(非敏感参数)
_CONFIG_PATH = get_config_path(REPO_ROOT)
_USER_CONFIG = load_user_config(_CONFIG_PATH)
def _get_from_cfg(cfg: dict, path: list | None):
if not path:
return None
cur = cfg
for key in path:
if not isinstance(cur, dict) or key not in cur:
return None
cur = cur[key]
return cur
def _to_bool(value):
if isinstance(value, bool):
return value
if value is None:
return None
if isinstance(value, (int, float)):
return value != 0
if isinstance(value, str):
return value.strip() == "1"
return bool(value)
def _cast(value, cast):
if cast is None:
return value
if cast is bool:
return _to_bool(value)
if cast is int:
return int(value)
if cast is float:
return float(value)
if cast is str:
return str(value)
if cast is Path:
return Path(value)
return cast(value)
def _cast_list_float(value):
if isinstance(value, str):
parts = [p.strip() for p in value.split(",") if p.strip()]
return [float(p) for p in parts]
if isinstance(value, (list, tuple)):
return [float(v) for v in value]
return value
def _get(key: str, default, cast=None, cfg_path: list | None = None):
env_val = os.getenv(key)
if env_val is not None:
value = env_val
else:
cfg_val = _get_from_cfg(_USER_CONFIG, cfg_path)
value = cfg_val if cfg_val is not None else default
return _cast(value, cast) if cast else value
# ===================== LLM API 配置 =====================
# Secret: only from env/.env (do not put in i2p_config.json)
LLM_API_KEY = os.getenv("LLM_API_KEY", "")
LLM_PROVIDER = _get(
"LLM_PROVIDER",
"openai_compatible_chat",
cast=str,
cfg_path=["llm", "provider"],
)
LLM_BASE_URL = _get(
"LLM_BASE_URL",
"",
cast=str,
cfg_path=["llm", "base_url"],
)
LLM_API_URL = _get(
"LLM_API_URL",
"",
cast=str,
cfg_path=["llm", "api_url"],
)
LLM_MODEL = _get(
"LLM_MODEL",
"gpt-4o-mini",
cast=str,
cfg_path=["llm", "model"],
)
LLM_ANTHROPIC_VERSION = _get(
"LLM_ANTHROPIC_VERSION",
"2023-06-01",
cast=str,
cfg_path=["llm", "anthropic_version"],
)
LLM_EXTRA_HEADERS = _get(
"LLM_EXTRA_HEADERS_JSON",
None,
cfg_path=["llm", "extra_headers"],
)
LLM_EXTRA_BODY = _get(
"LLM_EXTRA_BODY_JSON",
None,
cfg_path=["llm", "extra_body"],
)
# ===================== Embedding API 配置 =====================
# Embedding 可独立配置;默认使用 OpenAI-compatible /v1/embeddings 形态。
EMBEDDING_PROVIDER = _get(
"EMBEDDING_PROVIDER",
"openai_compatible",
cast=str,
cfg_path=["embedding", "provider"],
)
EMBEDDING_API_URL = _get(
"EMBEDDING_API_URL",
"https://api.openai.com/v1/embeddings",
cast=str,
cfg_path=["embedding", "api_url"],
)
EMBEDDING_MODEL = _get(
"EMBEDDING_MODEL",
"text-embedding-3-large",
cast=str,
cfg_path=["embedding", "model"],
)
# Secret: only from env/.env; fallback to LLM_API_KEY
EMBEDDING_API_KEY = os.getenv("EMBEDDING_API_KEY", "") or LLM_API_KEY
# ===================== Run Logging 配置 =====================
LOG_ROOT = _get(
"I2P_LOG_DIR",
str(REPO_ROOT / "log"),
cast=Path,
cfg_path=["logging", "dir"],
)
ENABLE_RUN_LOGGING = _get(
"I2P_ENABLE_LOGGING",
True,
cast=bool,
cfg_path=["logging", "enable"],
)
LOG_MAX_TEXT_CHARS = _get(
"I2P_LOG_MAX_TEXT_CHARS",
20000,
cast=int,
cfg_path=["logging", "max_text_chars"],
)
# ===================== Results Bundling 配置 =====================
RESULTS_ROOT = _get(
"I2P_RESULTS_DIR",
str(REPO_ROOT / "results"),
cast=Path,
cfg_path=["results", "dir"],
)
RESULTS_ENABLE = _get(
"I2P_RESULTS_ENABLE",
True,
cast=bool,
cfg_path=["results", "enable"],
)
# Hard-coded: always copy results into `results/run_.../` (no symlinks).
# This avoids platform-specific symlink issues and makes results fully portable.
RESULTS_MODE = "copy"
RESULTS_KEEP_LOG = _get(
"I2P_RESULTS_KEEP_LOG",
True,
cast=bool,
cfg_path=["results", "keep_log"],
)
# ===================== Index Dir Mode 配置 =====================
INDEX_DIR_MODE = _get(
"I2P_INDEX_DIR_MODE",
"manual",
cast=str,
cfg_path=["index", "dir_mode"],
)
_PROFILE_SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
def _sanitize_profile_component(value: str) -> str:
if value is None:
return ""
text = str(value)
text = text.replace("/", "_").replace(" ", "_")
return _PROFILE_SAFE_RE.sub("_", text)
def _compute_profile_id(model: str) -> str:
model_s = _sanitize_profile_component(model)
return model_s or "unknown_model"
if INDEX_DIR_MODE == "auto_profile":
_PROFILE_ID = _compute_profile_id(EMBEDDING_MODEL)
_DEFAULT_NOVELTY_INDEX_DIR = str(OUTPUT_DIR / f"novelty_index__{_PROFILE_ID}")
_DEFAULT_RECALL_INDEX_DIR = str(OUTPUT_DIR / f"recall_index__{_PROFILE_ID}")
else:
_PROFILE_ID = None
_DEFAULT_NOVELTY_INDEX_DIR = str(OUTPUT_DIR / "novelty_index")
_DEFAULT_RECALL_INDEX_DIR = str(OUTPUT_DIR / "recall_index")
# ===================== Novelty Check 配置 =====================
NOVELTY_ENABLE = _get(
"I2P_NOVELTY_ENABLE",
True,
cast=bool,
cfg_path=["novelty", "enable"],
)
NOVELTY_TOPK = _get(
"I2P_NOVELTY_TOPK",
100,
cast=int,
cfg_path=["novelty", "top_k"],
)
NOVELTY_HIGH_TH = _get(
"I2P_NOVELTY_HIGH_TH",
0.88,
cast=float,
cfg_path=["novelty", "high_th"],
)
NOVELTY_MEDIUM_TH = _get(
"I2P_NOVELTY_MEDIUM_TH",
0.82,
cast=float,
cfg_path=["novelty", "medium_th"],
)
NOVELTY_INDEX_DIR = _get(
"I2P_NOVELTY_INDEX_DIR",
_DEFAULT_NOVELTY_INDEX_DIR,
cast=Path,
cfg_path=["novelty", "index_dir"],
)
NOVELTY_AUTO_BUILD_INDEX = _get(
"I2P_NOVELTY_AUTO_BUILD_INDEX",
False,
cast=bool,
cfg_path=["novelty", "auto_build_index"],
)
NOVELTY_INDEX_BUILD_BATCH_SIZE = _get(
"I2P_NOVELTY_INDEX_BUILD_BATCH_SIZE",
32,
cast=int,
cfg_path=["novelty", "index_batch_size"],
)
NOVELTY_INDEX_BUILD_RESUME = _get(
"I2P_NOVELTY_INDEX_BUILD_RESUME",
True,
cast=bool,
cfg_path=["novelty", "index_resume"],
)
NOVELTY_INDEX_BUILD_MAX_RETRIES = _get(
"I2P_NOVELTY_INDEX_BUILD_MAX_RETRIES",
3,
cast=int,
cfg_path=["novelty", "index_max_retries"],
)
NOVELTY_INDEX_BUILD_SLEEP_SEC = _get(
"I2P_NOVELTY_INDEX_BUILD_SLEEP_SEC",
1.0,
cast=float,
cfg_path=["novelty", "index_sleep_sec"],
)
NOVELTY_ACTION = _get(
"I2P_NOVELTY_ACTION",
"pivot",
cast=str,
cfg_path=["novelty", "action"],
)
NOVELTY_MAX_PIVOTS = _get(
"I2P_NOVELTY_MAX_PIVOTS",
2,
cast=int,
cfg_path=["novelty", "max_pivots"],
)
NOVELTY_REQUIRE_EMBEDDING = _get(
"I2P_NOVELTY_REQUIRE_EMBEDDING",
False,
cast=bool,
cfg_path=["novelty", "require_embedding"],
)
NOVELTY_REPORT_IN_OUTPUT = _get(
"I2P_NOVELTY_REPORT_IN_OUTPUT",
False,
cast=bool,
cfg_path=["novelty", "report_in_output"],
)
# ===================== Pipeline 配置 =====================
class PipelineConfig:
"""Pipeline 配置参数"""
# Pattern 选择
SELECT_PATTERN_COUNT = 3 # 选择 3 个不同策略的 Pattern
CONSERVATIVE_RANK_RANGE = (0, 2) # 稳健型: Rank 1-3
INNOVATIVE_CLUSTER_SIZE_THRESHOLD = 10 # 创新型: Cluster Size < 10
# Critic 阈值
PASS_SCORE = _get(
"I2P_PASS_SCORE",
7.0,
cast=float,
cfg_path=["pass", "fixed_score"],
) # 评分 >= 7 为通过
MAX_REFINE_ITERATIONS = 3 # 最多修正 3 轮
# Pass mode (pattern-aware)
PASS_MODE = _get(
"I2P_PASS_MODE",
"two_of_three_q75_and_avg_ge_q50",
cast=str,
cfg_path=["pass", "mode"],
)
PASS_MIN_PATTERN_PAPERS = _get(
"I2P_PASS_MIN_PATTERN_PAPERS",
20,
cast=int,
cfg_path=["pass", "min_pattern_papers"],
)
PASS_FALLBACK = _get(
"I2P_PASS_FALLBACK",
"global",
cast=str,
cfg_path=["pass", "fallback"],
) # global|fixed
# LLM Temperature (per stage; defaults preserve current behavior)
LLM_TEMPERATURE_DEFAULT = _get(
"I2P_LLM_TEMPERATURE_DEFAULT",
0.7,
cast=float,
cfg_path=["llm", "temperature", "default"],
)
LLM_TEMPERATURE_STORY_GENERATOR = _get(
"I2P_LLM_TEMPERATURE_STORY_GENERATOR",
0.7,
cast=float,
cfg_path=["llm", "temperature", "story_generator"],
)
LLM_TEMPERATURE_STORY_GENERATOR_REWRITE = _get(
"I2P_LLM_TEMPERATURE_STORY_GENERATOR_REWRITE",
0.3,
cast=float,
cfg_path=["llm", "temperature", "story_generator_rewrite"],
)
LLM_TEMPERATURE_STORY_REFLECTOR = _get(
"I2P_LLM_TEMPERATURE_STORY_REFLECTOR",
0.5,
cast=float,
cfg_path=["llm", "temperature", "story_reflector"],
)
LLM_TEMPERATURE_PATTERN_SELECTOR = _get(
"I2P_LLM_TEMPERATURE_PATTERN_SELECTOR",
0.3,
cast=float,
cfg_path=["llm", "temperature", "pattern_selector"],
)
LLM_TEMPERATURE_IDEA_FUSION = _get(
"I2P_LLM_TEMPERATURE_IDEA_FUSION",
0.7,
cast=float,
cfg_path=["llm", "temperature", "idea_fusion"],
)
LLM_TEMPERATURE_IDEA_FUSION_STAGE2 = _get(
"I2P_LLM_TEMPERATURE_IDEA_FUSION_STAGE2",
0.8,
cast=float,
cfg_path=["llm", "temperature", "idea_fusion_stage2"],
)
LLM_TEMPERATURE_IDEA_FUSION_STAGE3 = _get(
"I2P_LLM_TEMPERATURE_IDEA_FUSION_STAGE3",
0.9,
cast=float,
cfg_path=["llm", "temperature", "idea_fusion_stage3"],
)
LLM_TEMPERATURE_CRITIC_MAIN = _get(
"I2P_LLM_TEMPERATURE_CRITIC_MAIN",
0.0,
cast=float,
cfg_path=["llm", "temperature", "critic_main"],
)
LLM_TEMPERATURE_CRITIC_REPAIR = _get(
"I2P_LLM_TEMPERATURE_CRITIC_REPAIR",
0.0,
cast=float,
cfg_path=["llm", "temperature", "critic_repair"],
)
LLM_TEMPERATURE_CRITIC_ANCHORED = _get(
"I2P_LLM_TEMPERATURE_CRITIC_ANCHORED",
0.3,
cast=float,
cfg_path=["llm", "temperature", "critic_anchored"],
)
# Idea Packaging (optional; defaults preserve current behavior)
IDEA_PACKAGING_ENABLE = _get(
"I2P_IDEA_PACKAGING_ENABLE",
False,
cast=bool,
cfg_path=["idea", "packaging_enable"],
)
IDEA_PACKAGING_TOPN_PATTERNS = _get(
"I2P_IDEA_PACKAGING_TOPN_PATTERNS",
5,
cast=int,
cfg_path=["idea", "packaging_topn_patterns"],
)
IDEA_PACKAGING_MAX_EXEMPLAR_PAPERS = _get(
"I2P_IDEA_PACKAGING_MAX_EXEMPLAR_PAPERS",
8,
cast=int,
cfg_path=["idea", "packaging_max_exemplar_papers"],
)
IDEA_PACKAGING_CANDIDATE_K = _get(
"I2P_IDEA_PACKAGING_CANDIDATE_K",
3,
cast=int,
cfg_path=["idea", "packaging_candidate_k"],
)
IDEA_PACKAGING_SELECT_MODE = _get(
"I2P_IDEA_PACKAGING_SELECT_MODE",
"llm_then_recall",
cast=str,
cfg_path=["idea", "packaging_select_mode"],
)
IDEA_PACKAGING_FORCE_EN_QUERY = _get(
"I2P_IDEA_PACKAGING_FORCE_EN_QUERY",
True,
cast=bool,
cfg_path=["idea", "packaging_force_en_query"],
)
# Idea Packaging LLM temperatures
LLM_TEMPERATURE_IDEA_PACKAGING_PARSE = _get(
"I2P_LLM_TEMPERATURE_IDEA_PACKAGING_PARSE",
0.0,
cast=float,
cfg_path=["llm", "temperature", "idea_packaging_parse"],
)
LLM_TEMPERATURE_IDEA_PACKAGING_PATTERN_GUIDED = _get(
"I2P_LLM_TEMPERATURE_IDEA_PACKAGING_PATTERN_GUIDED",
0.3,
cast=float,
cfg_path=["llm", "temperature", "idea_packaging_pattern_guided"],
)
LLM_TEMPERATURE_IDEA_PACKAGING_JUDGE = _get(
"I2P_LLM_TEMPERATURE_IDEA_PACKAGING_JUDGE",
0.0,
cast=float,
cfg_path=["llm", "temperature", "idea_packaging_judge"],
)
# 新颖性模式配置
NOVELTY_MODE_MAX_PATTERNS = 3 # 新颖性模式最多尝试的 Pattern 数
NOVELTY_SCORE_THRESHOLD = 6.0 # 新颖性得分阈值
# 召回审计配置(召回候选与分数落盘)
RECALL_AUDIT_ENABLE = _get(
"I2P_RECALL_AUDIT_ENABLE",
True,
cast=bool,
cfg_path=["recall", "audit_enable"],
)
RECALL_AUDIT_TOPN = _get(
"I2P_RECALL_AUDIT_TOPN",
50,
cast=int,
cfg_path=["recall", "audit_topn"],
)
RECALL_AUDIT_SNIPPET_CHARS = _get(
"I2P_RECALL_AUDIT_SNIPPET_CHARS",
240,
cast=int,
cfg_path=["recall", "audit_snippet_chars"],
)
RECALL_AUDIT_IN_EVENTS = _get(
"I2P_RECALL_AUDIT_IN_EVENTS",
True,
cast=bool,
cfg_path=["recall", "audit_in_events"],
)
RECALL_EMBED_BATCH_SIZE = _get(
"I2P_RECALL_EMBED_BATCH_SIZE",
32,
cast=int,
cfg_path=["recall", "embed_batch_size"],
)
RECALL_EMBED_MAX_RETRIES = _get(
"I2P_RECALL_EMBED_MAX_RETRIES",
3,
cast=int,
cfg_path=["recall", "embed_max_retries"],
)
RECALL_EMBED_SLEEP_SEC = _get(
"I2P_RECALL_EMBED_SLEEP_SEC",
0.5,
cast=float,
cfg_path=["recall", "embed_sleep_sec"],
)
RECALL_USE_OFFLINE_INDEX = _get(
"I2P_RECALL_USE_OFFLINE_INDEX",
False,
cast=bool,
cfg_path=["recall", "use_offline_index"],
)
SUBDOMAIN_TAXONOMY_ENABLE = _get(
"I2P_SUBDOMAIN_TAXONOMY_ENABLE",
False,
cast=bool,
cfg_path=["recall", "subdomain_taxonomy_enable"],
)
SUBDOMAIN_TAXONOMY_PATH = _get(
"I2P_SUBDOMAIN_TAXONOMY_PATH",
"",
cast=str,
cfg_path=["recall", "subdomain_taxonomy_path"],
)
SUBDOMAIN_TAXONOMY_STOPLIST_MODE = _get(
"I2P_SUBDOMAIN_TAXONOMY_STOPLIST_MODE",
"drop",
cast=str,
cfg_path=["recall", "subdomain_taxonomy_stoplist_mode"],
)
RECALL_INDEX_DIR = _get(
"I2P_RECALL_INDEX_DIR",
_DEFAULT_RECALL_INDEX_DIR,
cast=Path,
cfg_path=["recall", "index_dir"],
)
# Index preflight (auto-prepare before run)
INDEX_AUTO_PREPARE = _get(
"I2P_INDEX_AUTO_PREPARE",
True,
cast=bool,
cfg_path=["index", "auto_prepare"],
)
INDEX_ALLOW_BUILD = _get(
"I2P_INDEX_ALLOW_BUILD",
True,
cast=bool,
cfg_path=["index", "allow_build"],
)
# Phase 4 查重开关
VERIFICATION_ENABLE = _get(
"I2P_VERIFICATION_ENABLE",
True,
cast=bool,
cfg_path=["verification", "enable"],
)
# RAG 查重阈值
COLLISION_THRESHOLD = _get(
"I2P_COLLISION_THRESHOLD",
0.75,
cast=float,
cfg_path=["verification", "collision_threshold"],
) # 相似度 > 阈值 认为撞车
# Refinement 策略
TAIL_INJECTION_RANK_RANGE = (4, 9) # 长尾注入: Rank 5-10
HEAD_INJECTION_RANK_RANGE = (0, 2) # 头部注入: Rank 1-3
HEAD_INJECTION_CLUSTER_THRESHOLD = 15 # 头部注入: Cluster Size > 15
# Anchored Critic 配置
ANCHOR_QUANTILES = _get(
"I2P_ANCHOR_QUANTILES",
[0.05, 0.15, 0.25, 0.35, 0.5, 0.65, 0.75, 0.85, 0.95],
cast=_cast_list_float,
cfg_path=["anchors", "quantiles"],
)
ANCHOR_MAX_INITIAL = _get(
"I2P_ANCHOR_MAX_INITIAL",
11,
cast=int,
cfg_path=["anchors", "max_initial"],
)
ANCHOR_MAX_TOTAL = _get(
"I2P_ANCHOR_MAX_TOTAL",
13,
cast=int,
cfg_path=["anchors", "max_total"],
)
ANCHOR_MAX_EXEMPLARS = _get(
"I2P_ANCHOR_MAX_EXEMPLARS",
2,
cast=int,
cfg_path=["anchors", "max_exemplars"],
)
DENSIFY_OFFSETS = _get(
"I2P_DENSIFY_OFFSETS",
[-0.6, -0.4, -0.2, 0.2, 0.4, 0.6],
cast=_cast_list_float,
cfg_path=["anchors", "densify_offsets"],
)
ANCHOR_BUCKET_SIZE = _get(
"I2P_ANCHOR_BUCKET_SIZE",
1.0,
cast=float,
cfg_path=["anchors", "bucket_size"],
)
ANCHOR_BUCKET_COUNT = _get(
"I2P_ANCHOR_BUCKET_COUNT",
3,
cast=int,
cfg_path=["anchors", "bucket_count"],
)
SIGMOID_K = _get(
"I2P_SIGMOID_K",
1.2,
cast=float,
cfg_path=["anchors", "sigmoid_k"],
)
GRID_STEP = _get(
"I2P_GRID_STEP",
0.01,
cast=float,
cfg_path=["anchors", "grid_step"],
)
DENSIFY_LOSS_THRESHOLD = _get(
"I2P_DENSIFY_LOSS_THRESHOLD",
0.05,
cast=float,
cfg_path=["anchors", "densify_loss_threshold"],
)
DENSIFY_MIN_AVG_CONF = _get(
"I2P_DENSIFY_MIN_AVG_CONF",
0.35,
cast=float,
cfg_path=["anchors", "densify_min_avg_conf"],
)
ANCHOR_DENSIFY_ENABLE = _get(
"I2P_ANCHOR_DENSIFY_ENABLE",
True,
cast=bool,
cfg_path=["anchors", "densify_enable"],
)
# Critic JSON reliability (quality-first)
CRITIC_STRICT_JSON = _get(
"I2P_CRITIC_STRICT_JSON",
True,
cast=bool,
cfg_path=["critic", "strict_json"],
)
CRITIC_JSON_RETRIES = _get(
"I2P_CRITIC_JSON_RETRIES",
2,
cast=int,
cfg_path=["critic", "json_retries"],
)
# Blind Judge tau config
JUDGE_TAU_PATH = _get(
"I2P_JUDGE_TAU_PATH",
str(OUTPUT_DIR / "judge_tau.json"),
cast=Path,
cfg_path=["critic", "tau_path"],
)
JUDGE_TAU_DEFAULT = _get(
"I2P_JUDGE_TAU_DEFAULT",
1.0,
cast=float,
cfg_path=["critic", "tau_default"],
)
TAU_METHODOLOGY = _get(
"I2P_TAU_METHODOLOGY",
1.0,
cast=float,
cfg_path=["critic", "tau_methodology"],
)
TAU_NOVELTY = _get(
"I2P_TAU_NOVELTY",
1.0,
cast=float,
cfg_path=["critic", "tau_novelty"],
)
TAU_STORYTELLER = _get(
"I2P_TAU_STORYTELLER",
1.0,
cast=float,
cfg_path=["critic", "tau_storyteller"],
)
CRITIC_COACH_ENABLE = _get(
"I2P_CRITIC_COACH_ENABLE",
True,
cast=bool,
cfg_path=["critic", "coach_enable"],
)
CRITIC_COACH_TEMPERATURE = _get(
"I2P_CRITIC_COACH_TEMPERATURE",
0.3,
cast=float,
cfg_path=["critic", "coach_temperature"],
)
CRITIC_COACH_MAX_TOKENS = _get(
"I2P_CRITIC_COACH_MAX_TOKENS",
4096,
cast=int,
cfg_path=["critic", "coach_max_tokens"],
)