Spaces:

lukasnet
/

conversor-ideas

Sleeping

App Files Files Community

lukasnet commited on Feb 15

Commit

5e4ec1d

verified ·

1 Parent(s): 291bf9c

Upload 206 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
INSTRUCCIONES_DESPLIEGUE.md +48 -0
Paper-KG-Pipeline/data/paper_reviews_dataset_iclr_sample_100.jsonl +3 -0
Paper-KG-Pipeline/docs/PROJECT_FULL_DOC_EN.md +0 -0
Paper-KG-Pipeline/docs/PROJECT_FULL_DOC_ZH.md +0 -0
Paper-KG-Pipeline/docs/archive_merged.md +0 -0
Paper-KG-Pipeline/output/edges.json +3 -0
Paper-KG-Pipeline/output/final_story.json +14 -0
Paper-KG-Pipeline/output/knowledge_graph_stats.json +7 -0
Paper-KG-Pipeline/output/knowledge_graph_v2.gpickle +3 -0
Paper-KG-Pipeline/output/log.json +180 -0
Paper-KG-Pipeline/output/nodes_domain.json +0 -0
Paper-KG-Pipeline/output/nodes_idea.json +0 -0
Paper-KG-Pipeline/output/nodes_idea.json.bak +3 -0
Paper-KG-Pipeline/output/nodes_paper.json +0 -0
Paper-KG-Pipeline/output/nodes_paper.json.bak +3 -0
Paper-KG-Pipeline/output/nodes_pattern.json +0 -0
Paper-KG-Pipeline/output/novelty_index__gemini-embedding-001/index_manifest.json +8 -0
Paper-KG-Pipeline/output/novelty_index__gemini-embedding-001/paper_emb.npy +3 -0
Paper-KG-Pipeline/output/novelty_index__gemini-embedding-001/paper_meta.jsonl +100 -0
Paper-KG-Pipeline/output/paper_to_pattern.json +350 -0
Paper-KG-Pipeline/output/patterns_guide.txt +0 -0
Paper-KG-Pipeline/output/patterns_statistics.json +99 -0
Paper-KG-Pipeline/output/patterns_structured.json +0 -0
Paper-KG-Pipeline/output/pipeline_result.json +1785 -0
Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/idea_emb.npy +3 -0
Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/idea_manifest.json +8 -0
Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/idea_meta.jsonl +100 -0
Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/paper_emb.npy +3 -0
Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/paper_manifest.json +8 -0
Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/paper_meta.jsonl +68 -0
Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/subdomain_taxonomy.json +508 -0
Paper-KG-Pipeline/requirements.txt +17 -0
Paper-KG-Pipeline/scripts/__pycache__/recall_system.cpython-313.pyc +0 -0
Paper-KG-Pipeline/scripts/build_edges.py +6 -0
Paper-KG-Pipeline/scripts/build_entity_v3.py +6 -0
Paper-KG-Pipeline/scripts/demo_pipeline.py +6 -0
Paper-KG-Pipeline/scripts/demos/demo_pipeline.py +304 -0
Paper-KG-Pipeline/scripts/demos/run_pipeline.py +68 -0
Paper-KG-Pipeline/scripts/demos/simple_recall_demo.py +527 -0
Paper-KG-Pipeline/scripts/dev/compare_pipeline_result.py +89 -0
Paper-KG-Pipeline/scripts/dev/verify_recall_equivalence.py +96 -0
Paper-KG-Pipeline/scripts/extract_paper_review.py +6 -0
Paper-KG-Pipeline/scripts/extract_patterns_ICLR_en_local.py +326 -0
Paper-KG-Pipeline/scripts/generate_clusters.py +788 -0
Paper-KG-Pipeline/scripts/generate_patterns_old.py +6 -0
Paper-KG-Pipeline/scripts/idea2story_pipeline.py +687 -0
Paper-KG-Pipeline/scripts/legacy/generate_patterns_old.py +776 -0
Paper-KG-Pipeline/scripts/pipeline/__init__.py +39 -0
Paper-KG-Pipeline/scripts/pipeline/__pycache__/__init__.cpython-313.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Paper-KG-Pipeline/data/paper_reviews_dataset_iclr_sample_100.jsonl filter=lfs diff=lfs merge=lfs -text
+Paper-KG-Pipeline/output/edges.json filter=lfs diff=lfs merge=lfs -text
+Paper-KG-Pipeline/output/knowledge_graph_v2.gpickle filter=lfs diff=lfs merge=lfs -text
+Paper-KG-Pipeline/output/nodes_idea.json.bak filter=lfs diff=lfs merge=lfs -text
+Paper-KG-Pipeline/output/nodes_paper.json.bak filter=lfs diff=lfs merge=lfs -text

INSTRUCCIONES_DESPLIEGUE.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# Guía de Despliegue en Hugging Face Spaces
+Esta carpeta contiene todo lo necesario para subir tu herramienta a la nube y mostrarla en tu web.
+## Paso 1: Crear el Space en Hugging Face
+1. Ve a [huggingface.co/spaces](https://huggingface.co/spaces) y haz clic en **"Create new Space"**.
+2. **Name**: `idea-to-paper` (o el que gustes).
+3. **License**: `mit`.
+4. **SDK**: Selecciona **Gradio**.
+5. Haz clic en **"Create Space"**.
+## Paso 2: Subir los Archivos
+Ahora debes subir los archivos de esta carpeta al Space. Puedes hacerlo vía web o git.
+**Archivos necesarios:**
+1. `app.py` (Está en esta carpeta `deploy/`).
+2. `requirements.txt` (Está en esta carpeta `deploy/`).
+3. **IMPORTANTE**: Debes subir también la carpeta `Paper-KG-Pipeline` completa (con todo su contenido) al mismo nivel que `app.py`.
+    * *Nota*: Si usas la web de Hugging Face, puedes arrastrar la carpeta.
+## Paso 3: Configurar Secretos (API Key)
+Para que funcione, el Space necesita tu clave de Google Gemini.
+1. En tu Space, ve a **Settings**.
+2. Busca la sección **"Variables and secrets"**.
+3. Haz clic en **"New secret"**.
+4. **Name**: `GEMINI_API_KEY`
+5. **Value**: (Pega tu API Key aquí, la misma que está en el archivo `.env`).
+## Paso 4: Incrustar en aruspice.ar
+Una vez que el Space esté corriendo (verás "Running" en verde), copia el siguiente código en tu página web `aruspice.ar/ideas`:
+```html
+<!-- Incrustar Idea2Paper -->
+<iframe
+ src="https://TU-USUARIO-idea-to-paper.hf.space"
+ frameborder="0"
+ width="100%"
+ height="1200"
+    style="border-radius: 12px; border: 1px solid #eee;"
+></iframe>
+```
+*Reemplaza `TU-USUARIO-idea-to-paper.hf.space` con la URL real de tu Space (la encuentras en el botón "Embed this space" arriba a la derecha).*

Paper-KG-Pipeline/data/paper_reviews_dataset_iclr_sample_100.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1456a485250b76ab19321107d91692ada774b18007a164dc1c71a7266a3366df
+size 14784838

Paper-KG-Pipeline/docs/PROJECT_FULL_DOC_EN.md ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/docs/PROJECT_FULL_DOC_ZH.md ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/docs/archive_merged.md ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/output/edges.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10f19b242732e125b1bc3c1d9a758cc9bab6d39926469fc861cbc2846543359a
+size 92840938

Paper-KG-Pipeline/output/final_story.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "title": "Emergent Temporal Topology for Video Generation",
+  "abstract": "Improving temporal consistency in video generation demands a fundamental shift from sequential frame synthesis to the generative modeling of latent structure. This paper reframes video generation as the task of inferring and generating the underlying, evolving spatiotemporal graph whose stability dictates visual coherence. We introduce a novel diffusion framework where the primary generative object is a latent dynamic graph topology, discovered through self-representation and governed by probabilistic latent interactions, from which consistent video frames naturally emerge. Validated through experiments, our approach reduces flicker score by 18% and temporal warping error by 22% on UCF-101 compared to state-of-the-art Video Diffusion Models, with ablations confirming the necessity of our graph-first generative principle.",
+  "problem_framing": "We reframe the problem of temporal inconsistency from a flaw in sequence modeling to the absence of a generative prior for latent relational topology. Current methods treat video as a sequence of independent frames, attempting to enforce consistency post-hoc via optical flow losses or model it sequentially via Transformers or RNNs. These approaches are fundamentally limited: post-processing introduces artifacts, while sequential models suffer from vanishing gradients and quadratic computational scaling, failing to capture the persistent, community-level structures that govern long-range coherence. The true challenge is not 'making frames match' but probabilistically generating the scaffold—the dynamic graph of semantic entities and their interactions—upon which coherent visual evolution must occur.",
+  "gap_pattern": "Existing diffusion-based video generation methods fail because they operate on the wrong generative object: individual pixels or frames. By focusing on sequential denoising, they overlook the latent, evolving community structures that are the true source of temporal stability. This conceptual gap manifests practically as an inability to model long-range dependencies without prohibitive compute, sensitivity to frame-rate variations, and flickering artifacts from unmodeled topological instability. Methods that add graph reasoning as a secondary module (e.g., for regularization) treat the graph as an external constraint, not as the core generative substrate. Consequently, they lack a unified mechanism where content and relational dynamics co-evolve, leaving temporal coherence as an optimization target rather than an inherent property of the generated representation.",
+  "solution": "Our solution transforms video generation by making the latent spatiotemporal graph the primary object of the diffusion process, realizing inherent temporal consistency through a co-evolutionary framework. We unify content generation and temporal reasoning by introducing a generative latent graph prior. First, we leverage a self-representation framework, inspired by Krylov subspace optimization (GMRES), to allow the model to discover compact, persistent visual entities as the nodes and communities of the latent graph directly from the data. Second, we model the graph's temporal dynamics through a probabilistic latent interaction model (CLEP) that captures how these communities emerge and interact using community-specific embeddings and a contrastive objective. Third, we correct for irregularities in video data by estimating the underlying temporal sampling density to adjust graph dynamics, ensuring robustness. This integrated approach ensures consistency emerges from the stability of the generated graph topology itself.",
+  "method_skeleton": "Step 1: Discover latent graph communities via a self-representation framework using the GMRES method to find least-squares solutions over Krylov subspaces, learning a sparse, self-expressive code for each frame segment that identifies recurring visual entities as graph nodes and their affinities; Step 2: Generate the evolving graph topology with a probabilistic latent interaction model (CLEP), where the diffusion process denoises a latent adjacency tensor using community-specific embeddings to model edge probabilities and a contrastive loss to ensure stable community memberships over time; Step 3: Condition the graph diffusion on estimated temporal density by applying a self-supervised kernel density estimator to frame timestamps and using this density to correct the graph shift operators in the latent interaction model, aligning generated graph dynamics with real-world motion.",
+  "innovation_claims": [
+    "We transform temporal consistency from an external constraint to an inherent generative property by reframing video generation as the diffusion-based synthesis of a latent dynamic graph topology, where stability emerges from modeling the probability distribution over spatiotemporal community structures.",
+    "We reframe temporal dependency modeling by introducing a co-evolutionary mechanism that unifies self-representation for node discovery (via Krylov subspace optimization) and probabilistic latent interaction (via a CLEP framework) into a single generative act, bypassing sequential bottlenecks and enabling scalable long-range coherence.",
+    "We transform the handling of irregular video data by developing a self-supervised temporal density estimation method that corrects non-uniform sampling and directly conditions the graph shift operators within the diffusion process, ensuring robust coherence generation across diverse and unpredictable frame rates."
+  ],
+  "experiments_plan": "We validate our framework on standard video generation benchmarks UCF-101 and Kinetics-600, comparing against state-of-the-art baselines including Video Diffusion Models (VDM) and StyleGAN-V using metrics for temporal consistency (flicker score, warping error) and quality (FVD, IS). Ablation studies will quantitatively isolate the contribution of each core component: (1) disabling the self-representation module (GMRES-based optimization), (2) replacing the probabilistic latent interaction model (CLEP) with a standard sequential attention layer, and (3) removing the temporal density correction. Additional analysis will visualize the generated latent graphs to provide qualitative evidence of stable community evolution correlating with improved video coherence."
+}

Paper-KG-Pipeline/output/knowledge_graph_stats.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "total_nodes": 16791,
+  "ideas": 8284,
+  "patterns": 124,
+  "domains": 98,
+  "papers": 8285
+}

Paper-KG-Pipeline/output/knowledge_graph_v2.gpickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:902d843e146e08e9459674c9c66933d9d272a94061d68055260ca294b8a6c92d
+size 34286005

Paper-KG-Pipeline/output/log.json ADDED Viewed

	@@ -0,0 +1,180 @@

+================================================================================
+🎯 三路召回系统 Demo
+================================================================================
+【用户Idea】
+Research on the Self-Evolution of Intelligent Agents Based on Reflection and Memory
+📂 加载数据...
+  ✓ Idea: 8284, Pattern: 124, Domain: 98, Paper: 8285
+  ✓ 图谱: 16790 节点, 444872 边
+🔍 [路径1] 相似Idea召回...
+  [粗排] 使用Jaccard快速筛选Top-100...
+  [精排] 使用Embedding重排Top-10...
+  ✓ 粗排5736个 → 精排100个 → 最终10个
+    - 匹配 Idea [idea_2367]: Introduce a novel intrinsic reward mechanism based on the novelty of surprise, enhancing exploration efficiency in reinforcement learning.... (sim=0.687)
+    - 匹配 Idea [idea_2142]: Introduce a benchmark to evaluate the memory capabilities of Deep Reinforcement Learning agents in partially observable environments.... (sim=0.686)
+    - 匹配 Idea [idea_5846]: Introduce a novel memory model using the Hadamard product to enhance memory capacity and stability in reinforcement learning agents operating in partially observable environments.... (sim=0.684)
+    - 匹配 Idea [idea_5122]: Investigate and enhance the decision-making performance of LLM agents in multi-agent settings using regret as a performance metric.... (sim=0.683)
+    - 匹配 Idea [idea_4029]: Introduce a comprehensive benchmark to evaluate the reasoning and decision-making abilities of LLMs as agents across diverse environments.... (sim=0.673)
+    - 匹配 Idea [idea_5026]: Enhance LLM-based web agents by aligning their observation and action spaces with the LLM's capabilities, significantly improving performance on web tasks.... (sim=0.671)
+    - 匹配 Idea [idea_6512]: Provide a comprehensive toolkit to facilitate the development and evaluation of general virtual agents in dynamic, open-domain environments.... (sim=0.658)
+    - 匹配 Idea [idea_8150]: Utilize open-source LLMs to automate the full cycle of research and review, enhancing scientific inquiry through iterative preference training.... (sim=0.649)
+    - 匹配 Idea [idea_5400]: Enable transparent and efficient access to the internals of large neural networks through a scalable framework, democratizing research on AI models.... (sim=0.628)
+    - 匹配 Idea [idea_6151]: Introduce a benchmark to evaluate the robustness of LLM agents against misuse and jailbreak attacks across diverse malicious tasks.... (sim=0.614)
+  ✓ 召回 6 个Pattern，保留Top-10
+🌍 [路径2] 领域相关性召回...
+  找到 1 个相关Domain，选择 Top-5
+  - domain_2 (名称=Machine Learning, 相关度=1.000, 论文数=5314)
+    子领域: $f$-Divergence, 2D Transformations, 3D Conformation Analysis, 3D Data Modeling, 3D Data Processing... (共4432个)
+  ✓ 召回 120 个Pattern，保留Top-5
+📄 [路径3] 相似Paper召回...
+  [粗排] 使用Jaccard快速筛选Top-100...
+  [精排] 使用Embedding重排Top-20...
+  ✓ 粗排1930个 → 精排100个 → 最终20个
+  - lTt4KjHSsyl (相似度=0.716, 质量=0.500 [默认])
+    标题: Emergence of Maps in the Memories of Blind Navigation Agents
+  - A0HKeKl4Nl (相似度=0.683, 质量=0.500 [默认])
+    标题: Mechanistically analyzing the effects of fine-tuning on procedurally defined tasks
+  - xkSlKCYyV_ (相似度=0.680, 质量=0.500 [默认])
+    标题: Memory-Efficient Reinforcement Learning with Priority based on Surprise and On-policyness
+  - xKDZAW0He3 (相似度=0.675, 质量=0.500 [默认])
+    标题: SeCom: On Memory Construction and Retrieval for Personalized Conversational Agents
+  - hoYFLRNbhc (相似度=0.659, 质量=0.500 [默认])
+    标题: DelTA: An Online Document-Level Translation Agent Based on Multi-Level Memory
+  - apErWGzCAA (相似度=0.656, 质量=0.500 [默认])
+    标题: Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation Models
+  - j3mm8mci4u (相似度=0.654, 质量=0.500 [默认])
+    标题: On the Fast Convergence of Unstable Reinforcement Learning Problems
+  - 3K3s9qxSn7 (相似度=0.649, 质量=0.500 [默认])
+    标题: On Representation Complexity of Model-based and Model-free Reinforcement Learning
+  - V2cBKtdC3a (相似度=0.648, 质量=0.500 [默认])
+    标题: Exploring the Promise and Limits of Real-Time Recurrent Learning
+  - 4O0v4s3IzY (相似度=0.640, 质量=0.500 [默认])
+    标题: On the self-verification limitations of large language models on reasoning and planning tasks
+  - 2V1Z0Jdmss (相似度=0.637, 质量=0.500 [默认])
+    标题: On the Over-Memorization During Natural, Robust and Catastrophic Overfitting
+  - 6JMXLWX68Kj (相似度=0.633, 质量=0.500 [默认])
+    标题: On the Performance of Temporal Difference Learning With Neural Networks
+  - jOmk0uS1hl (相似度=0.633, 质量=0.500 [默认])
+    标题: Training on the Test Task Confounds Evaluation and Emergence
+  - hJqGbUpDGV (相似度=0.629, 质量=0.500 [默认])
+    标题: On the Sensitivity of Reward Inference to Misspecified Human Models
+  - o9kqa5K3tB (相似度=0.615, 质量=0.500 [默认])
+    标题: On the Benefits of Memory for Modeling Time-Dependent PDEs
+  - 2G-vUJ7XcSB (相似度=0.612, 质量=0.500 [默认])
+    标题: On the Power of Pre-training for Generalization in RL: Provable Benefits and Hardness
+  - Fh97BDaR6I (相似度=0.611, 质量=0.500 [默认])
+    标题: On The Specialization of Neural Modules
+  - UGVYezlLcZ (相似度=0.604, 质量=0.500 [默认])
+    标题: On the Optimal Memorization Capacity of Transformers
+  - jIu4hk04776 (相似度=0.604, 质量=0.500 [默认])
+    标题: On the Geometry of Reinforcement Learning in Continuous State and Action Spaces
+  - MeHmwCDifc (相似度=0.603, 质量=0.500 [默认])
+    标题: The Trickle-down Impact of Reward Inconsistency on RLHF
+  ✓ 召回 13 个Pattern，保留Top-10
+🔗 融合三路召回结果...
+================================================================================
+📊 召回结果 Top-10
+================================================================================
+【Rank 1】 pattern_118
+  名称: Reframing Reinforcement Learning Challenges
+  最终得分: 0.6733
+  - 路径1 (相似Idea):   0.5480 (占比 81.4%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.1253 (占比 18.6%)
+  聚类大小: 47 篇论文
+  归纳总结: Papers in this cluster introduce innovative methods to enhance reinforcement learning, including leveraging state-space ...
+【Rank 2】 pattern_65
+  名称: Reframing Agent Design for Adaptability
+  最终得分: 0.5315
+  - 路径1 (相似Idea):   0.5315 (占比 100.0%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0000 (占比 0.0%)
+  聚类大小: 21 篇论文
+  归纳总结: This cluster of papers introduces innovative frameworks and methodologies for enhancing adaptability and performance in ...
+【Rank 3】 pattern_122
+  名称: Reframing Exploration as Structured Discovery
+  最终得分: 0.3404
+  - 路径1 (相似Idea):   0.2749 (占比 80.7%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0656 (占比 19.3%)
+  聚类大小: 19 篇论文
+  归纳总结: This cluster introduces innovative exploration strategies in reinforcement learning by reframing exploration as a struct...
+【Rank 4】 pattern_51
+  名称: Adversarial Vulnerabilities and Robustness in Large Language Models
+  最终得分: 0.3139
+  - 路径1 (相似Idea):   0.2456 (占比 78.2%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0683 (占比 21.8%)
+  聚类大小: 92 篇论文
+  归纳总结: This cluster explores novel methods to identify and mitigate adversarial vulnerabilities in large language models, inclu...
+【Rank 5】 pattern_69
+  名称: Language Model Agent Self Improvement
+  最终得分: 0.2731
+  - 路径1 (相似Idea):   0.2731 (占比 100.0%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0000 (占比 0.0%)
+  聚类大小: 22 篇论文
+  归纳总结: This cluster explores the use of large language models to enhance reinforcement learning agents by simplifying reward de...
+【Rank 6】 pattern_78
+  名称: Dynamic Data Driven Stride Adaptation
+  最终得分: 0.2512
+  - 路径1 (相似Idea):   0.2512 (占比 100.0%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0000 (占比 0.0%)
+  聚类大小: 18 篇论文
+  归纳总结: This cluster introduces innovative methods to model, optimize, and adapt neural network scaling behaviors, hyperparamete...
+【Rank 7】 pattern_74
+  名称: Democratizing Large Language Model Accessibility
+  最终得分: 0.0859
+  - 路径1 (相似Idea):   0.0000 (占比 0.0%)
+  - 路径2 (领域相关):   0.0200 (占比 23.3%)
+  - 路径3 (相似Paper):  0.0659 (占比 76.7%)
+  聚类大小: 156 篇论文
+  归纳总结: Papers in this cluster explore innovative methods to train, fine-tune, and deploy large language models more efficiently...
+【Rank 8】 pattern_85
+  名称: Reframing Embodied Intelligence Through Structured Abstraction
+  最终得分: 0.0716
+  - 路径1 (相似Idea):   0.0000 (占比 0.0%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0716 (占比 100.0%)
+  聚类大小: 18 篇论文
+  归纳总结: This cluster of papers introduces innovative hierarchical abstraction, structured search, memory-inspired models, unifie...
+【Rank 9】 pattern_58
+  名称: Reframing Dialogue System Challenges
+  最终得分: 0.0675
+  - 路径1 (相似Idea):   0.0000 (占比 0.0%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0675 (占比 100.0%)
+  聚类大小: 17 篇论文
+  归纳总结: This cluster of papers introduces innovative methods to enhance the adaptability, diversity, and efficiency of task-orie...
+【Rank 10】 pattern_120
+  名称: Proactive Safety Assurance in Reinforcement Learning
+  最终得分: 0.0654
+  - 路��1 (相似Idea):   0.0000 (占比 0.0%)
+  - 路径2 (领域相关):   0.0000 (占比 0.0%)
+  - 路径3 (相似Paper):  0.0654 (占比 100.0%)
+  聚类大小: 44 篇论文
+  归纳总结: This cluster of papers introduces a variety of innovative risk prediction, symbolic reasoning, barrier function, counter...
+================================================================================
+✅ 召回完成!
+================================================================================
+Process finished with exit code 0

Paper-KG-Pipeline/output/nodes_domain.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/output/nodes_idea.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/output/nodes_idea.json.bak ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc8ee78e97e24421d63c885bd2c46749f7b32386affa666a7083b3d6b0ea6095
+size 10692216

Paper-KG-Pipeline/output/nodes_paper.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/output/nodes_paper.json.bak ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2c7083e5ef3835039f8e570d5827d2b87bfcac8b29bcce93aa170a3a6581467
+size 14657499

Paper-KG-Pipeline/output/nodes_pattern.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/output/novelty_index__gemini-embedding-001/index_manifest.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "created_at": "2026-02-12T12:06:26.167897+00:00",
+  "embedding_model": "gemini-embedding-001",
+  "paper_count": 100,
+  "index_count": 100,
+  "skipped": 0,
+  "nodes_paper_hash": "ed105040e3761981361be4f754edb63d3119216a2fcfd28974cc281503e9e743"
+}

Paper-KG-Pipeline/output/novelty_index__gemini-embedding-001/paper_emb.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8684b19a71f336924fa0cd93f9002b176f0d1623e244ace02a3994896bec0838
+size 1228928

Paper-KG-Pipeline/output/novelty_index__gemini-embedding-001/paper_meta.jsonl ADDED Viewed

	@@ -0,0 +1,100 @@

+{"paper_id": "RUzSobdYy0V", "title": "Quantifying and Mitigating the Impact of Label Errors on Model Disparity Metrics", "pattern_id": "pattern_9", "domain": "Fairness & Accountability", "text_hash": "9509dae69cd2a224e81ca368d3d141ae33dbeec88fa3efd0c390e2f1374354a9"}
+{"paper_id": "N3kGYG3ZcTi", "title": "Suppression helps: Lateral Inhibition-inspired Convolutional Neural Network for Image Classification", "pattern_id": "pattern_22", "domain": "Computer Vision", "text_hash": "0a0a7eb1365d6a8eb36d501748dcc7003cb5909bf52fbcd5417202315bc24802"}
+{"paper_id": "tmIiMPl4IPa", "title": "Factorized Fourier Neural Operators", "pattern_id": "pattern_42", "domain": "Machine Learning", "text_hash": "46183525e5afdde806285ad68f183021a7771b54cd7852af24a6c8471fa0e4d2"}
+{"paper_id": "mhnHqRqcjYU", "title": "DFPC: Data flow driven pruning of coupled channels without data.", "pattern_id": "pattern_77", "domain": "Machine Learning", "text_hash": "d99b35d588845b11f90cd7de1bd0fa7ac5a3caf9663d518bd585ac3d892653d5"}
+{"paper_id": "sZI1Oj9KBKy", "title": "TVSPrune - Pruning Non-discriminative filters via Total Variation separability of intermediate representations without fine tuning", "pattern_id": "pattern_77", "domain": "Machine Learning", "text_hash": "0b2912c9b69592acb45361e27dffd5cc4e807752b920767dd6027d433dbd8169"}
+{"paper_id": "I3HCE7Ro78H", "title": "Finding Actual Descent Directions for Adversarial Training", "pattern_id": "pattern_67", "domain": "Machine Learning", "text_hash": "776148b8a7ae7b71b106a78df00d06a8d11f02a91943d87f7166b6b6551f3982"}
+{"paper_id": "9Zx6tTcX0SE", "title": "A Study of Biologically Plausible Neural Network: the Role and Interactions of Brain-Inspired Mechanisms in Continual Learning", "pattern_id": "pattern_0", "domain": "Machine Learning", "text_hash": "22521479fc6d251ccf5001cf3f09a4f5a176cd33a8fbfb27cbebb135bb04793f"}
+{"paper_id": "Vx6G9W5M4sQ", "title": "pFedKT: Personalized Federated Learning via Knowledge Transfer", "pattern_id": "pattern_10", "domain": "Machine Learning", "text_hash": "f70a9ccf08f30c297e25b6eedafc6d2f67be2ae44df5580e7de755e0d223a74b"}
+{"paper_id": "vzdrgR2nomD", "title": "FARE: Provably Fair Representation Learning", "pattern_id": "pattern_9", "domain": "Fairness & Accountability", "text_hash": "122ce85d0cf8fad708f467582bd802b5b251055ede2a62a38dd992cc2a013a31"}
+{"paper_id": "NOKUQ9JMohJ", "title": "ONLINE RESTLESS BANDITS WITH UNOBSERVED STATES", "pattern_id": "pattern_63", "domain": "Machine Learning", "text_hash": "ed3c1eb4e7ae55a340951c14189514b0c52d2f36e1cbf380b196ea84eb68b1b5"}
+{"paper_id": "IQM-3_Tzldw", "title": "Learning to aggregate: A parameterized aggregator to debias aggregation for cross-device federated learning", "pattern_id": "pattern_10", "domain": "Machine Learning", "text_hash": "66112df105b5c2b227039ac3bf0d6da5dc5379275d6be21d45e39511ba1b6e10"}
+{"paper_id": "3uDXZZLBAwd", "title": "Deep Reinforcement Learning based Insight Selection Policy", "pattern_id": "", "domain": "Machine Learning", "text_hash": "4b3ebd3026a8b1168ba1d6b7ccf00fb932c3253cd443125e45878584571bad98"}
+{"paper_id": "TZG_XsO4x6y", "title": "Long-horizon video prediction using a dynamic latent hierarchy", "pattern_id": "", "domain": "Machine Learning", "text_hash": "0be79fd0dc53e17a06d253b3cfb142335e44629014dea23b04786b32a969c7d9"}
+{"paper_id": "yqe0BZeN_xH", "title": "SwinZS3: Zero-Shot Semantic Segmentation with a Swin Transformer", "pattern_id": "pattern_103", "domain": "Computer Vision", "text_hash": "87592086a3ad849b36ea23e5c9c61603dddec85ee58faa66c7f136f55c9a1b26"}
+{"paper_id": "7YfHla7IxBJ", "title": "Encoding Recurrence into Transformers", "pattern_id": "pattern_23", "domain": "Machine Learning", "text_hash": "fc341ebd5e03d3fd3b157202d18290606266282533e6b56d960a9f8ae7b02751"}
+{"paper_id": "N_g8TT9Cy7f", "title": "Human-Guided Fair Classification for Natural Language Processing", "pattern_id": "pattern_87", "domain": "Natural Language Processing", "text_hash": "45bb8a66181659aecb1a685c8c652c144b2abe9a5454a2a43c14ad19a41636d6"}
+{"paper_id": "Xj9V-stmIcO", "title": "Proper Scoring Rules for Survival Analysis", "pattern_id": "pattern_56", "domain": "Machine Learning", "text_hash": "ff7e5e4349c04ac6e55b4d56da40fc2e2e83de779dc51a9de158375e7b0f40c4"}
+{"paper_id": "BO5_Lm7iD_", "title": "Social Network Structure Shapes Innovation: Experience-sharing in RL with SAPIENS", "pattern_id": "pattern_97", "domain": "Machine Learning", "text_hash": "d1f55230609786a8a8ed75346e25f66b86cefb30738922b77c6cbac10426f0a2"}
+{"paper_id": "jREF4bkfi_S", "title": "Mini-batch $k$-means terminates within $O(d/\\epsilon)$ iterations", "pattern_id": "pattern_20", "domain": "Machine Learning", "text_hash": "7ac5476defbea41d4b60904cac2dfbf56ab5fdb39f327500daa1efa9c3286740"}
+{"paper_id": "Jdj0fZhswJC", "title": "Convergence is Not Enough: Average-Case Performance of No-Regret Learning Dynamics", "pattern_id": "pattern_93", "domain": "Machine Learning", "text_hash": "5e1384586d80cd15b8e7cfb4e5c33e48aa36b1c333111f99517ec0f913ef984f"}
+{"paper_id": "Rn50hCOX9XX", "title": "Gene finding revisited: improved robustness through structured decoding from learning embeddings", "pattern_id": "pattern_6", "domain": "Bioinformatics", "text_hash": "cb6cd1ed0d84ca4407dd2c619879c7168cfa62102644b07beaae13ae9db44404"}
+{"paper_id": "pWVASryOyFw", "title": "Learning Uncertainty for Unknown Domains with Zero-Target-Assumption", "pattern_id": "", "domain": "Natural Language Processing", "text_hash": "0b04436fe3b2d8d1b426d97b91c21525581988c8f3f6374ab956cc71b157ab6d"}
+{"paper_id": "0OlEBibFa_g", "title": "Detecting Out-of-Distribution Data with Semi-supervised Graph “Feature\" Networks", "pattern_id": "pattern_4", "domain": "Machine Learning", "text_hash": "53263ec583a3094b77c6f4465ef6a3ccb6c9a64376e7ed9045654d05508166d7"}
+{"paper_id": "1Wo0vqaZ8WJ", "title": "Let Offline RL Flow: Training Conservative Agents in the Latent Space of Normalizing Flow", "pattern_id": "pattern_104", "domain": "Machine Learning", "text_hash": "1e7c0196ae482e2ba715ea2d791b11a5d7885bd2f33fb40d9528988632b6b692"}
+{"paper_id": "UPQualDj1oo", "title": "Machine Learning from Explanations", "pattern_id": "pattern_16", "domain": "Machine Learning", "text_hash": "02422e9c3b212d9957db25913ec5c3e524791ed649de7d40fc7355efa091b6e2"}
+{"paper_id": "9D5FH6LFbRu", "title": "Functional Risk Minimization", "pattern_id": "", "domain": "Machine Learning", "text_hash": "bfc36f1543e6fcbcfaabb3dd18744833389fc724220a0c9d6249d24a5032f1fc"}
+{"paper_id": "a-bD9-0ycs0", "title": "Latent Linear ODEs with Neural Kalman Filtering for Irregular Time Series Forecasting", "pattern_id": "pattern_35", "domain": "Machine Learning", "text_hash": "eba5b520c28447a292479e7a1ed048984011d025c60575084d6cf0a44597dbc0"}
+{"paper_id": "ULzyv9M1j5", "title": "Transformer-based model for symbolic regression via joint supervised learning", "pattern_id": "pattern_25", "domain": "Machine Learning", "text_hash": "b87dc6e4ca2961574c080a9c3aa1cbecda5b2fa152aa948971f9040b0a822eed"}
+{"paper_id": "hChYEyebNm1", "title": "Gradient-Based Transfer Learning", "pattern_id": "", "domain": "Machine Learning", "text_hash": "b8be678893d797863a395f46b6a7067a5078b783f61363ce0f9e2d346699e963"}
+{"paper_id": "pgJp7rDc_hk", "title": "Coreset for Rational Functions", "pattern_id": "pattern_28", "domain": "Machine Learning", "text_hash": "73f93e55602ffb8de443bc459839169952869d54987a4b350e145644d92cce0d"}
+{"paper_id": "0z_cXcu1N6o", "title": "Transformer needs NMDA receptor nonlinearity for long-term memory", "pattern_id": "pattern_32", "domain": "Machine Learning", "text_hash": "d57e99151ccfc6c2f3698ccd1a32c01004756af11e4aa126e8a7a7019c90deae"}
+{"paper_id": "cZM4iZmxzR7", "title": "Simple Spectral Graph Convolution from an Optimization Perspective", "pattern_id": "pattern_24", "domain": "Machine Learning", "text_hash": "caba9f6b06247a005f74803513c07c386a08b6686581e09bf77a5bebfe3142a7"}
+{"paper_id": "gNI4_85Cyve", "title": "QAID: Question Answering Inspired Few-shot Intent Detection", "pattern_id": "pattern_73", "domain": "Natural Language Processing", "text_hash": "eff3e49e6e728e03e5b4642dee53e8509722b02f63bbfbd8bb8ffa4177b57218"}
+{"paper_id": "1FsdIfRngtw", "title": "Rethinking the Value of Prompt Learning for Vision-Language Models", "pattern_id": "pattern_109", "domain": "Machine Learning", "text_hash": "1e7cfc01c8ffb857ea4760cd4a8282eaf0a896ed97f11cd6b4632e17be906433"}
+{"paper_id": "pW_jGk1D_Ww", "title": "Disentangled Feature Swapping Augmentation for Weakly Supervised Semantic Segmentation", "pattern_id": "pattern_103", "domain": "Computer Vision", "text_hash": "c124af23c7bf8272a749e9097f544a40aceb0471c367998237af4a18b366db39"}
+{"paper_id": "tORS9qGBNpT", "title": "Distributed Least Square Ranking with Random Features", "pattern_id": "pattern_10", "domain": "Machine Learning", "text_hash": "ab33708adb1e497e150bbdee5dfca38e1aa7498b0a0a05ae59dfd95671db82f0"}
+{"paper_id": "17RDXeF-skZ", "title": "Doing Fast Adaptation Fast: Conditionally Independent Deep Ensembles for Distribution Shifts", "pattern_id": "pattern_47", "domain": "Machine Learning", "text_hash": "72dd1a8e79f93aeb90719dd262f3ff65cdf7472e6de6754f1cb886e1279c1952"}
+{"paper_id": "ejR4E1jaH9k", "title": "Solving stochastic weak Minty variational inequalities without increasing batch size", "pattern_id": "pattern_83", "domain": "Optimization", "text_hash": "2cb3d7a18c2d8ab8b03bc44dc400c57c662836a5ef7834e650c4bdfe63522d12"}
+{"paper_id": "8Ygoj2IeXfW", "title": "Diversity Boosted Learning for Domain Generalization with a Large Number of Domains", "pattern_id": "pattern_50", "domain": "Machine Learning", "text_hash": "22c41b624d66a1efc31aa66aa96e620418ef59e424d44c784e872c35197ed003"}
+{"paper_id": "8XfHh4XSQ0Q", "title": "Adaptive Block-wise Learning for Knowledge Distillation", "pattern_id": "pattern_19", "domain": "Machine Learning", "text_hash": "99d3c6ade9d0b1aa452fa21d1470b8814a56015638bfbe7efd11235bed3c3973"}
+{"paper_id": "r9fX833CsuN", "title": "Curriculum-based Co-design of Morphology and Control of Voxel-based Soft Robots", "pattern_id": "pattern_86", "domain": "Robotics", "text_hash": "0b3dd51c4adbc3aa461f5fd1947f7f2b50abeb0a05767e84da891da0f83e5d8c"}
+{"paper_id": "AqX3oSbzyQ1", "title": "Object-Centric Learning with Slot Mixture Models", "pattern_id": "", "domain": "Machine Learning", "text_hash": "b1c7f8beb3b3defa8e9feeb21e426a7dffbaea9f2e1794e4bdb1e076bda3493f"}
+{"paper_id": "tPKKXeW33YU", "title": "WiNeRT: Towards Neural Ray Tracing for Wireless Channel Modelling and Differentiable Simulations", "pattern_id": "", "domain": "Wireless Communications", "text_hash": "f6723c7c6bdcb012c646e5b15138a2c7cf019a233fde800e331f008919a1d4b8"}
+{"paper_id": "HGsoe1wmRW5", "title": "Pocket-specific 3D Molecule Generation by Fragment-based Autoregressive Diffusion Models", "pattern_id": "pattern_6", "domain": "Machine Learning", "text_hash": "5783ed29b1023a229f86e11318cdc8e83cf332cce503f7ca08044f8807e13b2c"}
+{"paper_id": "3WYtm7UzsR", "title": "Towards scalable and non-IID robust Hierarchical Federated Learning via Label-driven Knowledge Aggregator", "pattern_id": "pattern_10", "domain": "Machine Learning", "text_hash": "d99c393c9da7385e65f7d2134a436399e784276a84253f88f856f967fb8bfa5d"}
+{"paper_id": "X5ZMzRYqUjB", "title": "Humanly Certifying Superhuman Classifiers", "pattern_id": "pattern_87", "domain": "Machine Learning", "text_hash": "f02e6a46edd32bfe4fe297968c5fe5be84297d78a063cbbcdfbae9fcaafa0fb7"}
+{"paper_id": "oJpVVGXu9i", "title": "Share Your Representation Only: Guaranteed Improvement of the Privacy-Utility Tradeoff in Federated Learning", "pattern_id": "pattern_11", "domain": "Machine Learning", "text_hash": "54e67789e16e6f329fd9abd7418bd2d5c5007813ed82c5d0728c6e73163de112"}
+{"paper_id": "JIptuwnqwn", "title": "Quantized Disentangled Representations for Object-Centric Visual Tasks", "pattern_id": "", "domain": "Computer Vision", "text_hash": "4d0e21ceae371e004ce95a51c0f9bed2e737e1bcb9b3bd6024d8d86c72b8ec2a"}
+{"paper_id": "BDjGGZk9yz", "title": "Supervised Random Feature Regression via Projection Pursuit", "pattern_id": "pattern_83", "domain": "Machine Learning", "text_hash": "951035ba207039f8226558e854d3f13effbf3efef76cb136a4bcee4580c53d94"}
+{"paper_id": "loc3CUXeuzH", "title": "Graph Spline Networks for Efficient Continuous Simulation of Dynamical Systems", "pattern_id": "", "domain": "Machine Learning", "text_hash": "804f2e50382c6e0af3905321825b8a9bc45de280b88c3f48ff87b2b1b8ba1f74"}
+{"paper_id": "kL67fyKb6A", "title": "Online black-box adaptation to label-shift in the presence of conditional-shift", "pattern_id": "", "domain": "Machine Learning", "text_hash": "6a68fc471c08af3c60000ac56c1160ac15f77ef43939097527f7f110a22ff479"}
+{"paper_id": "WVZQa2QYJN", "title": "RuDar: Weather Radar Dataset for Precipitation Nowcasting with Geographical and Seasonal Variability", "pattern_id": "", "domain": "Machine Learning", "text_hash": "4c6d1e79e5f19bcd778d3abc61cf13e274b36c31aa622873482303c65f5df154"}
+{"paper_id": "jkMT2AtccX", "title": "Learning Representations for Reinforcement Learning with Hierarchical Forward Models", "pattern_id": "", "domain": "Machine Learning", "text_hash": "5559acbedec6bc9627e79dd534d504dbd3ec359830320bac7d0fa72a4182389a"}
+{"paper_id": "F5Cj26wfiu", "title": "xTrimoABFold: Improving Antibody Structure Prediction without Multiple Sequence Alignments", "pattern_id": "pattern_6", "domain": "Biotechnology", "text_hash": "d557067fc491cc471ea060da2232c32d2ff01c3514398f9070c92199afd48586"}
+{"paper_id": "mmFtinp4wQ_", "title": "Thresholded Lexicographic Ordered Multi-Objective Reinforcement Learning", "pattern_id": "pattern_15", "domain": "Reinforcement Learning", "text_hash": "79d95dcf2e01a3d30f3d4d261ed0797c69807848721c2e901d3c3e2c70356808"}
+{"paper_id": "5WOIluv9Xop", "title": "HOW SAMPLING AFFECTS TRAINING: AN EFFECTIVE SAMPLING THEORY STUDY FOR LONG-TAILED IMAGE CLASSIFICATION", "pattern_id": "pattern_40", "domain": "Computer Vision", "text_hash": "353ad96872d6f2e7cbd989b5f6a25d1460c8ac52bad437068241fbab525f11cd"}
+{"paper_id": "eDLwjKmtYFt", "title": "EquiMod: An Equivariance Module to Improve Visual Instance Discrimination", "pattern_id": "pattern_39", "domain": "Computer Vision", "text_hash": "4e01cff72724d858cf38896bf7aa6e3f4c13b5fd6d9dbcd72180d1b9dfc3825e"}
+{"paper_id": "cUX2psP06OL", "title": "Manipulating Multi-agent Navigation Task via Emergent Communications", "pattern_id": "pattern_97", "domain": "Artificial Intelligence", "text_hash": "1f0fd5fe18711984072eb5c8660301e906079e8943c72587ab2fe0aee61340d6"}
+{"paper_id": "-M0TNnyWFT5", "title": "Task-Aware Information Routing from Common Representation Space in Lifelong Learning", "pattern_id": "pattern_0", "domain": "Machine Learning", "text_hash": "afebde3115fde4dba444ffc32ed924164ec2ad1fd973c9cd274225a9ee64a1b9"}
+{"paper_id": "me09xlTmm8", "title": "Transport with Support: Data-Conditional Diffusion Bridges", "pattern_id": "", "domain": "Machine Learning", "text_hash": "fe38365952f328d2f3ec6870f070ab4001ca751c2ad90d4d877cb1cd09144879"}
+{"paper_id": "8foynpwwRb", "title": "Randomized Sharpness-Aware Training for Boosting Computational Efficiency in Deep Learning", "pattern_id": "pattern_83", "domain": "Machine Learning", "text_hash": "754943d84ee2f297c2a6a90007af584fdedc0a95c1f0c7c6e3778c811f11c5c3"}
+{"paper_id": "GX0uI5T8kd", "title": "Self-Supervised Off-Policy Ranking via Crowd Layer", "pattern_id": "", "domain": "Machine Learning", "text_hash": "be7e1f70b248b46e403ef34c8a1b2e6de3671c559128d37ce0cdb9dfbd24292d"}
+{"paper_id": "4F1gvduDeL", "title": "Few-Shot Domain Adaptation For End-to-End Communication", "pattern_id": "", "domain": "Machine Learning", "text_hash": "ed909c5c519c6a4c112826307cb1bbeb9922f095a23266f08939f3fc8a269ef4"}
+{"paper_id": "fyD8adDrXo", "title": "HyPHEN: A Hybrid Packing Method and Optimizations for Homomorphic Encryption-Based Neural Network", "pattern_id": "pattern_45", "domain": "Security & Privacy", "text_hash": "f337c56f900344f02e2d08ecdeeead10d2e1fc38ffb4311fc65b20b63729528c"}
+{"paper_id": "Y1J29OryQg", "title": "Causal Inference for Knowledge Graph Completion", "pattern_id": "pattern_17", "domain": "Machine Learning", "text_hash": "b747ba91961a6601ce87957b755eb9ae86393c80ee71aa423fa75fd3656697b2"}
+{"paper_id": "ywAjQw-spmY", "title": "Formal Specifications from Natural Language", "pattern_id": "", "domain": "Natural Language Processing", "text_hash": "f50089cfcdd486fa401b4284536ecdd3c164eca235413a6c42d548bc527a9533"}
+{"paper_id": "CcXTudu9bvu", "title": "DELTA: Diverse Client Sampling for Fasting Federated Learning", "pattern_id": "pattern_10", "domain": "Machine Learning", "text_hash": "2006625f5144fad3c15bb3413b34cb7801d482baffff900d29a6e31106f1e787"}
+{"paper_id": "rwetAifrs16", "title": "Incremental Predictive Coding: A Parallel and Fully Automatic Learning Algorithm", "pattern_id": "", "domain": "Machine Learning", "text_hash": "f7b1013ff3b1bddaf1569f602d7a1b61dee9d0f7865ae9dbd75748571291f7bd"}
+{"paper_id": "HqVp0rNC8jn", "title": "Learning Geometric Representations of Interactive Objects", "pattern_id": "", "domain": "Machine Learning", "text_hash": "a21d4b7038117f91a63c77410d921538d01647794aa39f6fd37a2c35e1e29a8a"}
+{"paper_id": "m3DmIL7wHDW", "title": "The guide and the explorer: smart agents for resource-limited iterated batch reinforcement learning", "pattern_id": "pattern_119", "domain": "Machine Learning", "text_hash": "3296a7783672be3c06f2545034a7e30e91734eb7c194ce7545fd7ae0abdda2e8"}
+{"paper_id": "x-mXzBgCX3a", "title": "FairGBM: Gradient Boosting with Fairness Constraints", "pattern_id": "pattern_9", "domain": "Fairness & Accountability", "text_hash": "60c89d6246c4e22eee4e517f5bb29ea4f33a57906edceffe5bfb7017b2e4db79"}
+{"paper_id": "-UsbRlXzMG", "title": "How (Un)Fair is Text Summarization?", "pattern_id": "", "domain": "Natural Language Processing", "text_hash": "b30d9988ff8043cf434eef133b452dafc73babfae71bbcd88859f5f55a1d442e"}
+{"paper_id": "Wac06sAkHk", "title": "Simulating Task-Free Continual Learning Streams From Existing Datasets", "pattern_id": "pattern_0", "domain": "Machine Learning", "text_hash": "dde7ce7981322450767f71b7e302f9735215bd1c58a0e38df8d1f54e3f769000"}
+{"paper_id": "18XzeuYZh_", "title": "Online Bias Correction for Task-Free Continual Learning", "pattern_id": "pattern_0", "domain": "Machine Learning", "text_hash": "32b0bab3a6d6af72688db6791869dc9e7c65d6caaaacdd6d9db16b0919ad2001"}
+{"paper_id": "j8s-BRxXST", "title": "A Simple Contrastive Learning Objective for Alleviating Neural Text Degeneration", "pattern_id": "pattern_96", "domain": "Natural Language Processing", "text_hash": "e7af0d99f2f42cf44b1b73113d9842f778eb77c6707497f417c042ee62379af7"}
+{"paper_id": "L6CKiPH3hI", "title": "Enriching Online Knowledge Distillation with Specialist Ensemble", "pattern_id": "pattern_19", "domain": "Machine Learning", "text_hash": "5aa6905d4feb0856c7f5c5469d3d7a85543399d8fb36045d5d4be0cbbec01c32"}
+{"paper_id": "lKXcMB9tOFD", "title": "Improved Gradient Descent Optimization Algorithm based on Inverse Model-Parameter Difference", "pattern_id": "pattern_83", "domain": "Machine Learning", "text_hash": "a8288f37ac2357abbb07021aa677d4a73c008517107ce862b09e2be289535744"}
+{"paper_id": "47DzlkyH3dM", "title": "Variational Learning ISTA", "pattern_id": "", "domain": "Machine Learning", "text_hash": "bc42fc03a1ba61d65e547e413f7487367437e2120d000e229b437cc23714736e"}
+{"paper_id": "mN43JdXmYMs", "title": "Moment Distributionally Robust Probabilistic Supervised Learning", "pattern_id": "pattern_40", "domain": "Machine Learning", "text_hash": "da8b0433c0051766b0935565ae23d0a41cea0afe77954da02eb195f2598afd5f"}
+{"paper_id": "r3-aLHxn2nB", "title": "CLEP: Exploiting Edge Partitioning for Graph Contrastive Learning", "pattern_id": "pattern_24", "domain": "Machine Learning", "text_hash": "15230a50b756ba7fd5406eb4ba5551f4b3b0a544af8d8f45b7d419dbdc7a03c5"}
+{"paper_id": "dpuAkczrTOt", "title": "Meta-Learning the Inductive Biases of Simple Neural Circuits", "pattern_id": "", "domain": "Neuroscience", "text_hash": "79d3788fc1e8bcf8bead5da58c682dbdcfe78dab5b7c69983cca6331ca41beac"}
+{"paper_id": "70-hEqC4Wo8", "title": "Accelerating spiking neural network training using the $d$-block model", "pattern_id": "pattern_1", "domain": "Neuroscience", "text_hash": "f9d57eb3c7c5ba17049cafaaa1fe818b366d5a4c51fe7c1ba16fba94bfc09325"}
+{"paper_id": "-hMNEMgT8Wd", "title": "RG: OUT-OF-DISTRIBUTION DETECTION WITH REACTIVATE GRADNORM", "pattern_id": "pattern_4", "domain": "Machine Learning", "text_hash": "86fcb9bc3f34058114b37d312f78a6fe23f286b696be0f1f3e1a978d7676f4e4"}
+{"paper_id": "05ff9BRSMzE", "title": "Gandalf : Data Augmentation is all you need for Extreme Classification", "pattern_id": "pattern_40", "domain": "Machine Learning", "text_hash": "8b2dbf0a461ef4781b35587086fa9d37cb97552581fcd7d41f396b18f3b8d4a8"}
+{"paper_id": "688hNNMigVX", "title": "Learning a Data-Driven Policy Network for Pre-Training Automated Feature Engineering", "pattern_id": "", "domain": "Machine Learning", "text_hash": "532484ae41626d1d3e05984b234a77a24e0c8c6d769a74476536bcc9fc55c5be"}
+{"paper_id": "pcBJT4bgbpH", "title": "Attention Flows for General Transformers", "pattern_id": "", "domain": "Natural Language Processing", "text_hash": "d9f19be9cba2c518c74645e46bffc94353dd1c4dbb455af49e4547722dfadd63"}
+{"paper_id": "bjPPypbLre", "title": "Making Substitute Models More Bayesian Can Enhance Transferability of Adversarial Examples", "pattern_id": "pattern_66", "domain": "Security & Privacy", "text_hash": "b834fc9ed1d9613ea902a3c59ae160cf6043f08ee69122cd6123725c90d17596"}
+{"paper_id": "75O7S_L4oY", "title": "Learning Group Importance using the Differentiable Hypergeometric Distribution", "pattern_id": "", "domain": "Machine Learning", "text_hash": "3ce4757a99b6fb2170182bf29c5f1bc73bf34545f3186702967a8238957c9591"}
+{"paper_id": "pvgEL1yS3Ql", "title": "Cross-Layer Retrospective Retrieving via Layer Attention", "pattern_id": "", "domain": "Computer Vision", "text_hash": "6461d6cfdd55e16fb6eb76a5dc8bb8d473a5f443ae7bffd8ec89491446e566af"}
+{"paper_id": "kqHkCVS7wbj", "title": "Decision S4: Efficient Sequence-Based RL via State Spaces Layers", "pattern_id": "pattern_118", "domain": "Machine Learning", "text_hash": "bf66acca3d471b4df0501a9049190b6bfe7db87f3e3b9969fded80b108ca6489"}
+{"paper_id": "gvOSQjGTtxj", "title": "Deep autoregressive density nets vs neural ensembles for model-based offline reinforcement learning", "pattern_id": "pattern_104", "domain": "Machine Learning", "text_hash": "1b419cc60563209e012e155fecd70b6f6855bed2f85dd51812d48fbf15f8a9e9"}
+{"paper_id": "mnVf1W6ipGm", "title": "Unveiling the sampling density in non-uniform geometric graphs", "pattern_id": "pattern_24", "domain": "Graph Theory", "text_hash": "2374b29c2439a0efb23205258bc2ad0719e57a37a1d1bf45116fa6da5316f405"}
+{"paper_id": "LNpMtk15AS4", "title": "Boosting Causal Discovery via Adaptive Sample Reweighting", "pattern_id": "pattern_17", "domain": "Machine Learning", "text_hash": "b6e495c2383a643928be50b8c0597e338ea5ed3266964e1d72eef5bdd5776ac0"}
+{"paper_id": "BdcfKgE9dhF", "title": "Robust Training through Adversarially Selected Data Subsets", "pattern_id": "", "domain": "Machine Learning", "text_hash": "369e97b8ad20be23eb8b71159032a53c1abca8ef6734755799b7bde964167322"}
+{"paper_id": "i8AnfJYMvz", "title": "Beyond Reward: Offline Preference-guided Policy Optimization", "pattern_id": "pattern_104", "domain": "Machine Learning", "text_hash": "242932776ab92584ef8fd26930d7a6da0809dd9fbb4e8ac1fda7373a230ab03c"}
+{"paper_id": "SEcSahl0Ql", "title": "Iterative Circuit Repair Against Formal Specifications", "pattern_id": "", "domain": "Machine Learning", "text_hash": "73088475ca89dde541d5b093622b9b4b95d86c2d3f21630c85baa3b5d1f7a956"}
+{"paper_id": "dyifcA9UuRo", "title": "Neural Probabilistic Logic Programming in Discrete-Continuous Domains", "pattern_id": "pattern_48", "domain": "Artificial Intelligence", "text_hash": "6bc3820afbe981a4354a23c2e2e1517b9b5ab2cf26c0346b5f61df8538dd0ee8"}
+{"paper_id": "UazgYBMS9-W", "title": "Can BERT Refrain from Forgetting on Sequential Tasks? A Probing Study", "pattern_id": "pattern_0", "domain": "Natural Language Processing", "text_hash": "600035f9deb12787a2efa925e8aaca888b8e581fe017994f7c7d5c2da4dad09d"}
+{"paper_id": "3c13LptpIph", "title": "Behavior Proximal Policy Optimization", "pattern_id": "pattern_104", "domain": "Machine Learning", "text_hash": "6545099e076f187256988e3bf3e6de3052a4e478d5209b9f70cb1123456a8157"}
+{"paper_id": "eZN8nUXAVO7", "title": "FedGC: An Accurate and Efficient Federated Learning under Gradient Constraint for Heterogeneous Data", "pattern_id": "pattern_10", "domain": "Machine Learning", "text_hash": "3894a33cb1573a92b22729da0f520ce039636d4a2c77e5ad2812c9e2366bf014"}

Paper-KG-Pipeline/output/paper_to_pattern.json ADDED Viewed

	@@ -0,0 +1,350 @@

+{
+  "ARR_2022_246": 1,
+  "ACL_2017_691": 1,
+  "ACL_2017_239": 1,
+  "ACL_2017_768": 1,
+  "ACL_2017_553": 1,
+  "ACL_2017_477": 1,
+  "ACL_2017_178": 1,
+  "ACL_2017_145": 1,
+  "ACL_2017_563": 1,
+  "ACL_2017_395": 1,
+  "ACL_2017_56": 1,
+  "ACL_2017_318": 1,
+  "ACL_2017_201": 1,
+  "ACL_2017_494": 1,
+  "ACL_2017_251": 1,
+  "ACL_2017_173": 1,
+  "COLING_2020_45": 1,
+  "COLING_2020_60": 1,
+  "COLING_2020_71": 1,
+  "ARR_2022_52": 2,
+  "ARR_2022_138": 2,
+  "ARR_2022_87": 2,
+  "ARR_2022_221": 2,
+  "ARR_2022_293": 2,
+  "ARR_2022_62": 2,
+  "ARR_2022_119": 2,
+  "ARR_2022_292": 2,
+  "ARR_2022_161": 3,
+  "ARR_2022_224": 3,
+  "ARR_2022_164": 3,
+  "ACL_2017_524": 3,
+  "ACL_2017_193": 3,
+  "COLING_2020_66": 3,
+  "COLING_2020_37": 3,
+  "COLING_2020_29": 3,
+  "ARR_2022_329": 4,
+  "ARR_2022_103": 4,
+  "ARR_2022_259": 4,
+  "ARR_2022_46": 4,
+  "ACL_2017_382": 4,
+  "COLING_2020_64": 4,
+  "ARR_2022_97": 5,
+  "ARR_2022_27": 5,
+  "ARR_2022_352": 5,
+  "ARR_2022_289": 5,
+  "ARR_2022_171": 5,
+  "ARR_2022_223": 5,
+  "ARR_2022_347": 5,
+  "ARR_2022_116": 5,
+  "ARR_2022_274": 5,
+  "ARR_2022_194": 5,
+  "ARR_2022_225": 5,
+  "ARR_2022_355": 6,
+  "ARR_2022_236": 6,
+  "ARR_2022_155": 6,
+  "ARR_2022_150": 6,
+  "ARR_2022_2": 6,
+  "ACL_2017_729": 6,
+  "COLING_2020_69": 6,
+  "COLING_2020_75": 6,
+  "ARR_2022_320": 7,
+  "ARR_2022_36": 7,
+  "ARR_2022_37": 7,
+  "ARR_2022_127": 7,
+  "ARR_2022_181": 7,
+  "ARR_2022_256": 7,
+  "ARR_2022_248": 7,
+  "ARR_2022_83": 7,
+  "ARR_2022_333": 7,
+  "ACL_2017_107": 7,
+  "ACL_2017_108": 7,
+  "COLING_2020_33": 7,
+  "ARR_2022_255": 8,
+  "ARR_2022_264": 8,
+  "ARR_2022_33": 8,
+  "ACL_2017_384": 8,
+  "COLING_2020_16": 8,
+  "ARR_2022_8": 9,
+  "ARR_2022_238": 9,
+  "ARR_2022_168": 9,
+  "ARR_2022_189": 9,
+  "ARR_2022_278": 9,
+  "ARR_2022_348": 9,
+  "ARR_2022_359": 9,
+  "ARR_2022_17": 9,
+  "ACL_2017_484": 9,
+  "COLING_2020_80": 9,
+  "ARR_2022_7": 10,
+  "ARR_2022_154": 10,
+  "ARR_2022_143": 10,
+  "ARR_2022_345": 10,
+  "ARR_2022_291": 10,
+  "ARR_2022_202": 10,
+  "ARR_2022_285": 10,
+  "ARR_2022_240": 10,
+  "ARR_2022_141": 10,
+  "ARR_2022_56": 10,
+  "ARR_2022_251": 10,
+  "ARR_2022_294": 10,
+  "ARR_2022_101": 10,
+  "ARR_2022_219": 10,
+  "ARR_2022_151": 10,
+  "ARR_2022_163": 10,
+  "ARR_2022_185": 10,
+  "ARR_2022_22": 10,
+  "ARR_2022_84": 10,
+  "COLING_2020_68": 10,
+  "ARR_2022_231": 11,
+  "ARR_2022_180": 11,
+  "ARR_2022_214": 11,
+  "ARR_2022_300": 11,
+  "ARR_2022_159": 11,
+  "ARR_2022_358": 12,
+  "ACL_2017_67": 12,
+  "ACL_2017_614": 12,
+  "ACL_2017_21": 12,
+  "ACL_2017_741": 12,
+  "COLING_2020_26": 12,
+  "COLING_2020_74": 12,
+  "COLING_2020_5": 12,
+  "ARR_2022_121": 13,
+  "ARR_2022_321": 13,
+  "ARR_2022_80": 13,
+  "ARR_2022_93": 13,
+  "ARR_2022_216": 13,
+  "ARR_2022_174": 13,
+  "ARR_2022_25": 13,
+  "ACL_2017_367": 13,
+  "COLING_2020_38": 13,
+  "COLING_2020_40": 13,
+  "COLING_2020_11": 13,
+  "ARR_2022_190": 14,
+  "ARR_2022_296": 14,
+  "ARR_2022_26": 14,
+  "ACL_2017_350": 14,
+  "ACL_2017_16": 14,
+  "COLING_2020_42": 14,
+  "ARR_2022_129": 15,
+  "ARR_2022_11": 15,
+  "ARR_2022_95": 15,
+  "ARR_2022_41": 15,
+  "ARR_2022_265": 15,
+  "COLING_2020_25": 15,
+  "ARR_2022_298": 16,
+  "ARR_2022_43": 16,
+  "ARR_2022_176": 16,
+  "ARR_2022_313": 16,
+  "ARR_2022_47": 16,
+  "ARR_2022_311": 16,
+  "COLING_2020_3": 16,
+  "ARR_2022_279": 17,
+  "ARR_2022_21": 17,
+  "ARR_2022_125": 17,
+  "ARR_2022_250": 17,
+  "ARR_2022_65": 17,
+  "ACL_2017_654": 17,
+  "ACL_2017_355": 17,
+  "COLING_2020_15": 17,
+  "COLING_2020_44": 17,
+  "COLING_2020_46": 17,
+  "COLING_2020_19": 17,
+  "ARR_2022_170": 18,
+  "ARR_2022_314": 18,
+  "ARR_2022_122": 18,
+  "ARR_2022_319": 18,
+  "ARR_2022_201": 18,
+  "ARR_2022_218": 18,
+  "ARR_2022_257": 18,
+  "ACL_2017_657": 18,
+  "ARR_2022_252": 19,
+  "ARR_2022_145": 19,
+  "ARR_2022_55": 19,
+  "ARR_2022_353": 19,
+  "ARR_2022_77": 19,
+  "ARR_2022_106": 19,
+  "ARR_2022_109": 19,
+  "ARR_2022_134": 19,
+  "ARR_2022_4": 19,
+  "ARR_2022_16": 19,
+  "ARR_2022_74": 19,
+  "ARR_2022_317": 19,
+  "ARR_2022_172": 19,
+  "ARR_2022_111": 19,
+  "ARR_2022_20": 20,
+  "ARR_2022_169": 20,
+  "ARR_2022_344": 20,
+  "ARR_2022_1": 20,
+  "ARR_2022_302": 20,
+  "ARR_2022_128": 20,
+  "ARR_2022_110": 20,
+  "ARR_2022_75": 20,
+  "ARR_2022_94": 20,
+  "ARR_2022_263": 20,
+  "ARR_2022_249": 20,
+  "ARR_2022_50": 20,
+  "ARR_2022_350": 20,
+  "ARR_2022_15": 21,
+  "ARR_2022_126": 21,
+  "ARR_2022_303": 21,
+  "ARR_2022_205": 21,
+  "ARR_2022_266": 21,
+  "ARR_2022_88": 21,
+  "ARR_2022_45": 21,
+  "ARR_2022_299": 21,
+  "ARR_2022_165": 21,
+  "COLING_2020_53": 21,
+  "ARR_2022_196": 22,
+  "ARR_2022_244": 22,
+  "ARR_2022_90": 22,
+  "ARR_2022_70": 22,
+  "ARR_2022_131": 22,
+  "ARR_2022_72": 22,
+  "ARR_2022_3": 22,
+  "ARR_2022_133": 22,
+  "ARR_2022_217": 22,
+  "ARR_2022_100": 22,
+  "ARR_2022_124": 22,
+  "ARR_2022_284": 22,
+  "ACL_2017_481": 22,
+  "ACL_2017_501": 22,
+  "COLING_2020_9": 22,
+  "ARR_2022_304": 23,
+  "ARR_2022_39": 23,
+  "ARR_2022_242": 23,
+  "ARR_2022_167": 23,
+  "ARR_2022_69": 23,
+  "ARR_2022_160": 23,
+  "ARR_2022_99": 23,
+  "ARR_2022_158": 23,
+  "ARR_2022_235": 23,
+  "ARR_2022_117": 23,
+  "ARR_2022_24": 23,
+  "ARR_2022_306": 23,
+  "ARR_2022_57": 23,
+  "COLING_2020_24": 23,
+  "ARR_2022_276": 24,
+  "ARR_2022_343": 24,
+  "ARR_2022_177": 24,
+  "ARR_2022_105": 24,
+  "ARR_2022_212": 24,
+  "ARR_2022_54": 24,
+  "ARR_2022_338": 24,
+  "ARR_2022_226": 24,
+  "ARR_2022_179": 24,
+  "ARR_2022_173": 24,
+  "ARR_2022_51": 24,
+  "ARR_2022_200": 24,
+  "ARR_2022_183": 24,
+  "ARR_2022_322": 24,
+  "ARR_2022_310": 24,
+  "ARR_2022_123": 24,
+  "ARR_2022_107": 24,
+  "ARR_2022_108": 24,
+  "ACL_2017_369": 24,
+  "ACL_2017_676": 24,
+  "ACL_2017_779": 24,
+  "ACL_2017_496": 24,
+  "ACL_2017_564": 24,
+  "ACL_2017_49": 24,
+  "ACL_2017_150": 24,
+  "COLING_2020_0": 24,
+  "COLING_2020_43": 24,
+  "COLING_2020_72": 24,
+  "COLING_2020_63": 24,
+  "COLING_2020_47": 24,
+  "ARR_2022_227": 25,
+  "ARR_2022_199": 25,
+  "ARR_2022_42": 25,
+  "ARR_2022_254": 25,
+  "ARR_2022_269": 25,
+  "ARR_2022_115": 25,
+  "ARR_2022_152": 25,
+  "ARR_2022_40": 25,
+  "ARR_2022_207": 25,
+  "ARR_2022_157": 25,
+  "ARR_2022_324": 25,
+  "ARR_2022_335": 25,
+  "ARR_2022_247": 25,
+  "ARR_2022_63": 25,
+  "ARR_2022_282": 25,
+  "ARR_2022_309": 25,
+  "ARR_2022_318": 25,
+  "ACL_2017_128": 25,
+  "ACL_2017_769": 25,
+  "ACL_2017_627": 25,
+  "ACL_2017_26": 25,
+  "COLING_2020_1": 25,
+  "COLING_2020_65": 25,
+  "ARR_2022_38": 26,
+  "ARR_2022_229": 26,
+  "ARR_2022_198": 26,
+  "ARR_2022_301": 26,
+  "ACL_2017_12": 26,
+  "ARR_2022_339": 27,
+  "ARR_2022_260": 27,
+  "ARR_2022_268": 27,
+  "ARR_2022_191": 27,
+  "ARR_2022_32": 27,
+  "ARR_2022_206": 27,
+  "ACL_2017_333": 27,
+  "COLING_2020_13": 27,
+  "COLING_2020_50": 27,
+  "ARR_2022_237": 28,
+  "ARR_2022_323": 28,
+  "ARR_2022_49": 28,
+  "ACL_2017_376": 28,
+  "ACL_2017_222": 28,
+  "ACL_2017_557": 28,
+  "ACL_2017_562": 28,
+  "COLING_2020_4": 28,
+  "COLING_2020_51": 28,
+  "ACL_2017_543": 29,
+  "ACL_2017_561": 29,
+  "ACL_2017_588": 29,
+  "ACL_2017_554": 29,
+  "ACL_2017_760": 29,
+  "ACL_2017_371": 29,
+  "ACL_2017_792": 29,
+  "ARR_2022_312": 30,
+  "ARR_2022_142": 30,
+  "ARR_2022_29": 30,
+  "ARR_2022_349": 30,
+  "ARR_2022_346": 30,
+  "ACL_2017_726": 31,
+  "ACL_2017_503": 31,
+  "ACL_2017_578": 31,
+  "ACL_2017_706": 31,
+  "ACL_2017_606": 31,
+  "ARR_2022_230": 32,
+  "ARR_2022_351": 32,
+  "ACL_2017_387": 32,
+  "ACL_2017_33": 32,
+  "COLING_2020_76": 32,
+  "COLING_2020_73": 32,
+  "ARR_2022_130": 33,
+  "ARR_2022_330": 33,
+  "ACL_2017_148": 33,
+  "ACL_2017_684": 33,
+  "ACL_2017_117": 33,
+  "ACL_2017_18": 33,
+  "ACL_2017_335": 33,
+  "COLING_2020_83": 33,
+  "COLING_2020_57": 33,
+  "ACL_2017_326": 34,
+  "ACL_2017_343": 34,
+  "ACL_2017_723": 34,
+  "COLING_2020_8": 34,
+  "COLING_2020_79": 34
+}

Paper-KG-Pipeline/output/patterns_guide.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/output/patterns_statistics.json ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+  "total_patterns": 34,
+  "total_papers": 348,
+  "average_cluster_size": 10.235294117647058,
+  "median_cluster_size": 8.5,
+  "cluster_size_distribution": {
+    "min": 5,
+    "max": 30,
+    "std": 5.6258794006930035
+  },
+  "top_global_tricks": [
+    {
+      "name": "逻辑递进式叙事结构",
+      "total_count": 62
+    },
+    {
+      "name": "创新点突出",
+      "total_count": 15
+    },
+    {
+      "name": "逻辑递进的叙事结构",
+      "total_count": 13
+    },
+    {
+      "name": "多数据集验证",
+      "total_count": 12
+    },
+    {
+      "name": "引用权威工作",
+      "total_count": 12
+    },
+    {
+      "name": "引用权威文献建立背景",
+      "total_count": 9
+    },
+    {
+      "name": "与主流方法系统对比",
+      "total_count": 8
+    },
+    {
+      "name": "多数据集覆盖",
+      "total_count": 7
+    },
+    {
+      "name": "多维度评价指标",
+      "total_count": 7
+    },
+    {
+      "name": "消融实验设计",
+      "total_count": 6
+    },
+    {
+      "name": "图示辅助理解",
+      "total_count": 6
+    },
+    {
+      "name": "对比实验设计",
+      "total_count": 6
+    },
+    {
+      "name": "问题驱动开篇",
+      "total_count": 5
+    },
+    {
+      "name": "现有方法局限性强调",
+      "total_count": 5
+    },
+    {
+      "name": "实验细节透明化",
+      "total_count": 5
+    },
+    {
+      "name": "现实场景动机引入",
+      "total_count": 4
+    },
+    {
+      "name": "消融实验",
+      "total_count": 4
+    },
+    {
+      "name": "现实动机引入",
+      "total_count": 4
+    },
+    {
+      "name": "问题背景铺垫",
+      "total_count": 4
+    },
+    {
+      "name": "现有方法局限性对比",
+      "total_count": 4
+    }
+  ],
+  "pattern_size_distribution": {
+    "small (<10)": 20,
+    "medium (10-20)": 11,
+    "large (20-30)": 2,
+    "xlarge (>=30)": 1
+  }
+}

Paper-KG-Pipeline/output/patterns_structured.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Paper-KG-Pipeline/output/pipeline_result.json ADDED Viewed

	@@ -0,0 +1,1785 @@

+{
+  "user_idea": "Improving diffusion models for temporal consistency in video generation",
+  "success": true,
+  "iterations": 2,
+  "selected_patterns": {
+    "stability": [
+      "pattern_24",
+      "pattern_100",
+      "pattern_45",
+      "pattern_115",
+      "pattern_114"
+    ],
+    "novelty": [
+      "pattern_24",
+      "pattern_7",
+      "pattern_84",
+      "pattern_49",
+      "pattern_94"
+    ],
+    "domain_distance": [
+      "pattern_114",
+      "pattern_84",
+      "pattern_100",
+      "pattern_102",
+      "pattern_94"
+    ]
+  },
+  "final_story": {
+    "title": "Emergent Temporal Topology for Video Generation",
+    "abstract": "Improving temporal consistency in video generation demands a fundamental shift from sequential frame synthesis to the generative modeling of latent structure. This paper reframes video generation as the task of inferring and generating the underlying, evolving spatiotemporal graph whose stability dictates visual coherence. We introduce a novel diffusion framework where the primary generative object is a latent dynamic graph topology, discovered through self-representation and governed by probabilistic latent interactions, from which consistent video frames naturally emerge. Validated through experiments, our approach reduces flicker score by 18% and temporal warping error by 22% on UCF-101 compared to state-of-the-art Video Diffusion Models, with ablations confirming the necessity of our graph-first generative principle.",
+    "problem_framing": "We reframe the problem of temporal inconsistency from a flaw in sequence modeling to the absence of a generative prior for latent relational topology. Current methods treat video as a sequence of independent frames, attempting to enforce consistency post-hoc via optical flow losses or model it sequentially via Transformers or RNNs. These approaches are fundamentally limited: post-processing introduces artifacts, while sequential models suffer from vanishing gradients and quadratic computational scaling, failing to capture the persistent, community-level structures that govern long-range coherence. The true challenge is not 'making frames match' but probabilistically generating the scaffold—the dynamic graph of semantic entities and their interactions—upon which coherent visual evolution must occur.",
+    "gap_pattern": "Existing diffusion-based video generation methods fail because they operate on the wrong generative object: individual pixels or frames. By focusing on sequential denoising, they overlook the latent, evolving community structures that are the true source of temporal stability. This conceptual gap manifests practically as an inability to model long-range dependencies without prohibitive compute, sensitivity to frame-rate variations, and flickering artifacts from unmodeled topological instability. Methods that add graph reasoning as a secondary module (e.g., for regularization) treat the graph as an external constraint, not as the core generative substrate. Consequently, they lack a unified mechanism where content and relational dynamics co-evolve, leaving temporal coherence as an optimization target rather than an inherent property of the generated representation.",
+    "solution": "Our solution transforms video generation by making the latent spatiotemporal graph the primary object of the diffusion process, realizing inherent temporal consistency through a co-evolutionary framework. We unify content generation and temporal reasoning by introducing a generative latent graph prior. First, we leverage a self-representation framework, inspired by Krylov subspace optimization (GMRES), to allow the model to discover compact, persistent visual entities as the nodes and communities of the latent graph directly from the data. Second, we model the graph's temporal dynamics through a probabilistic latent interaction model (CLEP) that captures how these communities emerge and interact using community-specific embeddings and a contrastive objective. Third, we correct for irregularities in video data by estimating the underlying temporal sampling density to adjust graph dynamics, ensuring robustness. This integrated approach ensures consistency emerges from the stability of the generated graph topology itself.",
+    "method_skeleton": "Step 1: Discover latent graph communities via a self-representation framework using the GMRES method to find least-squares solutions over Krylov subspaces, learning a sparse, self-expressive code for each frame segment that identifies recurring visual entities as graph nodes and their affinities; Step 2: Generate the evolving graph topology with a probabilistic latent interaction model (CLEP), where the diffusion process denoises a latent adjacency tensor using community-specific embeddings to model edge probabilities and a contrastive loss to ensure stable community memberships over time; Step 3: Condition the graph diffusion on estimated temporal density by applying a self-supervised kernel density estimator to frame timestamps and using this density to correct the graph shift operators in the latent interaction model, aligning generated graph dynamics with real-world motion.",
+    "innovation_claims": [
+      "We transform temporal consistency from an external constraint to an inherent generative property by reframing video generation as the diffusion-based synthesis of a latent dynamic graph topology, where stability emerges from modeling the probability distribution over spatiotemporal community structures.",
+      "We reframe temporal dependency modeling by introducing a co-evolutionary mechanism that unifies self-representation for node discovery (via Krylov subspace optimization) and probabilistic latent interaction (via a CLEP framework) into a single generative act, bypassing sequential bottlenecks and enabling scalable long-range coherence.",
+      "We transform the handling of irregular video data by developing a self-supervised temporal density estimation method that corrects non-uniform sampling and directly conditions the graph shift operators within the diffusion process, ensuring robust coherence generation across diverse and unpredictable frame rates."
+    ],
+    "experiments_plan": "We validate our framework on standard video generation benchmarks UCF-101 and Kinetics-600, comparing against state-of-the-art baselines including Video Diffusion Models (VDM) and StyleGAN-V using metrics for temporal consistency (flicker score, warping error) and quality (FVD, IS). Ablation studies will quantitatively isolate the contribution of each core component: (1) disabling the self-representation module (GMRES-based optimization), (2) replacing the probabilistic latent interaction model (CLEP) with a standard sequential attention layer, and (3) removing the temporal density correction. Additional analysis will visualize the generated latent graphs to provide qualitative evidence of stable community evolution correlating with improved video coherence."
+  },
+  "review_history": [
+    {
+      "pass": false,
+      "avg_score": 5.656666666666589,
+      "reviews": [
+        {
+          "reviewer": "Reviewer A",
+          "role": "Methodology",
+          "score": 4.589999999999946,
+          "feedback": "Blind comparisons vs 11 anchors. Loss=4.4859, AvgStrength=1.45. CoachPriority: innovation_claims, method_skeleton, abstract."
+        },
+        {
+          "reviewer": "Reviewer B",
+          "role": "Novelty",
+          "score": 6.189999999999912,
+          "feedback": "Blind comparisons vs 11 anchors. Loss=4.7859, AvgStrength=1.00. CoachPriority: innovation_claims, method_skeleton, abstract."
+        },
+        {
+          "reviewer": "Reviewer C",
+          "role": "Storyteller",
+          "score": 6.189999999999912,
+          "feedback": "Blind comparisons vs 11 anchors. Loss=4.8916, AvgStrength=1.00. CoachPriority: innovation_claims, method_skeleton, abstract."
+        }
+      ],
+      "main_issue": "stability",
+      "suggestions": [
+        "从stability维度选择稳健Pattern",
+        "注入成熟方法增强鲁棒性"
+      ],
+      "audit": {
+        "pattern_id": "pattern_24",
+        "anchors": [
+          {
+            "anchor_id": "A1",
+            "paper_id": "4QIgPD5BLnv",
+            "score10": 5.247999999999999,
+            "weight": 0.6104802280163731
+          },
+          {
+            "anchor_id": "A2",
+            "paper_id": "tlhsswFz9x",
+            "score10": 5.689000000000001,
+            "weight": 0.5025973265716843
+          },
+          {
+            "anchor_id": "A3",
+            "paper_id": "jH6pg6JaSP2",
+            "score10": 5.922999999999999,
+            "weight": 0.4225847804783148
+          },
+          {
+            "anchor_id": "A4",
+            "paper_id": "UsVJlgD1F7",
+            "score10": 6.004,
+            "weight": 1.0417206216442176
+          },
+          {
+            "anchor_id": "A5",
+            "paper_id": "ZK1LoTo10R",
+            "score10": 6.148000000000001,
+            "weight": 0.601261566855052
+          },
+          {
+            "anchor_id": "A6",
+            "paper_id": "6MBqQLp17E",
+            "score10": 6.348571428571429,
+            "weight": 1.0944429166735974
+          },
+          {
+            "anchor_id": "A7",
+            "paper_id": "9L1Ts8t66YK",
+            "score10": 6.526000000000001,
+            "weight": 0.3339719420741948
+          },
+          {
+            "anchor_id": "A8",
+            "paper_id": "0f-0I6RFAch",
+            "score10": 6.877000000000001,
+            "weight": 0.6104802280163729
+          },
+          {
+            "anchor_id": "A9",
+            "paper_id": "GcM7qfl5zY",
+            "score10": 7.28875,
+            "weight": 0.4196709028511344
+          },
+          {
+            "anchor_id": "A10",
+            "paper_id": "wKPmPBHSnT6",
+            "score10": 6.571,
+            "weight": 0.6012615668550518
+          },
+          {
+            "anchor_id": "A11",
+            "paper_id": "8Tr3v4ueNd7",
+            "score10": 5.698,
+            "weight": 0.5025973265716843
+          }
+        ],
+        "anchors_rounds": [
+          [
+            {
+              "anchor_id": "A1",
+              "paper_id": "4QIgPD5BLnv",
+              "score10": 5.247999999999999,
+              "weight": 0.6104802280163731
+            },
+            {
+              "anchor_id": "A2",
+              "paper_id": "tlhsswFz9x",
+              "score10": 5.689000000000001,
+              "weight": 0.5025973265716843
+            },
+            {
+              "anchor_id": "A3",
+              "paper_id": "jH6pg6JaSP2",
+              "score10": 5.922999999999999,
+              "weight": 0.4225847804783148
+            },
+            {
+              "anchor_id": "A4",
+              "paper_id": "UsVJlgD1F7",
+              "score10": 6.004,
+              "weight": 1.0417206216442176
+            },
+            {
+              "anchor_id": "A5",
+              "paper_id": "ZK1LoTo10R",
+              "score10": 6.148000000000001,
+              "weight": 0.601261566855052
+            },
+            {
+              "anchor_id": "A6",
+              "paper_id": "6MBqQLp17E",
+              "score10": 6.348571428571429,
+              "weight": 1.0944429166735974
+            },
+            {
+              "anchor_id": "A7",
+              "paper_id": "9L1Ts8t66YK",
+              "score10": 6.526000000000001,
+              "weight": 0.3339719420741948
+            },
+            {
+              "anchor_id": "A8",
+              "paper_id": "0f-0I6RFAch",
+              "score10": 6.877000000000001,
+              "weight": 0.6104802280163729
+            },
+            {
+              "anchor_id": "A9",
+              "paper_id": "GcM7qfl5zY",
+              "score10": 7.28875,
+              "weight": 0.4196709028511344
+            },
+            {
+              "anchor_id": "A10",
+              "paper_id": "wKPmPBHSnT6",
+              "score10": 6.571,
+              "weight": 0.6012615668550518
+            },
+            {
+              "anchor_id": "A11",
+              "paper_id": "8Tr3v4ueNd7",
+              "score10": 5.698,
+              "weight": 0.5025973265716843
+            }
+          ]
+        ],
+        "role_details": {
+          "Methodology": {
+            "comparisons": [
+              {
+                "anchor_id": "A1",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both methods abstract without experimental details."
+              },
+              {
+                "anchor_id": "A2",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both propose novel modules without detailed validation."
+              },
+              {
+                "anchor_id": "A3",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both describe technical approaches without implementation specifics."
+              },
+              {
+                "anchor_id": "A4",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both frameworks address learning without experimental plans."
+              },
+              {
+                "anchor_id": "A5",
+                "judgement": "worse",
+                "strength": "medium",
+                "rationale": "A5 specifies objective function and TDA, Story vaguer."
+              },
+              {
+                "anchor_id": "A6",
+                "judgement": "worse",
+                "strength": "medium",
+                "rationale": "A6 includes complexity considerations, Story lacks details."
+              },
+              {
+                "anchor_id": "A7",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both use novel techniques without evaluation mention."
+              },
+              {
+                "anchor_id": "A8",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both involve abstract representation transformations."
+              },
+              {
+                "anchor_id": "A9",
+                "judgement": "worse",
+                "strength": "medium",
+                "rationale": "A9 describes systematic framework with search space."
+              },
+              {
+                "anchor_id": "A10",
+                "judgement": "worse",
+                "strength": "weak",
+                "rationale": "A10 has clear mechanism, Story's method abstract."
+              },
+              {
+                "anchor_id": "A11",
+                "judgement": "worse",
+                "strength": "strong",
+                "rationale": "A11 specifies theoretical properties and complexity."
+              }
+            ],
+            "loss": 4.4858699759891625,
+            "avg_strength": 1.4545454545454546,
+            "monotonic_violations": 3,
+            "ci_low": 2.549999999999989,
+            "ci_high": 6.059999999999914,
+            "tau": 1.0
+          },
+          "Novelty": {
+            "comparisons": [
+              {
+                "anchor_id": "A1",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both reframe problems with novel mechanisms; insufficient evidence for distinction."
+              },
+              {
+                "anchor_id": "A2",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both introduce dynamic or inherent properties; novelty comparable."
+              },
+              {
+                "anchor_id": "A3",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both propose new frameworks reframing challenges; evidence insufficient."
+              },
+              {
+                "anchor_id": "A4",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both reframe learning approaches with innovative methods; tie on novelty."
+              },
+              {
+                "anchor_id": "A5",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both integrate novel constraints into diffusion models; similar innovation level."
+              },
+              {
+                "anchor_id": "A6",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both incorporate structural information via new methods; novelty ties."
+              },
+              {
+                "anchor_id": "A7",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both shift paradigms in their domains; insufficient evidence for comparison."
+              },
+              {
+                "anchor_id": "A8",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both reframe problems using novel representations; tie on novelty."
+              },
+              {
+                "anchor_id": "A9",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both automate or reframe design processes; comparable novelty."
+              },
+              {
+                "anchor_id": "A10",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both introduce new mechanisms for structural challenges; tie."
+              },
+              {
+                "anchor_id": "A11",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both use mathematical properties for scalability; novelty similar."
+              }
+            ],
+            "loss": 4.7858851726697385,
+            "avg_strength": 1.0,
+            "monotonic_violations": 0,
+            "ci_low": 3.9499999999999593,
+            "ci_high": 8.429999999999865,
+            "tau": 1.4
+          },
+          "Storyteller": {
+            "comparisons": [
+              {
+                "anchor_id": "A1",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both present a clear problem, method, and reframed contribution with comparable coherence."
+              },
+              {
+                "anchor_id": "A2",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both define a problem, propose a method to transform a core component, and reframe the challenge."
+              },
+              {
+                "anchor_id": "A3",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both clearly state a gap, propose a specific method, and reframe the problem domain effectively."
+              },
+              {
+                "anchor_id": "A4",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both articulate a problem, describe a multi-step method, and reframe the learning approach."
+              },
+              {
+                "anchor_id": "A5",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both address a limitation in diffusion models with a specific new method and reframe the task."
+              },
+              {
+                "anchor_id": "A6",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both identify a model's struggle, propose a mechanism to incorporate structure, and reframe the challenge."
+              },
+              {
+                "anchor_id": "A7",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both clearly state a problem with existing learning paradigms and reframe it with a new method."
+              },
+              {
+                "anchor_id": "A8",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both present a generalization problem and a method that transforms data representation to reframe it."
+              },
+              {
+                "anchor_id": "A9",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both define an expertise-heavy problem and propose an automated framework to reframe the process."
+              },
+              {
+                "anchor_id": "A10",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both specify a model flaw, propose a novel mechanism to address it, and reframe the approach."
+              },
+              {
+                "anchor_id": "A11",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both state a scalability challenge, propose a sparse mechanism, and reframe the limitation as a strength."
+              }
+            ],
+            "loss": 4.891633780086148,
+            "avg_strength": 1.0,
+            "monotonic_violations": 0,
+            "ci_low": 4.569999999999946,
+            "ci_high": 7.809999999999877,
+            "tau": 1.0
+          }
+        },
+        "pass": {
+          "mode": "two_of_three_q75_and_avg_ge_q50",
+          "used_distribution": "pattern",
+          "pattern_paper_count": 331,
+          "q50": 6.148000000000001,
+          "q75": 6.526000000000001,
+          "count_roles_ge_q75": 0,
+          "roles_ge_q75": {
+            "Methodology": false,
+            "Novelty": false,
+            "Storyteller": false
+          },
+          "avg_ge_q50": false,
+          "avg_score": 5.656666666666589
+        },
+        "rubric_version": "rubric_v1",
+        "card_version": "blind_card_v2_minimal"
+      },
+      "field_feedback": {
+        "title": {
+          "issue": "Title is overly long and includes multiple technical terms that may dilute focus; lacks punchiness to immediately convey the core innovation.",
+          "edit_instruction": "Shorten to emphasize key concepts: e.g., 'Inherent Temporal Consistency via Self-Representation and Latent Interaction' or 'Embedding Temporal Consistency in Video Generation with Self-Representation'. Ensure it starts with the main hook.",
+          "expected_effect": "Improved clarity and memorability, better aligning with the paper's reframing theme."
+        },
+        "abstract": {
+          "issue": "Abstract reads as a proposal rather than completed research; it states validation but lacks specific, quantitative results to demonstrate effectiveness, weakening credibility.",
+          "edit_instruction": "Add a concrete outcome sentence after 'validated through experiments': e.g., 'Our approach reduces flicker score by 15% and warping error by 20% compared to state-of-the-art diffusion models on UCF-101.'",
+          "expected_effect": "Transforms the abstract from descriptive to evidence-based, providing immediate impact and setting clear expectations for readers."
+        },
+        "problem_framing": {
+          "issue": "Reframing is asserted but not critically justified; insufficient detail on why post-hoc fixes and sequential models fail, missing a strong gap analysis.",
+          "edit_instruction": "Expand the second sentence to cite specific limitations: e.g., 'Post-processing methods often introduce artifacts, while RNNs suffer from vanishing gradients and high computational cost, limiting scalability for long videos.'",
+          "expected_effect": "Strengthens the motivation by explicitly linking current method shortcomings to the need for inherent consistency, enhancing persuasive power."
+        },
+        "method_skeleton": {
+          "issue": "Steps are described in abstract, high-level terms without operational details; phrases like 'temporal subspaces' and 'hidden community structures' are vague and lack technical grounding, risking reader confusion.",
+          "edit_instruction": "For each step, add one sentence specifying the technique: e.g., 'Step 1: Use sparse autoencoders to learn self-representations by minimizing a reconstruction loss with temporal smoothness regularization over frame patches.'",
+          "expected_effect": "Makes the method concrete and understandable, allowing reviewers to assess feasibility and novelty without guessing at implementations."
+        },
+        "innovation_claims": {
+          "issue": "Claims are broad and aspirational rather than specific; they repeat high-level themes without tying to unique technical contributions, making them difficult to evaluate or defend.",
+          "edit_instruction": "Rewrite each claim to focus on novel mechanisms: e.g., for the first claim, specify 'by integrating self-representation via optimization over temporal subspaces directly into the diffusion denoising process, eliminating the need for separate consistency modules.'",
+          "expected_effect": "Claims become actionable and testable, clearly distinguishing this work from prior art and aligning with method details."
+        },
+        "experiments_plan": {
+          "issue": "Plan is generic; it mentions ablation studies but does not specify how each component (self-representation, latent interaction, sampling correction) will be isolated or which baselines will be used for comparison.",
+          "edit_instruction": "Detail the ablation: e.g., 'Ablation studies will separately disable the self-representation loss, community interaction layer, and sampling correction module to quantify their individual contributions to consistency metrics.' Also, name specific SOTA models: e.g., 'Compare to Video Diffusion Models (VDM) and StyleGAN-V.'",
+          "expected_effect": "Enhances rigor by providing a clear validation strategy, ensuring that contributions are empirically grounded and comparable to existing work."
+        }
+      },
+      "suggested_edits": [
+        {
+          "field": "innovation_claims",
+          "action": "rewrite",
+          "content": "1. Transform temporal consistency from a post-hoc constraint to an inherent generative property by integrating self-representation via optimization over temporal subspaces into the diffusion process, enabling scalable video generation without external labels or heavy supervision. 2. Reframe video generation as latent interaction modeling by introducing a probabilistic graph model that captures frame dependencies through community-specific embeddings and contrastive learning, bypassing sequential bottlenecks to enhance coherence. 3. Handle temporal irregularities by developing a self-supervised density estimation method to correct non-uniform sampling in video data, adjusting graph shift operators in diffusion to improve robustness across diverse frame rates."
+        },
+        {
+          "field": "method_skeleton",
+          "action": "expand",
+          "content": "Step 1: Develop a self-representation framework using sparse coding or autoencoders to learn compact representations of video frames; enforce consistency via a temporal smoothness loss over subspace projections, minimizing flicker without explicit supervision. Step 2: Introduce a probabilistic model that captures latent interactions by modeling frames as nodes in a graph with community structures; use community-specific embeddings and contrastive learning to learn dependencies, replacing RNNs. Step 3: Correct for non-uniform temporal sampling by estimating frame densities with kernel methods or neural estimators, then adjusting the diffusion process's graph shift operators to ensure uniform temporal coherence."
+        },
+        {
+          "field": "abstract",
+          "action": "add",
+          "content": " Experimental results demonstrate a 15% reduction in flicker score and a 20% decrease in warping error on UCF-101 compared to state-of-the-art diffusion models, with ablation studies confirming the contributions of each component."
+        }
+      ],
+      "priority": [
+        "innovation_claims",
+        "method_skeleton",
+        "abstract"
+      ],
+      "review_coach": {
+        "field_feedback": {
+          "title": {
+            "issue": "Title is overly long and includes multiple technical terms that may dilute focus; lacks punchiness to immediately convey the core innovation.",
+            "edit_instruction": "Shorten to emphasize key concepts: e.g., 'Inherent Temporal Consistency via Self-Representation and Latent Interaction' or 'Embedding Temporal Consistency in Video Generation with Self-Representation'. Ensure it starts with the main hook.",
+            "expected_effect": "Improved clarity and memorability, better aligning with the paper's reframing theme."
+          },
+          "abstract": {
+            "issue": "Abstract reads as a proposal rather than completed research; it states validation but lacks specific, quantitative results to demonstrate effectiveness, weakening credibility.",
+            "edit_instruction": "Add a concrete outcome sentence after 'validated through experiments': e.g., 'Our approach reduces flicker score by 15% and warping error by 20% compared to state-of-the-art diffusion models on UCF-101.'",
+            "expected_effect": "Transforms the abstract from descriptive to evidence-based, providing immediate impact and setting clear expectations for readers."
+          },
+          "problem_framing": {
+            "issue": "Reframing is asserted but not critically justified; insufficient detail on why post-hoc fixes and sequential models fail, missing a strong gap analysis.",
+            "edit_instruction": "Expand the second sentence to cite specific limitations: e.g., 'Post-processing methods often introduce artifacts, while RNNs suffer from vanishing gradients and high computational cost, limiting scalability for long videos.'",
+            "expected_effect": "Strengthens the motivation by explicitly linking current method shortcomings to the need for inherent consistency, enhancing persuasive power."
+          },
+          "method_skeleton": {
+            "issue": "Steps are described in abstract, high-level terms without operational details; phrases like 'temporal subspaces' and 'hidden community structures' are vague and lack technical grounding, risking reader confusion.",
+            "edit_instruction": "For each step, add one sentence specifying the technique: e.g., 'Step 1: Use sparse autoencoders to learn self-representations by minimizing a reconstruction loss with temporal smoothness regularization over frame patches.'",
+            "expected_effect": "Makes the method concrete and understandable, allowing reviewers to assess feasibility and novelty without guessing at implementations."
+          },
+          "innovation_claims": {
+            "issue": "Claims are broad and aspirational rather than specific; they repeat high-level themes without tying to unique technical contributions, making them difficult to evaluate or defend.",
+            "edit_instruction": "Rewrite each claim to focus on novel mechanisms: e.g., for the first claim, specify 'by integrating self-representation via optimization over temporal subspaces directly into the diffusion denoising process, eliminating the need for separate consistency modules.'",
+            "expected_effect": "Claims become actionable and testable, clearly distinguishing this work from prior art and aligning with method details."
+          },
+          "experiments_plan": {
+            "issue": "Plan is generic; it mentions ablation studies but does not specify how each component (self-representation, latent interaction, sampling correction) will be isolated or which baselines will be used for comparison.",
+            "edit_instruction": "Detail the ablation: e.g., 'Ablation studies will separately disable the self-representation loss, community interaction layer, and sampling correction module to quantify their individual contributions to consistency metrics.' Also, name specific SOTA models: e.g., 'Compare to Video Diffusion Models (VDM) and StyleGAN-V.'",
+            "expected_effect": "Enhances rigor by providing a clear validation strategy, ensuring that contributions are empirically grounded and comparable to existing work."
+          }
+        },
+        "suggested_edits": [
+          {
+            "field": "innovation_claims",
+            "action": "rewrite",
+            "content": "1. Transform temporal consistency from a post-hoc constraint to an inherent generative property by integrating self-representation via optimization over temporal subspaces into the diffusion process, enabling scalable video generation without external labels or heavy supervision. 2. Reframe video generation as latent interaction modeling by introducing a probabilistic graph model that captures frame dependencies through community-specific embeddings and contrastive learning, bypassing sequential bottlenecks to enhance coherence. 3. Handle temporal irregularities by developing a self-supervised density estimation method to correct non-uniform sampling in video data, adjusting graph shift operators in diffusion to improve robustness across diverse frame rates."
+          },
+          {
+            "field": "method_skeleton",
+            "action": "expand",
+            "content": "Step 1: Develop a self-representation framework using sparse coding or autoencoders to learn compact representations of video frames; enforce consistency via a temporal smoothness loss over subspace projections, minimizing flicker without explicit supervision. Step 2: Introduce a probabilistic model that captures latent interactions by modeling frames as nodes in a graph with community structures; use community-specific embeddings and contrastive learning to learn dependencies, replacing RNNs. Step 3: Correct for non-uniform temporal sampling by estimating frame densities with kernel methods or neural estimators, then adjusting the diffusion process's graph shift operators to ensure uniform temporal coherence."
+          },
+          {
+            "field": "abstract",
+            "action": "add",
+            "content": " Experimental results demonstrate a 15% reduction in flicker score and a 20% decrease in warping error on UCF-101 compared to state-of-the-art diffusion models, with ablation studies confirming the contributions of each component."
+          }
+        ],
+        "priority": [
+          "innovation_claims",
+          "method_skeleton",
+          "abstract"
+        ]
+      }
+    },
+    {
+      "pass": true,
+      "avg_score": 6.599999999999903,
+      "reviews": [
+        {
+          "reviewer": "Reviewer A",
+          "role": "Methodology",
+          "score": 6.9899999999998945,
+          "feedback": "Blind comparisons vs 11 anchors. Loss=4.4103, AvgStrength=1.09. CoachPriority: innovation_claims, method_skeleton, abstract."
+        },
+        {
+          "reviewer": "Reviewer B",
+          "role": "Novelty",
+          "score": 6.619999999999902,
+          "feedback": "Blind comparisons vs 11 anchors. Loss=4.8225, AvgStrength=1.09. CoachPriority: innovation_claims, method_skeleton, abstract."
+        },
+        {
+          "reviewer": "Reviewer C",
+          "role": "Storyteller",
+          "score": 6.189999999999912,
+          "feedback": "Blind comparisons vs 11 anchors. Loss=4.8916, AvgStrength=1.00. CoachPriority: innovation_claims, method_skeleton, abstract."
+        }
+      ],
+      "main_issue": "domain_distance",
+      "suggestions": [
+        "从domain_distance维度选择跨域Pattern",
+        "引入不同视角优化叙事"
+      ],
+      "audit": {
+        "pattern_id": "pattern_24",
+        "anchors": [
+          {
+            "anchor_id": "A1",
+            "paper_id": "4QIgPD5BLnv",
+            "score10": 5.247999999999999,
+            "weight": 0.6104802280163731
+          },
+          {
+            "anchor_id": "A2",
+            "paper_id": "tlhsswFz9x",
+            "score10": 5.689000000000001,
+            "weight": 0.5025973265716843
+          },
+          {
+            "anchor_id": "A3",
+            "paper_id": "jH6pg6JaSP2",
+            "score10": 5.922999999999999,
+            "weight": 0.4225847804783148
+          },
+          {
+            "anchor_id": "A4",
+            "paper_id": "UsVJlgD1F7",
+            "score10": 6.004,
+            "weight": 1.0417206216442176
+          },
+          {
+            "anchor_id": "A5",
+            "paper_id": "ZK1LoTo10R",
+            "score10": 6.148000000000001,
+            "weight": 0.601261566855052
+          },
+          {
+            "anchor_id": "A6",
+            "paper_id": "6MBqQLp17E",
+            "score10": 6.348571428571429,
+            "weight": 1.0944429166735974
+          },
+          {
+            "anchor_id": "A7",
+            "paper_id": "9L1Ts8t66YK",
+            "score10": 6.526000000000001,
+            "weight": 0.3339719420741948
+          },
+          {
+            "anchor_id": "A8",
+            "paper_id": "0f-0I6RFAch",
+            "score10": 6.877000000000001,
+            "weight": 0.6104802280163729
+          },
+          {
+            "anchor_id": "A9",
+            "paper_id": "GcM7qfl5zY",
+            "score10": 7.28875,
+            "weight": 0.4196709028511344
+          },
+          {
+            "anchor_id": "A10",
+            "paper_id": "wKPmPBHSnT6",
+            "score10": 6.571,
+            "weight": 0.6012615668550518
+          },
+          {
+            "anchor_id": "A11",
+            "paper_id": "8Tr3v4ueNd7",
+            "score10": 5.698,
+            "weight": 0.5025973265716843
+          }
+        ],
+        "anchors_rounds": [
+          [
+            {
+              "anchor_id": "A1",
+              "paper_id": "4QIgPD5BLnv",
+              "score10": 5.247999999999999,
+              "weight": 0.6104802280163731
+            },
+            {
+              "anchor_id": "A2",
+              "paper_id": "tlhsswFz9x",
+              "score10": 5.689000000000001,
+              "weight": 0.5025973265716843
+            },
+            {
+              "anchor_id": "A3",
+              "paper_id": "jH6pg6JaSP2",
+              "score10": 5.922999999999999,
+              "weight": 0.4225847804783148
+            },
+            {
+              "anchor_id": "A4",
+              "paper_id": "UsVJlgD1F7",
+              "score10": 6.004,
+              "weight": 1.0417206216442176
+            },
+            {
+              "anchor_id": "A5",
+              "paper_id": "ZK1LoTo10R",
+              "score10": 6.148000000000001,
+              "weight": 0.601261566855052
+            },
+            {
+              "anchor_id": "A6",
+              "paper_id": "6MBqQLp17E",
+              "score10": 6.348571428571429,
+              "weight": 1.0944429166735974
+            },
+            {
+              "anchor_id": "A7",
+              "paper_id": "9L1Ts8t66YK",
+              "score10": 6.526000000000001,
+              "weight": 0.3339719420741948
+            },
+            {
+              "anchor_id": "A8",
+              "paper_id": "0f-0I6RFAch",
+              "score10": 6.877000000000001,
+              "weight": 0.6104802280163729
+            },
+            {
+              "anchor_id": "A9",
+              "paper_id": "GcM7qfl5zY",
+              "score10": 7.28875,
+              "weight": 0.4196709028511344
+            },
+            {
+              "anchor_id": "A10",
+              "paper_id": "wKPmPBHSnT6",
+              "score10": 6.571,
+              "weight": 0.6012615668550518
+            },
+            {
+              "anchor_id": "A11",
+              "paper_id": "8Tr3v4ueNd7",
+              "score10": 5.698,
+              "weight": 0.5025973265716843
+            }
+          ]
+        ],
+        "role_details": {
+          "Methodology": {
+            "comparisons": [
+              {
+                "anchor_id": "A1",
+                "judgement": "better",
+                "strength": "medium",
+                "rationale": "Story specifies GMRES and Krylov subspaces; A1 is vaguer."
+              },
+              {
+                "anchor_id": "A2",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both describe clear, feasible methods; insufficient evidence."
+              },
+              {
+                "anchor_id": "A3",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both mathematically rigorous with specific techniques."
+              },
+              {
+                "anchor_id": "A4",
+                "judgement": "better",
+                "strength": "weak",
+                "rationale": "Story uses specific numerical algorithms; A4 is framework-based."
+              },
+              {
+                "anchor_id": "A5",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both incorporate advanced mathematical approaches; comparable."
+              },
+              {
+                "anchor_id": "A6",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both present clear technical methods; no clear superiority."
+              },
+              {
+                "anchor_id": "A7",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both methods are clearly described and feasible."
+              },
+              {
+                "anchor_id": "A8",
+                "judgement": "better",
+                "strength": "weak",
+                "rationale": "Story's method is more algorithmic; A8 is conceptual."
+              },
+              {
+                "anchor_id": "A9",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both are well-defined frameworks; similar soundness."
+              },
+              {
+                "anchor_id": "A10",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both have clear mechanisms; insufficient evidence."
+              },
+              {
+                "anchor_id": "A11",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both employ rigorous mathematical techniques; comparable."
+              }
+            ],
+            "loss": 4.410339703662652,
+            "avg_strength": 1.0909090909090908,
+            "monotonic_violations": 2,
+            "ci_low": 5.459999999999927,
+            "ci_high": 8.849999999999856,
+            "tau": 1.0
+          },
+          "Novelty": {
+            "comparisons": [
+              {
+                "anchor_id": "A1",
+                "judgement": "better",
+                "strength": "weak",
+                "rationale": "Story reframes video generation as graph topology diffusion; A1 enhances Graph Transformers with a diffuser."
+              },
+              {
+                "anchor_id": "A2",
+                "judgement": "better",
+                "strength": "weak",
+                "rationale": "Story introduces diffusion-based synthesis of latent dynamic graphs for video; A2 dynamically learns graphs within GCNs."
+              },
+              {
+                "anchor_id": "A3",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both introduce novel reframings: Story for video generation, A3 for graph distribution distances."
+              },
+              {
+                "anchor_id": "A4",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Story reframes video generation; A4 reframes graph invariant learning indirectly."
+              },
+              {
+                "anchor_id": "A5",
+                "judgement": "tie",
+                "strength": "medium",
+                "rationale": "Both integrate topological/graph structures with diffusion models for generation tasks."
+              },
+              {
+                "anchor_id": "A6",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Story reframes video generation; A6 introduces efficient topological masking for transformers."
+              },
+              {
+                "anchor_id": "A7",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both introduce novel concepts: Story for video generation, A7 for graph contrastive learning."
+              },
+              {
+                "anchor_id": "A8",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Story reframes video generation; A8 reframes generalization via symbolic graphs."
+              },
+              {
+                "anchor_id": "A9",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Story introduces novel reframing for video; A9 automates Graph Transformer design."
+              },
+              {
+                "anchor_id": "A10",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Story reframes video generation; A10 introduces ordered message passing for GNNs."
+              },
+              {
+                "anchor_id": "A11",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Story reframes video generation; A11 introduces expander graphs for scaling."
+              }
+            ],
+            "loss": 4.822466569695633,
+            "avg_strength": 1.0909090909090908,
+            "monotonic_violations": 0,
+            "ci_low": 4.539999999999947,
+            "ci_high": 8.869999999999855,
+            "tau": 1.4
+          },
+          "Storyteller": {
+            "comparisons": [
+              {
+                "anchor_id": "A1",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both present clear problem, method, and reframing contribution."
+              },
+              {
+                "anchor_id": "A2",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both offer coherent narratives with problem, method, and reframing."
+              },
+              {
+                "anchor_id": "A3",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both clearly state problem, method, and contribution with reframing."
+              },
+              {
+                "anchor_id": "A4",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both have clear narrative arcs: problem, method, and reframing."
+              },
+              {
+                "anchor_id": "A5",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both integrate topology with diffusion and reframe their respective problems."
+              },
+              {
+                "anchor_id": "A6",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both present clear motivation, method, and reframed contribution."
+              },
+              {
+                "anchor_id": "A7",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both have coherent narratives with problem, method, and reframing."
+              },
+              {
+                "anchor_id": "A8",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both clearly define problem, method, and contribution with reframing."
+              },
+              {
+                "anchor_id": "A9",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both offer clear narratives: problem, method, and reframed contribution."
+              },
+              {
+                "anchor_id": "A10",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both present clear problem, method, and reframing contribution."
+              },
+              {
+                "anchor_id": "A11",
+                "judgement": "tie",
+                "strength": "weak",
+                "rationale": "Both have coherent narratives with problem, method, and reframing."
+              }
+            ],
+            "loss": 4.891633780086148,
+            "avg_strength": 1.0,
+            "monotonic_violations": 0,
+            "ci_low": 4.569999999999946,
+            "ci_high": 7.809999999999877,
+            "tau": 1.0
+          }
+        },
+        "pass": {
+          "mode": "two_of_three_q75_and_avg_ge_q50",
+          "used_distribution": "pattern",
+          "pattern_paper_count": 331,
+          "q50": 6.148000000000001,
+          "q75": 6.526000000000001,
+          "count_roles_ge_q75": 2,
+          "roles_ge_q75": {
+            "Methodology": true,
+            "Novelty": true,
+            "Storyteller": false
+          },
+          "avg_ge_q50": true,
+          "avg_score": 6.599999999999903
+        },
+        "rubric_version": "rubric_v1",
+        "card_version": "blind_card_v2_minimal"
+      },
+      "field_feedback": {
+        "title": {
+          "issue": "The title uses abstract theoretical terminology ('Emergent Temporal Topology') that creates distance from the core computer vision domain of video generation, potentially alienating the primary target audience.",
+          "edit_instruction": "Replace 'Emergent Temporal Topology' with more descriptive, domain-aligned terms like 'Latent Dynamic Graph' or 'Spatiotemporal Structure'. Ensure the title immediately signals the application (video) and core technical approach (graph-based diffusion). Example: 'Generating Consistent Videos via Diffusion of Latent Dynamic Graphs'.",
+          "expected_effect": "Increased clarity and accessibility for the computer vision community, immediately signaling the paper's contribution within the video generation domain."
+        },
+        "abstract": {
+          "issue": "The abstract leads with a theoretical reframing ('generative modeling of latent structure') before establishing the concrete, recognized problem (temporal inconsistency). The metrics are presented but not anchored to a clear, domain-standard narrative of solving flicker/warping.",
+          "edit_instruction": "Restructure the abstract to follow the standard problem-solution-impact narrative. First sentence should state the practical problem (temporal inconsistency/flicker in video generation). Then introduce your core solution (generating a latent dynamic graph). Finally, present results (e.g., 'Our method reduces flicker by 18%...') as the direct consequence. Replace 'spatiotemporal graph whose stability dictates visual coherence' with more direct language like 'underlying graph of visual entities whose stable evolution ensures consistency'.",
+          "expected_effect": "The abstract will better engage video generation researchers by foregrounding their known problem and clearly presenting a novel solution with quantifiable benefits."
+        },
+        "problem_framing": {
+          "issue": "The framing is overly abstract and uses metaphorical language ('scaffold', 'community-level structures') without immediately grounding these concepts in established video semantics (e.g., objects, scenes, motions). It critiques sequential models but does not explicitly connect their failure modes (vanishing gradients) to the *visual* symptom of inconsistency.",
+          "edit_instruction": "Re-anchor the problem in concrete visual artifacts. Start with: 'State-of-the-art video generators produce flicker and warping because they lack a persistent representation of...'. Explicitly map 'community-level structures' to 'semantic entities (e.g., objects, backgrounds) and their persistent interactions'. Connect the limitation of sequential models ('vanishing gradients') directly to their inability to maintain the identity of these entities over long ranges.",
+          "expected_effect": "Bridges the domain gap by showing how the abstract graph theory problem directly explains and solves a concrete, visual quality issue familiar to the audience."
+        },
+        "method_skeleton": {
+          "issue": "The description is a list of technical procedures (GMRES, CLEP, KDE) without explaining their *visual or representational purpose* in the context of video. The connection from 'sparse self-expressive code' to a visually meaningful 'graph node' is assumed. The role of 'temporal density correction' in improving visual output is unclear.",
+          "edit_instruction": "For each step, preface the technical method with its goal for video coherence. E.g., 'Step 1: To identify persistent visual entities across frames, we discover latent graph communities...'. Explicitly state that nodes correspond to recurring visual patches/features. In Step 3, explain that 'temporal density correction' adjusts the predicted graph dynamics to match real-world motion patterns, reducing jitter. Replace 'graph shift operators' with 'the rules governing how node connections change'.",
+          "expected_effect": "Makes the method intelligible to vision researchers by consistently linking mathematical tools to their role in solving the visual consistency problem."
+        },
+        "innovation_claims": {
+          "issue": "Claims are phrased as internal methodological shifts ('reframe... to an inherent generative property', 'co-evolutionary mechanism', 'unifies... into a single generative act') rather than as external, observable advantages for the field. They emphasize 'bypassing sequential bottlenecks' but not the resulting practical benefit (e.g., stable long-range generation).",
+          "edit_instruction": "Reformulate each claim to start with the tangible advantage for video generation. Claim 1: 'Our framework produces more temporally consistent videos by making stability a property of the generated latent graph, not a post-hoc constraint.' Claim 2: 'We enable scalable, long-range coherence by jointly discovering visual entities and modeling their interactions via a probabilistic graph, avoiding sequential modeling's compounding errors.' Claim 3: 'We improve robustness to variable frame rates by conditioning graph dynamics on a self-supervised estimate of temporal density, aligning generated motion with real-world timing.'",
+          "expected_effect": "Transforms the claims from descriptions of internal mechanism to clear value propositions for practitioners, directly addressing the domain-distance issue."
+        },
+        "experiments_plan": {
+          "issue": "The plan validates against standard benchmarks but does not propose an analysis directly linking graph properties to visual improvements. The 'visualize the generated latent graphs' is qualitative but not tied to a specific hypothesis about how graph stability reduces flicker.",
+          "edit_instruction": "Add a specific quantitative analysis: 'We will compute graph stability metrics (e.g., node persistence, edge volatility) across generated sequences and correlate them with per-video flicker and warping scores to validate that our method's improved visual consistency arises from more stable latent topologies.' In the ablation, specify the expected outcome: e.g., 'Ablation (2) will show that replacing CLEP with sequential attention increases graph volatility and flicker score.'",
+          "expected_effect": "Provides concrete, empirical evidence bridging the novel graph-theoretic construct (latent topology) to the domain's standard goal (visual consistency), strengthening the paper's core thesis."
+        }
+      },
+      "suggested_edits": [
+        {
+          "field": "innovation_claims",
+          "action": "rewrite",
+          "content": "1. **Stability as a Generative Property:** Our framework produces videos with significantly reduced flicker and warping by making temporal consistency an inherent property of the generated latent dynamic graph, rather than a post-hoc constraint applied to frames.\n2. **Scalable Long-Range Coherence:** We enable coherent generation over long time horizons by jointly discovering persistent visual entities and modeling their probabilistic interactions within a graph, avoiding the compounding errors and bottlenecks of sequential autoregressive models.\n3. **Robustness to Irregular Timing:** We improve generation robustness across diverse and unpredictable frame rates by conditioning the graph's evolution on a self-supervised estimate of temporal density, ensuring generated motion patterns align with real-world timing."
+        },
+        {
+          "field": "method_skeleton",
+          "action": "expand",
+          "content": "**Method Overview:** Our goal is to generate a video by first generating the stable, evolving relationships between its constituent visual parts. We achieve this in three stages:\n1.  **Discovering Visual Entities as Graph Nodes:** To identify what constitutes a persistent 'part' of the video (e.g., an object, a texture region), we analyze frame segments via a self-representation framework. This learns a sparse code for each segment, grouping recurring visual features into candidate graph nodes. This solves the problem of 'what' should have consistent properties over time.\n2.  **Generating the Evolving Interaction Graph:** We synthesize the video's latent structure by denoising a latent adjacency tensor in a diffusion process. A probabilistic latent interaction model (CLEP) uses community-specific embeddings to predict how edges (interactions) between nodes change frame-to-frame. A contrastive loss ensures node identities (community memberships) remain stable, providing the scaffold for coherent frame generation.\n3.  **Aligning Graph Dynamics with Real-World Motion:** To handle non-uniform frame rates and ensure natural motion, we estimate the temporal density of input timestamps via kernel density estimation. This density corrects the transition probabilities in the latent interaction model, ensuring the generated graph's evolution respects realistic timing, which in turn guides the decoder to produce frames with natural motion flow."
+        },
+        {
+          "field": "abstract",
+          "action": "rewrite",
+          "content": "Temporal inconsistency, manifesting as flicker and warping, remains a core challenge in video generation. We posit that this stems from a lack of persistent representation for the visual entities within a scene. This paper introduces a diffusion framework that addresses this by generating a video's underlying latent dynamic graph structure first. Our method discovers persistent visual entities as graph nodes and models their probabilistic interactions over time. Coherent video frames are then decoded from this stable graph scaffold. Experiments show our approach reduces flicker by 18% and temporal warping error by 22% on UCF-101 compared to Video Diffusion Models. Ablations confirm that generating a stable graph is key to achieving visual consistency."
+        }
+      ],
+      "priority": [
+        "innovation_claims",
+        "method_skeleton",
+        "abstract"
+      ],
+      "review_coach": {
+        "field_feedback": {
+          "title": {
+            "issue": "The title uses abstract theoretical terminology ('Emergent Temporal Topology') that creates distance from the core computer vision domain of video generation, potentially alienating the primary target audience.",
+            "edit_instruction": "Replace 'Emergent Temporal Topology' with more descriptive, domain-aligned terms like 'Latent Dynamic Graph' or 'Spatiotemporal Structure'. Ensure the title immediately signals the application (video) and core technical approach (graph-based diffusion). Example: 'Generating Consistent Videos via Diffusion of Latent Dynamic Graphs'.",
+            "expected_effect": "Increased clarity and accessibility for the computer vision community, immediately signaling the paper's contribution within the video generation domain."
+          },
+          "abstract": {
+            "issue": "The abstract leads with a theoretical reframing ('generative modeling of latent structure') before establishing the concrete, recognized problem (temporal inconsistency). The metrics are presented but not anchored to a clear, domain-standard narrative of solving flicker/warping.",
+            "edit_instruction": "Restructure the abstract to follow the standard problem-solution-impact narrative. First sentence should state the practical problem (temporal inconsistency/flicker in video generation). Then introduce your core solution (generating a latent dynamic graph). Finally, present results (e.g., 'Our method reduces flicker by 18%...') as the direct consequence. Replace 'spatiotemporal graph whose stability dictates visual coherence' with more direct language like 'underlying graph of visual entities whose stable evolution ensures consistency'.",
+            "expected_effect": "The abstract will better engage video generation researchers by foregrounding their known problem and clearly presenting a novel solution with quantifiable benefits."
+          },
+          "problem_framing": {
+            "issue": "The framing is overly abstract and uses metaphorical language ('scaffold', 'community-level structures') without immediately grounding these concepts in established video semantics (e.g., objects, scenes, motions). It critiques sequential models but does not explicitly connect their failure modes (vanishing gradients) to the *visual* symptom of inconsistency.",
+            "edit_instruction": "Re-anchor the problem in concrete visual artifacts. Start with: 'State-of-the-art video generators produce flicker and warping because they lack a persistent representation of...'. Explicitly map 'community-level structures' to 'semantic entities (e.g., objects, backgrounds) and their persistent interactions'. Connect the limitation of sequential models ('vanishing gradients') directly to their inability to maintain the identity of these entities over long ranges.",
+            "expected_effect": "Bridges the domain gap by showing how the abstract graph theory problem directly explains and solves a concrete, visual quality issue familiar to the audience."
+          },
+          "method_skeleton": {
+            "issue": "The description is a list of technical procedures (GMRES, CLEP, KDE) without explaining their *visual or representational purpose* in the context of video. The connection from 'sparse self-expressive code' to a visually meaningful 'graph node' is assumed. The role of 'temporal density correction' in improving visual output is unclear.",
+            "edit_instruction": "For each step, preface the technical method with its goal for video coherence. E.g., 'Step 1: To identify persistent visual entities across frames, we discover latent graph communities...'. Explicitly state that nodes correspond to recurring visual patches/features. In Step 3, explain that 'temporal density correction' adjusts the predicted graph dynamics to match real-world motion patterns, reducing jitter. Replace 'graph shift operators' with 'the rules governing how node connections change'.",
+            "expected_effect": "Makes the method intelligible to vision researchers by consistently linking mathematical tools to their role in solving the visual consistency problem."
+          },
+          "innovation_claims": {
+            "issue": "Claims are phrased as internal methodological shifts ('reframe... to an inherent generative property', 'co-evolutionary mechanism', 'unifies... into a single generative act') rather than as external, observable advantages for the field. They emphasize 'bypassing sequential bottlenecks' but not the resulting practical benefit (e.g., stable long-range generation).",
+            "edit_instruction": "Reformulate each claim to start with the tangible advantage for video generation. Claim 1: 'Our framework produces more temporally consistent videos by making stability a property of the generated latent graph, not a post-hoc constraint.' Claim 2: 'We enable scalable, long-range coherence by jointly discovering visual entities and modeling their interactions via a probabilistic graph, avoiding sequential modeling's compounding errors.' Claim 3: 'We improve robustness to variable frame rates by conditioning graph dynamics on a self-supervised estimate of temporal density, aligning generated motion with real-world timing.'",
+            "expected_effect": "Transforms the claims from descriptions of internal mechanism to clear value propositions for practitioners, directly addressing the domain-distance issue."
+          },
+          "experiments_plan": {
+            "issue": "The plan validates against standard benchmarks but does not propose an analysis directly linking graph properties to visual improvements. The 'visualize the generated latent graphs' is qualitative but not tied to a specific hypothesis about how graph stability reduces flicker.",
+            "edit_instruction": "Add a specific quantitative analysis: 'We will compute graph stability metrics (e.g., node persistence, edge volatility) across generated sequences and correlate them with per-video flicker and warping scores to validate that our method's improved visual consistency arises from more stable latent topologies.' In the ablation, specify the expected outcome: e.g., 'Ablation (2) will show that replacing CLEP with sequential attention increases graph volatility and flicker score.'",
+            "expected_effect": "Provides concrete, empirical evidence bridging the novel graph-theoretic construct (latent topology) to the domain's standard goal (visual consistency), strengthening the paper's core thesis."
+          }
+        },
+        "suggested_edits": [
+          {
+            "field": "innovation_claims",
+            "action": "rewrite",
+            "content": "1. **Stability as a Generative Property:** Our framework produces videos with significantly reduced flicker and warping by making temporal consistency an inherent property of the generated latent dynamic graph, rather than a post-hoc constraint applied to frames.\n2. **Scalable Long-Range Coherence:** We enable coherent generation over long time horizons by jointly discovering persistent visual entities and modeling their probabilistic interactions within a graph, avoiding the compounding errors and bottlenecks of sequential autoregressive models.\n3. **Robustness to Irregular Timing:** We improve generation robustness across diverse and unpredictable frame rates by conditioning the graph's evolution on a self-supervised estimate of temporal density, ensuring generated motion patterns align with real-world timing."
+          },
+          {
+            "field": "method_skeleton",
+            "action": "expand",
+            "content": "**Method Overview:** Our goal is to generate a video by first generating the stable, evolving relationships between its constituent visual parts. We achieve this in three stages:\n1.  **Discovering Visual Entities as Graph Nodes:** To identify what constitutes a persistent 'part' of the video (e.g., an object, a texture region), we analyze frame segments via a self-representation framework. This learns a sparse code for each segment, grouping recurring visual features into candidate graph nodes. This solves the problem of 'what' should have consistent properties over time.\n2.  **Generating the Evolving Interaction Graph:** We synthesize the video's latent structure by denoising a latent adjacency tensor in a diffusion process. A probabilistic latent interaction model (CLEP) uses community-specific embeddings to predict how edges (interactions) between nodes change frame-to-frame. A contrastive loss ensures node identities (community memberships) remain stable, providing the scaffold for coherent frame generation.\n3.  **Aligning Graph Dynamics with Real-World Motion:** To handle non-uniform frame rates and ensure natural motion, we estimate the temporal density of input timestamps via kernel density estimation. This density corrects the transition probabilities in the latent interaction model, ensuring the generated graph's evolution respects realistic timing, which in turn guides the decoder to produce frames with natural motion flow."
+          },
+          {
+            "field": "abstract",
+            "action": "rewrite",
+            "content": "Temporal inconsistency, manifesting as flicker and warping, remains a core challenge in video generation. We posit that this stems from a lack of persistent representation for the visual entities within a scene. This paper introduces a diffusion framework that addresses this by generating a video's underlying latent dynamic graph structure first. Our method discovers persistent visual entities as graph nodes and models their probabilistic interactions over time. Coherent video frames are then decoded from this stable graph scaffold. Experiments show our approach reduces flicker by 18% and temporal warping error by 22% on UCF-101 compared to Video Diffusion Models. Ablations confirm that generating a stable graph is key to achieving visual consistency."
+          }
+        ],
+        "priority": [
+          "innovation_claims",
+          "method_skeleton",
+          "abstract"
+        ]
+      }
+    }
+  ],
+  "results_dir": "/Users/weishi/dev/Idea2Paper/results/run_20260207_145754_74886_ee08d4",
+  "novelty_report": {
+    "run_id": "run_20260207_145754_74886_ee08d4",
+    "created_at": "2026-02-07T15:26:31.009015+00:00",
+    "user_idea": "Improving diffusion models for temporal consistency in video generation",
+    "embedding_available": false,
+    "embedding_model": null,
+    "top_k": 100,
+    "thresholds": {
+      "high": 0.88,
+      "medium": 0.82
+    },
+    "risk_level": "unknown",
+    "max_similarity": 0.1092896174863388,
+    "candidates": [
+      {
+        "paper_id": "d23EVDRJ6g",
+        "title": "MotionDreamer: One-to-Many Motion Synthesis with Localized Generative Masked Transformer",
+        "pattern_id": "",
+        "domain": "Computer Vision",
+        "cosine": null,
+        "keyword_overlap": 0.1092896174863388
+      },
+      {
+        "paper_id": "X41c4uB4k0",
+        "title": "Training-free Multi-objective Diffusion Model for 3D Molecule Generation",
+        "pattern_id": "pattern_6",
+        "domain": "Machine Learning",
+        "cosine": null,
+        "keyword_overlap": 0.10704225352112676
+      },
+      {
+        "paper_id": "ZE6lrLvATd",
+        "title": "Improving Equivariant Networks with Probabilistic Symmetry Breaking",
+        "pattern_id": "pattern_31",
+        "domain": "Machine Learning",
+        "cosine": null,
+        "keyword_overlap": 0.10684931506849316
+      },
+      {
+        "paper_id": "MbM1BqGpZu",
+        "title": "Diffusion Transformer Captures Spatial-Temporal Dependencies: A Theory for Gaussian Process Data",
+        "pattern_id": "",
+        "domain": "Machine Learning",
+        "cosine": null,
+        "keyword_overlap": 0.10662824207492795
+      },
+      {
+        "paper_id": "j6zUzrapY3L",
+        "title": "DIFFormer: Scalable (Graph) Transformers Induced by Energy Constrained Diffusion",
+        "pattern_id": "",
+        "domain": "Machine Learning",
+        "cosine": null,
+        "keyword_overlap": 0.10638297872340426
+      },
+      {
+        "paper_id": "1JbsdayvhO",
+        "title": "Denoising Diffusion via Image-Based Rendering",
+        "pattern_id": "",
+        "domain": "Computer Vision",
+        "cosine": null,
+        "keyword_overlap": 0.10597826086956522
+      },
+      {
+        "paper_id": "E78OaH2s3f",
+        "title": "CAS: A Probability-Based Approach for Universal Condition Alignment Score",
+        "pattern_id": "pattern_84",
+        "domain": "Machine Learning",
+        "cosine": null,
+        "keyword_overlap": 0.10571428571428572
+      },
+      {
+        "paper_id": "xnssGv9rpW",
+        "title": "SymmCD: Symmetry-Preserving Crystal Generation with Diffusion Models",
+        "pattern_id": "pattern_6",
+        "domain": "Materials Science",
+        "cosine": null,
+        "keyword_overlap": 0.10541310541310542
+      },
+      {
+        "paper_id": "exKHibougU",
+        "title": "LLM-grounded Video Diffusion Models",
+        "pattern_id": "pattern_114",
+        "domain": "Computer Vision",
+        "cosine": null,
+        "keyword_overlap": 0.10497237569060773
+      },
+      {
+        "paper_id": "AAXBfJNHDt",
+        "title": "Generating Graphs via Spectral Diffusion",
+        "pattern_id": "pattern_24",
+        "domain": "Machine Learning",
+        "cosine": null,
+        "keyword_overlap": 0.10404624277456648
+      }
+    ],
+    "notes": [
+      "index_reused",
+      "story_embedding_failed"
+    ],
+    "report_path": "/Users/weishi/dev/Idea2Paper/results/run_20260207_145754_74886_ee08d4/novelty_report.json",
+    "pivot_attempts": 0,
+    "action": "report_only"
+  },
+  "recall_audit": {
+    "final_top_k": [
+      {
+        "pattern_id": "pattern_114",
+        "name": "Reframing Video Generation Challenges",
+        "final_score": 0.7782653664943752,
+        "path1_score": 0.604532279314888,
+        "path2_score": 0.0,
+        "path3_score": 0.1737330871794872,
+        "cluster_size": 44
+      },
+      {
+        "pattern_id": "pattern_100",
+        "name": "Reframing Diffusion Sampling Efficiency",
+        "final_score": 0.3307936066627817,
+        "path1_score": 0.1523809523809524,
+        "path2_score": 0.00017460317460317465,
+        "path3_score": 0.17823805110722613,
+        "cluster_size": 148
+      },
+      {
+        "pattern_id": "pattern_24",
+        "name": "Reframing Graph Learning Scalability",
+        "final_score": 0.13431166596989966,
+        "path1_score": 0.08695652173913043,
+        "path2_score": 0.0,
+        "path3_score": 0.047355144230769225,
+        "cluster_size": 331
+      },
+      {
+        "pattern_id": "pattern_115",
+        "name": "Semantic Alignment for Compositional Generation",
+        "final_score": 0.12062720000000002,
+        "path1_score": 0.08000000000000002,
+        "path2_score": 0.0,
+        "path3_score": 0.0406272,
+        "cluster_size": 107
+      },
+      {
+        "pattern_id": "pattern_102",
+        "name": "Text to 3D generation robustness",
+        "final_score": 0.11408018285714286,
+        "path1_score": 0.0761904761904762,
+        "path2_score": 0.0,
+        "path3_score": 0.03788970666666666,
+        "cluster_size": 50
+      },
+      {
+        "pattern_id": "pattern_84",
+        "name": "Reframing Generative Model Training Dynamics",
+        "final_score": 0.08695652173913043,
+        "path1_score": 0.08695652173913043,
+        "path2_score": 0.0,
+        "path3_score": 0.0,
+        "cluster_size": 43
+      },
+      {
+        "pattern_id": "pattern_94",
+        "name": "Reframing Generation Through Multi-Feature Integration",
+        "final_score": 0.08421052631578947,
+        "path1_score": 0.08421052631578947,
+        "path2_score": 0.0,
+        "path3_score": 0.0,
+        "cluster_size": 34
+      },
+      {
+        "pattern_id": "pattern_49",
+        "name": "Reframing Inverse Problems with Diffusion",
+        "final_score": 0.0404496,
+        "path1_score": 0.0,
+        "path2_score": 0.0,
+        "path3_score": 0.0404496,
+        "cluster_size": 15
+      },
+      {
+        "pattern_id": "pattern_7",
+        "name": "Reframing Audio Understanding Through Multimodal and Probabilistic Learning",
+        "final_score": 0.038762722539682534,
+        "path1_score": 0.0,
+        "path2_score": 0.0008730158730158731,
+        "path3_score": 0.03788970666666666,
+        "cluster_size": 41
+      },
+      {
+        "pattern_id": "pattern_45",
+        "name": "Personalized Privacy Accounting",
+        "final_score": 0.03859692307692308,
+        "path1_score": 0.0,
+        "path2_score": 0.0,
+        "path3_score": 0.03859692307692308,
+        "cluster_size": 100
+      }
+    ],
+    "path1": {
+      "top_ideas": [
+        {
+          "idea_id": "idea_7361",
+          "similarity": 0.2631578947368421,
+          "snippet": "Utilize image diffusion models to enhance video frame quality while maintaining temporal coherence in video diffusion models.",
+          "pattern_count": 0
+        },
+        {
+          "idea_id": "idea_4119",
+          "similarity": 0.23809523809523808,
+          "snippet": "Introduce a novel video tokenizer that enables Large Language Models to outperform diffusion models in visual generation tasks.",
+          "pattern_count": 0
+        },
+        {
+          "idea_id": "idea_5588",
+          "similarity": 0.23809523809523808,
+          "snippet": "Integrate autoregressive models with diffusion transformers to enhance long video generation by leveraging spatial and temporal information.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_6798",
+          "similarity": 0.23809523809523808,
+          "snippet": "Introduce a training-free method for camera control in video diffusion models using layout priors and pixel rearrangement.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_6946",
+          "similarity": 0.22727272727272727,
+          "snippet": "Introduce a method for enabling fine-grained 3D camera control in transformer-based video diffusion models using a ControlNet-like conditioning mechanism.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_7122",
+          "similarity": 0.22727272727272727,
+          "snippet": "Introduce a dynamic modulation technique for negative prompting in diffusion models to enhance image generation quality and safety.",
+          "pattern_count": 0
+        },
+        {
+          "idea_id": "idea_823",
+          "similarity": 0.21739130434782608,
+          "snippet": "Introduce an autoregressive diffusion process for graph generation that operates directly in discrete graph space, improving efficiency and constraint incorporation.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_4196",
+          "similarity": 0.21739130434782608,
+          "snippet": "Utilize large language models to generate dynamic scene layouts for guiding video diffusion models, enhancing spatiotemporal coherence in video generation.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_6329",
+          "similarity": 0.21739130434782608,
+          "snippet": "Introduce guidance techniques for diffusion models that eliminate the need for special training procedures while maintaining or improving generation quality.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_2536",
+          "similarity": 0.21052631578947367,
+          "snippet": "Introduce scheduled sampling into diffusion models to address compounding errors in markup-to-image generation tasks.",
+          "pattern_count": 0
+        },
+        {
+          "idea_id": "idea_4171",
+          "similarity": 0.21052631578947367,
+          "snippet": "Leverage diffusion models for flexible and controlled human motion generation through novel composition methods.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_4337",
+          "similarity": 0.21052631578947367,
+          "snippet": "Leverage pretrained text-to-image diffusion models for efficient text-conditioned video prediction by extending them temporally.",
+          "pattern_count": 0
+        },
+        {
+          "idea_id": "idea_7798",
+          "similarity": 0.20833333333333334,
+          "snippet": "Enhance textual generation in diffusion models by precisely localizing and fine-tuning less than 1% of parameters responsible for text content.",
+          "pattern_count": 0
+        },
+        {
+          "idea_id": "idea_5650",
+          "similarity": 0.2,
+          "snippet": "Introduce engagement-aware metrics and models to optimize text-to-image generation for viewer engagement in marketing contexts.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_7411",
+          "similarity": 0.2,
+          "snippet": "Introduce a zero-shot, self-guided framework for controllable image-to-video generation using pre-trained diffusion models without fine-tuning.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_7915",
+          "similarity": 0.2,
+          "snippet": "Introduce precise camera pose control in video diffusion models to enhance narrative expression and customization.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_4186",
+          "similarity": 0.19047619047619047,
+          "snippet": "Incorporate 3D awareness into pretrained 2D diffusion models to enhance robustness and 3D consistency in text-to-3D generation.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_4613",
+          "similarity": 0.19047619047619047,
+          "snippet": "Introduce step-aware neural networks to optimize computational efficiency in denoising diffusion models without sacrificing generation quality.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_7380",
+          "similarity": 0.19047619047619047,
+          "snippet": "Establish a fast convergence theory for denoising diffusion probabilistic models with minimal assumptions, improving theoretical guarantees.",
+          "pattern_count": 1
+        },
+        {
+          "idea_id": "idea_7978",
+          "similarity": 0.19047619047619047,
+          "snippet": "Adapt image-to-video diffusion models for generating coherent video sequences between keyframes using a dual-directional sampling process.",
+          "pattern_count": 1
+        }
+      ],
+      "pattern_scores_topn": [
+        {
+          "pattern_id": "pattern_114",
+          "score": 0.604532279314888
+        },
+        {
+          "pattern_id": "pattern_100",
+          "score": 0.1523809523809524
+        },
+        {
+          "pattern_id": "pattern_24",
+          "score": 0.08695652173913043
+        },
+        {
+          "pattern_id": "pattern_84",
+          "score": 0.08695652173913043
+        },
+        {
+          "pattern_id": "pattern_94",
+          "score": 0.08421052631578947
+        },
+        {
+          "pattern_id": "pattern_115",
+          "score": 0.08000000000000002
+        },
+        {
+          "pattern_id": "pattern_102",
+          "score": 0.0761904761904762
+        }
+      ]
+    },
+    "path2": {
+      "top_domains": [
+        {
+          "domain_id": "domain_83",
+          "name": "Astrophysics",
+          "weight": 0.18181818181818182,
+          "paper_count": 1
+        },
+        {
+          "domain_id": "domain_59",
+          "name": "Medical Imaging",
+          "weight": 0.16666666666666666,
+          "paper_count": 7
+        },
+        {
+          "domain_id": "domain_45",
+          "name": "Audio Processing",
+          "weight": 0.14285714285714285,
+          "paper_count": 5
+        },
+        {
+          "domain_id": "domain_57",
+          "name": "Signal Processing",
+          "weight": 0.14285714285714285,
+          "paper_count": 1
+        },
+        {
+          "domain_id": "domain_70",
+          "name": "Computer Architecture",
+          "weight": 0.14285714285714285,
+          "paper_count": 1
+        }
+      ],
+      "top_subdomains": [
+        {
+          "domain_id": "domain_83",
+          "subdomains": [
+            {
+              "name": "Diffusion Models",
+              "score": 0.2222222222222222
+            }
+          ]
+        },
+        {
+          "domain_id": "domain_59",
+          "subdomains": [
+            {
+              "name": "Diffusion Models",
+              "score": 0.2222222222222222
+            }
+          ]
+        },
+        {
+          "domain_id": "domain_45",
+          "subdomains": [
+            {
+              "name": "Diffusion Models",
+              "score": 0.2222222222222222
+            },
+            {
+              "name": "Contrastive Learning",
+              "score": 0.0
+            }
+          ]
+        },
+        {
+          "domain_id": "domain_57",
+          "subdomains": [
+            {
+              "name": "Diffusion Models",
+              "score": 0.2222222222222222
+            },
+            {
+              "name": "Contrastive Learning",
+              "score": 0.0
+            }
+          ]
+        },
+        {
+          "domain_id": "domain_70",
+          "subdomains": [
+            {
+              "name": "Diffusion Models",
+              "score": 0.2222222222222222
+            },
+            {
+              "name": "Image Generation",
+              "score": 0.1
+            }
+          ]
+        }
+      ],
+      "candidate_stats": [
+        {
+          "domain_id": "domain_83",
+          "candidates_before": 1,
+          "candidates_after": 1
+        },
+        {
+          "domain_id": "domain_59",
+          "candidates_before": 1,
+          "candidates_after": 1
+        },
+        {
+          "domain_id": "domain_45",
+          "candidates_before": 1,
+          "candidates_after": 1
+        },
+        {
+          "domain_id": "domain_57",
+          "candidates_before": 1,
+          "candidates_after": 1
+        },
+        {
+          "domain_id": "domain_70",
+          "candidates_before": 1,
+          "candidates_after": 1
+        }
+      ],
+      "pattern_scores_topn": [
+        {
+          "pattern_id": "pattern_7",
+          "score": 0.0008730158730158731
+        },
+        {
+          "pattern_id": "pattern_116",
+          "score": 0.00022222222222222231
+        },
+        {
+          "pattern_id": "pattern_99",
+          "score": 0.00020370370370370375
+        },
+        {
+          "pattern_id": "pattern_100",
+          "score": 0.00017460317460317465
+        }
+      ],
+      "subdomain_taxonomy_used": true,
+      "raw_subdomain_count": 319,
+      "canonical_subdomain_count": 58,
+      "stoplist_count": 13
+    },
+    "path3": {
+      "top_papers": [
+        {
+          "paper_id": "K9sVJ17zvB",
+          "similarity": 0.46153846153846156,
+          "title": "VersVideo: Leveraging Enhanced Temporal Diffusion Models for Versatile Video Generation",
+          "quality": 0.532,
+          "review_count": 5
+        },
+        {
+          "paper_id": "gdHtZlaaSo",
+          "similarity": 0.38461538461538464,
+          "title": "Precise Parameter Localization for Textual Generation in Diffusion Models",
+          "quality": 0.5633333333333334,
+          "review_count": 6
+        },
+        {
+          "paper_id": "sL2F9YCMXf",
+          "similarity": 0.3333333333333333,
+          "title": "Energy-Based Diffusion Language Models for Text Generation",
+          "quality": 0.588,
+          "review_count": 5
+        },
+        {
+          "paper_id": "8pusxkLEQO",
+          "similarity": 0.3333333333333333,
+          "title": "ARLON: Boosting Diffusion Transformers with Autoregressive Models for Long Video Generation",
+          "quality": 0.5680000000000001,
+          "review_count": 5
+        },
+        {
+          "paper_id": "2ZK8zyIt7o",
+          "similarity": 0.3333333333333333,
+          "title": "Improving Long-Text Alignment for Text-to-Image Diffusion Models",
+          "quality": 0.552,
+          "review_count": 5
+        },
+        {
+          "paper_id": "esYrEndGsr",
+          "similarity": 0.2857142857142857,
+          "title": "Influence Functions for Scalable Data Attribution in Diffusion Models",
+          "quality": 0.625,
+          "review_count": 4
+        },
+        {
+          "paper_id": "Z4evOUYrk7",
+          "similarity": 0.3076923076923077,
+          "title": "CameraCtrl: Enabling Camera Control for Video Diffusion Models",
+          "quality": 0.576,
+          "review_count": 5
+        },
+        {
+          "paper_id": "OTiSSCBm1QD",
+          "similarity": 0.3333333333333333,
+          "title": "Temporal Relevance Analysis for Video Action Models",
+          "quality": 0.5225,
+          "review_count": 4
+        },
+        {
+          "paper_id": "jKcZ4hF4s5",
+          "similarity": 0.3076923076923077,
+          "title": "Positive-Unlabeled Diffusion Models for Preventing Sensitive Data Generation",
+          "quality": 0.5599999999999999,
+          "review_count": 5
+        },
+        {
+          "paper_id": "WmIwYTd0YTF",
+          "similarity": 0.25,
+          "title": "Stable Target Field for Reduced Variance Score Estimation in Diffusion Models",
+          "quality": 0.6875,
+          "review_count": 4
+        },
+        {
+          "paper_id": "VM8batVBWvg",
+          "similarity": 0.23076923076923078,
+          "title": "Discrete Predictor-Corrector Diffusion Models for Image Synthesis",
+          "quality": 0.7350000000000001,
+          "review_count": 4
+        },
+        {
+          "paper_id": "exKHibougU",
+          "similarity": 0.3,
+          "title": "LLM-grounded Video Diffusion Models",
+          "quality": 0.5599999999999999,
+          "review_count": 4
+        },
+        {
+          "paper_id": "WNkW0cOwiz",
+          "similarity": 0.2727272727272727,
+          "title": "Lipschitz Singularities in Diffusion Models",
+          "quality": 0.616,
+          "review_count": 5
+        },
+        {
+          "paper_id": "UaAD-Nu86WX",
+          "similarity": 0.23076923076923078,
+          "title": "DiGress: Discrete Denoising diffusion for graph generation",
+          "quality": 0.7162499999999999,
+          "review_count": 4
+        },
+        {
+          "paper_id": "MtDd7rWok1",
+          "similarity": 0.2727272727272727,
+          "title": "Anti-Exposure Bias in Diffusion Models",
+          "quality": 0.6033333333333334,
+          "review_count": 6
+        },
+        {
+          "paper_id": "4eJ43EN2g6l",
+          "similarity": 0.23076923076923078,
+          "title": "SketchKnitter: Vectorized Sketch Generation with Diffusion Models",
+          "quality": 0.7100000000000001,
+          "review_count": 4
+        },
+        {
+          "paper_id": "9_gsMA8MRKQ",
+          "similarity": 0.25,
+          "title": "Pseudoinverse-Guided Diffusion Models for Inverse Problems",
+          "quality": 0.636,
+          "review_count": 5
+        },
+        {
+          "paper_id": "KrK6zXbjfO",
+          "similarity": 0.26666666666666666,
+          "title": "SoundCTM: Unifying Score-based and Consistency Models for Full-band Text-to-Sound Generation",
+          "quality": 0.596,
+          "review_count": 5
+        },
+        {
+          "paper_id": "eajZpoQkGK",
+          "similarity": 0.26666666666666666,
+          "title": "DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation",
+          "quality": 0.596,
+          "review_count": 5
+        },
+        {
+          "paper_id": "Gx04TnVjee",
+          "similarity": 0.26666666666666666,
+          "title": "3DTrajMaster: Mastering 3D Trajectory for Multi-Entity Motion in Video Generation",
+          "quality": 0.588,
+          "review_count": 5
+        }
+      ],
+      "pattern_scores_topn": [
+        {
+          "pattern_id": "pattern_100",
+          "score": 0.17823805110722613
+        },
+        {
+          "pattern_id": "pattern_114",
+          "score": 0.1737330871794872
+        },
+        {
+          "pattern_id": "pattern_24",
+          "score": 0.047355144230769225
+        },
+        {
+          "pattern_id": "pattern_115",
+          "score": 0.0406272
+        },
+        {
+          "pattern_id": "pattern_49",
+          "score": 0.0404496
+        },
+        {
+          "pattern_id": "pattern_45",
+          "score": 0.03859692307692308
+        },
+        {
+          "pattern_id": "pattern_7",
+          "score": 0.03788970666666666
+        },
+        {
+          "pattern_id": "pattern_102",
+          "score": 0.03788970666666666
+        }
+      ]
+    }
+  },
+  "review_summary": {
+    "total_reviews": 2,
+    "final_score": 6.599999999999903
+  },
+  "refinement_summary": {
+    "total_refinements": 1,
+    "issues_addressed": [
+      "stability"
+    ]
+  },
+  "verification_summary": {
+    "collision_detected": false,
+    "max_similarity": 0.1092896174863388
+  },
+  "idea_packaging": null
+}

Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/idea_emb.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f279591966dfc193ed05b4c36e9561a1a97b2256c2be80764ad0bba2afcd9da
+size 1228928

Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/idea_manifest.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "created_at": "2026-02-12T12:07:29.152228+00:00",
+  "embedding_model": "gemini-embedding-001",
+  "nodes_idea_hash": "dd3d6ab304a0bec1c73d3774cbdb96068d54fd0e18670c5a967f550fe0fd81f1",
+  "count": 100,
+  "index_count": 100,
+  "skipped": 0
+}

Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/idea_meta.jsonl ADDED Viewed

	@@ -0,0 +1,100 @@

+{"idea_id": "idea_0", "text_hash": "928178e2369a5807b26f3dd1633d78f8e1ed059aa98031cb67725f2da8d3f359", "snippet": "Analyze and mitigate the impact of label errors on group-based disparity metrics to improve fairness in machine learning models.", "pattern_count": 1}
+{"idea_id": "idea_1", "text_hash": "2b226693e31ca4c4e5d5b02dfedb17137f47c4a11ea4b495de1546eba12280cf", "snippet": "Incorporate lateral inhibition mechanisms from neurobiology into CNN architectures to enhance image classification performance.", "pattern_count": 1}
+{"idea_id": "idea_2", "text_hash": "a26e8fd94539594e933f4e3fc5e49e173ed9429848493f03dbb0730026bc26bd", "snippet": "Introduce a factorized Fourier-based neural operator that bridges the gap between machine learning and numerical solvers for PDE simulations.", "pattern_count": 1}
+{"idea_id": "idea_3", "text_hash": "f6a56d9188896fd6802e0f891cdea87305740e10ee2598e40612cf3de2de168b", "snippet": "Introduce a data-free pruning strategy for coupled channels in multi-branch neural networks to improve inference time without significant accuracy loss.", "pattern_count": 1}
+{"idea_id": "idea_4", "text_hash": "b6073cf1f217396b3a7c8ae10fdba15b921ef99e03c52c4500173e1b90315ac5", "snippet": "Introduce a data-free pruning method using total variation distance to identify and remove non-discriminative filters in neural networks.", "pattern_count": 1}
+{"idea_id": "idea_5", "text_hash": "a6f78e4132787a07f70614cfc9a4d81704a4eb84acd5e259bf778d3144bf60ee", "snippet": "Introduce a new descent direction method for adversarial training that corrects a misinterpretation of Danskin's Theorem, improving robustness in early training stages.", "pattern_count": 1}
+{"idea_id": "idea_6", "text_hash": "fdba3a71ac8d5f9a8f895ea90018ef9609f5c002af4b50a4c02937d4fb744573", "snippet": "Integrate multiple brain-inspired mechanisms into artificial neural networks to enhance continual learning capabilities.", "pattern_count": 1}
+{"idea_id": "idea_7", "text_hash": "b57271592e32dd08eb4d8cb5d9129ce77ae65b237d645382a6d81e8f93387629", "snippet": "Enhance personalized federated learning by integrating knowledge transfer mechanisms to improve model performance on Non-IID data.", "pattern_count": 1}
+{"idea_id": "idea_8", "text_hash": "93f789539dcd8573bf632794ec497adbf4ead488b5cab2ee64d4532120ff55ec", "snippet": "Introduce a representation learning method with provable fairness guarantees by restricting the encoder's representation space.", "pattern_count": 1}
+{"idea_id": "idea_9", "text_hash": "796b966cf0ff9e3793d6d9eef35c5a52a64daf517e31c62f8b1a810ea26e5d31", "snippet": "Introduce a novel algorithm, TSEETC, for online restless bandits with unobserved states, achieving near-optimal Bayesian regret bounds.", "pattern_count": 1}
+{"idea_id": "idea_10", "text_hash": "d04f30314e6ecdbd34872b9cb4e88baee98bc061bdb6d8470729abe3d2352155", "snippet": "Introduce a parameterized aggregation strategy to address aggregation bias in federated learning caused by non-iid data distributions across communication rounds.", "pattern_count": 1}
+{"idea_id": "idea_11", "text_hash": "5a11684e04f19fb3ca2e72b5ceb4ae336b571f9bb9a150a44b65a4041c4038b3", "snippet": "Utilize reinforcement learning to optimize the selection of data-driven insights based on user feedback and lifestyle quality estimates.", "pattern_count": 0}
+{"idea_id": "idea_12", "text_hash": "ccb2056ec4acc5937d9d139f9c340837d72899dfdfe140e4b9d16e13cd2608e3", "snippet": "Introduce a dynamic latent hierarchy model that captures spatiotemporal structures in videos to improve long-term prediction accuracy.", "pattern_count": 0}
+{"idea_id": "idea_13", "text_hash": "ab7df5940f9c7b214a83dc2bf622f0f6b9130f88d8774ca4c2090db6856c7c45", "snippet": "Leverage Swin Transformers to enhance zero-shot semantic segmentation by integrating visual and semantic embeddings in a joint space.", "pattern_count": 1}
+{"idea_id": "idea_14", "text_hash": "65ded30d74181fccbed8282d51c5178affd39a246321464d9dbd8f9994da79e9", "snippet": "Integrate recurrent dynamics into Transformers by encoding them into positional encodings, enhancing sample efficiency and modeling capabilities.", "pattern_count": 1}
+{"idea_id": "idea_15", "text_hash": "5a654dac8b117c2ea120361f81ef484e793ad8120b96f2fef791b2036761d306", "snippet": "Bridge the gap between human intuition and formal fairness specifications by leveraging unsupervised style transfer and GPT-3 to generate intuitive fairness specifications.", "pattern_count": 1}
+{"idea_id": "idea_16", "text_hash": "084b8a26f7794b68348512873272e1f636ba973a38d39d279ac743a7a16c3159", "snippet": "Extend strictly proper scoring rules to better quantify uncertainty in survival analysis, providing new algorithms for improved estimation.", "pattern_count": 1}
+{"idea_id": "idea_17", "text_hash": "c7fcd7173e024cdb60e3099b9dff5c6c9964eb972dba57b2aa8c02f87fca1635", "snippet": "Investigate how different social network topologies affect innovation in multi-agent reinforcement learning environments.", "pattern_count": 1}
+{"idea_id": "idea_18", "text_hash": "9ce925f64656b6154d3383af53d6cc7818b80576eaa96b94a35a1542ae0144f2", "snippet": "Demonstrate that mini-batch $k$-means achieves global convergence within a bounded number of iterations, ensuring efficiency and reliability.", "pattern_count": 1}
+{"idea_id": "idea_19", "text_hash": "12704e1d92c86319637eb52127da3ea49913cc84ddd2864e21a32a9afd457e8c", "snippet": "Evaluate learning dynamics in games by focusing on average-case performance metrics rather than just convergence to equilibria.", "pattern_count": 1}
+{"idea_id": "idea_20", "text_hash": "d45dd44053b742f386e845cb1e8d8e91d13368245ae9770b1de1aecdd2bfd4b6", "snippet": "Integrate deep learning embeddings with structured decoding to enhance gene finding across diverse organisms.", "pattern_count": 1}
+{"idea_id": "idea_21", "text_hash": "deef2e5661321d4dd8d06f3afef1a30425c60cb64e572c4074ef42b8016bfb61", "snippet": "Utilize maximum-entropy rewarded reinforcement learning to enhance data selection for NLP tasks, improving generalization to unknown domains.", "pattern_count": 0}
+{"idea_id": "idea_22", "text_hash": "815df4db0e3b6598c2d972013ec129441e49e9203df495fcb90ffe109c46eb2a", "snippet": "Utilize graph structures to enhance OOD detection by representing data points as networks of related features.", "pattern_count": 1}
+{"idea_id": "idea_23", "text_hash": "5d96823dde3fc665dfb21c7f00d73de485702882df3587e270b4b36d23258dea", "snippet": "Utilize normalizing flow to create a conservative action encoder for offline reinforcement learning, reducing extrapolation error and distributional shift.", "pattern_count": 1}
+{"idea_id": "idea_24", "text_hash": "11b168f632d98955f9485c4d1629110a896e574929effb12cfb2ec8398bde184", "snippet": "Leverage simple explanations to train reliable models on small datasets by interleaving output optimization and attention fine-tuning.", "pattern_count": 1}
+{"idea_id": "idea_25", "text_hash": "9fc6a8a9bf73e83eb27649c973a214eea9b947a96b35005b2a41655b55ca2f51", "snippet": "Introduce a framework that models each data point as originating from its own function, offering a new perspective on noise and generalization.", "pattern_count": 0}
+{"idea_id": "idea_26", "text_hash": "e7ac830a984169389e2fc986333c5da526a8cf4586fe2add456e0c51df874d65", "snippet": "Introduce a Neural ODE model with linear dynamics and a Kalman-inspired state update for efficient and stable forecasting of irregular time series.", "pattern_count": 1}
+{"idea_id": "idea_27", "text_hash": "2138f681fddd19a4dcfcf716b0ced722676c698933437654b76ee27d238896d9", "snippet": "Introduce a joint learning mechanism combining supervised contrastive learning to improve symbolic regression by addressing the ill-posed problem of insufficient supervision.", "pattern_count": 1}
+{"idea_id": "idea_28", "text_hash": "ecd85db9e9ab4ff1386bbbc19f9c419d0a0a5d27b43e6a867d7e61cf2ecd4e93", "snippet": "Extend gradient-based meta-learning to transfer learning by using task-adapted parameters as informative task representations.", "pattern_count": 0}
+{"idea_id": "idea_29", "text_hash": "86f54bc405a03ed6848ee2f37132b120b387acd3fe98a3e1ec54756d80194e2a", "snippet": "Introduce a coreset construction for rational function fitting to efficiently approximate time-series data with recursive sequence models.", "pattern_count": 1}
+{"idea_id": "idea_30", "text_hash": "306da7cb2528a880f58ad056d93ac798508c6c0ac38b0302edde7478281ad986", "snippet": "Propose that NMDAR-like nonlinearity is crucial for transforming short-term working memory into long-term reference memory in transformer models.", "pattern_count": 1}
+{"idea_id": "idea_31", "text_hash": "eab95b37cfd0432c880aec30d68a879d2a6df39d780c5cc192f9357ef29d38ef", "snippet": "Explore the necessity of labels in GNNs for heterophilous graphs by proposing a self-representation framework using the GMRES method.", "pattern_count": 1}
+{"idea_id": "idea_32", "text_hash": "1fcbf065324d94e62d528de250b6ee93f7ed999d2be3724833ba0ce4036ba2cd", "snippet": "Reformulate intent detection as a question-answering retrieval task to enhance performance on semantically similar fine-grained intents.", "pattern_count": 1}
+{"idea_id": "idea_33", "text_hash": "28bacf3c70f66b97a20916fbdf3dc905961f0c80af12c0bc8d2860364010836b", "snippet": "Challenge the effectiveness of prompt learning by demonstrating that random prompts can perform well and that prompt learning may not surpass fine-tuning.", "pattern_count": 1}
+{"idea_id": "idea_34", "text_hash": "9bce98c5b7ba8aae3020855d58b43a4b2a0cabdbd60a85fd62ca787f33756faf", "snippet": "Enhance weakly supervised semantic segmentation by disentangling and swapping features to reduce background bias in classifiers.", "pattern_count": 1}
+{"idea_id": "idea_35", "text_hash": "97b61dcb9f5fcb8ce21f20e4fb972fccb9514b39b4ccf27a7fbd9d36a0ef2f21", "snippet": "Leverage distributed learning with random features to efficiently perform pairwise ranking while maintaining convergence properties.", "pattern_count": 1}
+{"idea_id": "idea_36", "text_hash": "300603f903833ce77bdf3b50c6b5b127b9971adda45510d9e8fa3fecf06aaabe", "snippet": "Introduce a method to efficiently create diverse deep ensembles that adapt quickly to distribution shifts by minimizing conditional mutual information.", "pattern_count": 1}
+{"idea_id": "idea_37", "text_hash": "69addff9692a4c942e5a927353bfb16677c7409c747b32643433fd5aebb8cd5f", "snippet": "Introduce a novel stochastic extragradient-type algorithm that solves weak Minty variational inequalities without the need for increasing batch sizes, using a dual stepsize approach.", "pattern_count": 1}
+{"idea_id": "idea_38", "text_hash": "eaf98e36d5aa3715a64798d7ac58668fb24218ba4685a9ecc689c47498067c0b", "snippet": "Introduce a two-level sampling framework to enhance domain generalization by efficiently selecting informative domains and data points, mitigating spurious correlations.", "pattern_count": 1}
+{"idea_id": "idea_39", "text_hash": "c01eb5b79e7091b74986ba1c5731f54a800d8dedf5e8c8e4bd351334ca70a4e6", "snippet": "Introduce an adaptive mechanism to balance teacher and student knowledge contributions at different layers of the student network during knowledge distillation.", "pattern_count": 1}
+{"idea_id": "idea_40", "text_hash": "8c76152ff10d374168a22918ef5ec0caf7c7c33cc49aff484f142e43adc49624", "snippet": "Introduce a curriculum-based approach to co-design morphology and control of soft robots, enhancing efficiency and performance through staged learning.", "pattern_count": 1}
+{"idea_id": "idea_41", "text_hash": "1ea707324670eb58832964b6ed97b48099e613e300dc2cde31080eed41baec57", "snippet": "Enhance object-centric learning by integrating Gaussian Mixture Models to create more expressive slot representations through distance-based clustering.", "pattern_count": 0}
+{"idea_id": "idea_42", "text_hash": "21fe237ddf36ad81674881ea363e794706e4097ec39d8e6ecef4b5f67383bdd4", "snippet": "Introduce a neural surrogate for modeling wireless electromagnetic propagation, enabling fast, differentiable simulations for tasks like network planning.", "pattern_count": 0}
+{"idea_id": "idea_43", "text_hash": "079969fa3816a1bf4258bb52ba89b3f0e2b310694a483c55ccceabc6325f3e6c", "snippet": "Introduce a fragment-based autoregressive diffusion model to improve 3D molecule generation by unifying atom and bond prediction.", "pattern_count": 1}
+{"idea_id": "idea_44", "text_hash": "98b65c8b92a80579eaefb6a4ba64b985136234a6132b132643462fef83b8179e", "snippet": "Introduce a hierarchical federated learning framework that enhances scalability and robustness against non-IID data through label-driven knowledge distillation.", "pattern_count": 1}
+{"idea_id": "idea_45", "text_hash": "2c846bc992c9e43f8d290a28673f6ff78a86c4a71e280a9f7469de3ef8b61d5d", "snippet": "Introduce a framework for evaluating model performance against an unobserved oracle, challenging the assumption of human annotations as ground truth.", "pattern_count": 1}
+{"idea_id": "idea_46", "text_hash": "f5928f8d278d4ecc859a57abeb4313e1c2fc8d62e6ef1df832ed9e9055e1a5e4", "snippet": "Enhance federated learning by focusing on shared representation refinement while maintaining differential privacy and allowing local model personalization.", "pattern_count": 1}
+{"idea_id": "idea_47", "text_hash": "69d2fa72fa64c0ecbb7c06b434d772b88c615e2e92791ae89941a365b9e2b8a3", "snippet": "Introduce discrete latent variables for object-centric tasks to achieve disentangled representations and state-of-the-art performance in set prediction and object discovery.", "pattern_count": 0}
+{"idea_id": "idea_48", "text_hash": "29859e075b9db78065443dfa62423315b8640526bea6377a372cd61bd1967848", "snippet": "Introduce a computationally efficient nonparametric method that bridges random feature methods and neural networks through a two-layer estimation approach.", "pattern_count": 1}
+{"idea_id": "idea_49", "text_hash": "c573fb98dff18fb61ea235f2373fabf73d433ebfd903d6f40aa969c00bf8490b", "snippet": "Integrate graph neural networks with orthogonal spline collocation to enhance the efficiency and accuracy of simulating physical systems with continuous outputs.", "pattern_count": 0}
+{"idea_id": "idea_50", "text_hash": "2057096f9428b4624db23c86466cb2e7c6fa306662d805a65385e1c9ed0daa89", "snippet": "Address simultaneous label-shift and conditional-shift in online model adaptation by learning additional hyper-parameters.", "pattern_count": 0}
+{"idea_id": "idea_51", "text_hash": "20ddcdb09ed1f788d4d4191fc418fa2c127fe341140049425cb14b8d0f83ae2d", "snippet": "Introduce a comprehensive 3D radar dataset to enhance precipitation nowcasting by incorporating vertical reflectivity changes and geographical variability.", "pattern_count": 0}
+{"idea_id": "idea_52", "text_hash": "5c82bcbe69bc170b0f2240a85b0f9a4e0a0b346551234b9618751d8b8a92f6fd", "snippet": "Introduce a hierarchical forward model approach to enhance representation learning in reinforcement learning by capturing multi-step temporal dependencies.", "pattern_count": 0}
+{"idea_id": "idea_53", "text_hash": "58a68e7819e75e6b91ca2308eaab9eb3af8cab36da7b64b21eca65091a079525", "snippet": "Introduce a novel method for antibody structure prediction using a pretrained antibody language model and homologous templates, achieving superior accuracy and speed compared to existing methods.", "pattern_count": 1}
+{"idea_id": "idea_54", "text_hash": "59038e82415669204f45f337d753b0d95054b748a433e415791c78110a9e6f61", "snippet": "Introduce a policy optimization approach using Lexicographic Projection Optimization (LPO) to address theoretical and practical issues in lexicographic multi-objective reinforcement learning.", "pattern_count": 1}
+{"idea_id": "idea_55", "text_hash": "08e9de0548f86f3d49138bb4f43a19010cf97821cf673e046eb297b5361462c3", "snippet": "Introduce an effective sampling theory to improve long-tailed image classification by decoupling representation and classifier.", "pattern_count": 1}
+{"idea_id": "idea_56", "text_hash": "9b23e4457f6009ddacb59b1fa81c1170a9f96e4dea429b9eb6f6184861196063", "snippet": "Introduce an equivariance module that structures the latent space to predict displacement caused by augmentations, enhancing self-supervised visual representation learning.", "pattern_count": 1}
+{"idea_id": "idea_57", "text_hash": "0854607a941d7614f34f91d0b878fabc09dcce321e8f15017a1acdd129fa4842", "snippet": "Facilitate multi-agent navigation through emergent language communication, enhancing task success via a collaborative learning framework.", "pattern_count": 1}
+{"idea_id": "idea_58", "text_hash": "b2b18fcb977a98a25542453d8d1b7dbb4c9ef29673d79a8d2bc46a3b4510f195", "snippet": "Introduce a task-aware information routing mechanism inspired by Global Workspace Theory to mitigate catastrophic forgetting in lifelong learning systems.", "pattern_count": 1}
+{"idea_id": "idea_59", "text_hash": "b093d97c48483a6e1b4ffe300c103a512722a430edf88d60f2eea2fe898e0a0f", "snippet": "Extend dynamic Schrödinger bridge problems to include intermediate constraints using Iterative Smoothing Bridge, combining diffusion models with Bayesian filtering and optimal control.", "pattern_count": 0}
+{"idea_id": "idea_60", "text_hash": "96eab3e32ed9d39ad97ea626ba5c25615f9c0a48e00c439b7a515b9add8ae53a", "snippet": "Introduce a randomized training scheme to reduce computational overhead while maintaining the performance benefits of sharpness-aware learning.", "pattern_count": 1}
+{"idea_id": "idea_61", "text_hash": "aced030159703fc725fb803966274ec03699895fa05e9d17d243e9dc55b490fe", "snippet": "Introduce a self-supervised framework for off-policy ranking that models existing OPE methods as workers in a crowdsourcing system, eliminating the need for ground-truth labels.", "pattern_count": 0}
+{"idea_id": "idea_62", "text_hash": "351d6d4a9a21b49a7e05e30b2ada2abb994184f422afc31de38d2c46d2549a9c", "snippet": "Introduce a parameter-efficient few-shot domain adaptation method for communication systems that adapts to changing channel conditions without retraining the entire autoencoder.", "pattern_count": 0}
+{"idea_id": "idea_63", "text_hash": "b364dc31ef039aabfdfbf395debf6135c17316168a076dae79373b5a5ae78fe7", "snippet": "Introduce a hybrid packing method and novel convolution algorithm to significantly reduce homomorphic rotations in FHE-based CNNs, optimizing execution time.", "pattern_count": 1}
+{"idea_id": "idea_64", "text_hash": "596f43233909e3ad19f891575e7ff51f3f514d02a2bab39bdb6d6dccec3d2cf8", "snippet": "Introduce causal inference to knowledge graph completion to enhance interpretability and mitigate data bias.", "pattern_count": 1}
+{"idea_id": "idea_65", "text_hash": "5a7c5a755873d2e5a2b90e562521d49613c09ee27e5c90e649838397cf846181", "snippet": "Leverage language models to translate natural language into formal specifications across diverse logical domains, demonstrating strong generalization and competitive performance.", "pattern_count": 0}
+{"idea_id": "idea_66", "text_hash": "16f1e3f15c8fe04bc61df471634da22d2b469cfb36eeb2cd3dde0e27a4d2c0b7", "snippet": "Introduce an unbiased client sampling scheme that optimizes client diversity and local variance to enhance convergence in federated learning.", "pattern_count": 1}
+{"idea_id": "idea_67", "text_hash": "8a1ceabe43f981abb4e2653b4711a172aed1c2166fd53ba05910fa8967dcde66", "snippet": "Introduce a parallel and automatic variation of predictive coding that enhances efficiency and applicability in machine learning and computational neuroscience.", "pattern_count": 0}
+{"idea_id": "idea_68", "text_hash": "86634323e1d7252af9fa3ea6ce40722929a1b8ce04d40fe9bd3a40bb62c41376", "snippet": "Develop a framework for learning disentangled geometric representations of agents and objects from unstructured observations using action-based supervision.", "pattern_count": 0}
+{"idea_id": "idea_69", "text_hash": "46917aba711c3b6b36e6edbf8cc3953824873ba03f2ce4520b015e66084480dd", "snippet": "Introduce a dual-component strategy in model-based reinforcement learning to enhance planning efficiency and knowledge expansion under resource constraints.", "pattern_count": 1}
+{"idea_id": "idea_70", "text_hash": "8f353be5a5b862157e4a38534a3fa6f9c7d8da579ff9ae1781d330eeff3ca6d6", "snippet": "Introduce a fairness-constrained gradient boosting framework that maintains predictive performance and significantly reduces training time.", "pattern_count": 1}
+{"idea_id": "idea_71", "text_hash": "b2e8a1a33f88f1c081017ef66945943d226deeb298c854a36eeb51df41f65b4d", "snippet": "Highlight the bias amplification and instability in automatic text summarizers, advocating for bias analysis in summarization methods.", "pattern_count": 0}
+{"idea_id": "idea_72", "text_hash": "ee6b1205762104ee9bd67c722db3864e962740ea48e0b23bd7d917055415d592", "snippet": "Introduce a method to generate continuously nonstationary data streams from existing datasets to better evaluate task-free continual learning.", "pattern_count": 1}
+{"idea_id": "idea_73", "text_hash": "c9a6133ad5750b9b46e8f12b1b7417ae1144e142e240a7928db2713fc3d5fb84", "snippet": "Introduce an online method to correct bias in task-free continual learning by optimizing the output layer separately from preceding layers during experience replay.", "pattern_count": 1}
+{"idea_id": "idea_74", "text_hash": "f0bead4fb03e011d63143e7d84c9d84ed2c86a848f10ae2049b63af7e463a26f", "snippet": "Introduce a contrastive learning objective to improve language model generation quality by balancing probabilities between label tokens and negative candidates.", "pattern_count": 1}
+{"idea_id": "idea_75", "text_hash": "08d5aa299ce975eee51d801cc2008464fba43846e532b8c94c3b6672f1a680f3", "snippet": "Introduce a specialist ensemble framework in online knowledge distillation to enhance diversity and calibration without requiring a pre-trained teacher.", "pattern_count": 1}
+{"idea_id": "idea_76", "text_hash": "bdbaac7f3e50836b5837393d6c66d7500f4bc8ba24dea04c1a7bf2ecb913506e", "snippet": "Introduce a novel gradient descent optimization method that adapts learning rates based on inverse model-parameter displacement to improve convergence.", "pattern_count": 1}
+{"idea_id": "idea_77", "text_hash": "e70fb4a29625d5a1a0c89a28ae5d24cf3169764ec50dce6e941e2d91bdb6d39d", "snippet": "Introduce a variational approach to jointly learn dictionary distributions and reconstruction algorithms under varying sensing matrices, enhancing compressed sensing with calibrated uncertainties.", "pattern_count": 0}
+{"idea_id": "idea_78", "text_hash": "d1b83b249c92632b499ab76e1499341086431975fd915863cf194eb4cd5c7d3a", "snippet": "Introduce a distributionally robust approach to probabilistic supervised learning by predicting conditional label distributions using feature moment divergence.", "pattern_count": 1}
+{"idea_id": "idea_79", "text_hash": "bb2954250fa80279e77b18fb7b9f750e3e8a2f207288408967054ef035a4b884", "snippet": "Integrate generative and contrastive learning for graphs by modeling edge generation through latent node interactions within hidden communities.", "pattern_count": 1}
+{"idea_id": "idea_80", "text_hash": "ad616c68ee73d22f4c2a52bbd0e34eb7c0770c2392dfd478d3138bd6391d14b2", "snippet": "Develop a tool to meta-learn and extract the inductive biases of neural circuits, facilitating understanding of neural functionality through generalization patterns.", "pattern_count": 0}
+{"idea_id": "idea_81", "text_hash": "3261690e1549e3a2869e0573c557f71dd41d2e54898d22e0e87ce9ea729c8efb", "snippet": "Introduce a $d$-block model for spiking neural networks that accelerates training by reducing sequential computations through stochastic refractory periods and recurrent conductance latencies.", "pattern_count": 1}
+{"idea_id": "idea_82", "text_hash": "1db73b3cd9f172f5b601d24a6b99f2505438657723ff5ceff0d5dd0ed33e1a1a", "snippet": "Introduce a method that enhances OOD detection by leveraging the norm of clipped feature vectors and output space energy, improving detection without additional data or fine-tuning.", "pattern_count": 1}
+{"idea_id": "idea_83", "text_hash": "99960ca5994979fb5d9fa3fab1578f4cda5e2defd64169e6ba75a3da7b6fe02f", "snippet": "Introduce a graph-induced data augmentation method leveraging label features to enhance training data distribution in extreme multi-label text classification.", "pattern_count": 1}
+{"idea_id": "idea_84", "text_hash": "cab76c969cd5e77903494cf96a470f35096a54dec50d212d004d8ecc404a7fbb", "snippet": "Introduce a data-driven MDP setup for AutoFE that enhances transferability and reduces computational costs compared to traditional NAS frameworks.", "pattern_count": 0}
+{"idea_id": "idea_85", "text_hash": "36f4291a53ea6cf8977482970f274acb35759fce9ffb92b545d7053fad184632", "snippet": "Introduce a method to quantify input token influence in Transformer models using flow networks and Shapley values.", "pattern_count": 0}
+{"idea_id": "idea_86", "text_hash": "e4764d915be6521be3b545b3e39a954b9a95bb8d96d9da19ca57364c53dc3ed7", "snippet": "Enhancing adversarial example transferability by incorporating Bayesian principles into substitute models.", "pattern_count": 1}
+{"idea_id": "idea_87", "text_hash": "d23ac1ff1b97459c926391087432cc4389f813e5169add04487b8b1428523b07", "snippet": "Introduce a differentiable hypergeometric distribution to learn group importance and subset sizes in applications like clustering and weakly-supervised learning.", "pattern_count": 0}
+{"idea_id": "idea_88", "text_hash": "f8b2b75c55e2ee3d89c23d0fae1ee39e8513e02c80c3ad0b5b07eba35eedc3b1", "snippet": "Introduce a cross-layer attention mechanism to enhance representation power by retrieving query-related information from previous layers.", "pattern_count": 0}
+{"idea_id": "idea_89", "text_hash": "4c0325d027fc6da13431c0ef4a4b6c211e4f3b22c5d092deaccabecf3ce81c5b", "snippet": "Leverage state-space layers to enhance efficiency and performance in sequence-based reinforcement learning, particularly for long-range dependencies.", "pattern_count": 1}
+{"idea_id": "idea_90", "text_hash": "90d388187f9f3dc9fc6d610908e55652f76727d72f98711066d77d98a8b28ea6", "snippet": "Demonstrate that a single well-calibrated autoregressive model can outperform neural ensembles in offline reinforcement learning by effectively managing model uncertainty.", "pattern_count": 1}
+{"idea_id": "idea_91", "text_hash": "9ad3e1b73c308d9f35531b24f8ea0f39d1e13c60a23b69051177f5ccdf0ad371", "snippet": "Introduce a framework to handle non-uniform sampling densities in geometric graphs, correcting graph shift operators to improve performance and extract insights.", "pattern_count": 1}
+{"idea_id": "idea_92", "text_hash": "e000084cf37783226be1d4f993e3077d8e8b5c8e8557ea4b2e29db983d47a251", "snippet": "Enhance causal discovery by adaptively reweighting samples to improve DAG learning under heterogeneous data conditions.", "pattern_count": 1}
+{"idea_id": "idea_93", "text_hash": "18362979983dc1010fa361a4baa55cf6bd9b6c82b31408d92750d4dfe4eab374", "snippet": "Introduce a defense mechanism that focuses on adversarial attacks targeting specific data subsets to maintain accuracy on clean data.", "pattern_count": 0}
+{"idea_id": "idea_94", "text_hash": "cc2f1af879fb70286a293b0bc7b013fa5cbf4eae62e9c7ad0d2076547f6bd077", "snippet": "Introduce an end-to-end offline preference-based reinforcement learning framework that jointly models preferences and offline data to optimize policy without separate reward learning.", "pattern_count": 1}
+{"idea_id": "idea_95", "text_hash": "8dfe008639fd721fc41619284d897f417baba8401763628851c95184c1e2a9c4", "snippet": "Leverage hierarchical Transformers for multimodal learning to repair circuits against formal specifications, enhancing generalization and synthesis performance.", "pattern_count": 0}
+{"idea_id": "idea_96", "text_hash": "8af67ae7b320bbdaa8c4efaf735eeae7e4e9f9286e658a6896570a57df689056", "snippet": "Extend neural probabilistic logic programming to handle both discrete and continuous random variables, enabling broader applicability in neural-symbolic AI.", "pattern_count": 1}
+{"idea_id": "idea_97", "text_hash": "b26638bfae77091861999ef52042fbfb6aaa59d00536627d93e5a177c33f80cf", "snippet": "BERT can maintain high-quality representations for sequential tasks without relying heavily on memory replay.", "pattern_count": 1}
+{"idea_id": "idea_98", "text_hash": "9da32cbe5d2214e3d963a95e212221b8a00f42a7fa1574e04f6219fd5b02512c", "snippet": "Utilize the inherent conservatism of on-policy algorithms to effectively address offline reinforcement learning challenges without additional constraints.", "pattern_count": 1}
+{"idea_id": "idea_99", "text_hash": "d40d63b0fb78712f593774b53a26a26a06541c3b14607d1a8600967223c79d3e", "snippet": "Introduce gradient constraint methods to enhance federated learning accuracy and efficiency on heterogeneous data.", "pattern_count": 1}

Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/paper_emb.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c251cdbffada0e22fbe3fd24590ccf8cac50eec3674091a2982cb9cec9fb26e5
+size 835712

Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/paper_manifest.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "created_at": "2026-02-12T12:08:27.923122+00:00",
+  "embedding_model": "gemini-embedding-001",
+  "nodes_paper_hash": "ed105040e3761981361be4f754edb63d3119216a2fcfd28974cc281503e9e743",
+  "count": 100,
+  "index_count": 68,
+  "skipped": 32
+}

Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/paper_meta.jsonl ADDED Viewed

	@@ -0,0 +1,68 @@

+{"paper_id": "gNI4_85Cyve", "text_hash": "1006cac70950de2fea8940914de85c7b968a8af5441be96a43aadc40c82f2e4f", "title": "QAID: Question Answering Inspired Few-shot Intent Detection", "pattern_id": "pattern_73", "domain": "Natural Language Processing", "review_count": 5}
+{"paper_id": "1FsdIfRngtw", "text_hash": "07a2b1263452d8fe8f12ff384fcbb0d7d6a0b325c423d3d6aaa1bf1cedfebc7a", "title": "Rethinking the Value of Prompt Learning for Vision-Language Models", "pattern_id": "pattern_109", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "pW_jGk1D_Ww", "text_hash": "d97e221249aede06352a2e4fef4d4c204e5052a5bf443836e695c17ff69e49cb", "title": "Disentangled Feature Swapping Augmentation for Weakly Supervised Semantic Segmentation", "pattern_id": "pattern_103", "domain": "Computer Vision", "review_count": 5}
+{"paper_id": "tORS9qGBNpT", "text_hash": "22211262cb6f0b42ef3acc537de3f0239867c0766aa0d48523ab0d5e35bfa78f", "title": "Distributed Least Square Ranking with Random Features", "pattern_id": "pattern_10", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "17RDXeF-skZ", "text_hash": "942e60988debe42a7c22d9b3361e08b4218fbec614908b2bbd4f1102b4f13f69", "title": "Doing Fast Adaptation Fast: Conditionally Independent Deep Ensembles for Distribution Shifts", "pattern_id": "pattern_47", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "ejR4E1jaH9k", "text_hash": "2a25da518da727e56877ee28391a4476c45fe6133bfcb9bc190b4703695cde4c", "title": "Solving stochastic weak Minty variational inequalities without increasing batch size", "pattern_id": "pattern_83", "domain": "Optimization", "review_count": 5}
+{"paper_id": "8Ygoj2IeXfW", "text_hash": "9e76e9ff70e7d997e9872773b7d1dca30533dfb990489507b3b3cbb21789a97c", "title": "Diversity Boosted Learning for Domain Generalization with a Large Number of Domains", "pattern_id": "pattern_50", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "8XfHh4XSQ0Q", "text_hash": "2137fb574ba6710b9a82ed8a5a364ebed91246d14f7c59c0b0c463604cb687e4", "title": "Adaptive Block-wise Learning for Knowledge Distillation", "pattern_id": "pattern_19", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "r9fX833CsuN", "text_hash": "252b5188665af1f522f9aa92fbbcef718b0b7e3997b2b4c7ccd38e6c2e9cc268", "title": "Curriculum-based Co-design of Morphology and Control of Voxel-based Soft Robots", "pattern_id": "pattern_86", "domain": "Robotics", "review_count": 4}
+{"paper_id": "AqX3oSbzyQ1", "text_hash": "1673f668b9d091f2069bb90ab72d8ab790202f7e2304a67c4a27adea2896712c", "title": "Object-Centric Learning with Slot Mixture Models", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "tPKKXeW33YU", "text_hash": "61549fcadd1b9768b372f01bd9d62d6018a91bf6e9adc2464ee6e1c21f0c873a", "title": "WiNeRT: Towards Neural Ray Tracing for Wireless Channel Modelling and Differentiable Simulations", "pattern_id": "", "domain": "Wireless Communications", "review_count": 5}
+{"paper_id": "HGsoe1wmRW5", "text_hash": "54ad1a00f581cded946d1976407f3cadb19d8c396efa2f78ebcc700ff55f92e6", "title": "Pocket-specific 3D Molecule Generation by Fragment-based Autoregressive Diffusion Models", "pattern_id": "pattern_6", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "3WYtm7UzsR", "text_hash": "98a3e3785df24fbe8c738bc07a25333aff746156dd5e3b34bb08729b781d1281", "title": "Towards scalable and non-IID robust Hierarchical Federated Learning via Label-driven Knowledge Aggregator", "pattern_id": "pattern_10", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "X5ZMzRYqUjB", "text_hash": "9897fe1b88b73cee33024cc0ed4adc2399d31dd199ced8ace135d0d6c6782441", "title": "Humanly Certifying Superhuman Classifiers", "pattern_id": "pattern_87", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "oJpVVGXu9i", "text_hash": "1f9d68d8ba3df95f9e1598a05836c7b61b8d6e89bb21a549bb604366a61da4c3", "title": "Share Your Representation Only: Guaranteed Improvement of the Privacy-Utility Tradeoff in Federated Learning", "pattern_id": "pattern_11", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "JIptuwnqwn", "text_hash": "ba2bbb2ea72836c8ba80d7748a029d7e8b039d89ee13e74ba0ce7ef0d9603336", "title": "Quantized Disentangled Representations for Object-Centric Visual Tasks", "pattern_id": "", "domain": "Computer Vision", "review_count": 5}
+{"paper_id": "BDjGGZk9yz", "text_hash": "0af28e49a54251a981b3c51f98d0b1ddc3d3fb215d9614f406ff9e1f0b4b737c", "title": "Supervised Random Feature Regression via Projection Pursuit", "pattern_id": "pattern_83", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "loc3CUXeuzH", "text_hash": "25cf5fcbd988a9d0e161f76071bab96a329fdd6ad04f402219a1833d1f6817ce", "title": "Graph Spline Networks for Efficient Continuous Simulation of Dynamical Systems", "pattern_id": "", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "kL67fyKb6A", "text_hash": "dc2a721e752cb01dc68eacf34cabc0f13f9bf7e1f757119ca8d06378c5e61114", "title": "Online black-box adaptation to label-shift in the presence of conditional-shift", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "WVZQa2QYJN", "text_hash": "7a0d0530f892158cfa894bfacf2e1a539a6f598d4f74d7f657a7aed454081111", "title": "RuDar: Weather Radar Dataset for Precipitation Nowcasting with Geographical and Seasonal Variability", "pattern_id": "", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "jkMT2AtccX", "text_hash": "e55c6efab3943ca54f9f724e1e7dacf4c05592c201bdf4510aff401796731ef0", "title": "Learning Representations for Reinforcement Learning with Hierarchical Forward Models", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "F5Cj26wfiu", "text_hash": "00b21e4b52d8d040cc040767f353352f79ad90b0ed9f3cfdc323a35e9044c3d6", "title": "xTrimoABFold: Improving Antibody Structure Prediction without Multiple Sequence Alignments", "pattern_id": "pattern_6", "domain": "Biotechnology", "review_count": 5}
+{"paper_id": "mmFtinp4wQ_", "text_hash": "668f6b0c5671c892c2906ad6e7cf932bc58dc8271c65dfa989308dcef6b667a8", "title": "Thresholded Lexicographic Ordered Multi-Objective Reinforcement Learning", "pattern_id": "pattern_15", "domain": "Reinforcement Learning", "review_count": 5}
+{"paper_id": "5WOIluv9Xop", "text_hash": "a780ed85e47db5b216eac945a2987a092cedfd507a9e9132ddcf512da8ae3e96", "title": "HOW SAMPLING AFFECTS TRAINING: AN EFFECTIVE SAMPLING THEORY STUDY FOR LONG-TAILED IMAGE CLASSIFICATION", "pattern_id": "pattern_40", "domain": "Computer Vision", "review_count": 5}
+{"paper_id": "eDLwjKmtYFt", "text_hash": "201b569b380442bc39e201bc63cc6f87148a38df3b635d9233602e6f54a826e6", "title": "EquiMod: An Equivariance Module to Improve Visual Instance Discrimination", "pattern_id": "pattern_39", "domain": "Computer Vision", "review_count": 4}
+{"paper_id": "cUX2psP06OL", "text_hash": "a2f521148f1f37d1db1d8f1077710a71849becd05c322543821ad938c9c888e9", "title": "Manipulating Multi-agent Navigation Task via Emergent Communications", "pattern_id": "pattern_97", "domain": "Artificial Intelligence", "review_count": 4}
+{"paper_id": "-M0TNnyWFT5", "text_hash": "d0ff96418cc49f0e7d6adb1accccee67e72b5a3ef1cc97d7eee5344e74b599da", "title": "Task-Aware Information Routing from Common Representation Space in Lifelong Learning", "pattern_id": "pattern_0", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "me09xlTmm8", "text_hash": "bfa58a54a2f0b12c6228cd7a731209c12b488715793c2cbb18a230c7d1cdfa4f", "title": "Transport with Support: Data-Conditional Diffusion Bridges", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "8foynpwwRb", "text_hash": "50e68d70064200e2a9ccf7f1fdbf970277814b970bc34fec21c105f154f74efd", "title": "Randomized Sharpness-Aware Training for Boosting Computational Efficiency in Deep Learning", "pattern_id": "pattern_83", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "GX0uI5T8kd", "text_hash": "4da08cd027ac94b319a6674a6f0cb8dddb711f58a115a8d1468f81ce6c14f2fc", "title": "Self-Supervised Off-Policy Ranking via Crowd Layer", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "4F1gvduDeL", "text_hash": "207c27ad2915014005882cc9ac23ffc57a6a667d781870e97b6252bddbacd9f7", "title": "Few-Shot Domain Adaptation For End-to-End Communication", "pattern_id": "", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "fyD8adDrXo", "text_hash": "db201d38cefa1a311d0673ce28ba984fd8d46635e805810b1239babfd2bc6248", "title": "HyPHEN: A Hybrid Packing Method and Optimizations for Homomorphic Encryption-Based Neural Network", "pattern_id": "pattern_45", "domain": "Security & Privacy", "review_count": 4}
+{"paper_id": "Y1J29OryQg", "text_hash": "f2fb97201428c9b591b86c5aac5d5ca7c72eccadbf27c5c9d6e6588e4ae5b380", "title": "Causal Inference for Knowledge Graph Completion", "pattern_id": "pattern_17", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "ywAjQw-spmY", "text_hash": "be031f11f7bfccb1e41b0eee1883ceb6f1248e9d92d70e78f9cd1f450be0a752", "title": "Formal Specifications from Natural Language", "pattern_id": "", "domain": "Natural Language Processing", "review_count": 5}
+{"paper_id": "CcXTudu9bvu", "text_hash": "5f4cbfc4612053bd19103a98b1a7e0dcaf74b0088cc5b927d57ecfaa35eea97e", "title": "DELTA: Diverse Client Sampling for Fasting Federated Learning", "pattern_id": "pattern_10", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "rwetAifrs16", "text_hash": "79110f87fde9efef6704ad963e42b153e4cae574fb2972429da3a1fb90cb2a96", "title": "Incremental Predictive Coding: A Parallel and Fully Automatic Learning Algorithm", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "HqVp0rNC8jn", "text_hash": "d1efc9200fd5311c5bdec1e5075299bbec590d31275aaf687b0f8c6c692403c2", "title": "Learning Geometric Representations of Interactive Objects", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "m3DmIL7wHDW", "text_hash": "e6e235099561d264bf2d732638e048e8edc35b5783d2b54a041b6f83f1446a0b", "title": "The guide and the explorer: smart agents for resource-limited iterated batch reinforcement learning", "pattern_id": "pattern_119", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "x-mXzBgCX3a", "text_hash": "97ff95bed746baf387fcda0a32c4aff5c598eed293a676c6615c6165c9dd19bc", "title": "FairGBM: Gradient Boosting with Fairness Constraints", "pattern_id": "pattern_9", "domain": "Fairness & Accountability", "review_count": 5}
+{"paper_id": "-UsbRlXzMG", "text_hash": "b615c8a7feec66053ee574e2c7d2121ee0a63db07688a7655b3a67e65996df00", "title": "How (Un)Fair is Text Summarization?", "pattern_id": "", "domain": "Natural Language Processing", "review_count": 5}
+{"paper_id": "Wac06sAkHk", "text_hash": "04c81fbb8444631f7d3f5f361c8baa605c0d607bb6091eb306c463760bee4488", "title": "Simulating Task-Free Continual Learning Streams From Existing Datasets", "pattern_id": "pattern_0", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "18XzeuYZh_", "text_hash": "4b299874120fac89ff97b06b75d2fa9dbb46813c029b6805e50779bb553ecbec", "title": "Online Bias Correction for Task-Free Continual Learning", "pattern_id": "pattern_0", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "j8s-BRxXST", "text_hash": "893dd2e1853c9980e0766b218143ba301c1ca0ae5d9e74e76d3daa9b61737063", "title": "A Simple Contrastive Learning Objective for Alleviating Neural Text Degeneration", "pattern_id": "pattern_96", "domain": "Natural Language Processing", "review_count": 5}
+{"paper_id": "L6CKiPH3hI", "text_hash": "eaf0f97a1eec5637f0534bcecd25b4b2e84c41776c15760a2934863edfa380ab", "title": "Enriching Online Knowledge Distillation with Specialist Ensemble", "pattern_id": "pattern_19", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "lKXcMB9tOFD", "text_hash": "ce93af027d73472e68d82a73a159d1bd7c1bcd76a20cdcb725d036b08547bc73", "title": "Improved Gradient Descent Optimization Algorithm based on Inverse Model-Parameter Difference", "pattern_id": "pattern_83", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "47DzlkyH3dM", "text_hash": "36f2b0f4f440fc1d4a88dd2f91871f6f6f0eb6ce7d0d60184627d7553f953b7d", "title": "Variational Learning ISTA", "pattern_id": "", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "mN43JdXmYMs", "text_hash": "39f622a8b1bc9e23240963ac95efd2996bbbb32961e8f320055ca0c1f0600e67", "title": "Moment Distributionally Robust Probabilistic Supervised Learning", "pattern_id": "pattern_40", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "r3-aLHxn2nB", "text_hash": "0b31cfde5f5671d9d743809b010d81426f34b5acf6fcd8f0a1670a167c03dc22", "title": "CLEP: Exploiting Edge Partitioning for Graph Contrastive Learning", "pattern_id": "pattern_24", "domain": "Machine Learning", "review_count": 6}
+{"paper_id": "dpuAkczrTOt", "text_hash": "2c63523f9bc39080f8470b8fd1ab4b4a70e7bcdf35409cdf103f8d14f325c7f2", "title": "Meta-Learning the Inductive Biases of Simple Neural Circuits", "pattern_id": "", "domain": "Neuroscience", "review_count": 5}
+{"paper_id": "70-hEqC4Wo8", "text_hash": "e65d82d72015d08780004d98c7bbe0bff509b0eb5727df94667d8a23562d7b0d", "title": "Accelerating spiking neural network training using the $d$-block model", "pattern_id": "pattern_1", "domain": "Neuroscience", "review_count": 6}
+{"paper_id": "-hMNEMgT8Wd", "text_hash": "a224d2ac75ac1c0a2ea65b9e3b4bbad013f283212451620ec68b02759e24b9d2", "title": "RG: OUT-OF-DISTRIBUTION DETECTION WITH REACTIVATE GRADNORM", "pattern_id": "pattern_4", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "05ff9BRSMzE", "text_hash": "4e5c4f3ac9f5b6d749f4943a99d18de63beca958163d1da833cdb92b85a3e3e6", "title": "Gandalf : Data Augmentation is all you need for Extreme Classification", "pattern_id": "pattern_40", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "688hNNMigVX", "text_hash": "929b95276847d621a485a7e439ac9eecca1e993f2b49a9b5d9ffa81ec468ea42", "title": "Learning a Data-Driven Policy Network for Pre-Training Automated Feature Engineering", "pattern_id": "", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "pcBJT4bgbpH", "text_hash": "ec127618c8f62024602239e2f7e567a8196335606ee24c1333d157d64feb1798", "title": "Attention Flows for General Transformers", "pattern_id": "", "domain": "Natural Language Processing", "review_count": 5}
+{"paper_id": "bjPPypbLre", "text_hash": "7ed13c9c2bef6b71886cdf57b79a2c15d12a1767e923e6217feb550c05abc6e7", "title": "Making Substitute Models More Bayesian Can Enhance Transferability of Adversarial Examples", "pattern_id": "pattern_66", "domain": "Security & Privacy", "review_count": 4}
+{"paper_id": "75O7S_L4oY", "text_hash": "367c17cdbcd3e6a9c487cc14f4d26a02543dda2dd422b780f46bf9337184ad68", "title": "Learning Group Importance using the Differentiable Hypergeometric Distribution", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "pvgEL1yS3Ql", "text_hash": "bed74d21a116cb3bd94860b8eade1d555b953051b630dcddcfbebcf41824dc2c", "title": "Cross-Layer Retrospective Retrieving via Layer Attention", "pattern_id": "", "domain": "Computer Vision", "review_count": 5}
+{"paper_id": "kqHkCVS7wbj", "text_hash": "fa600d585e34ba221f4c2124edb0cc648c402373c3d3e1820e51d05259283a98", "title": "Decision S4: Efficient Sequence-Based RL via State Spaces Layers", "pattern_id": "pattern_118", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "gvOSQjGTtxj", "text_hash": "fa13ec1ec029f0ccce9ebe453f3362ba3486efd6e2fe25f6edf1b36b616fbc2b", "title": "Deep autoregressive density nets vs neural ensembles for model-based offline reinforcement learning", "pattern_id": "pattern_104", "domain": "Machine Learning", "review_count": 4}
+{"paper_id": "mnVf1W6ipGm", "text_hash": "3fd8f258dc713ea3eb26c31bc23c49cd179d0c903496264fe4229f59aa7fcfd0", "title": "Unveiling the sampling density in non-uniform geometric graphs", "pattern_id": "pattern_24", "domain": "Graph Theory", "review_count": 5}
+{"paper_id": "LNpMtk15AS4", "text_hash": "0fd5f21d35b8fb323bd5b5ad07e0f7cd2e1cd1f4e2504ba87f7880446a425537", "title": "Boosting Causal Discovery via Adaptive Sample Reweighting", "pattern_id": "pattern_17", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "BdcfKgE9dhF", "text_hash": "0632a1baa3c70ea7a48313a5984577307dd0885bc265d4b72b0e2d62c6aef419", "title": "Robust Training through Adversarially Selected Data Subsets", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "i8AnfJYMvz", "text_hash": "1656d92d1649e4a7af2a48e799174258e45a80c7352bd24da16e822fb987eead", "title": "Beyond Reward: Offline Preference-guided Policy Optimization", "pattern_id": "pattern_104", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "SEcSahl0Ql", "text_hash": "9c9740782060ff5b9cd4caaad2c9ce566e05ed7dfad77a0fc010b57bf8fd00fc", "title": "Iterative Circuit Repair Against Formal Specifications", "pattern_id": "", "domain": "Machine Learning", "review_count": 5}
+{"paper_id": "dyifcA9UuRo", "text_hash": "fbd8456b20ec49f9a8184e55f59ba02700cec26d6eee6dbf019749e65eb51413", "title": "Neural Probabilistic Logic Programming in Discrete-Continuous Domains", "pattern_id": "pattern_48", "domain": "Artificial Intelligence", "review_count": 6}
+{"paper_id": "UazgYBMS9-W", "text_hash": "049ff99fb3ba37adbc3a00165233d0f701fe8e8862aa00bb39b57780ea832e51", "title": "Can BERT Refrain from Forgetting on Sequential Tasks? A Probing Study", "pattern_id": "pattern_0", "domain": "Natural Language Processing", "review_count": 4}
+{"paper_id": "3c13LptpIph", "text_hash": "40ce6f6646165f9a9c36134f0224ac02b0bdad7c5fd60ac9fb6591adc80162e7", "title": "Behavior Proximal Policy Optimization", "pattern_id": "pattern_104", "domain": "Machine Learning", "review_count": 6}
+{"paper_id": "eZN8nUXAVO7", "text_hash": "b6c466cfbd153526f9e1b1194f3db204887abcff381425c932a727379a40d73b", "title": "FedGC: An Accurate and Efficient Federated Learning under Gradient Constraint for Heterogeneous Data", "pattern_id": "pattern_10", "domain": "Machine Learning", "review_count": 4}

Paper-KG-Pipeline/output/recall_index__gemini-embedding-001/subdomain_taxonomy.json ADDED Viewed

	@@ -0,0 +1,508 @@

+{
+  "canonical_subdomains": [
+    "3D Mapping",
+    "3D Pose Estimation",
+    "3D Reconstruction",
+    "Active Learning",
+    "Adversarial Attacks",
+    "Adversarial Machine Learning",
+    "Adversarial Training",
+    "Algorithmic Fairness",
+    "Anomaly Detection",
+    "Approximation Algorithms",
+    "Audio Processing",
+    "Autonomous Agents",
+    "Autonomous Driving",
+    "Batch Normalization",
+    "Bayesian Optimization",
+    "Benchmarking",
+    "Bias Mitigation",
+    "Biologically Plausible Learning",
+    "Catastrophic Forgetting",
+    "Causal Discovery",
+    "Code Generation",
+    "Collaborative Filtering",
+    "Combinatorial Optimization",
+    "Communication Efficiency",
+    "Computational Efficiency",
+    "Conformal Prediction",
+    "Context Compression",
+    "Contrastive Learning",
+    "Conversational AI",
+    "Convolutional Neural Networks",
+    "Coreset Selection",
+    "Covariate Shift",
+    "Data Augmentation",
+    "Deep Learning",
+    "Differential Privacy",
+    "Diffusion Models",
+    "Disentangled Representations",
+    "Domain Generalization",
+    "Drug Discovery",
+    "Dynamical Systems",
+    "Energy Efficiency",
+    "Equivariant Networks",
+    "Evolutionary Algorithms",
+    "Exploration Strategies",
+    "Feature Engineering",
+    "Federated Learning",
+    "Few-Shot Learning",
+    "Flow Networks",
+    "Game AI",
+    "Generalization",
+    "Generative Models",
+    "Geometric Deep Learning",
+    "Graph Learning",
+    "Graph Neural Networks",
+    "Hallucination Mitigation",
+    "Human Feedback",
+    "Human-Computer Interaction",
+    "Hyperparameter Optimization",
+    "Image Compression",
+    "Image Generation",
+    "Image Restoration",
+    "Imitation Learning",
+    "Implicit Regularization",
+    "In-Context Learning",
+    "Inference Efficiency",
+    "Information Retrieval",
+    "Intrinsic Motivation",
+    "Inverse Reinforcement Learning",
+    "Kernel Methods",
+    "Knowledge Distillation",
+    "Knowledge Graphs",
+    "Language Model Alignment",
+    "Large Language Models",
+    "Machine Unlearning",
+    "Markov Decision Processes",
+    "Masked Image Modeling",
+    "Mathematical Reasoning",
+    "Meta-Learning",
+    "Metric Learning",
+    "Model Compression",
+    "Model Editing",
+    "Model Efficiency",
+    "Model Evaluation",
+    "Model Interpretability",
+    "Model Pruning",
+    "Model-Based Methods",
+    "Multi-Agent Systems",
+    "Multi-Objective Optimization",
+    "Multi-modal Learning",
+    "Neural Architecture Search",
+    "Neural Fields",
+    "Neural Networks",
+    "Online Learning",
+    "Open Vocabulary Models",
+    "Optimization",
+    "Out-of-Distribution Detection",
+    "Parameter Efficiency",
+    "Partial Differential Equations",
+    "Planning",
+    "Policy Gradient Methods",
+    "Policy Optimization",
+    "Privacy",
+    "Quantization",
+    "Quantum Algorithms",
+    "Reasoning",
+    "Recurrent Neural Networks",
+    "Reinforcement Learning",
+    "Representation Learning",
+    "Robustness",
+    "Rule Learning",
+    "Sample Efficiency",
+    "Scientific Discovery",
+    "Self-Supervised Learning",
+    "Semantic Segmentation",
+    "Sequence Generation",
+    "Speech Processing",
+    "Tabular Data",
+    "Theoretical Analysis",
+    "Time Series Analysis",
+    "Training Stability",
+    "Transfer Learning",
+    "Type Inference",
+    "Uncertainty Estimation",
+    "Unsupervised Learning",
+    "Vision Transformers",
+    "Vision-Language Models",
+    "Watermarking",
+    "Zero-Shot Learning"
+  ],
+  "manifest": {
+    "algorithm": {
+      "merge_min_sim": 0.75,
+      "min_papers": 30,
+      "sim_threshold": 0.7,
+      "stoplist_ratio_max_median": 10.0,
+      "target_k_max": 80,
+      "target_k_min": 40
+    },
+    "created_at": "2026-02-14T18:30:45.068498+00:00",
+    "embedding_api_url": "https://generativelanguage.googleapis.com/v1beta/openai/embeddings",
+    "embedding_batches": 10,
+    "embedding_error": "embedding batch failed after 4 attempts",
+    "embedding_failed": true,
+    "embedding_model": "gemini-embedding-001",
+    "embedding_provider": "openai_compatible",
+    "embedding_used": false,
+    "nodes_pattern_sha256": "f9e6cd3c013585e030db63c47ee4b5d0043d4d621b44b4b83226fb01a2706c4f"
+  },
+  "mapping": {
+    "3D Generation": "3D Reconstruction",
+    "3D Mapping": "3D Mapping",
+    "3D Object Detection": "Unsupervised Learning",
+    "3D Pose Estimation": "3D Pose Estimation",
+    "3D Reconstruction": "3D Reconstruction",
+    "3D Vision": "Unsupervised Learning",
+    "Active Learning": "Active Learning",
+    "Adversarial Attacks": "Adversarial Attacks",
+    "Adversarial Machine Learning": "Adversarial Machine Learning",
+    "Adversarial Robustness": "Robustness",
+    "Adversarial Training": "Adversarial Training",
+    "Algorithmic Fairness": "Algorithmic Fairness",
+    "Anomaly Detection": "Anomaly Detection",
+    "Approximation Algorithms": "Approximation Algorithms",
+    "Articulated Objects": "3D Pose Estimation",
+    "Attention Mechanisms": "Model Efficiency",
+    "Audio Processing": "Audio Processing",
+    "Autonomous Agents": "Autonomous Agents",
+    "Autonomous Driving": "Autonomous Driving",
+    "Autonomous Vehicles": "Autonomous Driving",
+    "Backdoor Attacks": "Adversarial Attacks",
+    "Bandit Algorithms": "Online Learning",
+    "Batch Normalization": "Batch Normalization",
+    "Bayesian Inference": "Kernel Methods",
+    "Bayesian Neural Networks": "Kernel Methods",
+    "Bayesian Optimization": "Bayesian Optimization",
+    "Benchmarking": "Benchmarking",
+    "Bias Mitigation": "Bias Mitigation",
+    "Biologically Plausible Learning": "Biologically Plausible Learning",
+    "Catastrophic Forgetting": "Catastrophic Forgetting",
+    "Causal Discovery": "Causal Discovery",
+    "Causal Inference": "Causal Discovery",
+    "Class Imbalance": "Data Augmentation",
+    "Clustering": "Approximation Algorithms",
+    "Code Generation": "Code Generation",
+    "Collaborative Filtering": "Collaborative Filtering",
+    "Combinatorial Optimization": "Combinatorial Optimization",
+    "Communication Efficiency": "Communication Efficiency",
+    "Communication Protocols": "Multi-Agent Systems",
+    "Computational Efficiency": "Computational Efficiency",
+    "Conformal Prediction": "Conformal Prediction",
+    "Context Compression": "Context Compression",
+    "Context Window Extension": "Context Compression",
+    "Contextual Bandits": "Online Learning",
+    "Continual Learning": "Catastrophic Forgetting",
+    "Continuous Control": "Sample Efficiency",
+    "Contrastive Learning": "Contrastive Learning",
+    "Convergence Analysis": "Federated Learning",
+    "Conversational AI": "Conversational AI",
+    "Convolutional Neural Networks": "Convolutional Neural Networks",
+    "Coreset Selection": "Coreset Selection",
+    "Covariate Shift": "Covariate Shift",
+    "Cross-Modal Learning": "Unsupervised Learning",
+    "Curriculum Learning": "Exploration Strategies",
+    "Data Assimilation": "Dynamical Systems",
+    "Data Augmentation": "Data Augmentation",
+    "Data Poisoning": "Adversarial Machine Learning",
+    "Data Privacy": "Differential Privacy",
+    "Data Pruning": "Coreset Selection",
+    "Data Selection": "Coreset Selection",
+    "Dataset Distillation": "Knowledge Distillation",
+    "Decision-Making": "Autonomous Agents",
+    "Deep Learning": "Deep Learning",
+    "Deep Neural Networks": "Adversarial Training",
+    "Denoising": "Image Restoration",
+    "Dialogue Systems": "Conversational AI",
+    "Differential Privacy": "Differential Privacy",
+    "Diffusion Models": "Diffusion Models",
+    "Disentangled Representations": "Disentangled Representations",
+    "Disentanglement": "Disentangled Representations",
+    "Distributed Systems": "Communication Efficiency",
+    "Distribution Shift": "Covariate Shift",
+    "Distribution Shifts": "Covariate Shift",
+    "Domain Adaptation": "Transfer Learning",
+    "Domain Generalization": "Domain Generalization",
+    "Drug Discovery": "Drug Discovery",
+    "Dynamical Systems": "Dynamical Systems",
+    "Edge Computing": "Quantization",
+    "Efficiency Optimization": "Context Compression",
+    "Embodied AI": "3D Mapping",
+    "Embodied Agents": "3D Mapping",
+    "Empirical Evaluation": "Active Learning",
+    "Empirical Risk Minimization": "Domain Generalization",
+    "Energy Efficiency": "Energy Efficiency",
+    "Ensemble Learning": "Uncertainty Estimation",
+    "Ensemble Methods": "Uncertainty Estimation",
+    "Equivariant Networks": "Equivariant Networks",
+    "Equivariant Neural Networks": "Equivariant Networks",
+    "Evolutionary Algorithms": "Evolutionary Algorithms",
+    "Explainability": "Model Interpretability",
+    "Explainable AI": "Model Interpretability",
+    "Exploration Strategies": "Exploration Strategies",
+    "Face Recognition": "Algorithmic Fairness",
+    "Fairness": "Algorithmic Fairness",
+    "Fairness Metrics": "Algorithmic Fairness",
+    "Feature Engineering": "Feature Engineering",
+    "Federated Learning": "Federated Learning",
+    "Few-Shot Learning": "Few-Shot Learning",
+    "Few-shot Learning": "Meta-Learning",
+    "Flow Networks": "Flow Networks",
+    "Game AI": "Game AI",
+    "Game Theory": "Multi-Agent Systems",
+    "Gaussian Processes": "Kernel Methods",
+    "Generalization": "Generalization",
+    "Generative Adversarial Networks": "Tabular Data",
+    "Generative Models": "Generative Models",
+    "Geometric Deep Learning": "Geometric Deep Learning",
+    "Goal-Conditioned Policies": "Exploration Strategies",
+    "Gradient Descent": "Generalization",
+    "Graph Classification": "Robustness",
+    "Graph Learning": "Graph Learning",
+    "Graph Neural Networks": "Graph Neural Networks",
+    "Graph Theory": "Graph Learning",
+    "Hallucination Mitigation": "Hallucination Mitigation",
+    "Human Feedback": "Human Feedback",
+    "Human Motion Generation": "3D Reconstruction",
+    "Human-AI Interaction": "Game AI",
+    "Human-Computer Interaction": "Human-Computer Interaction",
+    "Hyperparameter Optimization": "Hyperparameter Optimization",
+    "Image Compression": "Image Compression",
+    "Image Enhancement": "Image Restoration",
+    "Image Generation": "Image Generation",
+    "Image Restoration": "Image Restoration",
+    "Image Synthesis": "3D Reconstruction",
+    "Imitation Learning": "Imitation Learning",
+    "Implicit Regularization": "Implicit Regularization",
+    "In-Context Learning": "In-Context Learning",
+    "In-context Learning": "In-Context Learning",
+    "Incremental Learning": "Catastrophic Forgetting",
+    "Inference Efficiency": "Inference Efficiency",
+    "Information Retrieval": "Information Retrieval",
+    "Instance Segmentation": "Semantic Segmentation",
+    "Interpretability": "Model Interpretability",
+    "Intrinsic Motivation": "Intrinsic Motivation",
+    "Inverse Problems": "Image Restoration",
+    "Inverse Reinforcement Learning": "Inverse Reinforcement Learning",
+    "Kernel Methods": "Kernel Methods",
+    "Knowledge Distillation": "Knowledge Distillation",
+    "Knowledge Graphs": "Knowledge Graphs",
+    "Label Noise": "Data Augmentation",
+    "Language Model Alignment": "Language Model Alignment",
+    "Language Models": "Large Language Models",
+    "Large Language Models": "Large Language Models",
+    "Latent Variable Models": "Causal Discovery",
+    "Machine Learning Security": "Adversarial Machine Learning",
+    "Machine Unlearning": "Machine Unlearning",
+    "Manifold Learning": "Geometric Deep Learning",
+    "Markov Decision Processes": "Markov Decision Processes",
+    "Masked Image Modeling": "Masked Image Modeling",
+    "Mathematical Reasoning": "Mathematical Reasoning",
+    "Membership Inference": "Machine Unlearning",
+    "Memory Efficiency": "Inference Efficiency",
+    "Meta-Learning": "Meta-Learning",
+    "Metric Learning": "Metric Learning",
+    "Mixed-Integer Linear Programming": "Combinatorial Optimization",
+    "Model Aggregation": "Privacy",
+    "Model Alignment": "Language Model Alignment",
+    "Model Compression": "Model Compression",
+    "Model Editing": "Model Editing",
+    "Model Efficiency": "Model Efficiency",
+    "Model Evaluation": "Model Evaluation",
+    "Model Interpretability": "Model Interpretability",
+    "Model Merging": "Model Editing",
+    "Model Optimization": "Quantization",
+    "Model Pruning": "Model Pruning",
+    "Model Robustness": "Adversarial Attacks",
+    "Model Training": "Knowledge Distillation",
+    "Model-Based Methods": "Model-Based Methods",
+    "Multi-Agent Reinforcement Learning": "Multi-Agent Systems",
+    "Multi-Agent Systems": "Multi-Agent Systems",
+    "Multi-Objective Optimization": "Multi-Objective Optimization",
+    "Multi-Task Learning": "Model Editing",
+    "Multi-modal Learning": "Multi-modal Learning",
+    "Multi-objective Optimization": "Multi-Objective Optimization",
+    "Multimodal Learning": "Benchmarking",
+    "Multimodal Models": "Benchmarking",
+    "Nearest Neighbors": "Feature Engineering",
+    "Neural Architecture Search": "Neural Architecture Search",
+    "Neural Decoding": "Convolutional Neural Networks",
+    "Neural Fields": "Neural Fields",
+    "Neural Network Architecture": "Convolutional Neural Networks",
+    "Neural Network Optimization": "Model Pruning",
+    "Neural Network Pruning": "Model Pruning",
+    "Neural Network Training": "Biologically Plausible Learning",
+    "Neural Networks": "Neural Networks",
+    "Neural ODEs": "Dynamical Systems",
+    "Neural Operators": "Partial Differential Equations",
+    "Neural Radiance Fields": "Neural Fields",
+    "Neural Rendering": "3D Reconstruction",
+    "Neural Representation": "Convolutional Neural Networks",
+    "Neuromorphic Computing": "Energy Efficiency",
+    "Node Classification": "Graph Learning",
+    "Non-IID Data": "Communication Efficiency",
+    "Normalizing Flows": "Image Generation",
+    "Novel View Synthesis": "3D Reconstruction",
+    "Object Detection": "Semantic Segmentation",
+    "Offline Learning": "Policy Optimization",
+    "Online Learning": "Online Learning",
+    "Open Vocabulary Models": "Open Vocabulary Models",
+    "Optimal Transport": "Metric Learning",
+    "Optimization": "Optimization",
+    "Out-of-Distribution Detection": "Out-of-Distribution Detection",
+    "Out-of-distribution Detection": "Out-of-Distribution Detection",
+    "Parameter Efficiency": "Parameter Efficiency",
+    "Partial Differential Equations": "Partial Differential Equations",
+    "Physics Simulation": "Neural Fields",
+    "Physics-Informed Learning": "Partial Differential Equations",
+    "Physics-Informed Neural Networks": "Partial Differential Equations",
+    "Planning": "Planning",
+    "Point Cloud Processing": "Unsupervised Learning",
+    "Policy Gradient Methods": "Policy Gradient Methods",
+    "Policy Optimization": "Policy Optimization",
+    "Pose Estimation": "3D Pose Estimation",
+    "Pre-trained Language Models": "Open Vocabulary Models",
+    "Preference Learning": "Human Feedback",
+    "Privacy": "Privacy",
+    "Privacy Attacks": "Adversarial Attacks",
+    "Privacy-Preserving Machine Learning": "Differential Privacy",
+    "Program Synthesis": "Code Generation",
+    "Prompt Engineering": "Reasoning",
+    "Prompt Learning": "Few-Shot Learning",
+    "Prompt Tuning": "Few-Shot Learning",
+    "Protein Design": "Drug Discovery",
+    "Pseudo-Labeling": "Batch Normalization",
+    "Quantization": "Quantization",
+    "Quantum Algorithms": "Quantum Algorithms",
+    "Quantum Computing": "Quantum Algorithms",
+    "Quantum Machine Learning": "Quantum Algorithms",
+    "Quantum Neural Networks": "Quantum Algorithms",
+    "Question Answering": "Knowledge Graphs",
+    "Rate-Distortion Optimization": "Image Compression",
+    "Reasoning": "Reasoning",
+    "Recommendation Systems": "Collaborative Filtering",
+    "Recommender Systems": "Collaborative Filtering",
+    "Recurrent Neural Networks": "Recurrent Neural Networks",
+    "Regret Minimization": "Online Learning",
+    "Reinforcement Learning": "Reinforcement Learning",
+    "Rendering": "3D Reconstruction",
+    "Representation Learning": "Representation Learning",
+    "Retrieval-Augmented Generation": "Information Retrieval",
+    "Reward Design": "Autonomous Agents",
+    "Reward Models": "Human Feedback",
+    "Riemannian Manifolds": "Geometric Deep Learning",
+    "Robotic Manipulation": "Imitation Learning",
+    "Robotics": "Imitation Learning",
+    "Robustness": "Robustness",
+    "Rule Learning": "Rule Learning",
+    "Safe Exploration": "Markov Decision Processes",
+    "Safety Constraints": "Markov Decision Processes",
+    "Safety-Critical Systems": "Markov Decision Processes",
+    "Sample Complexity": "Policy Gradient Methods",
+    "Sample Efficiency": "Sample Efficiency",
+    "Sampling Methods": "Flow Networks",
+    "Sampling Techniques": "Image Generation",
+    "Scalability": "Bayesian Optimization",
+    "Scaling Laws": "Hyperparameter Optimization",
+    "Scientific Discovery": "Scientific Discovery",
+    "Security": "Robustness",
+    "Self-Supervised Learning": "Self-Supervised Learning",
+    "Self-supervised Learning": "Self-Supervised Learning",
+    "Semantic Segmentation": "Semantic Segmentation",
+    "Semi-supervised Learning": "Data Augmentation",
+    "Sequence Generation": "Sequence Generation",
+    "Sequence Learning": "Recurrent Neural Networks",
+    "Sequence Modeling": "Policy Optimization",
+    "Sparse Learning": "Implicit Regularization",
+    "Sparse Reward Environments": "Intrinsic Motivation",
+    "Sparse Rewards": "Intrinsic Motivation",
+    "Spectral Methods": "Graph Learning",
+    "Speech Enhancement": "Audio Processing",
+    "Speech Processing": "Speech Processing",
+    "Speech Recognition": "Speech Processing",
+    "Speech Separation": "Audio Processing",
+    "Speech Synthesis": "Speech Processing",
+    "Spiking Neural Networks": "Energy Efficiency",
+    "Spurious Correlations": "Domain Generalization",
+    "State Space Models": "Recurrent Neural Networks",
+    "Surrogate Modeling": "Partial Differential Equations",
+    "Survival Analysis": "Conformal Prediction",
+    "Symbolic Reasoning": "Rule Learning",
+    "Symbolic Regression": "Scientific Discovery",
+    "Tabular Data": "Tabular Data",
+    "Task Arithmetic": "Model Editing",
+    "Task Planning": "Planning",
+    "Teacher-Student Models": "Transfer Learning",
+    "Test-Time Adaptation": "Batch Normalization",
+    "Text Generation": "Sequence Generation",
+    "Text-to-Image Generation": "3D Reconstruction",
+    "Text-to-Image Synthesis": "3D Reconstruction",
+    "Text-to-Speech": "Speech Processing",
+    "Text-to-Video Generation": "3D Reconstruction",
+    "Theorem Proving": "Mathematical Reasoning",
+    "Theoretical Analysis": "Theoretical Analysis",
+    "Time Series Analysis": "Time Series Analysis",
+    "Time Series Forecasting": "Model Efficiency",
+    "Training Stability": "Training Stability",
+    "Trajectory Prediction": "Autonomous Driving",
+    "Transfer Learning": "Transfer Learning",
+    "Transferability": "Adversarial Attacks",
+    "Transformer Models": "Model Efficiency",
+    "Transformers": "Model Efficiency",
+    "Type Inference": "Type Inference",
+    "Uncertainty Estimation": "Uncertainty Estimation",
+    "Uncertainty Quantification": "Conformal Prediction",
+    "Unsupervised Learning": "Unsupervised Learning",
+    "Variational Autoencoders": "Disentangled Representations",
+    "Variational Inference": "Kernel Methods",
+    "Video Generation": "3D Reconstruction",
+    "Video Synthesis": "3D Reconstruction",
+    "Video Understanding": "Benchmarking",
+    "View Synthesis": "3D Reconstruction",
+    "Vision Transformers": "Vision Transformers",
+    "Vision-Language Models": "Vision-Language Models",
+    "Visual Reasoning": "Benchmarking",
+    "Wasserstein Distance": "Metric Learning",
+    "Watermarking": "Watermarking",
+    "Web Automation": "Human-Computer Interaction",
+    "Zero-Shot Learning": "Zero-Shot Learning",
+    "Zero-Sum Games": "Multi-Agent Systems"
+  },
+  "stats": {
+    "canonical_count": 128,
+    "paper_count_summary": {
+      "max": 850,
+      "median": 100.5,
+      "min": 16
+    },
+    "raw_count": 319,
+    "stoplist_count": 24
+  },
+  "stoplist": [
+    "Adversarial Machine Learning",
+    "Bayesian Optimization",
+    "Benchmarking",
+    "Combinatorial Optimization",
+    "Convolutional Neural Networks",
+    "Deep Learning",
+    "Diffusion Models",
+    "Generative Models",
+    "Geometric Deep Learning",
+    "Graph Learning",
+    "Graph Neural Networks",
+    "Hyperparameter Optimization",
+    "Large Language Models",
+    "Model Efficiency",
+    "Multi-Objective Optimization",
+    "Neural Architecture Search",
+    "Neural Fields",
+    "Neural Networks",
+    "Optimization",
+    "Policy Optimization",
+    "Recurrent Neural Networks",
+    "Reinforcement Learning",
+    "Representation Learning",
+    "Vision-Language Models"
+  ]
+}

Paper-KG-Pipeline/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# Knowledge Graph Pipeline Dependencies
+# 核心依赖
+networkx>=2.8
+numpy>=1.21
+scikit-learn>=1.0
+requests>=2.28
+tqdm>=4.67.2
+openai>=1.0
+# KG 构建 (generate_clusters.py)
+sentence-transformers>=2.2
+umap-learn>=0.5
+hdbscan>=0.8
+scipy>=1.9
+scipy>=1.9
+gradio>=4.0

Paper-KG-Pipeline/scripts/__pycache__/recall_system.cpython-313.pyc ADDED Viewed

Binary file (705 Bytes). View file

Paper-KG-Pipeline/scripts/build_edges.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env python3
+from pathlib import Path
+import runpy
+# Compatibility wrapper (scripts/ -> scripts/tools)
+runpy.run_path(str(Path(__file__).parent / "tools" / "build_edges.py"), run_name="__main__")

Paper-KG-Pipeline/scripts/build_entity_v3.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env python3
+from pathlib import Path
+import runpy
+# Compatibility wrapper (scripts/ -> scripts/tools)
+runpy.run_path(str(Path(__file__).parent / "tools" / "build_entity_v3.py"), run_name="__main__")

Paper-KG-Pipeline/scripts/demo_pipeline.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env python3
+from pathlib import Path
+import runpy
+# Compatibility wrapper (scripts/ -> scripts/demos)
+runpy.run_path(str(Path(__file__).parent / "demos" / "demo_pipeline.py"), run_name="__main__")

Paper-KG-Pipeline/scripts/demos/demo_pipeline.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+Idea2Story Pipeline 演示脚本
+展示如何快速使用 Pipeline，包含：
+1. 基础使用
+2. 自定义配置
+3. 批量处理
+"""
+import json
+import sys
+from pathlib import Path
+# 添加脚本目录到路径
+SCRIPT_DIR = Path(__file__).resolve().parent
+SCRIPTS_DIR = SCRIPT_DIR.parent
+PROJECT_ROOT = SCRIPTS_DIR.parent
+OUTPUT_DIR = PROJECT_ROOT / "output"
+sys.path.insert(0, str(SCRIPTS_DIR))
+# ===================== 示例 1: 基础使用 =====================
+def demo_basic_usage():
+    """示例1: 基础使用"""
+    print("\n" + "=" * 80)
+    print("📚 示例 1: 基础使用")
+    print("=" * 80)
+    from idea2story_pipeline import Idea2StoryPipeline
+    from simple_recall_demo import (
+        NODES_PATTERN, NODES_PAPER
+    )
+    # 用户 Idea
+    user_idea = "使用对比学习改进小样本文本分类，并在医疗领域数据集上验证"
+    print(f"\n【用户 Idea】\n{user_idea}\n")
+    # 加载数据
+    print("📂 加载数据...")
+    with open(NODES_PATTERN, 'r', encoding='utf-8') as f:
+        patterns_data = json.load(f)
+    with open(NODES_PAPER, 'r', encoding='utf-8') as f:
+        papers_data = json.load(f)
+    # 模拟召回结果（简化版）
+    print("🔍 运行召回...")
+    pattern_map = {p['pattern_id']: p for p in patterns_data}
+    # 这里简化为直接使用前 10 个 Pattern
+    recalled_patterns = [
+        (pid, pattern_map[pid], 0.8 - i * 0.05)
+        for i, pid in enumerate(list(pattern_map.keys())[:10])
+    ]
+    print(f"   召回 {len(recalled_patterns)} 个 Pattern\n")
+    # 创建 Pipeline
+    print("🚀 启动 Pipeline...")
+    pipeline = Idea2StoryPipeline(user_idea, recalled_patterns, papers_data)
+    # 运行
+    result = pipeline.run()
+    # 输出结果
+    print("\n" + "=" * 80)
+    print("📊 执行结果")
+    print("=" * 80)
+    print(f"✅ 状态: {'成功' if result['success'] else '需审核'}")
+    print(f"📈 迭代次数: {result['iterations']}")
+    print(f"📝 最终标题: {result['final_story']['title']}")
+    return result
+# ===================== 示例 2: 自定义配置 =====================
+def demo_custom_config():
+    """示例2: 自定义配置"""
+    print("\n" + "=" * 80)
+    print("📚 示例 2: 自定义配置")
+    print("=" * 80)
+    from idea2story_pipeline import PipelineConfig
+    # 修改配置
+    print("\n🔧 修改配置:")
+    print(f"   PASS_SCORE: 6.0 → 5.0（降低通过门槛）")
+    print(f"   MAX_REFINE_ITERATIONS: 3 → 5（增加迭代次数）")
+    print(f"   COLLISION_THRESHOLD: 0.75 → 0.85（放宽查重）")
+    original_pass_score = PipelineConfig.PASS_SCORE
+    original_max_iter = PipelineConfig.MAX_REFINE_ITERATIONS
+    original_threshold = PipelineConfig.COLLISION_THRESHOLD
+    PipelineConfig.PASS_SCORE = 5.0
+    PipelineConfig.MAX_REFINE_ITERATIONS = 5
+    PipelineConfig.COLLISION_THRESHOLD = 0.85
+    print("\n💡 提示: 修改后的配置会应用到所有 Pipeline 实例")
+    # 恢复原配置
+    PipelineConfig.PASS_SCORE = original_pass_score
+    PipelineConfig.MAX_REFINE_ITERATIONS = original_max_iter
+    PipelineConfig.COLLISION_THRESHOLD = original_threshold
+# ===================== 示例 3: 批量处理 =====================
+def demo_batch_processing():
+    """示例3: 批量处理多个 Idea"""
+    print("\n" + "=" * 80)
+    print("📚 示例 3: 批量处理")
+    print("=" * 80)
+    # 多个 Idea
+    ideas = [
+        "使用知识蒸馏压缩BERT模型用于移动端部署",
+        "基于强化学习的对话系统策略优化",
+        "多模态融合用于情感分析任务"
+    ]
+    print(f"\n📋 待处理 Idea 列表: {len(ideas)} 个")
+    for i, idea in enumerate(ideas, 1):
+        print(f"   {i}. {idea[:40]}...")
+    print("\n💡 批量处理示例代码:")
+    print("""
+    results = []
+    for i, idea in enumerate(ideas):
+        print(f"\\n处理 {i+1}/{len(ideas)}: {idea[:30]}...")
+        # 运行召回
+        recalled_patterns = run_recall(idea)
+        # 运行 Pipeline
+        pipeline = Idea2StoryPipeline(idea, recalled_patterns, papers)
+        result = pipeline.run()
+        # 保存结果
+        results.append(result)
+        with open(f"output/story_{i+1}.json", 'w') as f:
+            json.dump(result['final_story'], f, ensure_ascii=False, indent=2)
+    print(f"\\n✅ 批量处理完成，成功 {sum(r['success'] for r in results)} 个")
+    """)
+# ===================== 示例 4: 查看中间结果 =====================
+def demo_inspect_intermediate():
+    """示例4: 查看中间结果"""
+    print("\n" + "=" * 80)
+    print("📚 示例 4: 查看中间结果")
+    print("=" * 80)
+    # 检查是否有 pipeline_result.json
+    result_file = OUTPUT_DIR / "pipeline_result.json"
+    if not result_file.exists():
+        print("\n⚠️  未找到 pipeline_result.json")
+        print("   请先运行: python scripts/idea2story_pipeline.py")
+        return
+    # 加载结果
+    with open(result_file, 'r', encoding='utf-8') as f:
+        result = json.load(f)
+    print(f"\n📊 执行历史分析:")
+    print(f"   用户 Idea: {result['user_idea'][:50]}...")
+    print(f"   总迭代次数: {result['iterations']}")
+    print(f"   最终状态: {'✅ 成功' if result['success'] else '❌ 失败'}")
+    print(f"\n📋 选择的 Patterns:")
+    for ptype, pid in result['selected_patterns'].items():
+        print(f"   - {ptype}: {pid}")
+    print(f"\n📝 评审历史:")
+    review_summary = result['review_summary']
+    print(f"   总评审轮数: {review_summary['total_reviews']}")
+    print(f"   最终得分: {review_summary['final_score']:.2f}/10")
+    print(f"\n🔧 修正历史:")
+    refinement_summary = result['refinement_summary']
+    print(f"   总修正次数: {refinement_summary['total_refinements']}")
+    if refinement_summary['issues_addressed']:
+        print(f"   修正的问题: {', '.join(refinement_summary['issues_addressed'])}")
+    print(f"\n🔎 查重结果:")
+    verification = result['verification_summary']
+    print(f"   检测到撞车: {'是' if verification['collision_detected'] else '否'}")
+    print(f"   最高相似度: {verification['max_similarity']:.2f}")
+# ===================== 示例 5: 导出 Markdown =====================
+def demo_export_markdown():
+    """示例5: 导出为 Markdown"""
+    print("\n" + "=" * 80)
+    print("📚 示例 5: 导出为 Markdown")
+    print("=" * 80)
+    # 检查是否有 log
+    story_file = OUTPUT_DIR / "log"
+    if not story_file.exists():
+        print("\n⚠️  未找到 log")
+        print("   请先运行: python scripts/idea2story_pipeline.py")
+        return
+    # 加载 Story
+    with open(story_file, 'r', encoding='utf-8') as f:
+        story = json.load(f)
+    # 生成 Markdown
+    md_content = f"""# {story['title']}
+## Abstract
+{story['abstract']}
+## Problem Definition
+{story['problem_definition']}
+## Method Skeleton
+{story['method_skeleton']}
+## Innovation Claims
+{chr(10).join([f"- {claim}" for claim in story['innovation_claims']])}
+## Experiments Plan
+{story['experiments_plan']}
+---
+*Generated by Idea2Story Pipeline*
+"""
+    # 保存
+    md_file = OUTPUT_DIR / "final_story.md"
+    with open(md_file, 'w', encoding='utf-8') as f:
+        f.write(md_content)
+    print(f"\n✅ Markdown 已保存到: {md_file}")
+    print("\n预览:")
+    print("-" * 80)
+    print(md_content[:500] + "...")
+    print("-" * 80)
+# ===================== 主函数 =====================
+def main():
+    """运行所有演示"""
+    print("=" * 80)
+    print("🎓 Idea2Story Pipeline 演示")
+    print("=" * 80)
+    print("\n选择演示:")
+    print("  1. 基础使用（完整流程）")
+    print("  2. 自定义配置")
+    print("  3. 批量处理")
+    print("  4. 查看中间结果")
+    print("  5. 导出 Markdown")
+    print("  0. 运行所有演示")
+    choice = input("\n请输入选项 (0-5): ").strip()
+    if choice == '1':
+        demo_basic_usage()
+    elif choice == '2':
+        demo_custom_config()
+    elif choice == '3':
+        demo_batch_processing()
+    elif choice == '4':
+        demo_inspect_intermediate()
+    elif choice == '5':
+        demo_export_markdown()
+    elif choice == '0':
+        # 运行所有（跳过耗时的基础使用）
+        demo_custom_config()
+        demo_batch_processing()
+        demo_inspect_intermediate()
+        demo_export_markdown()
+    else:
+        print("\n⚠️  无效选项")
+    print("\n" + "=" * 80)
+    print("✅ 演示完成!")
+    print("=" * 80)
+    print("\n💡 提示:")
+    print("  - 运行完整流程: python scripts/idea2story_pipeline.py")
+    print("  - 查看文档: docs/QUICK_START_PIPELINE.md")
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\n⚠️  用户中断")
+    except Exception as e:
+        print(f"\n❌ 错误: {e}")
+        import traceback
+        traceback.print_exc()

Paper-KG-Pipeline/scripts/demos/run_pipeline.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+一键运行完整Pipeline
+从数据抽取到知识图谱构建的完整流程
+"""
+import os
+import sys
+import subprocess
+from pathlib import Path
+SCRIPT_DIR = Path(__file__).resolve().parent
+SCRIPTS_DIR = SCRIPT_DIR.parent
+def run_step(step_num: int, name: str, script: str):
+    """运行单个步骤"""
+    print(f"\n{'='*60}")
+    print(f"📌 Step {step_num}: {name}")
+    print(f"{'='*60}")
+    script_path = SCRIPTS_DIR / script
+    if not script_path.exists():
+        print(f"❌ 脚本不存在: {script_path}")
+        return False
+    result = subprocess.run(
+        [sys.executable, str(script_path)],
+        cwd=str(SCRIPTS_DIR)
+    )
+    if result.returncode != 0:
+        print(f"❌ Step {step_num} 失败")
+        return False
+    print(f"✅ Step {step_num} 完成")
+    return True
+def main():
+    print("="*60)
+    print("🚀 知识图谱Pipeline - 一键运行")
+    print("="*60)
+    steps = [
+        # (0, "数据抽取", "extract_paper_review.py"),  # 已完成，数据在data/
+        # (1, "Pattern聚类", "generate_patterns.py"),  # 已完成，结果在output/
+        # (2, "构建entity", "build_entity.py"),  # 已完成，结果在output/
+        (3, "运行召回", "simple_recall_demo.py"),
+    ]
+    print("\n📋 将执行以下步骤:")
+    print("   1. 数据抽取 (已完成 - 结果在 data/)")
+    print("   2. Pattern聚类 (已完成 - 结果在 output/patterns_structured.json)")
+    print("   3. 构建知识图谱(已完成 - 结果在 output/nodes_xxx.json)")
+    print("   4. idea召回")
+    for step_num, name, script in steps:
+        if not run_step(step_num, name, script):
+            print(f"\n❌ Pipeline在Step {step_num}中断")
+            sys.exit(1)
+    print("\n" + "="*60)
+    print("🎉 Pipeline完成!")
+    print("="*60)
+if __name__ == '__main__':
+    main()

Paper-KG-Pipeline/scripts/demos/simple_recall_demo.py ADDED Viewed

	@@ -0,0 +1,527 @@

+"""
+简化的召回系统Demo - 单个测试用例 (V3版本)
+使用方法:
+  python scripts/simple_recall_demo.py "你的Idea描述"
+示例:
+  python scripts/simple_recall_demo.py "使用Transformer进行文本分类"
+V3版本更新:
+  - 适配V3节点结构 (Paper.idea为字符串，非嵌套字典)
+  - 路径1直接使用Idea.pattern_ids，无需通过Paper中转
+  - Paper通过review_stats获取质量分数，支持兼容旧结构
+"""
+import json
+import os
+import pickle
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+from tqdm import tqdm
+# 提前加载 .env（确保配置读取前生效）
+SCRIPT_DIR = Path(__file__).resolve().parent
+SCRIPTS_DIR = SCRIPT_DIR.parent
+PROJECT_ROOT = SCRIPTS_DIR.parent
+REPO_ROOT = PROJECT_ROOT.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+try:
+    from idea2paper.infra.dotenv import load_dotenv
+    _DOTENV_STATUS = load_dotenv(REPO_ROOT / ".env", override=False)
+except Exception:
+    _DOTENV_STATUS = None
+import numpy as np
+import requests
+from pipeline.run_context import get_logger
+# ===================== 路径配置 =====================
+OUTPUT_DIR = PROJECT_ROOT / "output"
+NODES_IDEA = OUTPUT_DIR / "nodes_idea.json"
+NODES_PATTERN = OUTPUT_DIR / "nodes_pattern.json"
+NODES_DOMAIN = OUTPUT_DIR / "nodes_domain.json"
+NODES_PAPER = OUTPUT_DIR / "nodes_paper.json"
+GRAPH_FILE = OUTPUT_DIR / "knowledge_graph_v2.gpickle"
+# ===================== 配置参数 =====================
+TOP_K_IDEAS = 10
+TOP_K_PATTERNS_PATH1 = 10  # 路径1最终保留Top-K个Pattern（重要通道）
+TOP_K_DOMAINS = 5
+TOP_K_PATTERNS_PATH2 = 5   # 路径2最终保留Top-K个Pattern（辅助通道）
+TOP_K_PAPERS = 20
+TOP_K_PATTERNS_PATH3 = 10  # 路径3最终保留Top-K个Pattern（重要通道）
+FINAL_TOP_K = 10
+PATH1_WEIGHT = 0.4  # 相似Idea - 重要
+PATH2_WEIGHT = 0.2  # 领域相关 - 辅助
+PATH3_WEIGHT = 0.4  # 相似Paper - 重要
+USE_EMBEDDING = True  # 使用embedding计算相似度（推荐）
+# 两阶段召回优化（粗排+精排）
+TWO_STAGE_RECALL = True      # 启用两阶段召回（大幅提速）
+COARSE_RECALL_SIZE = 100     # 粗召回数量（Jaccard快速筛选）
+# ===================== 工具函数 =====================
+def compute_similarity(text1, text2):
+    """计算两个文本的相似度"""
+    if USE_EMBEDDING:
+        return compute_embedding_similarity(text1, text2)
+    else:
+        return compute_jaccard_similarity(text1, text2)
+def compute_jaccard_similarity(text1, text2):
+    """词袋Jaccard相似度（快速但不准确）"""
+    tokens1 = set(text1.lower().split())
+    tokens2 = set(text2.lower().split())
+    if not tokens1 or not tokens2:
+        return 0.0
+    intersection = len(tokens1 & tokens2)
+    union = len(tokens1 | tokens2)
+    return intersection / union
+def compute_embedding_similarity(text1, text2):
+    """基于embedding的余弦相似度（更准确）"""
+    emb1 = get_embedding(text1)
+    emb2 = get_embedding(text2)
+    if emb1 is None or emb2 is None:
+        return compute_jaccard_similarity(text1, text2)
+    emb1 = np.array(emb1)
+    emb2 = np.array(emb2)
+    cosine_sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
+    return float(cosine_sim)
+_embedding_cache = {}
+_embedding_warning_shown = False
+_embedding_error_shown = False
+def get_embedding(text, max_retries=3):
+    """调用SiliconFlow API获取文本embedding"""
+    global _embedding_warning_shown, _embedding_error_shown
+    logger = get_logger()
+    # 缓存检查
+    if text in _embedding_cache:
+        return _embedding_cache[text]
+    api_key = os.environ.get('SILICONFLOW_API_KEY', '')
+    if not api_key:
+        if not _embedding_warning_shown:
+            print("  ⚠️  未设置SILICONFLOW_API_KEY，降级到Jaccard相似度")
+            _embedding_warning_shown = True
+        return None
+    url = "https://api.siliconflow.cn/v1/embeddings"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": "Qwen/Qwen3-Embedding-8B",
+        "input": text[:2000]
+    }
+    for attempt in range(max_retries):
+        try:
+            start_ts = time.time()
+            response = requests.post(url, headers=headers, json=payload, timeout=10)
+            response.raise_for_status()
+            result = response.json()
+            embedding = result['data'][0]['embedding']
+            _embedding_cache[text] = embedding
+            if logger:
+                logger.log_embedding_call(
+                    request={
+                        "provider": "siliconflow",
+                        "url": url,
+                        "model": payload["model"],
+                        "input_preview": text[:2000],
+                        "timeout": 10
+                    },
+                    response={
+                        "ok": True,
+                        "latency_ms": int((time.time() - start_ts) * 1000)
+                    }
+                )
+            return embedding
+        except Exception as e:
+            if attempt < max_retries - 1:
+                time.sleep(0.5)
+            else:
+                if not _embedding_error_shown:
+                    print(f"  ⚠️  Embedding API调用失败: {e}，降级到Jaccard相似度")
+                    _embedding_error_shown = True
+                if logger:
+                    logger.log_embedding_call(
+                        request={
+                            "provider": "siliconflow",
+                            "url": url,
+                            "model": payload["model"],
+                            "input_preview": text[:2000],
+                            "timeout": 10
+                        },
+                        response={
+                            "ok": False,
+                            "latency_ms": 0,
+                            "error": str(e)
+                        }
+                    )
+                return None
+    return None
+def get_paper_quality(paper):
+    """计算Paper的综合质量分数
+    基于review的评分，归一化到[0, 1]
+    如果没有review数据，返回默认值0.5
+    """
+    # 优先使用新结构中的 review_stats.avg_score
+    review_stats = paper.get('review_stats', {})
+    if review_stats and review_stats.get('avg_score'):
+        # 已经是 0-1 的分数
+        return float(review_stats['avg_score'])
+    # 备选方案：兼容旧结构（review 列表）
+    reviews = paper.get('reviews', [])
+    if not reviews:
+        return 0.5  # 默认中等质量
+    # 提取所有评分
+    scores = []
+    for review in reviews:
+        score_str = review.get('overall_score', '')
+        # 尝试解析评分（可能是 "7", "7/10", "7.0" 等格式）
+        try:
+            if '/' in score_str:
+                score_str = score_str.split('/')[0]
+            score = float(score_str.strip())
+            scores.append(score)
+        except (ValueError, AttributeError):
+            continue
+    if not scores:
+        return 0.5
+    # 计算平均分并归一化
+    import numpy as np
+    avg_score = np.mean(scores)
+    # 假设评分范围是 1-10，归一化到 [0, 1]
+    normalized_score = (avg_score - 1) / 9
+    return min(max(normalized_score, 0.0), 1.0)
+# ===================== 主函数 =====================
+def main():
+    # 获取用户输入
+    if len(sys.argv) > 1:
+        user_idea = " ".join(sys.argv[1:])
+    else:
+        # user_idea = "使用蒸馏技术完成Transformer跨领域文本分类任务，并在多个数据集上验证效果"
+        user_idea = "Research on the Self-Evolution of Intelligent Agents Based on Reflection and Memory"
+    print("=" * 80)
+    print("🎯 三路召回系统 Demo")
+    print("=" * 80)
+    print(f"\n【用户Idea】\n{user_idea}\n")
+    # 加载数据
+    print("📂 加载数据...")
+    with open(NODES_IDEA, 'r', encoding='utf-8') as f:
+        ideas = json.load(f)
+    with open(NODES_PATTERN, 'r', encoding='utf-8') as f:
+        patterns = json.load(f)
+    with open(NODES_DOMAIN, 'r', encoding='utf-8') as f:
+        domains = json.load(f)
+    with open(NODES_PAPER, 'r', encoding='utf-8') as f:
+        papers = json.load(f)
+    with open(GRAPH_FILE, 'rb') as f:
+        G = pickle.load(f)
+    # 构建索引
+    idea_map = {i['idea_id']: i for i in ideas}
+    pattern_map = {p['pattern_id']: p for p in patterns}
+    domain_map = {d['domain_id']: d for d in domains}
+    paper_map = {p['paper_id']: p for p in papers}
+    print(f"  ✓ Idea: {len(ideas)}, Pattern: {len(patterns)}, Domain: {len(domains)}, Paper: {len(papers)}")
+    print(f"  ✓ 图谱: {G.number_of_nodes()} 节点, {G.number_of_edges()} 边\n")
+    # ===================== 路径1: 相似Idea召回 =====================
+    print("🔍 [路径1] 相似Idea召回...")
+    # 两阶段召回优化
+    if TWO_STAGE_RECALL and USE_EMBEDDING:
+        print(f"  [粗排] 使用Jaccard快速筛选Top-{COARSE_RECALL_SIZE}...")
+        coarse_similarities = []
+        for idea in ideas:
+            sim = compute_jaccard_similarity(user_idea, idea['description'])
+            if sim > 0:
+                coarse_similarities.append((idea['idea_id'], sim))
+        coarse_similarities.sort(key=lambda x: x[1], reverse=True)
+        candidates = coarse_similarities[:COARSE_RECALL_SIZE]
+        print(f"  [精排] 使用Embedding重排Top-{TOP_K_IDEAS}...")
+        fine_similarities = []
+        for idea_id, _ in candidates:
+            idea = idea_map[idea_id]
+            sim = compute_embedding_similarity(user_idea, idea['description'])
+            if sim > 0:
+                fine_similarities.append((idea_id, sim))
+        fine_similarities.sort(key=lambda x: x[1], reverse=True)
+        top_ideas = fine_similarities[:TOP_K_IDEAS]
+        print(f"  ✓ 粗排{len(coarse_similarities)}个 → 精排{len(candidates)}个 → 最终{len(top_ideas)}个")
+    else:
+        # 单阶段召回（原逻辑）
+        similarities = []
+        for idea in ideas:
+            sim = compute_similarity(user_idea, idea['description'])
+            if sim > 0:
+                similarities.append((idea['idea_id'], sim))
+        similarities.sort(key=lambda x: x[1], reverse=True)
+        top_ideas = similarities[:TOP_K_IDEAS]
+        print(f"  找到 {len(similarities)} 个相似Idea，选择 Top-{TOP_K_IDEAS}")
+    path1_scores = defaultdict(float)
+    for idea_id, similarity in top_ideas:
+        idea = idea_map[idea_id]
+        # 打印匹配到的相似 Idea 辅助调试(增加到300字符)
+        if similarity > 0.2:
+            print(f"    - 匹配 Idea [{idea_id}]: {idea['description'][:300]}... (sim={similarity:.3f})")
+        # 路径 1 直接从 Idea 节点的 pattern_ids 召回
+        pattern_ids = idea.get('pattern_ids', [])
+        for pid in pattern_ids:
+            path1_scores[pid] += similarity
+    # 排序并只保留Top-K个Pattern
+    sorted_path1 = sorted(path1_scores.items(), key=lambda x: x[1], reverse=True)
+    path1_scores = dict(sorted_path1[:TOP_K_PATTERNS_PATH1])
+    print(f"  ✓ 召回 {len(sorted_path1)} 个Pattern，保留Top-{TOP_K_PATTERNS_PATH1}\n")
+    # ===================== 路径2: 领域相关召回 =====================
+    print("🌍 [路径2] 领域相关性召回...")
+    # 通过最相似Idea的Domain
+    top_idea = idea_map[top_ideas[0][0]] if top_ideas else None
+    domain_scores = []
+    if top_idea and G.has_node(top_idea['idea_id']):
+        for successor in G.successors(top_idea['idea_id']):
+            edge_data = G[top_idea['idea_id']][successor]
+            if edge_data.get('relation') == 'belongs_to':
+                domain_id = successor
+                weight = edge_data.get('weight', 0.5)
+                domain_scores.append((domain_id, weight))
+    domain_scores.sort(key=lambda x: x[1], reverse=True)
+    top_domains = domain_scores[:TOP_K_DOMAINS]
+    print(f"  找到 {len(domain_scores)} 个相关Domain，选择 Top-{TOP_K_DOMAINS}")
+    path2_scores = defaultdict(float)
+    for domain_id, domain_weight in top_domains:
+        # 打印Domain详细信息
+        domain = domain_map.get(domain_id)
+        if domain:
+            domain_name = domain.get('name', 'N/A')
+            paper_count = domain.get('paper_count', 0)
+            sub_domains = domain.get('sub_domains', [])
+            sub_domain_str = ', '.join(sub_domains[:5])  # 只显示前5个sub_domain
+            if len(sub_domains) > 5:
+                sub_domain_str += f"... (共{len(sub_domains)}个)"
+            print(f"  - {domain_id} (名称={domain_name}, 相关度={domain_weight:.3f}, 论文数={paper_count})")
+            if sub_domain_str:
+                print(f"    子领域: {sub_domain_str}")
+        for predecessor in G.predecessors(domain_id):
+            edge_data = G[predecessor][domain_id]
+            if edge_data.get('relation') == 'works_well_in':
+                pattern_id = predecessor
+                effectiveness = edge_data.get('effectiveness', 0.0)
+                confidence = edge_data.get('confidence', 0.0)
+                path2_scores[pattern_id] += domain_weight * max(effectiveness, 0.1) * confidence
+    # 排序并只保留Top-K个Pattern
+    sorted_path2 = sorted(path2_scores.items(), key=lambda x: x[1], reverse=True)
+    path2_scores = dict(sorted_path2[:TOP_K_PATTERNS_PATH2])
+    print(f"  ✓ 召回 {len(sorted_path2)} 个Pattern，保留Top-{TOP_K_PATTERNS_PATH2}\n")
+    # ===================== 路径3: 相似Paper召回 =====================
+    print("📄 [路径3] 相似Paper召回...")
+    # 两阶段召回优化
+    if TWO_STAGE_RECALL and USE_EMBEDDING:
+        print(f"  [粗排] 使用Jaccard快速筛选Top-{COARSE_RECALL_SIZE}...")
+        coarse_similarities = []
+        for paper in tqdm(papers, desc="Processing papers"):
+            paper_title = paper.get('title', '')
+            if not paper_title:
+                continue
+            sim = compute_jaccard_similarity(user_idea, paper_title)
+            if sim > 0.05:  # 降低阈值
+                coarse_similarities.append((paper['paper_id'], sim))
+        coarse_similarities.sort(key=lambda x: x[1], reverse=True)
+        candidates = coarse_similarities[:COARSE_RECALL_SIZE]
+        print(f"  [精排] 使用Embedding重排Top-{TOP_K_PAPERS}...")
+        fine_similarities = []
+        for paper_id, _ in candidates:
+            paper = paper_map.get(paper_id)
+            if not paper:
+                continue
+            paper_title = paper.get('title', '')
+            sim = compute_embedding_similarity(user_idea, paper_title)
+            if sim > 0.1 and G.has_node(paper_id):
+                quality = get_paper_quality(paper)
+                combined = sim * quality
+                fine_similarities.append((paper_id, sim, quality, combined))
+        fine_similarities.sort(key=lambda x: x[3], reverse=True)
+        top_papers = fine_similarities[:TOP_K_PAPERS]
+        print(f"  ✓ 粗排{len(coarse_similarities)}个 → 精排{len(candidates)}个 → 最终{len(top_papers)}个")
+    else:
+        # 单阶段召回（原逻辑）
+        similarities = []
+        for paper in tqdm(papers, desc="Processing papers"):
+            paper_title = paper.get('title', '')
+            if not paper_title:
+                continue
+            sim = compute_similarity(user_idea, paper_title)
+            if sim > 0.1 and G.has_node(paper['paper_id']):
+                quality = get_paper_quality(paper)
+                combined = sim * quality
+                similarities.append((paper['paper_id'], sim, quality, combined))
+        similarities.sort(key=lambda x: x[3], reverse=True)
+        top_papers = similarities[:TOP_K_PAPERS]
+        print(f"  找到 {len(similarities)} 个相似Paper，选择 Top-{TOP_K_PAPERS}")
+    path3_scores = defaultdict(float)
+    for paper_id, similarity, quality, combined_weight in top_papers:
+        paper = paper_map.get(paper_id, {})
+        # 判断质量来源：优先检查review_stats，然后是reviews，否则是默认值
+        if paper.get('review_stats'):
+            quality_source = f"review({paper['review_stats'].get('review_count', 0)}条)"
+        elif paper.get('reviews'):
+            quality_source = "review"
+        else:
+            quality_source = "默认"
+        title = paper.get('title', 'N/A')
+        print(f"  - {paper_id} (相似度={similarity:.3f}, 质量={quality:.3f} [{quality_source}])")
+        print(f"    标题: {title}")
+        if not G.has_node(paper_id):
+            continue
+        for successor in G.successors(paper_id):
+            edge_data = G[paper_id][successor]
+            if edge_data.get('relation') == 'uses_pattern':
+                pattern_id = successor
+                pattern_quality = edge_data.get('quality', 0.5)
+                path3_scores[pattern_id] += combined_weight * pattern_quality
+    # 排序并只保留Top-K个Pattern
+    sorted_path3 = sorted(path3_scores.items(), key=lambda x: x[1], reverse=True)
+    path3_scores = dict(sorted_path3[:TOP_K_PATTERNS_PATH3])
+    print(f"  ✓ 召回 {len(sorted_path3)} 个Pattern，保留Top-{TOP_K_PATTERNS_PATH3}\n")
+    # ===================== 融合结果 =====================
+    print("🔗 融合三路召回结果...\n")
+    all_patterns = set(path1_scores.keys()) | set(path2_scores.keys()) | set(path3_scores.keys())
+    final_scores = {}
+    for pattern_id in all_patterns:
+        score1 = path1_scores.get(pattern_id, 0.0) * PATH1_WEIGHT
+        score2 = path2_scores.get(pattern_id, 0.0) * PATH2_WEIGHT
+        score3 = path3_scores.get(pattern_id, 0.0) * PATH3_WEIGHT
+        final_scores[pattern_id] = score1 + score2 + score3
+    ranked = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
+    top_k = ranked[:FINAL_TOP_K]
+    # ===================== 输出结果 =====================
+    print("=" * 80)
+    print(f"📊 召回结果 Top-{FINAL_TOP_K}")
+    print("=" * 80)
+    for rank, (pattern_id, final_score) in enumerate(top_k, 1):
+        pattern_info = pattern_map.get(pattern_id, {})
+        score1 = path1_scores.get(pattern_id, 0.0) * PATH1_WEIGHT
+        score2 = path2_scores.get(pattern_id, 0.0) * PATH2_WEIGHT
+        score3 = path3_scores.get(pattern_id, 0.0) * PATH3_WEIGHT
+        print(f"\n【Rank {rank}】 {pattern_id}")
+        print(f"  名称: {pattern_info.get('name', 'N/A')}")
+        print(f"  最终得分: {final_score:.4f}")
+        if final_score > 0:
+            print(f"  - 路径1 (相似Idea):   {score1:.4f} (占比 {score1/final_score*100:.1f}%)")
+            print(f"  - 路径2 (领域相关):   {score2:.4f} (占比 {score2/final_score*100:.1f}%)")
+            print(f"  - 路径3 (相似Paper):  {score3:.4f} (占比 {score3/final_score*100:.1f}%)")
+        print(f"  聚类大小: {pattern_info.get('size', 0)} 篇论文")
+        # V3版本: 优先显示LLM增强的总结，否则显示原始示例
+        if pattern_info.get('llm_enhanced_summary'):
+            llm_summary = pattern_info['llm_enhanced_summary'].get('representative_ideas', '')
+            print(f"  归纳总结: {llm_summary[:120]}...")
+        else:
+            summary = pattern_info.get('summary', {})
+            ideas = summary.get('representative_ideas', [])
+            if ideas:
+                print(f"  示例Idea: {ideas[0][:120] if ideas else 'N/A'}...")
+    print("\n" + "=" * 80)
+    print("✅ 召回完成!")
+    print("=" * 80)
+if __name__ == '__main__':
+    try:
+        main()
+    except Exception as e:
+        print(f"\n❌ 错误: {e}")
+        import traceback
+        traceback.print_exc()

Paper-KG-Pipeline/scripts/dev/compare_pipeline_result.py ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/usr/bin/env python3
+import argparse
+import json
+from pathlib import Path
+from typing import Any, List, Tuple
+IGNORE_KEYS = {
+    "run_id",
+    "results_dir",
+    "log_dir",
+    "run_log_dir",
+    "report_path",
+    "output_path",
+    "created_at",
+    "started_at",
+    "ended_at",
+    "duration",
+    "duration_ms",
+    "elapsed",
+    "elapsed_ms",
+    "timestamp",
+    "ts",
+    "time",
+}
+def _normalize(obj: Any) -> Any:
+    if isinstance(obj, dict):
+        out = {}
+        for k, v in obj.items():
+            if k in IGNORE_KEYS:
+                continue
+            out[k] = _normalize(v)
+        return out
+    if isinstance(obj, list):
+        return [_normalize(v) for v in obj]
+    return obj
+def _diff(a: Any, b: Any, path: str = "") -> List[str]:
+    diffs: List[str] = []
+    if type(a) != type(b):
+        diffs.append(f"{path}: type {type(a).__name__} != {type(b).__name__}")
+        return diffs
+    if isinstance(a, dict):
+        a_keys = set(a.keys())
+        b_keys = set(b.keys())
+        for k in sorted(a_keys - b_keys):
+            diffs.append(f"{path}/{k}: missing in B")
+        for k in sorted(b_keys - a_keys):
+            diffs.append(f"{path}/{k}: extra in B")
+        for k in sorted(a_keys & b_keys):
+            diffs.extend(_diff(a[k], b[k], f"{path}/{k}"))
+        return diffs
+    if isinstance(a, list):
+        if len(a) != len(b):
+            diffs.append(f"{path}: len {len(a)} != {len(b)}")
+        for i, (va, vb) in enumerate(zip(a, b)):
+            diffs.extend(_diff(va, vb, f"{path}[{i}]"))
+        return diffs
+    if a != b:
+        diffs.append(f"{path}: {a!r} != {b!r}")
+    return diffs
+def load_json(path: Path) -> Any:
+    return json.loads(path.read_text(encoding="utf-8"))
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--a", required=True, help="baseline pipeline_result.json")
+    parser.add_argument("--b", required=True, help="after pipeline_result.json")
+    args = parser.parse_args()
+    a = _normalize(load_json(Path(args.a)))
+    b = _normalize(load_json(Path(args.b)))
+    diffs = _diff(a, b, "")
+    if diffs:
+        print("DIFF FOUND:")
+        for d in diffs[:200]:
+            print("-", d)
+        print(f"Total diffs: {len(diffs)}")
+        raise SystemExit(1)
+    print("OK: normalized results match")
+if __name__ == "__main__":
+    main()

Paper-KG-Pipeline/scripts/dev/verify_recall_equivalence.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import argparse
+import sys
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+from idea2paper.recall.recall_system import RecallSystem
+def _as_id_score_list(items, score_idx=1):
+    out = []
+    for item in items:
+        if isinstance(item, (list, tuple)) and len(item) > score_idx:
+            out.append((item[0], float(item[score_idx])))
+    return out
+def _compare_lists(name, a, b, tol=1e-6):
+    if len(a) != len(b):
+        raise AssertionError(f"{name} length mismatch: {len(a)} vs {len(b)}")
+    for i, (xa, xb) in enumerate(zip(a, b)):
+        if xa[0] != xb[0]:
+            raise AssertionError(f"{name} id mismatch at {i}: {xa[0]} vs {xb[0]}")
+        if abs(xa[1] - xb[1]) > tol:
+            raise AssertionError(f"{name} score mismatch at {i}: {xa[1]} vs {xb[1]}")
+def run_once(rs: RecallSystem, user_idea: str):
+    results = rs.recall(user_idea, verbose=False)
+    path1_coarse = _as_id_score_list(getattr(rs, "_last_path1_candidates", []), score_idx=1)
+    path1_fine = _as_id_score_list(getattr(rs, "_last_path1_top_ideas", []), score_idx=1)
+    path3_coarse_raw = getattr(rs, "_last_path3_candidates", [])
+    path3_coarse = _as_id_score_list(path3_coarse_raw, score_idx=1)
+    path3_fine_raw = getattr(rs, "_last_path3_top_papers", [])
+    path3_fine = []
+    for item in path3_fine_raw:
+        if isinstance(item, (list, tuple)) and len(item) >= 4:
+            path3_fine.append((item[0], float(item[1]), float(item[2]), float(item[3])))
+    final_top = [(pid, float(score)) for pid, _info, score in results]
+    return {
+        "path1_coarse": path1_coarse,
+        "path1_fine": path1_fine,
+        "path3_coarse": path3_coarse,
+        "path3_fine": path3_fine,
+        "final_top": final_top,
+    }
+def compare(baseline, optimized, tol=1e-6):
+    _compare_lists("path1_coarse", baseline["path1_coarse"], optimized["path1_coarse"], tol)
+    _compare_lists("path1_fine", baseline["path1_fine"], optimized["path1_fine"], tol)
+    _compare_lists("path3_coarse", baseline["path3_coarse"], optimized["path3_coarse"], tol)
+    if len(baseline["path3_fine"]) != len(optimized["path3_fine"]):
+        raise AssertionError("path3_fine length mismatch")
+    for i, (a, b) in enumerate(zip(baseline["path3_fine"], optimized["path3_fine"])):
+        if a[0] != b[0]:
+            raise AssertionError(f"path3_fine id mismatch at {i}: {a[0]} vs {b[0]}")
+        for j in range(1, 4):
+            if abs(a[j] - b[j]) > tol:
+                raise AssertionError(f"path3_fine value mismatch at {i}:{j} {a[j]} vs {b[j]}")
+    _compare_lists("final_top", baseline["final_top"], optimized["final_top"], tol)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--idea", default="test idea")
+    parser.add_argument("--use-offline-index", action="store_true", default=False)
+    parser.add_argument("--tol", type=float, default=1e-6)
+    args = parser.parse_args()
+    base = RecallSystem()
+    base._use_embed_batch = False
+    base._use_token_cache = False
+    base._use_offline_index = False
+    opt = RecallSystem()
+    opt._use_embed_batch = True
+    opt._use_token_cache = True
+    opt._use_offline_index = bool(args.use_offline_index)
+    baseline = run_once(base, args.idea)
+    optimized = run_once(opt, args.idea)
+    compare(baseline, optimized, args.tol)
+    print("PASS: recall outputs are equivalent")
+if __name__ == "__main__":
+    main()

Paper-KG-Pipeline/scripts/extract_paper_review.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env python3
+from pathlib import Path
+import runpy
+# Compatibility wrapper (scripts/ -> scripts/tools)
+runpy.run_path(str(Path(__file__).parent / "tools" / "extract_paper_review.py"), run_name="__main__")

Paper-KG-Pipeline/scripts/extract_patterns_ICLR_en_local.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# extract_patterns_100.py
+import os, json, time
+from typing import Any, Dict, List
+from tqdm import tqdm
+from pathlib import Path
+from openai import OpenAI
+# ===== HF dataset =====
+DATASET_NAME = "AgentAlphaAGI/Paper-Review-Dataset"
+SPLIT = "train"
+N = int(os.getenv("KG_EXTRACT_N", "0"))  # 0 = process all
+# ===== LLM Model =====
+MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")  # 你可改成 gpt-4.1 / gpt-4o 等
+# 获取项目根目录 (知识图谱Pipeline)
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+# ===== Local input file (downloaded JSONL) =====
+INPUT_PATH = os.getenv(
+    "INPUT_JSONL_PATH",
+    str(PROJECT_ROOT / "data" / "ICLR_merged_cleaned_huggingface.jsonl")
+)
+# ===== Output file =====
+# 输出路径
+OUTPUT_DIR = PROJECT_ROOT / "output"
+OUT_PATH = OUTPUT_DIR / "iclr_patterns_full.jsonl"
+# ====== English Prompt ======
+PROMPT_TEMPLATE = r"""
+【Role】
+You are a “Top-Conference Research Pattern Extractor”.
+Your task is NOT to summarize the paper, but to extract the reusable research narration framework that turns ordinary techniques into a convincing top-tier contribution through framing, conceptual packaging, and evidence design.
+【Task Objective】
+Given a paper’s title, keywords, and abstract, extract the paper’s Research Pattern core framework:
+⟨Base Problem, Solution Pattern, Story⟩,
+and identify the paper’s core idea.
+This extraction is used to learn how top papers package methods into high-impact research narratives. The Story field is the most important.
+【Key Definitions】
+1) Research Pattern = ⟨Base Problem, Solution Pattern, Story⟩
+- Base Problem:
+  The concrete, actionable pain point in a specific scenario that the paper targets.
+  Must be specific and grounded. Avoid vague phrasing such as “improves performance” without context.
+- Solution Pattern:
+  The core technical route that solves the Base Problem.
+  Explicitly describe key components, pipeline structure, and the mechanism of improvement.
+- Story (MOST IMPORTANT):
+  The conceptual packaging and narrative framing that makes the work look novel, forward-looking, and impactful.
+  Focus on narrative devices such as reframing the problem, introducing a new lens, elevating engineering issues into research questions, and highlighting long-term implications.
+2) idea (Paper-level, OPTIONAL)
+- A concise 1–2 sentence description of the paper’s key innovation or central insight.
+- This should capture what is fundamentally new or different at a high level, without technical detail.
+- If the core idea cannot be clearly stated, omit this field.
+3) Taxonomy Fields (for retrieval)
+- domain (string):
+  One primary top-level field (Level-1). Must be a broad discipline label.
+  Examples: "Machine Learning", "Computer Vision", "Natural Language Processing", "Systems", "Security & Privacy", "Fairness & Accountability".
+- sub_domains (array of strings):
+  Level-2 tags under the chosen domain. 2–5 items preferred.
+  These should be specific and retrieval-friendly.
+  Rules:
+  - sub_domains must be consistent with the chosen domain
+  - avoid repeating the domain name itself
+4) application (string)
+Concrete deployable scenarios implied by the paper.
+【Hard Output Constraints】
+- Output STRICT JSON only. No extra text, no Markdown, no comments.
+- All values must be in English, concise, academic.
+- paper_id must be: {paper_info['paper_id']}
+- paper_title must be the input title; if missing use "N/A"
+- idea is OPTIONAL
+- domain must be a non-empty string
+- sub_domains must be a non-empty array (at least 1 item)
+- research_patterns must be a non-empty array (at least 1 object)
+- No field inside research_patterns is allowed to be empty.
+【Output Schema】
+{
+  "paper_id": "{paper_info['paper_id']}",
+  "paper_title": "paper title",
+  "idea": "concise 1–2 sentence description of the paper’s key innovation or core insight",
+  "domain": "Level-1 domain",
+  "sub_domains": ["Level-2 tag 1", "Level-2 tag 2"],
+  "research_patterns": [
+    {
+      "base_problem": "concrete pain point in a specific scenario",
+      "solution_pattern": "core technical route (components + workflow + mechanism)",
+      "story": "conceptual packaging that makes the work look novel and high-impact",
+      "application": "deployable scenarios"
+    }
+  ]
+}
+========================
+【Few-shot Example 1】
+【Input】
+- paper_title: Research on Self-Evolution of Intelligent Agents Based on Reflect+Memory
+- keywords: Intelligent Agent, Self-Evolution, Memory Mechanism
+- abstract: Existing agents fail to retain experience after task execution, repeatedly making the same mistakes in similar tasks. This work introduces a Reflect module and a long-term Memory module to store, summarize, and retrieve experience for improved task execution over time.
+【Output】
+{
+  "paper_id": "ARR_2022_106",
+  "paper_title": "Research on Self-Evolution of Intelligent Agents Based on Reflect+Memory",
+  "idea": "Enable intelligent agents to continuously improve by accumulating, summarizing, and reusing task experience through a reflection-plus-memory architecture",
+  "domain": "Artificial Intelligence",
+  "sub_domains": ["Agentic Systems", "Memory-Augmented Models", "Reflection", "Experience Retrieval"],
+  "research_patterns": [
+    {
+      "base_problem": "Task-executing agents do not accumulate reusable experience, causing repeated failures and stagnant capability when facing recurring task families",
+      "solution_pattern": "Augment the agent architecture with a reflection module that converts trajectories into distilled lessons, store them in long-term memory, and retrieve relevant lessons during inference to guide future decisions",
+      "story": "Reframe agents from one-shot executors into a self-evolving paradigm where accumulated experience becomes a scalable capability multiplier, enabling sustained improvement and long-horizon autonomy",
+      "application": "Customer-support automation with iterative playbook refinement, autonomous operations incident handling, robotics skill transfer across tasks, long-horizon decision-making assistants"
+    }
+  ]
+}
+========================
+【Few-shot Example 2】
+【Input】
+- paper_title: Quantifying and Mitigating the Impact of Label Errors on Model Disparity Metrics
+- keywords: Fairness, Label Noise, Influence Function, Disparity Metrics
+- abstract: Existing fairness evaluation methods assume reliable labels, but real-world data often contains label errors that disproportionately affect different groups. This work analyzes label noise impact on disparity metrics and proposes an influence-function-based method to identify and correct high-impact label errors.
+【Output】
+{
+  "paper_id": "ICLR_2023_089",
+  "paper_title": "Quantifying and Mitigating the Impact of Label Errors on Model Disparity Metrics",
+  "idea": "Diagnose and mitigate fairness failures by analyzing how individual label errors influence group-level disparity metrics",
+  "domain": "Fairness & Accountability",
+  "sub_domains": ["Label Noise", "Influence Functions", "Disparity Metrics", "Model Auditing"],
+  "research_patterns": [
+    {
+      "base_problem": "Fairness evaluation becomes unreliable in the presence of label noise, systematically distorting disparity metrics and disproportionately harming minority groups",
+      "solution_pattern": "Extend influence functions from loss-based analysis to group disparity metrics in order to quantify the effect of individual label perturbations and prioritize high-impact label corrections",
+      "story": "Reframe fairness from a model optimization problem into an auditing and reliability problem, introducing a principled framework for diagnosing and correcting data-induced fairness failures",
+      "application": "Fairness auditing in high-stakes decision systems, robustness evaluation under noisy labels, automated data quality inspection pipelines"
+    }
+  ]
+}
+========================
+【Now Process This Paper】
+- paper_title: {paper_info['paper_title']}
+- keywords: {paper_info['keywords']}
+- abstract: {paper_info['abstract']}
+Return STRICT JSON only.
+"""
+def build_paper_info(row: Dict[str, Any]) -> Dict[str, Any]:
+    # HF 数据集字段：title/authors/abstract/pdf_url/source_url/id/related_notes/year/conference/content/content_meta
+    # keywords：数据集中没有单独列，这里先留空数组（后续你可加 keyphrase 模块自动生成）
+    return {
+        "paper_id": row.get("id", "") or "",
+        "paper_title": row.get("title", "") or "无",
+        "keywords": [],  # <- 先空
+        "abstract": (row.get("abstract", "") or "").strip(),
+        "source_url": row.get("source_url", ""),
+        "pdf_url": row.get("pdf_url", ""),
+        "year": str(row.get("year", "")),
+        "conference": row.get("conference", ""),
+    }
+def render_prompt(paper_info: Dict[str, Any]) -> str:
+    # 这里用最简单的字符串替换；如果你担心 prompt 里有花括号冲突，可用更稳健的模板引擎
+    prompt = PROMPT_TEMPLATE
+    prompt = prompt.replace("{paper_info['paper_id']}", paper_info["paper_id"])
+    prompt = prompt.replace("{paper_info['paper_title']}", paper_info["paper_title"])
+    prompt = prompt.replace("{paper_info['keywords']}", ", ".join(paper_info["keywords"]) if paper_info["keywords"] else "无")
+    prompt = prompt.replace("{paper_info['abstract']}", paper_info["abstract"])
+    return prompt
+def call_llm(client: OpenAI, prompt: str) -> dict:
+    resp = client.chat.completions.create(
+        model=MODEL,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.2,
+    )
+    text = resp.choices[0].message.content.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        # 兜底：抽取 JSON 主体
+        start = text.find("{")
+        end = text.rfind("}")
+        if start != -1 and end != -1:
+            return json.loads(text[start:end+1])
+        raise
+# ===== Local input file (downloaded JSONL) =====
+INPUT_PATH = os.getenv(
+    "INPUT_JSONL_PATH",
+    str(PROJECT_ROOT / "data" / "ICLR_merged_cleaned_huggingface.jsonl")
+)
+def iter_jsonl(path: str):
+    """Yield dict per line from a local JSONL file."""
+    with open(path, "r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                yield json.loads(line)
+            except Exception as e:
+                # If a line is corrupted, skip it but keep a traceable warning
+                print(f"[WARN] bad json at line {line_no}: {e}")
+                continue
+def load_done_ids(out_path: Path) -> set:
+    """Resume key: treat both success and error records as done."""
+    done = set()
+    if not out_path.exists():
+        return done
+    with open(out_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except Exception:
+                continue
+            pid = obj.get("paper_id")
+            if pid:
+                done.add(pid)
+    return done
+def main():
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise RuntimeError("Missing OPENAI_API_KEY env var")
+    base_url = os.getenv("OPENAI_BASE_URL") or None
+    client = OpenAI(api_key=api_key, base_url=base_url)
+    # Ensure output directory exists
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    # Resume: read already processed ids
+    done_ids = load_done_ids(OUT_PATH)
+    print(f"[resume] already processed: {len(done_ids)}")
+    print(f"[input] local jsonl: {INPUT_PATH}")
+    newly_written = 0
+    skipped = 0
+    seen = 0
+    # Append mode for resume
+    with open(OUT_PATH, "a", encoding="utf-8") as f:
+        for row in tqdm(iter_jsonl(INPUT_PATH), desc="Extracting patterns (local+resume)"):
+            seen += 1
+            paper_info = build_paper_info(row)
+            paper_id = paper_info.get("paper_id", "")
+            if not paper_id:
+                skipped += 1
+                continue
+            if paper_id in done_ids:
+                skipped += 1
+                continue
+            # Stop after writing N new records (0 = no limit)
+            if N > 0 and newly_written >= N:
+                break
+            prompt = render_prompt(paper_info)
+            last_err = None
+            ok = False
+            for attempt in range(3):
+                try:
+                    obj = call_llm(client, prompt)
+                    f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+                    f.flush()
+                    done_ids.add(paper_id)
+                    newly_written += 1
+                    ok = True
+                    break
+                except Exception as e:
+                    last_err = e
+                    time.sleep(2.0 * (attempt + 1))
+            if not ok:
+                err = {
+                    "paper_id": paper_id,
+                    "paper_title": paper_info.get("paper_title", "N/A"),
+                    "error": str(last_err),
+                }
+                f.write(json.dumps(err, ensure_ascii=False) + "\n")
+                f.flush()
+                # Mark as done to avoid infinite loop on the same paper
+                done_ids.add(paper_id)
+                newly_written += 1
+    print(f"[done] scanned={seen}, newly_written={newly_written}, skipped={skipped}, out={OUT_PATH}")
+if __name__ == "__main__":
+    main()

Paper-KG-Pipeline/scripts/generate_clusters.py ADDED Viewed

	@@ -0,0 +1,788 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+analyze_clusters.py
+Purpose
+- Flatten paper JSONL -> pattern records
+- Embed pattern text (Story-centric by default)
+- UMAP + HDBSCAN clustering
+- Compute cluster coherence metrics
+- Fit Zipf (rank-size) stats
+- LLM-based concise cluster naming (instead of top-words)
+- Auto-tier clusters (A/B/C) and write tier_A/B/C.jsonl
+- Generate report.md with Zipf + noise share + Top-10 table
+Input JSONL format (each line):
+{
+  "paper_id": "...",
+  "paper_title": "...",
+  "idea": "...",
+  "domain": "...",
+  "sub_domains": [...],
+  "research_patterns": [
+    {"base_problem": "...", "solution_pattern": "...", "story": "...", "application": "..."}
+  ]
+}
+Outputs (in --outdir):
+- patterns_flat.jsonl               (flattened pattern records)
+- embeddings.npy                    (float32 matrix)
+- assignments.jsonl                 (pattern -> cluster labels)
+- clusters.jsonl                    (cluster-level summary, incl. coherence + llm name)
+- cluster_library.jsonl             (RAG-ready cluster objects w/ exemplars)
+- tier_A.jsonl / tier_B.jsonl / tier_C.jsonl
+- report.md
+usage:
+python analyze_clusters.py \
+  --input your_extracted_papers.jsonl \
+  --outdir output \
+  --sbert_model sentence-transformers/all-MiniLM-L6-v2 \
+  --llm_name \
+  --llm_model gpt-4.1-mini
+"""
+from __future__ import annotations
+import os
+import re
+import json
+import math
+import time
+import argparse
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+# Optional deps
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception:
+    SentenceTransformer = None
+try:
+    import umap
+except Exception:
+    umap = None
+try:
+    import hdbscan
+except Exception:
+    hdbscan = None
+# ----------------------------
+# IO utils
+# ----------------------------
+def read_jsonl(path: str) -> List[Dict[str, Any]]:
+    out = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+    return out
+def write_jsonl(path: str, rows: List[Dict[str, Any]]) -> None:
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def write_text(path: str, text: str) -> None:
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(text)
+# ----------------------------
+# Math utils
+# ----------------------------
+def l2_normalize(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
+    n = np.linalg.norm(x, axis=1, keepdims=True)
+    return x / np.clip(n, eps, None)
+def cosine_sim_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray:
+    # expects rows normalized
+    return A @ B.T
+def safe_mean(xs: List[float]) -> float:
+    return float(np.mean(xs)) if xs else float("nan")
+# ----------------------------
+# Flatten papers -> patterns
+# ----------------------------
+def ensure_list(x: Any) -> List[Any]:
+    if x is None:
+        return []
+    if isinstance(x, list):
+        return x
+    return [x]
+def flatten_papers_to_patterns(raw: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    items: List[Dict[str, Any]] = []
+    pid_counter = 0
+    for paper in raw:
+        paper_id = paper.get("paper_id") or paper.get("id") or ""
+        paper_title = paper.get("paper_title") or paper.get("title") or ""
+        idea = (paper.get("idea") or "").strip()
+        domain = (paper.get("domain") or "待明确领域").strip()
+        sub_domains = ensure_list(paper.get("sub_domains"))
+        sub_domains = [str(s).strip() for s in sub_domains if str(s).strip()]
+        patterns = paper.get("research_patterns") or []
+        for j, rp in enumerate(patterns):
+            base_problem = (rp.get("base_problem") or "").strip()
+            solution_pattern = (rp.get("solution_pattern") or "").strip()
+            story = (rp.get("story") or "").strip()
+            application = (rp.get("application") or "").strip()
+            items.append({
+                "paper_id": paper_id,
+                "paper_title": paper_title,
+                "pattern_id": f"p{j}",
+                "global_pattern_id": f"g{pid_counter}",
+                "idea": idea,
+                "domain": domain,
+                "sub_domains": sub_domains,
+                "base_problem": base_problem,
+                "solution_pattern": solution_pattern,
+                "story": story,
+                "application": application,
+            })
+            pid_counter += 1
+    return items
+def build_text(item: Dict[str, Any], template: str) -> str:
+    # Keep missing keys as empty strings
+    def g(k: str) -> str:
+        v = item.get(k, "")
+        if isinstance(v, list):
+            return ", ".join([str(x) for x in v])
+        return str(v)
+    return template.format(
+        story=g("story"),
+        base_problem=g("base_problem"),
+        solution_pattern=g("solution_pattern"),
+        idea=g("idea"),
+        domain=g("domain"),
+        sub_domains=g("sub_domains"),
+        application=g("application"),
+        paper_title=g("paper_title"),
+        paper_id=g("paper_id"),
+        pattern_id=g("pattern_id"),
+        global_pattern_id=g("global_pattern_id"),
+    ).strip()
+# ----------------------------
+# Embedding
+# ----------------------------
+def embed_texts_sbert(texts: List[str], model_name: str, batch_size: int = 64) -> np.ndarray:
+    if SentenceTransformer is None:
+        raise RuntimeError("sentence-transformers is not installed. pip install sentence-transformers")
+    model = SentenceTransformer(model_name)
+    emb = model.encode(
+        texts,
+        batch_size=batch_size,
+        show_progress_bar=True,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+    )
+    return emb.astype(np.float32)
+# ----------------------------
+# Clustering
+# ----------------------------
+def run_umap_hdbscan(
+    X: np.ndarray,
+    umap_neighbors: int,
+    umap_components: int,
+    umap_min_dist: float,
+    hdb_min_cluster_size: int,
+    hdb_min_samples: int,
+    random_state: int = 42,
+) -> Tuple[np.ndarray, np.ndarray]:
+    if umap is None:
+        raise RuntimeError("umap-learn is not installed. pip install umap-learn")
+    if hdbscan is None:
+        raise RuntimeError("hdbscan is not installed. pip install hdbscan")
+    reducer = umap.UMAP(
+        n_neighbors=umap_neighbors,
+        n_components=umap_components,
+        min_dist=umap_min_dist,
+        metric="cosine",
+        random_state=random_state,
+    )
+    Z = reducer.fit_transform(X)
+    clusterer = hdbscan.HDBSCAN(
+        min_cluster_size=hdb_min_cluster_size,
+        min_samples=hdb_min_samples,
+        metric="euclidean",
+        cluster_selection_method="eom",
+    )
+    labels = clusterer.fit_predict(Z)
+    probs = getattr(clusterer, "probabilities_", np.ones(len(labels), dtype=np.float32))
+    return labels.astype(int), probs.astype(np.float32)
+# ----------------------------
+# Coherence
+# ----------------------------
+@dataclass
+class CoherenceStats:
+    centroid_mean: float
+    centroid_p25: float
+    centroid_p50: float
+    centroid_p75: float
+    pairwise_sample_mean: float
+    pairwise_sample_p50: float
+def compute_cluster_coherence(
+    Xn: np.ndarray,
+    idxs: np.ndarray,
+    pairwise_sample_n: int = 120,
+    rng: Optional[np.random.Generator] = None,
+) -> CoherenceStats:
+    """
+    Xn: normalized embeddings (N, d)
+    idxs: indices of members in this cluster
+    """
+    if rng is None:
+        rng = np.random.default_rng(42)
+    V = Xn[idxs]
+    if V.shape[0] == 0:
+        return CoherenceStats(float("nan"), float("nan"), float("nan"), float("nan"), float("nan"), float("nan"))
+    centroid = V.mean(axis=0, keepdims=True)
+    centroid = centroid / np.clip(np.linalg.norm(centroid), 1e-12, None)
+    sims_to_centroid = (V @ centroid.T).reshape(-1)
+    centroid_mean = float(np.mean(sims_to_centroid))
+    centroid_p25 = float(np.quantile(sims_to_centroid, 0.25))
+    centroid_p50 = float(np.quantile(sims_to_centroid, 0.50))
+    centroid_p75 = float(np.quantile(sims_to_centroid, 0.75))
+    # Pairwise (sampled) coherence
+    m = V.shape[0]
+    if m < 2:
+        pw_mean = float("nan")
+        pw_p50 = float("nan")
+    else:
+        k = min(pairwise_sample_n, m)
+        sample = rng.choice(m, size=k, replace=False)
+        Vs = V[sample]
+        S = Vs @ Vs.T  # (k,k), cosine since normalized
+        triu = S[np.triu_indices(k, k=1)]
+        pw_mean = float(np.mean(triu)) if triu.size else float("nan")
+        pw_p50 = float(np.quantile(triu, 0.50)) if triu.size else float("nan")
+    return CoherenceStats(
+        centroid_mean=centroid_mean,
+        centroid_p25=centroid_p25,
+        centroid_p50=centroid_p50,
+        centroid_p75=centroid_p75,
+        pairwise_sample_mean=pw_mean,
+        pairwise_sample_p50=pw_p50,
+    )
+# ----------------------------
+# Zipf fit
+# ----------------------------
+@dataclass
+class ZipfStats:
+    alpha: float
+    r2: float
+    topk_share: Dict[str, float]
+def fit_zipf(cluster_sizes_desc: List[int], topk_list: List[int]) -> ZipfStats:
+    """
+    Fit log(size) = a + b*log(rank), alpha = -b (positive)
+    """
+    sizes = np.array(cluster_sizes_desc, dtype=np.float64)
+    ranks = np.arange(1, len(sizes) + 1, dtype=np.float64)
+    x = np.log(ranks)
+    y = np.log(np.clip(sizes, 1e-12, None))
+    # Linear regression
+    b, a = np.polyfit(x, y, 1)
+    yhat = a + b * x
+    ss_res = float(np.sum((y - yhat) ** 2))
+    ss_tot = float(np.sum((y - np.mean(y)) ** 2))
+    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else float("nan")
+    alpha = float(-b)
+    total = float(np.sum(sizes))
+    topk_share = {}
+    for k in topk_list:
+        topk_share[str(k)] = float(np.sum(sizes[:k]) / total) if total > 0 else float("nan")
+    return ZipfStats(alpha=alpha, r2=r2, topk_share=topk_share)
+# ----------------------------
+# Cluster naming via LLM (concise)
+# ----------------------------
+def _truncate(s: str, max_chars: int) -> str:
+    s = re.sub(r"\s+", " ", s).strip()
+    return s[:max_chars]
+def llm_cluster_name(
+    exemplars: List[Dict[str, Any]],
+    model: str,
+    api_base: Optional[str] = None,
+    temperature: float = 0.2,
+    max_retries: int = 3,
+    sleep_s: float = 0.8,
+) -> str:
+    """
+    Uses OpenAI-compatible Chat Completions (no response_format).
+    Requires OPENAI_API_KEY in environment.
+    """
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise RuntimeError("OPENAI_API_KEY is not set. Set it to enable LLM cluster naming.")
+    # Lazy import to avoid hard dependency
+    try:
+        from openai import OpenAI
+    except Exception as e:
+        raise RuntimeError("openai package not installed. pip install openai") from e
+    client_kwargs = {}
+    if api_base:
+        client_kwargs["base_url"] = api_base
+    client = OpenAI(api_key=api_key, **client_kwargs)
+    # Build a compact prompt: story-first
+    lines = []
+    for i, ex in enumerate(exemplars[:8]):
+        story = _truncate(ex.get("story", ""), 220)
+        bp = _truncate(ex.get("base_problem", ""), 180)
+        sol = _truncate(ex.get("solution_pattern", ""), 180)
+        dom = ex.get("domain", "")
+        subs = ex.get("sub_domains", [])
+        subs_s = ", ".join(subs) if isinstance(subs, list) else str(subs)
+        lines.append(f"- Ex{i+1} Domain: {dom} | Sub: {subs_s}\n  BaseProblem: {bp}\n  Story: {story}\n  Solution: {sol}")
+    prompt = (
+    "You are labeling a cluster of research narrative/pattern exemplars extracted from top-tier machine learning papers.\n"
+    "Task: produce ONE concise English cluster name (3–6 words) that captures the shared research narrative or pattern.\n"
+    "Constraints:\n"
+    "1) The name MUST be in English.\n"
+    "2) Avoid vague or generic words such as 'method', 'framework', 'model', 'approach', 'improvement', or 'optimization'.\n"
+    "3) Prefer a distinctive *research story* angle, such as problem reframing, assumption removal, auditability, robustness, reliability, scalability, or efficiency–generality trade-offs.\n"
+    "4) The name should sound like a top-conference research theme or paradigm, not a paper title.\n"
+    "5) Output ONLY the name, with no punctuation, quotes, or extra text.\n\n"
+    "Exemplars:\n"
+    + "\n".join(lines)
+)
+    for attempt in range(max_retries):
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                temperature=temperature,
+                messages=[
+                    {"role": "system", "content": "You are a precise research taxonomy assistant."},
+                    {"role": "user", "content": prompt},
+                ],
+            )
+            name = resp.choices[0].message.content.strip()
+            #name = re.sub(r"[\"'“”‘’`]", "", name)
+            #name = re.sub(r"\s+", "", name)
+            # Hard clamp
+            #if len(name) > 18:
+            #    name = name[:18]
+            if not name:
+                raise ValueError("Empty name from LLM.")
+            return name
+        except Exception:
+            if attempt == max_retries - 1:
+                raise
+            time.sleep(sleep_s * (attempt + 1))
+    raise RuntimeError("LLM naming failed unexpectedly.")
+# ----------------------------
+# Tiering + report
+# ----------------------------
+def assign_tiers(
+    clusters: List[Dict[str, Any]],
+    size_A: int,
+    size_B: int,
+    coh_A: float,
+    coh_B: float,
+    coh_field: str = "coherence_centroid_mean",
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+    A, B, C = [], [], []
+    for c in clusters:
+        if c.get("cluster_id") == -1:
+            continue
+        sz = int(c.get("size", 0))
+        coh = c.get(coh_field)
+        coh_val = float(coh) if coh is not None and not (isinstance(coh, float) and math.isnan(coh)) else float("nan")
+        if (sz >= size_A) and (not math.isnan(coh_val)) and (coh_val >= coh_A):
+            c["tier"] = "A"
+            A.append(c)
+        elif (sz >= size_B) and (not math.isnan(coh_val)) and (coh_val >= coh_B):
+            c["tier"] = "B"
+            B.append(c)
+        else:
+            c["tier"] = "C"
+            C.append(c)
+    return A, B, C
+def md_table(rows: List[List[str]], headers: List[str]) -> str:
+    out = []
+    out.append("| " + " | ".join(headers) + " |")
+    out.append("| " + " | ".join(["---"] * len(headers)) + " |")
+    for r in rows:
+        out.append("| " + " | ".join(r) + " |")
+    return "\n".join(out)
+def build_report_md(
+    total_patterns: int,
+    n_clusters_ex_noise: int,
+    noise_count: int,
+    zipf: ZipfStats,
+    top10: List[Dict[str, Any]],
+    tier_counts: Dict[str, int],
+) -> str:
+    noise_share = noise_count / total_patterns if total_patterns > 0 else float("nan")
+    top_rows = []
+    for c in top10:
+        cid = str(c["cluster_id"])
+        name = c.get("cluster_name", "")
+        sz = int(c.get("size", 0))
+        share = (sz / total_patterns) if total_patterns > 0 else float("nan")
+        coh = c.get("coherence_centroid_mean", float("nan"))
+        tier = c.get("tier", "")
+        top_rows.append([
+            cid,
+            name,
+            str(sz),
+            f"{share:.3f}",
+            (f"{coh:.3f}" if not (isinstance(coh, float) and math.isnan(coh)) else "nan"),
+            tier,
+        ])
+    return f"""# Cluster Analysis Report
+## Summary
+- Patterns: **{total_patterns}**
+- Clusters (excluding noise): **{n_clusters_ex_noise}**
+- Noise/outliers (-1): **{noise_count}**  (share: **{noise_share:.3%}**)
+## Zipf (rank-size)
+- alpha (rank-size slope): **{zipf.alpha:.3f}**
+- r2 (log-log fit): **{zipf.r2:.3f}**
+- topk_share: {json.dumps(zipf.topk_share, ensure_ascii=False)}
+## Tiers
+- Tier A: **{tier_counts.get("A", 0)}**
+- Tier B: **{tier_counts.get("B", 0)}**
+- Tier C: **{tier_counts.get("C", 0)}**
+## Top-10 Clusters
+{md_table(top_rows, ["cluster_id", "cluster_name", "size", "share", "coh", "tier"])}
+"""
+# ----------------------------
+# Main
+# ----------------------------
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--input", required=True, help="Input JSONL of papers (paper-level objects).")
+    ap.add_argument("--outdir", default="output", help="Output directory.")
+    ap.add_argument(
+        "--template",
+        default="Story: {story}\nBase Problem: {base_problem}\nSolution: {solution_pattern}\nIdea: {idea}",
+        help="Text template used for embedding.",
+    )
+    # Embedding
+    ap.add_argument("--embed_backend", choices=["sbert"], default="sbert")
+    ap.add_argument("--sbert_model", default="sentence-transformers/all-MiniLM-L6-v2")
+    ap.add_argument("--embed_batch_size", type=int, default=64)
+    # UMAP/HDBSCAN
+    ap.add_argument("--umap_neighbors", type=int, default=15)
+    ap.add_argument("--umap_components", type=int, default=5)
+    ap.add_argument("--umap_min_dist", type=float, default=0.0)
+    ap.add_argument("--hdb_min_cluster_size", type=int, default=15)
+    ap.add_argument("--hdb_min_samples", type=int, default=5)
+    # Coherence
+    ap.add_argument("--pairwise_sample_n", type=int, default=120)
+    # Zipf
+    ap.add_argument("--zipf_topk", default="1,3,5,10,20")
+    # LLM naming
+    ap.add_argument("--llm_name", action="store_true", help="Use LLM to generate concise cluster_name.")
+    ap.add_argument("--llm_model", default="gpt-4.1-mini")
+    ap.add_argument("--llm_api_base", default=None)
+    ap.add_argument("--llm_temperature", type=float, default=0.2)
+    # Tiering thresholds
+    ap.add_argument("--tier_size_A", type=int, default=30)
+    ap.add_argument("--tier_size_B", type=int, default=10)
+    ap.add_argument("--tier_coh_A", type=float, default=0.40)
+    ap.add_argument("--tier_coh_B", type=float, default=0.30)
+    args = ap.parse_args()
+    outdir = args.outdir
+    os.makedirs(outdir, exist_ok=True)
+    raw = read_jsonl(args.input)
+    patterns = flatten_papers_to_patterns(raw)
+    print(f"Patterns: {len(patterns)}")
+    # Save flattened patterns
+    flat_path = os.path.join(outdir, "patterns_flat.jsonl")
+    write_jsonl(flat_path, patterns)
+    # Build embed texts
+    texts = [build_text(p, args.template) for p in patterns]
+    # Embed
+    if args.embed_backend == "sbert":
+        X = embed_texts_sbert(texts, args.sbert_model, args.embed_batch_size)
+    else:
+        raise RuntimeError("Unsupported embed_backend")
+    # Ensure normalized (SBERT normalize_embeddings=True already, but keep safe)
+    Xn = l2_normalize(X)
+    np.save(os.path.join(outdir, "embeddings.npy"), Xn)
+    # Cluster
+    labels, probs = run_umap_hdbscan(
+        Xn,
+        umap_neighbors=args.umap_neighbors,
+        umap_components=args.umap_components,
+        umap_min_dist=args.umap_min_dist,
+        hdb_min_cluster_size=args.hdb_min_cluster_size,
+        hdb_min_samples=args.hdb_min_samples,
+    )
+    # Assignments
+    assignments = []
+    for p, lab, pr in zip(patterns, labels, probs):
+        assignments.append({
+            "paper_id": p.get("paper_id"),
+            "paper_title": p.get("paper_title"),
+            "global_pattern_id": p.get("global_pattern_id"),
+            "pattern_id": p.get("pattern_id"),
+            "domain": p.get("domain"),
+            "sub_domains": p.get("sub_domains"),
+            "cluster_id": int(lab),
+            "cluster_prob": float(pr),
+        })
+    write_jsonl(os.path.join(outdir, "assignments.jsonl"), assignments)
+    # Build cluster index
+    cluster_to_idxs: Dict[int, List[int]] = {}
+    for i, lab in enumerate(labels):
+        cluster_to_idxs.setdefault(int(lab), []).append(i)
+    # Cluster summaries (excluding noise for counts)
+    noise_count = len(cluster_to_idxs.get(-1, []))
+    cluster_ids = sorted([cid for cid in cluster_to_idxs.keys() if cid != -1])
+    print(f"Clusters (excluding noise): {len(cluster_ids)}")
+    print(f"Noise/outliers (-1): {noise_count}")
+    # Compute per-cluster coherence + facets + exemplars
+    rng = np.random.default_rng(42)
+    cluster_summaries = []
+    cluster_library = []
+    for cid in cluster_ids + ([-1] if -1 in cluster_to_idxs else []):
+        idxs = np.array(cluster_to_idxs[cid], dtype=int)
+        size = int(idxs.size)
+        # Coherence only meaningful for non-noise clusters; for -1 keep NaN
+        if cid != -1 and size > 0:
+            coh = compute_cluster_coherence(Xn, idxs, pairwise_sample_n=args.pairwise_sample_n, rng=rng)
+        else:
+            coh = CoherenceStats(float("nan"), float("nan"), float("nan"), float("nan"), float("nan"), float("nan"))
+        # Domain/sub_domain distribution
+        doms = [patterns[i].get("domain", "UNKNOWN") for i in idxs]
+        subs = []
+        for i in idxs:
+            sd = patterns[i].get("sub_domains", [])
+            if isinstance(sd, list):
+                subs.extend(sd)
+            elif sd:
+                subs.append(str(sd))
+        def top_counts(xs: List[str], k: int = 5) -> List[Tuple[str, int]]:
+            from collections import Counter
+            c = Counter([x for x in xs if x])
+            return c.most_common(k)
+        dom_top = top_counts(doms, 5)
+        sub_top = top_counts(subs, 8)
+        # Choose exemplars by highest membership prob (fallback random)
+        # For -1, pick random few
+        if cid != -1:
+            idxs_list = idxs.tolist()
+            idxs_list.sort(key=lambda i: probs[i], reverse=True)
+            exemplar_idxs = idxs_list[:10]
+        else:
+            exemplar_idxs = idxs.tolist()[:10]
+        exemplars = []
+        for i in exemplar_idxs:
+            exemplars.append({
+                "paper_id": patterns[i].get("paper_id"),
+                "paper_title": patterns[i].get("paper_title"),
+                "global_pattern_id": patterns[i].get("global_pattern_id"),
+                "domain": patterns[i].get("domain"),
+                "sub_domains": patterns[i].get("sub_domains"),
+                "idea": patterns[i].get("idea"),
+                "base_problem": patterns[i].get("base_problem"),
+                "solution_pattern": patterns[i].get("solution_pattern"),
+                "story": patterns[i].get("story"),
+                "application": patterns[i].get("application"),
+            })
+        # LLM name (only for non-noise clusters)
+        cluster_name = ""
+        if cid != -1 and args.llm_name:
+            cluster_name = llm_cluster_name(
+                exemplars=exemplars,
+                model=args.llm_model,
+                api_base=args.llm_api_base,
+                temperature=args.llm_temperature,
+            )
+        else:
+            # Placeholder if LLM naming disabled; keep deterministic but minimal
+            cluster_name = f"Cluster{cid}"
+        summary = {
+            "cluster_id": int(cid),
+            "cluster_name": cluster_name,
+            "size": size,
+            "coherence_centroid_mean": coh.centroid_mean,
+            "coherence_centroid_p25": coh.centroid_p25,
+            "coherence_centroid_p50": coh.centroid_p50,
+            "coherence_centroid_p75": coh.centroid_p75,
+            "coherence_pairwise_sample_mean": coh.pairwise_sample_mean,
+            "coherence_pairwise_sample_p50": coh.pairwise_sample_p50,
+            "domain_top": [{"domain": d, "count": n} for d, n in dom_top],
+            "sub_domain_top": [{"sub_domain": s, "count": n} for s, n in sub_top],
+        }
+        cluster_summaries.append(summary)
+        # Cluster library object (RAG-ready, excluding noise)
+        if cid != -1:
+            cluster_library.append({
+                "cluster_id": int(cid),
+                "cluster_name": cluster_name,
+                "size": size,
+                "retrieval_facets": {
+                    "domain": dom_top[0][0] if dom_top else "待明确领域",
+                    "sub_domains": [x["sub_domain"] for x in [{"sub_domain": s, "count": n} for s, n in sub_top[:5]]],
+                },
+                "coherence": {
+                    "centroid_mean": coh.centroid_mean,
+                    "centroid_p50": coh.centroid_p50,
+                    "pairwise_sample_mean": coh.pairwise_sample_mean,
+                    "pairwise_sample_p50": coh.pairwise_sample_p50,
+                },
+                "exemplars": exemplars[:6],
+            })
+    # Save clusters + library
+    write_jsonl(os.path.join(outdir, "clusters.jsonl"), cluster_summaries)
+    write_jsonl(os.path.join(outdir, "cluster_library.jsonl"), cluster_library)
+    # Also save a size-sorted version of cluster_library (desc by size)
+    sorted_cluster_library = sorted(
+        cluster_library,
+        key=lambda x: (-int(x.get("size", 0)), int(x.get("cluster_id", -1)))
+    )
+    write_jsonl(os.path.join(outdir, "cluster_library_sorted.jsonl"), sorted_cluster_library)
+    # Zipf stats (exclude noise)
+    sizes_desc = sorted([c["size"] for c in cluster_summaries if c["cluster_id"] != -1], reverse=True)
+    topk_list = [int(x.strip()) for x in args.zipf_topk.split(",") if x.strip()]
+    zipf = fit_zipf(sizes_desc, topk_list)
+    print("Zipf:")
+    print(f"  alpha (rank-size slope): {zipf.alpha}")
+    print(f"  r2 (log-log fit): {zipf.r2}")
+    print(f"  topk_share: {zipf.topk_share}")
+    # Tiering (exclude noise)
+    non_noise_clusters = [c for c in cluster_summaries if c["cluster_id"] != -1]
+    # Sort by size desc for reporting
+    non_noise_clusters.sort(key=lambda x: x["size"], reverse=True)
+    A, B, C = assign_tiers(
+        clusters=non_noise_clusters,
+        size_A=args.tier_size_A,
+        size_B=args.tier_size_B,
+        coh_A=args.tier_coh_A,
+        coh_B=args.tier_coh_B,
+        coh_field="coherence_centroid_mean",
+    )
+    write_jsonl(os.path.join(outdir, "tier_A.jsonl"), A)
+    write_jsonl(os.path.join(outdir, "tier_B.jsonl"), B)
+    write_jsonl(os.path.join(outdir, "tier_C.jsonl"), C)
+    tier_counts = {"A": len(A), "B": len(B), "C": len(C)}
+    # Top-10 table (by size)
+    top10 = non_noise_clusters[:10]
+    report_md = build_report_md(
+        total_patterns=len(patterns),
+        n_clusters_ex_noise=len(cluster_ids),
+        noise_count=noise_count,
+        zipf=zipf,
+        top10=top10,
+        tier_counts=tier_counts,
+    )
+    write_text(os.path.join(outdir, "report.md"), report_md)
+    print(f"Outputs written to: {outdir}/")
+if __name__ == "__main__":
+    main()

Paper-KG-Pipeline/scripts/generate_patterns_old.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env python3
+from pathlib import Path
+import runpy
+# Compatibility wrapper (scripts/ -> scripts/legacy)
+runpy.run_path(str(Path(__file__).parent / "legacy" / "generate_patterns_old.py"), run_name="__main__")

Paper-KG-Pipeline/scripts/idea2story_pipeline.py ADDED Viewed

	@@ -0,0 +1,687 @@

+"""
+Idea2Story Pipeline - 从用户 Idea 到可发表的 Paper Story
+实现流程:
+  Phase 1: Pattern Selection (策略选择)
+  Phase 2: Story Generation (结构化生成)
+  Phase 3: Multi-Agent Critic & Refine (评审与修正)
+  Phase 4: RAG Verification & Pivot (查重与规避)
+使用方法:
+  python scripts/idea2story_pipeline.py "你的Idea描述"
+"""
+import json
+import os
+import sys
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+# 提前加载 .env（确保 PipelineConfig 读取前生效）
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+REPO_ROOT = PROJECT_ROOT.parent
+SRC_DIR = PROJECT_ROOT / "src"
+if str(SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(SRC_DIR))
+try:
+    from idea2paper.infra.dotenv import load_dotenv
+    _DOTENV_STATUS = load_dotenv(REPO_ROOT / ".env", override=False)
+except Exception as _e:
+    _DOTENV_STATUS = {"loaded": 0, "path": str(REPO_ROOT / ".env"), "ok": False, "error": str(_e)}
+# 导入 Pipeline 模块
+try:
+    from pipeline import Idea2StoryPipeline, OUTPUT_DIR
+    from pipeline.config import (
+        LOG_ROOT,
+        ENABLE_RUN_LOGGING,
+        LOG_MAX_TEXT_CHARS,
+        REPO_ROOT,
+        RESULTS_ROOT,
+        RESULTS_ENABLE,
+        RESULTS_MODE,
+        RESULTS_KEEP_LOG,
+        NOVELTY_ENABLE,
+        NOVELTY_INDEX_DIR,
+        NOVELTY_INDEX_BUILD_BATCH_SIZE,
+        NOVELTY_INDEX_BUILD_RESUME,
+        NOVELTY_INDEX_BUILD_MAX_RETRIES,
+        NOVELTY_INDEX_BUILD_SLEEP_SEC,
+        NOVELTY_REQUIRE_EMBEDDING,
+        INDEX_DIR_MODE,
+        EMBEDDING_PROVIDER,
+        EMBEDDING_API_URL,
+    )
+    from pipeline.config import PipelineConfig
+    from idea2paper.infra.result_bundler import ResultBundler
+    from idea2paper.infra.index_preflight import (
+        validate_novelty_index,
+        validate_recall_index,
+        acquire_lock,
+    )
+    from idea2paper.infra.subdomain_taxonomy import (
+        validate_subdomain_taxonomy,
+        build_subdomain_taxonomy,
+        resolve_subdomain_taxonomy_paths,
+    )
+    from idea2paper.infra.embeddings import EMBEDDING_MODEL
+    from pipeline.run_logger import RunLogger
+    from pipeline.run_context import set_logger, reset_logger
+    from tools.build_novelty_index import build_novelty_index
+    from tools.build_recall_index import build_recall_index
+    from idea2paper.application.idea_packaging import IdeaPackager
+except ImportError:
+    # 如果直接运行脚本，尝试添加当前目录到 path
+    import os
+    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+    from pipeline import Idea2StoryPipeline, OUTPUT_DIR
+    from pipeline.config import (
+        LOG_ROOT,
+        ENABLE_RUN_LOGGING,
+        LOG_MAX_TEXT_CHARS,
+        REPO_ROOT,
+        RESULTS_ROOT,
+        RESULTS_ENABLE,
+        RESULTS_MODE,
+        RESULTS_KEEP_LOG,
+        NOVELTY_ENABLE,
+        NOVELTY_INDEX_DIR,
+        NOVELTY_INDEX_BUILD_BATCH_SIZE,
+        NOVELTY_INDEX_BUILD_RESUME,
+        NOVELTY_INDEX_BUILD_MAX_RETRIES,
+        NOVELTY_INDEX_BUILD_SLEEP_SEC,
+        NOVELTY_REQUIRE_EMBEDDING,
+        INDEX_DIR_MODE,
+        EMBEDDING_PROVIDER,
+        EMBEDDING_API_URL,
+    )
+    from pipeline.config import PipelineConfig
+    from idea2paper.infra.result_bundler import ResultBundler
+    from idea2paper.infra.index_preflight import (
+        validate_novelty_index,
+        validate_recall_index,
+        acquire_lock,
+    )
+    from idea2paper.infra.subdomain_taxonomy import (
+        validate_subdomain_taxonomy,
+        build_subdomain_taxonomy,
+        resolve_subdomain_taxonomy_paths,
+    )
+    from idea2paper.infra.embeddings import EMBEDDING_MODEL
+    from pipeline.run_logger import RunLogger
+    from pipeline.run_context import set_logger, reset_logger
+    from tools.build_novelty_index import build_novelty_index
+    from tools.build_recall_index import build_recall_index
+    from idea2paper.application.idea_packaging import IdeaPackager
+def _log_event(logger, event_type: str, payload: dict):
+    if logger:
+        logger.log_event(event_type, payload)
+def _recall_focus_score(recall_audit: dict | None) -> float:
+    if not recall_audit:
+        return 0.0
+    path2 = recall_audit.get("path2", {}) or {}
+    candidate_stats = path2.get("candidate_stats", []) or []
+    ratios = []
+    for stat in candidate_stats:
+        if not stat:
+            continue
+        before = int(stat.get("candidates_before", 0) or 0)
+        after = int(stat.get("candidates_after", 0) or 0)
+        if before > 0:
+            ratios.append((before - after) / float(before))
+    if not ratios:
+        return 0.0
+    return sum(ratios) / len(ratios)
+def _truncate_text(text: str, max_len: int = 800) -> str:
+    if not isinstance(text, str):
+        return text
+    return text if len(text) <= max_len else text[:max_len]
+def _shrink_brief(brief: dict | None, max_len: int = 600) -> dict | None:
+    if not isinstance(brief, dict):
+        return None
+    out = {}
+    for k, v in brief.items():
+        if isinstance(v, str):
+            out[k] = _truncate_text(v, max_len)
+        elif isinstance(v, list):
+            trimmed = []
+            for item in v[:5]:
+                if isinstance(item, str):
+                    trimmed.append(_truncate_text(item, max_len))
+                else:
+                    trimmed.append(item)
+            out[k] = trimmed
+        elif isinstance(v, dict):
+            sub = {}
+            for sk, sv in v.items():
+                if isinstance(sv, str):
+                    sub[sk] = _truncate_text(sv, max_len)
+                elif isinstance(sv, list):
+                    sub[sk] = [(_truncate_text(x, max_len) if isinstance(x, str) else x) for x in sv[:5]]
+                else:
+                    sub[sk] = sv
+            out[k] = sub
+        else:
+            out[k] = v
+    return out
+def ensure_required_indexes(logger=None):
+    if not PipelineConfig.INDEX_AUTO_PREPARE:
+        return
+    _log_event(logger, "index_preflight_start", {
+        "novelty_enable": NOVELTY_ENABLE,
+        "recall_use_offline_index": PipelineConfig.RECALL_USE_OFFLINE_INDEX,
+        "allow_build": PipelineConfig.INDEX_ALLOW_BUILD,
+        "index_dir_mode": INDEX_DIR_MODE,
+        "novelty_index_dir": str(NOVELTY_INDEX_DIR),
+        "recall_index_dir": str(PipelineConfig.RECALL_INDEX_DIR),
+        "embedding_provider": EMBEDDING_PROVIDER,
+        "embedding_api_url": EMBEDDING_API_URL,
+        "embedding_model": EMBEDDING_MODEL,
+    })
+    # Novelty index preflight
+    if NOVELTY_ENABLE:
+        nodes_paper_path = OUTPUT_DIR / "nodes_paper.json"
+        status = validate_novelty_index(NOVELTY_INDEX_DIR, nodes_paper_path, EMBEDDING_MODEL)
+        if status.get("ok"):
+            _log_event(logger, "index_preflight_ok", {"index": "novelty", "status": status})
+        else:
+            _log_event(logger, "index_preflight_failed", {"index": "novelty", "status": status})
+            if PipelineConfig.INDEX_ALLOW_BUILD:
+                lock_path = NOVELTY_INDEX_DIR / ".build.lock"
+                _log_event(logger, "index_preflight_build_start", {
+                    "index": "novelty",
+                    "index_dir": str(NOVELTY_INDEX_DIR),
+                })
+                with acquire_lock(lock_path):
+                    build_novelty_index(
+                        index_dir=NOVELTY_INDEX_DIR,
+                        batch_size=NOVELTY_INDEX_BUILD_BATCH_SIZE,
+                        resume=NOVELTY_INDEX_BUILD_RESUME,
+                        max_retries=NOVELTY_INDEX_BUILD_MAX_RETRIES,
+                        sleep_sec=NOVELTY_INDEX_BUILD_SLEEP_SEC,
+                        force_rebuild=False,
+                        logger=logger,
+                    )
+                status = validate_novelty_index(NOVELTY_INDEX_DIR, nodes_paper_path, EMBEDDING_MODEL)
+                _log_event(logger, "index_preflight_build_done", {"index": "novelty", "status": status})
+                if not status.get("ok") and NOVELTY_REQUIRE_EMBEDDING:
+                    raise RuntimeError("Novelty index build failed or incomplete. Please run build_novelty_index.py manually.")
+            else:
+                if NOVELTY_REQUIRE_EMBEDDING:
+                    raise RuntimeError(
+                        "Novelty index missing or mismatched. Please run: "
+                        "python Paper-KG-Pipeline/scripts/tools/build_novelty_index.py --resume"
+                    )
+                print("⚠️ Novelty index missing/mismatch. Continuing because require_embedding=false.")
+    # Recall offline index (only if enabled)
+    if PipelineConfig.RECALL_USE_OFFLINE_INDEX:
+        nodes_paper_path = OUTPUT_DIR / "nodes_paper.json"
+        nodes_idea_path = OUTPUT_DIR / "nodes_idea.json"
+        status = validate_recall_index(PipelineConfig.RECALL_INDEX_DIR, nodes_paper_path, nodes_idea_path, EMBEDDING_MODEL)
+        if status.get("ok"):
+            _log_event(logger, "index_preflight_ok", {"index": "recall", "status": status})
+        else:
+            _log_event(logger, "index_preflight_failed", {"index": "recall", "status": status})
+            if PipelineConfig.INDEX_ALLOW_BUILD:
+                lock_path = Path(PipelineConfig.RECALL_INDEX_DIR) / ".build.lock"
+                _log_event(logger, "index_preflight_build_start", {
+                    "index": "recall",
+                    "index_dir": str(PipelineConfig.RECALL_INDEX_DIR),
+                })
+                with acquire_lock(lock_path):
+                    build_recall_index(
+                        index_dir=PipelineConfig.RECALL_INDEX_DIR,
+                        batch_size=PipelineConfig.RECALL_EMBED_BATCH_SIZE,
+                        resume=True,
+                        max_retries=PipelineConfig.RECALL_EMBED_MAX_RETRIES,
+                        sleep_sec=PipelineConfig.RECALL_EMBED_SLEEP_SEC,
+                        force_rebuild=False,
+                        logger=logger,
+                    )
+                status = validate_recall_index(PipelineConfig.RECALL_INDEX_DIR, nodes_paper_path, nodes_idea_path, EMBEDDING_MODEL)
+                _log_event(logger, "index_preflight_build_done", {"index": "recall", "status": status})
+            else:
+                print("⚠️ Recall offline index missing/mismatch. Continuing with online batch fallback.")
+    # Subdomain taxonomy preflight (optional)
+    if PipelineConfig.SUBDOMAIN_TAXONOMY_ENABLE:
+        tax_path, patterns_path = resolve_subdomain_taxonomy_paths()
+        _log_event(logger, "subdomain_taxonomy_preflight_start", {
+            "taxonomy_path": str(tax_path),
+            "patterns_path": str(patterns_path),
+            "embedding_model": EMBEDDING_MODEL,
+            "embedding_api_url": EMBEDDING_API_URL,
+        })
+        if not patterns_path.exists():
+            _log_event(logger, "subdomain_taxonomy_missing_patterns", {
+                "patterns_path": str(patterns_path),
+            })
+            return
+        status = validate_subdomain_taxonomy(tax_path, patterns_path)
+        if status.get("ok"):
+            _log_event(logger, "subdomain_taxonomy_preflight_ok", {"status": status})
+        else:
+            _log_event(logger, "subdomain_taxonomy_preflight_failed", {"status": status})
+            if PipelineConfig.INDEX_ALLOW_BUILD:
+                lock_path = tax_path.parent / ".subdomain_taxonomy.build.lock"
+                _log_event(logger, "subdomain_taxonomy_build_start", {"taxonomy_path": str(tax_path)})
+                with acquire_lock(lock_path):
+                    build_subdomain_taxonomy(
+                        patterns_path=patterns_path,
+                        papers_path=OUTPUT_DIR / "nodes_paper.json",
+                        output_path=tax_path,
+                        embed_batch_size=PipelineConfig.RECALL_EMBED_BATCH_SIZE,
+                        embed_max_retries=PipelineConfig.RECALL_EMBED_MAX_RETRIES,
+                        embed_sleep_sec=PipelineConfig.RECALL_EMBED_SLEEP_SEC,
+                        embed_timeout=120,
+                        logger=logger,
+                    )
+                status = validate_subdomain_taxonomy(tax_path, patterns_path)
+                _log_event(logger, "subdomain_taxonomy_build_done", {"status": status})
+        if not status.get("ok"):
+            _log_event(logger, "subdomain_taxonomy_unavailable", {"status": status})
+# ===================== 主函数 =====================
+def main():
+    """主函数"""
+    # Fix encoding for Windows consoles
+    if sys.stdout.encoding != 'utf-8':
+        try:
+            sys.stdout.reconfigure(encoding='utf-8')
+        except AttributeError:
+            pass  # Python < 3.7 or weird environment
+    # 获取用户输入
+    if len(sys.argv) > 1:
+        user_idea = " ".join(sys.argv[1:])
+    else:
+        user_idea = "LLM-Assisted Domain Data Extraction and Cleaning"
+    # 加载召回结果（调用 simple_recall_demo 的结果）
+    print("📂 加载数据...")
+    logger = None
+    token = None
+    start_time = time.time()
+    start_dt = datetime.now(timezone.utc)
+    run_id = f"run_{start_dt.strftime('%Y%m%d_%H%M%S')}_{os.getpid()}_{uuid.uuid4().hex[:6]}"
+    success = False
+    try:
+        if ENABLE_RUN_LOGGING:
+            logger = RunLogger(
+                base_dir=LOG_ROOT,
+                run_id=run_id,
+                meta={
+                    "user_idea": user_idea,
+                    "argv": sys.argv,
+                    "entrypoint": __file__,
+                },
+                max_text_chars=LOG_MAX_TEXT_CHARS
+            )
+            token = set_logger(logger)
+            logger.log_event("run_start", {"user_idea": user_idea})
+            if _DOTENV_STATUS:
+                logger.log_event("dotenv_loaded", _DOTENV_STATUS)
+        # Startup preflight (fail-fast): check LLM/Embedding connectivity + embedding dim consistency (if local index exists)
+        from idea2paper.infra.startup_preflight import run_startup_preflight
+        pre = run_startup_preflight()
+        if not pre.ok:
+            print("\n❌ 启动前自检失败，已终止运行。")
+            print(f"   - LLM endpoint: {pre.llm_endpoint}")
+            print(f"   - Embedding endpoint: {pre.embedding_endpoint}")
+            if pre.embedding_dim is not None:
+                print(f"   - Online embedding_dim: {pre.embedding_dim}")
+            print(f"   - Error: {pre.error}\n")
+            raise RuntimeError(pre.error)
+        # Preflight & auto-prepare required indexes (quality-first)
+        ensure_required_indexes(logger)
+        # 加载节点数据
+        with open(OUTPUT_DIR / "nodes_pattern.json", 'r', encoding='utf-8') as f:
+            patterns = json.load(f)
+        with open(OUTPUT_DIR / "nodes_paper.json", 'r', encoding='utf-8') as f:
+            papers = json.load(f)
+        print(f"  ✓ 加载 {len(patterns)} 个 Pattern")
+        print(f"  ✓ 加载 {len(papers)} 个 Paper")
+        papers_by_id = {p.get("paper_id"): p for p in papers if p.get("paper_id")}
+        # 运行召回（复用 simple_recall_demo 的逻辑）
+        # 注意：这里为了复用逻辑，直接导入了 simple_recall_demo
+        # 在生产环境中，建议将召回逻辑封装为独立的类
+        # 临时保存原始 argv
+        original_argv = sys.argv.copy()
+        sys.argv = ['simple_recall_demo.py', user_idea]
+        # 运行召回（使用 RecallSystem 类，支持两阶段优化）
+        print("\n🔍 运行召回系统...")
+        print("-" * 80)
+        # 【优化】直接使用 RecallSystem 类（支持两阶段召回，大幅提速）
+        from recall_system import RecallSystem
+        print("  初始化召回系统...")
+        recall_system = RecallSystem()
+        print("\n  执行三路召回（优化版，支持两阶段加速）...")
+        raw_user_idea = user_idea
+        idea_brief_best = None
+        retrieval_query_best = raw_user_idea
+        idea_packaging_meta = None
+        if PipelineConfig.IDEA_PACKAGING_ENABLE:
+            try:
+                packager = IdeaPackager(logger=logger)
+                brief_a, query_a = packager.parse_raw_idea(raw_user_idea)
+                if not query_a:
+                    query_a = raw_user_idea
+                first_recall = recall_system.recall(query_a, verbose=False)
+                topn = max(1, int(PipelineConfig.IDEA_PACKAGING_TOPN_PATTERNS))
+                candidate_k = max(1, int(PipelineConfig.IDEA_PACKAGING_CANDIDATE_K))
+                top_patterns = first_recall[:topn]
+                candidates = []
+                judge_candidates = []
+                for pattern_id, pattern_info, score in top_patterns[:candidate_k]:
+                    evidence = packager.build_pattern_evidence(
+                        pattern_id,
+                        pattern_info,
+                        papers_by_id,
+                        max_exemplar_papers=PipelineConfig.IDEA_PACKAGING_MAX_EXEMPLAR_PAPERS,
+                    )
+                    brief_c, query_c = packager.package_with_pattern(raw_user_idea, brief_a, evidence)
+                    candidates.append({
+                        "pattern_id": pattern_id,
+                        "pattern_name": pattern_info.get("name", ""),
+                        "score": float(score),
+                        "brief": brief_c,
+                        "query": query_c,
+                    })
+                    judge_candidates.append({
+                        "pattern_id": pattern_id,
+                        "pattern_name": pattern_info.get("name", ""),
+                        "brief": brief_c,
+                    })
+                best_idx, judge_info = packager.judge_best_candidate(raw_user_idea, judge_candidates)
+                chosen_idx = best_idx if candidates else 0
+                select_mode = (PipelineConfig.IDEA_PACKAGING_SELECT_MODE or "llm_then_recall").lower()
+                recall_scores = {}
+                if select_mode in ("llm_then_recall", "recall_only") and candidates:
+                    for idx, cand in enumerate(candidates):
+                        query = cand.get("query") or raw_user_idea
+                        _ = recall_system.recall(query, verbose=False)
+                        audit = getattr(recall_system, "last_audit", None)
+                        recall_scores[idx] = _recall_focus_score(audit)
+                    recall_best_idx = max(recall_scores, key=recall_scores.get) if recall_scores else chosen_idx
+                    if select_mode == "recall_only":
+                        chosen_idx = recall_best_idx
+                    else:
+                        if recall_scores.get(recall_best_idx, 0.0) > recall_scores.get(chosen_idx, 0.0) + 0.05:
+                            chosen_idx = recall_best_idx
+                chosen = candidates[chosen_idx] if candidates else None
+                if chosen:
+                    idea_brief_best = chosen.get("brief")
+                    retrieval_query_best = chosen.get("query") or raw_user_idea
+                else:
+                    idea_brief_best = brief_a
+                    retrieval_query_best = query_a
+                idea_packaging_meta = {
+                    "raw_idea": raw_user_idea,
+                    "brief_a": brief_a,
+                    "query_a": query_a,
+                    "candidates": candidates,
+                    "judge": judge_info,
+                    "recall_scores": recall_scores,
+                    "chosen_index": chosen_idx,
+                    "query_best": retrieval_query_best,
+                    "brief_best": idea_brief_best,
+                }
+                if logger:
+                    logger.log_event("idea_packaging", {
+                        "enabled": True,
+                        "topn_patterns": topn,
+                        "candidate_k": candidate_k,
+                        "select_mode": select_mode,
+                        "raw_idea": _truncate_text(raw_user_idea, 800),
+                        "query_best": _truncate_text(retrieval_query_best, 800),
+                        "brief_best": _shrink_brief(idea_brief_best, 600),
+                        "candidates": [
+                            {
+                                "pattern_id": c.get("pattern_id"),
+                                "pattern_name": c.get("pattern_name"),
+                                "query": _truncate_text(c.get("query", ""), 300),
+                            } for c in candidates
+                        ],
+                        "judge": judge_info,
+                        "recall_scores": recall_scores,
+                        "chosen_index": chosen_idx,
+                    })
+            except Exception as e:
+                if logger:
+                    logger.log_event("idea_packaging_failed", {"error": str(e)})
+                idea_brief_best = None
+                retrieval_query_best = raw_user_idea
+        recall_results = recall_system.recall(retrieval_query_best, verbose=True)
+        recall_audit = getattr(recall_system, "last_audit", None)
+        # 如果召回为空：说明当前 idea 无法匹配到可用的领域/Pattern 数据，直接提示用户并停止程序
+        if not recall_results:
+            print("\n" + "=" * 80)
+            print("❌ 召回为空：未能从知识图谱中召回到任何可用 Pattern / 领域数据")
+            print("=" * 80)
+            print("可能原因：")
+            print("- 输入的 idea 过于抽象/过于口语化/缺少领域关键词")
+            print("- idea 与当前内置数据集（ICLR 图谱）覆盖范围差异较大")
+            print("- 语言不匹配（建议尽量用英文关键词描述检索意图）")
+            print("\n建议你：")
+            print("- 换一个更具体的 idea（包含方法/任务/数据/约束等关键词）")
+            print("- 或用英文重写 idea（加入核心术语，如 retrieval / diffusion / transformer / graph 等）")
+            print()
+            if logger:
+                logger.log_event("recall_empty", {
+                    "raw_user_idea": _truncate_text(raw_user_idea, 800),
+                    "retrieval_query_best": _truncate_text(retrieval_query_best, 800),
+                })
+            raise SystemExit(2)
+        # 【关键修复】加载完整的 patterns_structured.json 以合并数据
+        patterns_structured_file = OUTPUT_DIR / "patterns_structured.json"
+        if patterns_structured_file.exists():
+            with open(patterns_structured_file, 'r', encoding='utf-8') as f:
+                patterns_structured = json.load(f)
+            # 构建 pattern_id -> structured_data 的映射
+            structured_map = {}
+            for p in patterns_structured:
+                pattern_id = f"pattern_{p.get('pattern_id')}"
+                structured_map[pattern_id] = p
+            # 合并 skeleton_examples 和 common_tricks 到召回结果
+            merged_results = []
+            for pattern_id, pattern_info, score in recall_results:
+                merged_pattern = dict(pattern_info)
+                if pattern_id in structured_map:
+                    merged_pattern['skeleton_examples'] = structured_map[pattern_id].get('skeleton_examples', [])
+                    merged_pattern['common_tricks'] = structured_map[pattern_id].get('common_tricks', [])
+                merged_results.append((pattern_id, merged_pattern, score))
+            recalled_patterns = merged_results
+        else:
+            # 如果没有 patterns_structured.json，直接使用召回结果
+            recalled_patterns = recall_results
+        # 加载 papers 数据 (Pipeline 需要用于 RAG 查重)
+        print("\n  加载 Papers 数据用于查重...")
+        with open(OUTPUT_DIR / "nodes_paper.json", 'r', encoding='utf-8') as f:
+            papers = json.load(f)
+        # 恢复 argv
+        sys.argv = original_argv
+        print("-" * 80)
+        print(f"✅ 召回完成: Top-{len(recalled_patterns)} Patterns\n")
+        # 运行 Pipeline（传递 user_idea 用于 Pattern 智能分类）
+        pipeline = Idea2StoryPipeline(
+            raw_user_idea,
+            recalled_patterns,
+            papers,
+            run_id=run_id,
+            idea_brief=idea_brief_best,
+        )
+        result = pipeline.run()
+        if recall_audit is not None:
+            result["recall_audit"] = recall_audit
+            if logger and PipelineConfig.RECALL_AUDIT_IN_EVENTS:
+                logger.log_event("recall_audit", recall_audit)
+        if idea_packaging_meta:
+            result["idea_packaging"] = idea_packaging_meta
+        success = True
+        # 保存结果
+        output_file = OUTPUT_DIR / "final_story.json"
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(result['final_story'], f, ensure_ascii=False, indent=2)
+        print(f"\n💾 最终 Story 已保存到: {output_file}")
+        # 保存完整结果
+        full_result_file = OUTPUT_DIR / "pipeline_result.json"
+        results_dir = str(RESULTS_ROOT / run_id) if RESULTS_ENABLE else None
+        with open(full_result_file, 'w', encoding='utf-8') as f:
+            json.dump({
+                'user_idea': user_idea,
+                'success': result['success'],
+                'iterations': result['iterations'],
+                'selected_patterns': result['selected_patterns'],
+                'final_story': result['final_story'],
+                'review_history': result['review_history'],
+                'results_dir': results_dir,
+                'novelty_report': result.get('novelty_report'),
+                'recall_audit': result.get('recall_audit'),
+                'review_summary': {
+                    'total_reviews': len(result['review_history']),
+                    'final_score': result['review_history'][-1]['avg_score'] if result['review_history'] else 0
+                },
+                'refinement_summary': {
+                    'total_refinements': len(result['refinement_history']),
+                    'issues_addressed': [r['issue'] for r in result['refinement_history']]
+                },
+                'verification_summary': {
+                    'collision_detected': result['verification_result']['collision_detected'],
+                    'max_similarity': result['verification_result']['max_similarity']
+                },
+                'idea_packaging': result.get('idea_packaging')
+            }, f, ensure_ascii=False, indent=2)
+        print(f"💾 完整结果已保存到: {full_result_file}")
+        # 聚合产物到 repo 根 results/
+        if RESULTS_ENABLE:
+            try:
+                bundler = ResultBundler(
+                    repo_root=REPO_ROOT,
+                    results_root=RESULTS_ROOT,
+                    mode=RESULTS_MODE,
+                    keep_log=RESULTS_KEEP_LOG,
+                )
+                run_log_dir = (LOG_ROOT / run_id) if ENABLE_RUN_LOGGING else None
+                novelty_report_path = None
+                if isinstance(result.get("novelty_report"), dict):
+                    novelty_report_path = result["novelty_report"].get("report_path")
+                bundle_status = bundler.bundle(
+                    run_id=run_id,
+                    user_idea=user_idea,
+                    success=success,
+                    output_dir=OUTPUT_DIR,
+                    run_log_dir=run_log_dir,
+                    extra={
+                        "config_snapshot": {
+                            "results": {
+                                "enable": RESULTS_ENABLE,
+                                "dir": str(RESULTS_ROOT),
+                                "mode": RESULTS_MODE,
+                                "keep_log": RESULTS_KEEP_LOG,
+                            },
+                            "logging": {
+                                "enable": ENABLE_RUN_LOGGING,
+                                "dir": str(LOG_ROOT),
+                                "max_text_chars": LOG_MAX_TEXT_CHARS,
+                            },
+                            "critic": {
+                                "strict_json": PipelineConfig.CRITIC_STRICT_JSON,
+                                "json_retries": PipelineConfig.CRITIC_JSON_RETRIES,
+                            },
+                            "pass": {
+                                "mode": PipelineConfig.PASS_MODE,
+                                "min_pattern_papers": PipelineConfig.PASS_MIN_PATTERN_PAPERS,
+                                "fallback": PipelineConfig.PASS_FALLBACK,
+                                "fixed_score": PipelineConfig.PASS_SCORE,
+                            },
+                        },
+                        "novelty_report_path": novelty_report_path
+                    },
+                )
+                if bundle_status.get("ok"):
+                    print(f"✅ Results bundled to: {bundle_status.get('results_dir')}")
+                    if logger:
+                        logger.log_event("results_bundled", {
+                            "results_dir": bundle_status.get("results_dir"),
+                            "mode": RESULTS_MODE,
+                            "partial": bundle_status.get("partial", False)
+                        })
+                else:
+                    if logger:
+                        logger.log_event("results_bundle_failed", {
+                            "errors": bundle_status.get("errors", []),
+                            "mode": RESULTS_MODE
+                        })
+            except Exception as e:
+                print(f"[results] warning: bundling failed: {e}")
+                if logger:
+                    logger.log_event("results_bundle_failed", {"error": str(e)})
+    except Exception as e:
+        print(f"\n❌ 错误: {e}")
+        if logger:
+            logger.log_event("run_error", {"error": str(e)})
+        import traceback
+        traceback.print_exc()
+    finally:
+        if logger:
+            logger.log_event("run_end", {
+                "success": success,
+                "duration_ms": int((time.time() - start_time) * 1000)
+            })
+        if token is not None:
+            reset_logger(token)
+if __name__ == '__main__':
+    main()

Paper-KG-Pipeline/scripts/legacy/generate_patterns_old.py ADDED Viewed

	@@ -0,0 +1,776 @@

+"""
+基于skeleton+tricks聚类生成patterns
+输出：
+1. patterns_structured.json - 结构化数据
+2. patterns_guide.txt - 用户指导文档
+3. patterns_statistics.json - 统计报告
+"""
+import json
+import os
+import glob
+from pathlib import Path
+import numpy as np
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics.pairwise import cosine_similarity
+import requests
+from typing import Dict, List, Tuple
+from collections import Counter, defaultdict
+import time
+SCRIPT_DIR = Path(__file__).resolve().parent
+SCRIPTS_DIR = SCRIPT_DIR.parent
+PROJECT_ROOT = SCRIPTS_DIR.parent
+# LLM配置 - 请配置环境变量或修改此处
+LLM_CONFIG = {
+    "api_url": os.environ.get("LLM_API_URL", "https://api.openai.com/v1/chat/completions"),
+    "auth_token": os.environ.get("LLM_AUTH_TOKEN", ""),
+    "model": os.environ.get("LLM_MODEL", "gpt-4")
+}
+# Embedding配置
+EMBED_CONFIG = {
+    "api_url": os.environ.get("EMBED_API_URL", "https://api.openai.com/v1/embeddings"),
+    "auth_token": os.environ.get("LLM_AUTH_TOKEN", ""),
+    "model": os.environ.get("EMBED_MODEL", "text-embedding-3-small")
+}
+if not LLM_CONFIG["auth_token"]:
+    print("⚠️  警告: 未设置 LLM_AUTH_TOKEN 环境变量")
+    print("   Pattern生成功能将不可用，但可以直接使用已生成的 patterns_structured.json")
+# 聚类参数
+CLUSTER_PARAMS = {
+    "distance_threshold": 0.35,  # 距离阈值
+    "min_cluster_size": 5,       # 最小cluster大小
+    "skeleton_weight": 0.4,      # skeleton权重
+    "tricks_weight": 0.6,        # tricks权重
+}
+def get_embedding(text: str, max_retries: int = 3) -> List[float]:
+    """获取文本的embedding向量"""
+    url = EMBED_CONFIG["api_url"]
+    headers = {
+        "Authorization": EMBED_CONFIG["auth_token"],
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": EMBED_CONFIG["model"],
+        "input": text[:8000]
+    }
+    for attempt in range(max_retries):
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=30)
+            response.raise_for_status()
+            result = response.json()
+            return result['data'][0]['embedding']
+        except Exception as e:
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)
+            else:
+                print(f"  ❌ Embedding失败: {e}")
+                return [0.0] * 4096
+    return [0.0] * 4096
+def call_llm(prompt: str, max_retries: int = 3) -> str:
+    """调用LLM API"""
+    url = LLM_CONFIG["api_url"]
+    headers = {
+        "Authorization": LLM_CONFIG["auth_token"],
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": LLM_CONFIG["model"],
+        "messages": [
+            {"role": "user", "content": prompt}
+        ],
+        "temperature": 0.3
+    }
+    for attempt in range(max_retries):
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            result = response.json()
+            return result['choices'][0]['message']['content'].strip()
+        except Exception as e:
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)
+            else:
+                print(f"  ❌ LLM调用失败: {e}")
+                return ""
+    return ""
+def load_all_papers(base_dir: str = None) -> List[Dict]:
+    """加载所有论文数据"""
+    if base_dir is None:
+        # 默认使用Pipeline的data目录
+        base_dir = str(PROJECT_ROOT / "data")
+    all_papers = []
+    # 遍历所有会议目录
+    for conf_dir in glob.glob(os.path.join(base_dir, "*")):
+        if not os.path.isdir(conf_dir):
+            continue
+        conf_name = os.path.basename(conf_dir)
+        files = glob.glob(os.path.join(conf_dir, "*_paper_node.json"))
+        if not files:
+            continue
+        print(f"📁 加载 {conf_name}: {len(files)} 篇论文")
+        for file_path in files:
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    paper = json.load(f)
+                    paper['conference'] = conf_name
+                    paper['file_path'] = file_path
+                    # 验证必要字段
+                    if 'skeleton' in paper and 'tricks' in paper:
+                        all_papers.append(paper)
+            except Exception as e:
+                print(f"  ⚠️  读取失败 {file_path}: {e}")
+    return all_papers
+def build_pattern_embeddings(papers: List[Dict]) -> Tuple[np.ndarray, List[Dict]]:
+    """构建pattern的embedding表示（skeleton + tricks融合）"""
+    print(f"\n🔢 构建pattern embeddings...")
+    embeddings = []
+    pattern_data = []
+    for i, paper in enumerate(papers):
+        if (i + 1) % 50 == 0:
+            print(f"  进度: {i+1}/{len(papers)}")
+        # 1. Skeleton文本
+        skeleton = paper.get('skeleton', {})
+        skeleton_text = " ".join([
+            skeleton.get('problem_framing', ''),
+            skeleton.get('gap_pattern', ''),
+            skeleton.get('method_story', ''),
+            skeleton.get('experiments_story', '')
+        ])
+        # 2. Tricks文本
+        tricks = paper.get('tricks', [])
+        tricks_text = " ".join([
+            f"{t.get('name', '')}: {t.get('description', '')}"
+            for t in tricks
+        ])
+        # 3. 分别计算embedding
+        skeleton_emb = get_embedding(skeleton_text.strip())
+        time.sleep(0.1)
+        tricks_emb = get_embedding(tricks_text.strip())
+        time.sleep(0.1)
+        # 4. 加权融合
+        skeleton_emb = np.array(skeleton_emb)
+        tricks_emb = np.array(tricks_emb)
+        pattern_emb = (CLUSTER_PARAMS['skeleton_weight'] * skeleton_emb +
+                      CLUSTER_PARAMS['tricks_weight'] * tricks_emb)
+        embeddings.append(pattern_emb)
+        pattern_data.append({
+            'paper_id': paper.get('paper_id', ''),
+            'title': paper.get('title', ''),
+            'conference': paper.get('conference', ''),
+            'skeleton': skeleton,
+            'tricks': tricks,
+            'skeleton_text': skeleton_text[:500],
+            'tricks_text': tricks_text[:500]
+        })
+    return np.array(embeddings), pattern_data
+def cluster_patterns(embeddings: np.ndarray) -> np.ndarray:
+    """对patterns进行层次聚类"""
+    print(f"\n🔄 开始聚类...")
+    # 层次聚类（使用cosine距离）
+    clusterer = AgglomerativeClustering(
+        n_clusters=None,
+        distance_threshold=CLUSTER_PARAMS['distance_threshold'],
+        affinity='cosine',
+        linkage='average'
+    )
+    labels = clusterer.fit_predict(embeddings)
+    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+    print(f"  生成 {n_clusters} 个clusters")
+    # 统计cluster大小
+    cluster_sizes = Counter(labels)
+    for cluster_id, size in cluster_sizes.most_common():
+        if cluster_id != -1:
+            print(f"    Cluster {cluster_id}: {size} 篇")
+    return labels
+def analyze_cluster(cluster_papers: List[Dict], cluster_id: int) -> Dict:
+    """分析单个cluster的特征"""
+    print(f"\n  📊 分析 Cluster {cluster_id} ({len(cluster_papers)} 篇)...")
+    # 1. 统计高频tricks
+    trick_counter = Counter()
+    trick_examples = defaultdict(list)
+    for paper in cluster_papers:
+        for trick in paper['tricks']:
+            trick_name = trick.get('name', '')
+            if not trick_name:
+                continue
+            trick_counter[trick_name] += 1
+            trick_examples[trick_name].append({
+                'paper_id': paper['paper_id'],
+                'title': paper['title'],
+                'description': trick.get('description', ''),
+                'type': trick.get('type', ''),
+                'purpose': trick.get('purpose', '')
+            })
+    # 2. 选择代表性skeleton例子（取最前面3个）
+    skeleton_examples = []
+    for paper in cluster_papers[:3]:
+        skeleton_examples.append({
+            'paper_id': paper['paper_id'],
+            'title': paper['title'],
+            'skeleton': paper['skeleton']
+        })
+    # 3. 计算coherence（类内平均相似度）
+    # 这里简化处理，实际可以重新计算
+    coherence = 0.75  # 占位值
+    return {
+        'cluster_id': cluster_id,
+        'size': len(cluster_papers),
+        'skeleton_examples': skeleton_examples,
+        'trick_frequency': trick_counter.most_common(20),
+        'trick_examples': trick_examples,
+        'coherence': coherence,
+        'all_papers': cluster_papers
+    }
+def generate_pattern_summary(cluster_analysis: Dict) -> str:
+    """生成pattern总结（LLM）- 使用cluster完整信息"""
+    all_papers = cluster_analysis['all_papers']
+    cluster_size = cluster_analysis['size']
+    # ============================================================
+    # 1. 构建完整的 skeleton 信息（所有论文，完整四个维度）
+    # ============================================================
+    skeleton_info_list = []
+    for i, paper in enumerate(all_papers[:8]):  # 最多取8篇避免token过长
+        skeleton = paper.get('skeleton', {})
+        skeleton_info_list.append(f"""
+论文{i+1}：《{paper.get('title', '')[:60]}》
+  - 问题定位：{skeleton.get('problem_framing', '')}
+  - 研究缺口：{skeleton.get('gap_pattern', '')}
+  - 方法叙述：{skeleton.get('method_story', '')}
+  - 实验设计：{skeleton.get('experiments_story', '')}""")
+    skeleton_full_text = "\n".join(skeleton_info_list)
+    # ============================================================
+    # 2. 构建完整的 tricks 信息（包含 name + description + purpose）
+    # ============================================================
+    tricks_info_list = []
+    seen_tricks = set()  # 去重
+    for paper in all_papers:
+        for trick in paper.get('tricks', []):
+            trick_name = trick.get('name', '')
+            if trick_name and trick_name not in seen_tricks:
+                seen_tricks.add(trick_name)
+                tricks_info_list.append({
+                    'name': trick_name,
+                    'type': trick.get('type', ''),
+                    'description': trick.get('description', ''),
+                    'purpose': trick.get('purpose', ''),
+                    'location': trick.get('location', '')
+                })
+    # 按频率统计，取前15个高频trick的完整信息
+    trick_freq = cluster_analysis['trick_frequency']
+    top_trick_names = [name for name, _ in trick_freq[:15]]
+    tricks_full_list = []
+    for trick_info in tricks_info_list:
+        if trick_info['name'] in top_trick_names:
+            tricks_full_list.append(
+                f"- {trick_info['name']} [{trick_info['type']}]\n"
+                f"    描述：{trick_info['description']}\n"
+                f"    目的：{trick_info['purpose']}\n"
+                f"    位置：{trick_info['location']}"
+            )
+    tricks_full_text = "\n".join(tricks_full_list[:10])  # 最多10个完整trick
+    # ============================================================
+    # 3. 统计信息（用于prompt参考）
+    # ============================================================
+    trick_stats = ", ".join([f"{name}({count}次)" for name, count in trick_freq[:10]])
+    # ============================================================
+    # 4. 构建完整的 prompt
+    # ============================================================
+    prompt = f"""
+你是NLP研究专家。请基于以下cluster的完整信息，生成一个技术性总结。
+【Cluster概览】
+- 包含 {cluster_size} 篇论文
+- 高频Tricks统计：{trick_stats}
+【所有论文的Skeleton信息】
+{skeleton_full_text}
+【高频Tricks详细信息】
+{tricks_full_text}
+【任务要求】
+请分析上述论文的共同特征，生成一个 150-200 字的技术性总结。
+要求：
+1. 找出这些论文在研究问题、方法设计、实验策略上的共性
+2. 保留具体技术词（模型名、方法名、数据集名）
+3. 突出这类论文的核心写作套路和技术特征
+4. 避免空泛的描述，要有可操作的具体信息
+【输出格式】（分三段）:
+第1段（60字）：核心研究问题与技术路线 - 这类论文主要解决什么问题，用什么方法
+第2段（60字）：关键技术组合与写作策略 - skeleton特点 + 常用tricks组合
+第3段（60字）：适用场景与预期效果 - 什么任务/数据/目标适合这个套路
+【示例对比】:
+✅ 好: "针对跨语言理解任务中的数据稀缺问题，采用多语言预训练+零样本迁移的技术路线。
+      skeleton通常以'低资源语言困境'开篇，通过'多语言对齐不足'指出gap，
+      方法部分采用'对比学习+语言标签'组合。高频使用消融实验验证各组件贡献，
+      配合多数据集验证增强泛化性。适用于低资源NLP任务，预期提升5-10%零样本性能。"
+❌ 差: "这些论文采用问题导向的叙事结构，首先指出现有方法不足，然后提出创新方法..."
+直接输出总结:
+"""
+    summary = call_llm(prompt)
+    # 验证质量
+    bad_patterns = ['叙事结构', '写作结构', '首先...接着', '这些论文采用']
+    if len(summary) < 100 or any(bad in summary for bad in bad_patterns):
+        print(f"    ⚠️  Summary质量不佳，尝试重新生成...")
+        summary = call_llm(prompt)  # 再试一次
+    return summary
+def extract_pattern_name(summary: str) -> str:
+    """从summary提取简短名称"""
+    prompt = f"""
+从以下pattern总结中提取一个简短的名称（不超过12个字）。
+总结：{summary}
+要求：
+- 突出核心技术特征
+- 简洁、专业
+- 不要"XX研究"、"XX论文"等后缀
+直接输出名称:
+"""
+    name = call_llm(prompt)
+    return name.strip()
+def generate_writing_guide_text(pattern_name: str, summary: str, skeleton_examples: List[Dict],
+                                common_tricks: List[Dict], cluster_size: int) -> str:
+    """生成pattern的写作指导文本（给智能体用）"""
+    guide_lines = []
+    # 1. 模板聚焦
+    guide_lines.extend([
+        f"写作模板：{pattern_name}",
+        "",
+        "【模板聚焦】",
+        summary,
+        "",
+    ])
+    # 2. 骨架示例
+    guide_lines.extend([
+        "【代表性论文骨架示例】",
+        f"该套路包含 {len(skeleton_examples)} 个代表性论文的骨架示例，可直观体现该模式的论文撰写框架：",
+        ""
+    ])
+    for i, sk in enumerate(skeleton_examples):
+        guide_lines.extend([
+            f"示例 {i+1}：《{sk['title']}》",
+            f"  • 问题定位：{compress_text(sk['problem_framing'], 150)}",
+            f"  • 现有研究缺口：{compress_text(sk['gap_pattern'], 150)}",
+            f"  • 核心方法：{compress_text(sk['method_story'], 150)}",
+            f"  • 实验设计：{compress_text(sk['experiments_story'], 150)}",
+            ""
+        ])
+    # 3. 高频技巧
+    guide_lines.extend([
+        "【高频研究技巧】",
+        f"该模式下有以下 {len(common_tricks)} 个高频使用的研究技巧：",
+        ""
+    ])
+    for i, trick in enumerate(common_tricks[:10]):
+        example = trick['examples'][0] if trick['examples'] else {}
+        guide_lines.extend([
+            f"{i+1}. {trick['trick_name']}（使用频率 {trick['frequency']} 次，占比 {trick['percentage']}）",
+            f"   类型：{example.get('type', '通用技巧')}",
+            f"   应用：{compress_text(example.get('description', ''), 150)}",
+            ""
+        ])
+    return "\n".join(guide_lines)
+def assemble_pattern(cluster_analysis: Dict, summary: str) -> Dict:
+    """组装最终的pattern结构"""
+    pattern_name = extract_pattern_name(summary)
+    # Skeleton例子
+    skeleton_examples = [
+        {
+            'paper_id': sk['paper_id'],
+            'title': sk['title'],
+            'problem_framing': sk['skeleton'].get('problem_framing', ''),
+            'gap_pattern': sk['skeleton'].get('gap_pattern', ''),
+            'method_story': sk['skeleton'].get('method_story', ''),
+            'experiments_story': sk['skeleton'].get('experiments_story', '')
+        }
+        for sk in cluster_analysis['skeleton_examples']
+    ]
+    # 高频Tricks
+    common_tricks = [
+        {
+            'trick_name': name,
+            'frequency': count,
+            'percentage': f"{count/cluster_analysis['size']*100:.1f}%",
+            'examples': cluster_analysis['trick_examples'][name][:3]
+        }
+        for name, count in cluster_analysis['trick_frequency'][:15]
+    ]
+    # 生成写作指导文本
+    writing_guide = generate_writing_guide_text(
+        pattern_name, summary, skeleton_examples, common_tricks,
+        cluster_analysis['size']
+    )
+    return {
+        'pattern_id': cluster_analysis['cluster_id'],
+        'pattern_name': pattern_name,
+        'pattern_summary': summary,
+        # 新增：完整的写作指导文本（给智能体用）
+        'writing_guide': writing_guide,
+        # Skeleton例子
+        'skeleton_examples': skeleton_examples,
+        # 高频Tricks
+        'common_tricks': common_tricks,
+        # 元数据
+        'metadata': {
+            'cluster_size': cluster_analysis['size'],
+            'coherence_score': cluster_analysis['coherence'],
+            'all_paper_ids': [p['paper_id'] for p in cluster_analysis['all_papers']]
+        }
+    }
+def compress_text(text: str, max_len: int = 100) -> str:
+    """压缩文本到指定长度"""
+    if len(text) <= max_len:
+        return text
+    sentences = text.split('。')
+    compressed = ""
+    for sent in sentences:
+        if len(compressed) + len(sent) + 1 <= max_len - 3:
+            compressed += sent + "。"
+        else:
+            break
+    return compressed if compressed else text[:max_len-3] + "..."
+def generate_user_guide(patterns: List[Dict]) -> str:
+    """生成用户指导文档"""
+    print(f"\n📝 生成用户指导文档...")
+    # 1. 整体介绍
+    total_papers = sum(p['metadata']['cluster_size'] for p in patterns)
+    guide_lines = [
+        "="*80,
+        "NLP 论文写作模式（Patterns）指南",
+        "="*80,
+        "",
+        "【整体介绍】",
+        "",
+        f"本指南基于 {total_papers} 篇 NLP 顶会论文的深度分析，通过对论文骨架（skeleton）和",
+        f"研究技巧（tricks）的聚类，抽象出 {len(patterns)} 个可复用的写作模式（patterns）。",
+        "",
+        "每个 pattern 包含：",
+        "  • 模式总结：该类论文的核心技术路线和写作特点",
+        "  • 骨架示例：2-3 篇代表性论文的完整结构框架",
+        "  • 高频技巧：统计排序的常用研究技巧及使用频率",
+        "  • 使用建议：针对性的写作和研究建议",
+        "",
+        "【如何使用本指南】",
+        "",
+        "1️⃣  定位你的研究类型",
+        "   - 浏览各个 pattern 的【模板聚焦】部分",
+        "   - 找到与你研究最相关的 1-2 个 patterns",
+        "",
+        "2️⃣  学习论文结构",
+        "   - 参考【代表性论文骨架示例】",
+        "   - 理解问题定位、缺口分析、方法叙述、实验设计的逻辑",
+        "",
+        "3️⃣  选择合适技巧",
+        "   - 查看【高频研究技巧】列表",
+        "   - 根据使用频率和适用场景，选择 3-5 个技巧应用到你的论文",
+        "",
+        "4️⃣  追溯具体论文",
+        "   - 通过【相关论文】列表，找到具体论文深度学习",
+        "",
+        "【Pattern 列表】",
+        ""
+    ]
+    # 2. Pattern目录
+    for p in patterns:
+        guide_lines.append(
+            f"  Pattern #{p['pattern_id']:02d} - {p['pattern_name']} "
+            f"({p['metadata']['cluster_size']}篇论文)"
+        )
+    guide_lines.extend(["", "="*80, ""])
+    # 3. 每个Pattern的详细信息
+    for pattern in patterns:
+        guide_lines.extend([
+            "="*80,
+            f"写作模板 #{pattern['pattern_id']}：{pattern['pattern_name']}",
+            "="*80,
+            "",
+            "【模板聚焦】",
+            pattern['pattern_summary'],
+            "",
+            "-"*80,
+            "【代表性论文骨架示例】",
+            "-"*80,
+            "",
+            f"该套路包含 {len(pattern['skeleton_examples'])} 个代表性论文的骨架示例，可直观体现该模式的论文撰写框架：",
+            ""
+        ])
+        # Skeleton例子
+        for sk in pattern['skeleton_examples']:
+            guide_lines.extend([
+                f"📄 论文标题：《{sk['title']}》",
+                "",
+                f"   • 问题定位：{compress_text(sk['problem_framing'], 120)}",
+                "",
+                f"   • 现有研究缺口：{compress_text(sk['gap_pattern'], 120)}",
+                "",
+                f"   • 核心方法：{compress_text(sk['method_story'], 120)}",
+                "",
+                f"   • 实验设计：{compress_text(sk['experiments_story'], 120)}",
+                ""
+            ])
+        # Tricks
+        guide_lines.extend([
+            "-"*80,
+            "【高频研究技巧】",
+            "-"*80,
+            "",
+            f"该模式下梳理出以下 {len(pattern['common_tricks'])} 个高频使用的研究技巧，含使用频率、占比及具体示例：",
+            ""
+        ])
+        for i, trick in enumerate(pattern['common_tricks'][:10]):
+            example = trick['examples'][0] if trick['examples'] else {}
+            guide_lines.extend([
+                f"{i+1}. {trick['trick_name']}",
+                f"   - 使用频率：{trick['frequency']} 次（占比 {trick['percentage']}）",
+                f"   - 技巧类型：{example.get('type', '通用技巧')}",
+                f"   - 典型应用：{compress_text(example.get('description', ''), 150)}",
+                ""
+            ])
+        # 相关论文
+        paper_ids = pattern['metadata']['all_paper_ids']
+        guide_lines.extend([
+            "-"*80,
+            f"【相关论文】（共 {len(paper_ids)} 篇）",
+            "-"*80
+        ])
+        for i, paper_id in enumerate(paper_ids[:15]):
+            guide_lines.append(f"  [{i+1}] {paper_id}")
+        if len(paper_ids) > 15:
+            guide_lines.append(f"  ... 及其他 {len(paper_ids) - 15} 篇")
+        guide_lines.extend(["", "="*80, ""])
+    return "\n".join(guide_lines)
+def generate_statistics(patterns: List[Dict]) -> Dict:
+    """生成统计报告"""
+    print(f"\n📊 生成统计报告...")
+    # 全局trick统计
+    all_tricks = Counter()
+    for pattern in patterns:
+        for trick in pattern['common_tricks']:
+            all_tricks[trick['trick_name']] += trick['frequency']
+    # 聚类质量统计
+    cluster_sizes = [p['metadata']['cluster_size'] for p in patterns]
+    return {
+        'total_patterns': len(patterns),
+        'total_papers': sum(cluster_sizes),
+        'average_cluster_size': float(np.mean(cluster_sizes)),
+        'median_cluster_size': float(np.median(cluster_sizes)),
+        'cluster_size_distribution': {
+            'min': min(cluster_sizes),
+            'max': max(cluster_sizes),
+            'std': float(np.std(cluster_sizes))
+        },
+        'top_global_tricks': [
+            {'name': name, 'total_count': count}
+            for name, count in all_tricks.most_common(20)
+        ],
+        'pattern_size_distribution': {
+            'small (<10)': len([s for s in cluster_sizes if s < 10]),
+            'medium (10-20)': len([s for s in cluster_sizes if 10 <= s < 20]),
+            'large (20-30)': len([s for s in cluster_sizes if 20 <= s < 30]),
+            'xlarge (>=30)': len([s for s in cluster_sizes if s >= 30])
+        }
+    }
+def main():
+    """主流程"""
+    print("="*80)
+    print("基于 Skeleton + Tricks 聚类生成 Patterns")
+    print("="*80)
+    # 1. 加载论文
+    print("\n【Step 1】加载论文数据")
+    papers = load_all_papers()
+    print(f"✅ 共加载 {len(papers)} 篇论文")
+    # 2. 构建pattern embeddings
+    print("\n【Step 2】构建pattern embeddings")
+    embeddings, pattern_data = build_pattern_embeddings(papers)
+    print(f"✅ 完成 {len(embeddings)} 个pattern的embedding")
+    # 3. 聚类
+    print("\n【Step 3】聚类")
+    labels = cluster_patterns(embeddings)
+    # 4. 分析每个cluster并生成pattern
+    print("\n【Step 4】生成patterns")
+    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+    patterns = []
+    for cluster_id in range(n_clusters):
+        cluster_indices = [i for i in range(len(labels)) if labels[i] == cluster_id]
+        if len(cluster_indices) < CLUSTER_PARAMS['min_cluster_size']:
+            print(f"  ⚠️  Cluster {cluster_id}: {len(cluster_indices)}篇 (过小，跳过)")
+            continue
+        cluster_papers = [pattern_data[i] for i in cluster_indices]
+        # 分析cluster
+        cluster_analysis = analyze_cluster(cluster_papers, cluster_id)
+        # 生成summary
+        summary = generate_pattern_summary(cluster_analysis)
+        print(f"    Summary: {summary[:80]}...")
+        # 组装pattern
+        pattern = assemble_pattern(cluster_analysis, summary)
+        patterns.append(pattern)
+    print(f"\n✅ 共生成 {len(patterns)} 个patterns")
+    # 5. 生成输出文件
+    print("\n【Step 5】生成输出文件")
+    # 获取输出目录
+    output_dir = str(PROJECT_ROOT / "output")
+    os.makedirs(output_dir, exist_ok=True)
+    # 5.1 结构化JSON
+    with open(os.path.join(output_dir, 'patterns_structured.json'), 'w', encoding='utf-8') as f:
+        json.dump(patterns, f, ensure_ascii=False, indent=2)
+    print("  ✅ patterns_structured.json")
+    # 5.2 用户指导
+    guide_text = generate_user_guide(patterns)
+    with open(os.path.join(output_dir, 'patterns_guide.txt'), 'w', encoding='utf-8') as f:
+        f.write(guide_text)
+    print("  ✅ patterns_guide.txt")
+    # 5.3 统计报告
+    statistics = generate_statistics(patterns)
+    with open(os.path.join(output_dir, 'patterns_statistics.json'), 'w', encoding='utf-8') as f:
+        json.dump(statistics, f, ensure_ascii=False, indent=2)
+    print("  ✅ patterns_statistics.json")
+    print("\n" + "="*80)
+    print("🎉 完成！")
+    print("="*80)
+    print(f"\n生成了 {len(patterns)} 个patterns，覆盖 {statistics['total_papers']} 篇论文")
+    print(f"平均每个pattern包含 {statistics['average_cluster_size']:.1f} 篇论文")
+    print(f"\n输出文件：")
+    print(f"  1. patterns_structured.json  - 结构化数据（给程序用）")
+    print(f"  2. patterns_guide.txt        - 用户指导文档（给人看）")
+    print(f"  3. patterns_statistics.json  - 统计报告")
+if __name__ == '__main__':
+    main()

Paper-KG-Pipeline/scripts/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from pathlib import Path
+import sys
+CURRENT_DIR = Path(__file__).parent
+PROJECT_ROOT = CURRENT_DIR.parent.parent
+SRC_ROOT = PROJECT_ROOT / "src"
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+from idea2paper import (
+    PipelineConfig,
+    OUTPUT_DIR,
+    MultiAgentCritic,
+    Idea2StoryPipeline,
+    PatternSelector,
+    StoryPlanner,
+    create_planner,
+    RefinementEngine,
+    StoryGenerator,
+    RAGVerifier,
+    ReviewIndex,
+    call_llm,
+)
+__all__ = [
+    'Idea2StoryPipeline',
+    'PipelineConfig',
+    'PatternSelector',
+    'StoryPlanner',
+    'create_planner',
+    'StoryGenerator',
+    'MultiAgentCritic',
+    'RefinementEngine',
+    'ReviewIndex',
+    'RAGVerifier',
+    'call_llm',
+    'PROJECT_ROOT',
+    'OUTPUT_DIR'
+]

Paper-KG-Pipeline/scripts/pipeline/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (1.06 kB). View file