Spaces:

skapoor19
/

explorer-staging

Running

App Files Files Community

skapoor-wpi commited on Mar 21

Commit

8a2906d

1 Parent(s): 6acb635

RAG pipeline nintroduced- up the UI and add more insights

Browse files

Files changed (34) hide show

.gitignore +10 -0
backend/app.py +194 -89
backend/rag/__init__.py +1 -0
backend/rag/config.py +109 -0
backend/rag/ingest/__init__.py +1 -0
backend/rag/ingest/chunker.py +762 -0
backend/rag/ingest/embedder.py +46 -0
backend/rag/ingest/pdf_parser.py +241 -0
backend/rag/ingest/pipeline.py +168 -0
backend/rag/ingest/store.py +79 -0
backend/rag/query_engine.py +352 -0
backend/rag/retrieval/__init__.py +1 -0
backend/rag/retrieval/formatter.py +71 -0
backend/rag/retrieval/retriever.py +170 -0
backend/rag/retrieval/router.py +109 -0
backend/rag/tools/__init__.py +7 -0
backend/rag/tools/data_tools.py +110 -0
backend/rag/tools/ml_tools.py +172 -0
backend/rag/tools/rag_tool.py +51 -0
backend/rag/tools/registry.py +127 -0
backend/rag/tools/statistical.py +134 -0
backend/requirements.txt +3 -0
docs/pipeline-architecture.md +455 -0
frontend/package-lock.json +385 -2
frontend/package.json +2 -0
frontend/src/App.css +385 -15
frontend/src/App.js +11 -2
frontend/src/components/AnalyticsDashboard.js +354 -0
frontend/src/components/InsightBullets.js +99 -2
frontend/src/components/InsightCard.js +193 -11
frontend/src/components/PdfViewer.js +118 -0
frontend/src/components/QueryExplanation.js +106 -0
frontend/src/components/Tooltip.js +64 -0
rag_config.yaml +125 -0

.gitignore CHANGED Viewed

@@ -39,3 +39,13 @@ backend/package-lock.json
 # Cached embeddings
 backend/.description_embeddings.npy
 .env.example

 # Cached embeddings
 backend/.description_embeddings.npy
 .env.example
+# ChromaDB vector database (rebuilt from papers via ingestion pipeline)
+chroma_db/
+# PDF papers (downloaded from arXiv, not checked into git)
+papers/*.pdf
+# Temporary / scratch
+/tmp/
+*.tmp

backend/app.py CHANGED Viewed

@@ -87,13 +87,40 @@ Key distinctions that matter:
 Why clustering matters: Methods that cluster together share fundamental design choices. Separation between clusters often reflects genuinely different philosophies (e.g., learning-based vs. analytical, or 2D vs. 3D grasp representations)."""
 # AI Copilot configuration
-# Supports: "ollama" (local, no key needed) or "huggingface" (needs HF_API_TOKEN)
-AI_PROVIDER = os.environ.get('AI_PROVIDER', 'ollama')
 OLLAMA_BASE_URL = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434')
 OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'llama3.1:8b')
 HF_API_TOKEN = os.environ.get('HF_API_TOKEN', os.environ.get('HF_TOKEN', ''))
 HF_MODEL = os.environ.get('AI_MODEL', 'Qwen/Qwen2.5-72B-Instruct')
 USE_RAG = os.environ.get('USE_RAG', 'false').lower() == 'true'
 def llm_chat(messages, max_tokens=2048, temperature=0.3):
@@ -114,6 +141,19 @@ def llm_chat(messages, max_tokens=2048, temperature=0.3):
         with urllib.request.urlopen(req, timeout=120) as resp:
             result = json.loads(resp.read().decode('utf-8'))
         return result['message']['content'].strip()
     else:
         # HuggingFace Inference API
         if not HF_API_TOKEN:
@@ -616,15 +656,18 @@ DEFAULT WEIGHTS:
 {json.dumps(dict(DEFAULT_WEIGHTS), indent=2)}"""
 def build_method_summaries(df):
-    """Build compact one-line summaries of all methods."""
     summaries = []
     for _, row in df.iterrows():
         name = row.get('Name', '')
         parts = []
-        for col in DEFAULT_WEIGHTS.keys():
-            if col == 'Description':
-                continue
             val = str(row.get(col, '')) if pd.notna(row.get(col, '')) else ''
             if val:
                 short = SHORT_COLUMN_NAMES.get(col, col)
@@ -633,10 +676,29 @@ def build_method_summaries(df):
     return '\n'.join(summaries)
-def retrieve_relevant_chunks(query):
-    """Placeholder for RAG retrieval. Returns empty string.
-    Future: query ChromaDB for relevant paper chunks."""
-    return ""
 def build_ai_system_prompt(df, query):
@@ -644,11 +706,18 @@ def build_ai_system_prompt(df, query):
     schema = build_schema_context(df)
     methods = build_method_summaries(df)
-    retrieved = ""
-    if USE_RAG:
-        retrieved = retrieve_relevant_chunks(query)
-        if retrieved:
-            retrieved = f"\n\nRELEVANT PAPER EXCERPTS:\n{retrieved}"
     return f"""You are an AI copilot for the Grasp Planner Explorer, a visualization tool that shows 56 robotic grasp planning methods projected via weighted UMAP.
@@ -656,7 +725,7 @@ def build_ai_system_prompt(df, query):
 ALL {len(df)} METHODS:
 {methods}
-{retrieved}
 YOUR TASK (Pass 1 — Configuration):
 Given a natural language query from a researcher, respond with a JSON object containing:
@@ -667,6 +736,7 @@ Given a natural language query from a researcher, respond with a JSON object con
    - For SEARCH queries ("find methods for X"): highlight the strongest matches for X.
    - For COMPARISON queries ("how do X and Y differ?"): highlight representative examples from EACH side — e.g., 3-4 examples of X AND 3-4 examples of Y so the user sees both groups.
    - For EXPLORATION queries ("overview of the field"): highlight diverse, well-known methods spanning different clusters.
 FILTERING GUIDELINES:
 - When the query specifies attributes (e.g., "cluttered scenes"), filter to methods that have those attributes.
@@ -786,122 +856,134 @@ WHAT YOU DID:
 {cluster_summary}
 YOUR TASK (Pass 2 — Insight):
-Based on the ACTUAL clustering results and your domain knowledge, write concise bullet points. Format as bullet points starting with "- ".
 Write 3-5 bullet points that:
-- Explain WHY methods group together using domain knowledge (e.g., "these methods share a sampling-based approach which requires different input representations than direct regression methods")
-- Point out meaningful patterns — not just what's in each group, but WHY that grouping matters for the researcher's query
-- Call attention to the highlighted best-match methods — what group did they land in and what does that tell us?
-- Note any surprising groupings or trade-offs the researcher should be aware of
-IMPORTANT: Do NOT reference cluster numbers (e.g., "Cluster 0", "Cluster 3"). Instead, refer to groups by their defining characteristics (e.g., "the sampling-based group", "the RL + multi-finger group"). Reference specific method names and attribute values. Ground insights in both the data AND domain knowledge.
 Respond with ONLY the bullet points, no JSON, no markdown fences, no headers."""
 @app.route('/api/ai-query', methods=['POST'])
 def ai_query():
-    """Two-pass AI copilot:
-    Pass 1: LLM decides filter, weights, colorBy, highlights (from raw metadata)
-    Pass 2: Run UMAP/clustering, feed results back to LLM for grounded insight
     """
-    response_text = ''
     try:
         data = request.get_json() or {}
         query = data.get('query', '').strip()
         if not query:
             return jsonify({'success': False, 'error': 'Empty query'}), 400
-        current_weights = data.get('currentWeights', DEFAULT_WEIGHTS)
-        current_color_by = data.get('currentColorBy', 'cluster')
         df = pd.read_csv(CSV_FILE)
-        valid_names = set(df['Name'].tolist())
-        # ── Pass 1: Decide configuration ──────────────────────────────
-        print(f"[Pass 1] Query: '{query}'")
-        system_prompt = build_ai_system_prompt(df, query)
-        user_message = f"""Current weights: {json.dumps(current_weights)}
-Current color-by: {current_color_by}
-Researcher's query: {query}"""
-        response_text = llm_chat([
-            {'role': 'system', 'content': system_prompt},
-            {'role': 'user', 'content': user_message}
-        ])
-        # Handle potential markdown fences
-        if response_text.startswith('```'):
-            lines = response_text.split('\n')
-            response_text = '\n'.join(lines[1:-1])
-        result = json.loads(response_text)
-        # Validate required fields (insight no longer required from Pass 1)
-        required = ['weights', 'colorBy', 'highlightMethods']
-        for field in required:
-            if field not in result:
-                return jsonify({
-                    'success': False,
-                    'error': f'AI response missing field: {field}'
-                }), 500
-        # Validate filterMethods
-        if 'filterMethods' in result:
-            result['filterMethods'] = [
-                m for m in result['filterMethods'] if m in valid_names
-            ]
-            if not result['filterMethods'] or len(result['filterMethods']) >= len(valid_names):
-                result['filterMethods'] = None
-        else:
-            result['filterMethods'] = None
-        result['highlightMethods'] = [
-            m for m in result['highlightMethods'] if m in valid_names
-        ]
-        # Clamp weights to 0-20
-        for col in result['weights']:
-            result['weights'][col] = max(0, min(20, int(result['weights'][col])))
-        print(f"[Pass 1] Filter: {len(result['filterMethods']) if result['filterMethods'] else 'none'}, "
-              f"Highlights: {len(result['highlightMethods'])}, ColorBy: {result['colorBy']}")
-        # ── Run UMAP/Clustering pipeline ──────────────────────────────
-        print("[Pipeline] Running UMAP + K-Means on AI-configured data...")
         response_data, clustering_info, _, _ = run_umap_pipeline(
             result['weights'], result['filterMethods']
         )
         print(f"[Pipeline] Done: {len(response_data)} methods, {clustering_info['n_clusters']} clusters")
-        # ── Pass 2: Generate grounded insight ─────────────────────────
-        print("[Pass 2] Generating insight from clustering results...")
-        insight_prompt = build_insight_prompt(
-            query, response_data, clustering_info,
-            result['weights'], result['colorBy'],
-            result['highlightMethods'], result['filterMethods']
         )
         insight_text = llm_chat([
             {'role': 'user', 'content': insight_prompt}
         ], max_tokens=1024)
-        # Clean up any markdown formatting
         if insight_text.startswith('```'):
             lines = insight_text.split('\n')
             insight_text = '\n'.join(lines[1:-1])
         result['insight'] = insight_text
-        print(f"[Pass 2] Insight generated ({len(insight_text)} chars)")
-        # Include the UMAP data and cluster stats in the response
-        _, cluster_stats = build_cluster_stats(response_data, clustering_info, result['weights'])
         result['umapData'] = response_data
         result['clustering'] = {
             'n_clusters': clustering_info['n_clusters'],
             'value_cluster_map': clustering_info['value_cluster_map']
         }
         result['clusterStats'] = cluster_stats
         return jsonify({'success': True, **result})
@@ -972,6 +1054,29 @@ Respond with ONLY the bullet points, no JSON, no markdown fences, no headers."""
         return jsonify({'success': False, 'error': str(e)}), 500
 @app.route('/api/health')
 def health():
     """Health check endpoint."""

 Why clustering matters: Methods that cluster together share fundamental design choices. Separation between clusters often reflects genuinely different philosophies (e.g., learning-based vs. analytical, or 2D vs. 3D grasp representations)."""
 # AI Copilot configuration
+# Supports: "ollama", "huggingface", or "groq"
+AI_PROVIDER = os.environ.get('AI_PROVIDER', 'groq')
 OLLAMA_BASE_URL = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434')
 OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'llama3.1:8b')
 HF_API_TOKEN = os.environ.get('HF_API_TOKEN', os.environ.get('HF_TOKEN', ''))
 HF_MODEL = os.environ.get('AI_MODEL', 'Qwen/Qwen2.5-72B-Instruct')
+GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '')
+GROQ_MODEL = os.environ.get('GROQ_MODEL', 'llama-3.3-70b-versatile')
 USE_RAG = os.environ.get('USE_RAG', 'false').lower() == 'true'
+USE_TOOLS = os.environ.get('USE_TOOLS', 'true').lower() == 'true'
+# RAG + Tool calling initialization (lazy-loaded)
+_rag_retriever = None
+_rag_config = None
+def _get_rag_config():
+    global _rag_config
+    if _rag_config is None:
+        config_path = os.path.join(BASE_DIR, 'rag_config.yaml')
+        if os.path.exists(config_path):
+            from rag.config import load_config
+            _rag_config = load_config(config_path)
+    return _rag_config
+def _get_rag_retriever():
+    global _rag_retriever
+    if _rag_retriever is None:
+        config = _get_rag_config()
+        if config:
+            from rag.retrieval.retriever import RAGRetriever
+            from rag.ingest.embedder import ChunkEmbedder
+            embedder = ChunkEmbedder(model_name=config.embedding_model, model_instance=st_model)
+            _rag_retriever = RAGRetriever(config=config, embedder=embedder)
+    return _rag_retriever
 def llm_chat(messages, max_tokens=2048, temperature=0.3):
         with urllib.request.urlopen(req, timeout=120) as resp:
             result = json.loads(resp.read().decode('utf-8'))
         return result['message']['content'].strip()
+    elif AI_PROVIDER == 'groq':
+        # Groq (free, fast, OpenAI-compatible)
+        if not GROQ_API_KEY:
+            raise ValueError('GROQ_API_KEY not configured. Set it as an environment variable.')
+        from groq import Groq
+        client = Groq(api_key=GROQ_API_KEY)
+        completion = client.chat.completions.create(
+            model=GROQ_MODEL,
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        return completion.choices[0].message.content.strip()
     else:
         # HuggingFace Inference API
         if not HF_API_TOKEN:
 {json.dumps(dict(DEFAULT_WEIGHTS), indent=2)}"""
+SUMMARY_COLUMNS = [
+    'Planning Method', 'End-effector Hardware', 'Input Data',
+    'Training Data', 'Object Configuration',
+]
 def build_method_summaries(df):
+    """Build compact one-line summaries of all methods (key columns only to save tokens)."""
     summaries = []
     for _, row in df.iterrows():
         name = row.get('Name', '')
         parts = []
+        for col in SUMMARY_COLUMNS:
             val = str(row.get(col, '')) if pd.notna(row.get(col, '')) else ''
             if val:
                 short = SHORT_COLUMN_NAMES.get(col, col)
     return '\n'.join(summaries)
+def retrieve_relevant_chunks(query, paper_ids=None):
+    """Retrieve relevant paper chunks from ChromaDB.
+    Returns (prompt_text, citations) where prompt_text is formatted for LLM
+    injection and citations is structured data for the frontend.
+    """
+    if not USE_RAG:
+        return "", []
+    retriever = _get_rag_retriever()
+    if retriever is None:
+        return "", []
+    try:
+        from rag.retrieval.formatter import format_for_prompt, format_chunk_citations
+        config = _get_rag_config()
+        chunks = retriever.retrieve(query, paper_ids=paper_ids)
+        token_budget = config.retrieval.token_budget if config else 3000
+        prompt_text = format_for_prompt(chunks, token_budget=token_budget)
+        citations = format_chunk_citations(chunks)
+        print(f"[RAG] Retrieved {len(chunks)} chunks ({len(prompt_text)} chars)")
+        return prompt_text, citations
+    except Exception as e:
+        print(f"[RAG] Error: {e}")
+        return "", []
 def build_ai_system_prompt(df, query):
     schema = build_schema_context(df)
     methods = build_method_summaries(df)
+    tools_section = ""
+    if USE_TOOLS or USE_RAG:
+        try:
+            import rag.tools  # triggers registration of all tools including search_papers
+            from rag.tools.registry import get_tool_prompt_section
+            tools_section = "\n\n" + get_tool_prompt_section()
+        except Exception:
+            pass
+    tools_instruction = ""
+    if tools_section:
+        tools_instruction = '\n5. "tools" (OPTIONAL) - Array of tool calls if the query needs computed results or paper content. Each: {"name": "tool_name", "arguments": {...}}. Use "search_papers" when the query asks about specific techniques, loss functions, architectures, or anything that requires reading actual paper content.'
     return f"""You are an AI copilot for the Grasp Planner Explorer, a visualization tool that shows 56 robotic grasp planning methods projected via weighted UMAP.
 ALL {len(df)} METHODS:
 {methods}
+{tools_section}
 YOUR TASK (Pass 1 — Configuration):
 Given a natural language query from a researcher, respond with a JSON object containing:
    - For SEARCH queries ("find methods for X"): highlight the strongest matches for X.
    - For COMPARISON queries ("how do X and Y differ?"): highlight representative examples from EACH side — e.g., 3-4 examples of X AND 3-4 examples of Y so the user sees both groups.
    - For EXPLORATION queries ("overview of the field"): highlight diverse, well-known methods spanning different clusters.
+{tools_instruction}
 FILTERING GUIDELINES:
 - When the query specifies attributes (e.g., "cluttered scenes"), filter to methods that have those attributes.
 {cluster_summary}
 YOUR TASK (Pass 2 — Insight):
+Based on the ACTUAL clustering results, paper excerpts (if provided), and computed tool results (if provided), write concise bullet points. Format as bullet points starting with "- ".
 Write 3-5 bullet points that:
+- DIRECTLY ANSWER the researcher's query using specific evidence from the paper excerpts and clustering results
+- When paper excerpts are provided, CITE specific papers by name (e.g., "Contact-GraspNet uses a binary cross-entropy loss on predicted contact points")
+- Reference concrete technical details from the papers, not generic descriptions
+- Point out meaningful patterns relevant to the query, grounded in actual paper content
+- If computed results are provided (e.g., similarity scores, distributions), incorporate the exact numbers
+IMPORTANT RULES:
+- Do NOT reference cluster numbers (e.g., "Cluster 0", "Cluster 3"). Refer to groups by their defining characteristics.
+- Do NOT give generic overviews of the clusters. Focus on answering the specific query.
+- When paper excerpts are available, prioritize insights derived from actual paper content over general domain knowledge.
+- Reference specific method names and attribute values.
 Respond with ONLY the bullet points, no JSON, no markdown fences, no headers."""
 @app.route('/api/ai-query', methods=['POST'])
 def ai_query():
+    """Deterministic pipeline + single LLM call:
+    1. Deterministic: embed query, search vector DB, compute weights/filters/highlights
+    2. Pipeline: run UMAP + HDBSCAN with computed weights
+    3. LLM: interpret results with RAG context (single, small prompt)
     """
     try:
         data = request.get_json() or {}
         query = data.get('query', '').strip()
         if not query:
             return jsonify({'success': False, 'error': 'Empty query'}), 400
         df = pd.read_csv(CSV_FILE)
+        # ── Step 1: Deterministic query analysis ──────────────────────
+        print(f"[Query] '{query}'")
+        from rag.query_engine import deterministic_query_pipeline
+        retriever = _get_rag_retriever()
+        pipeline_result = deterministic_query_pipeline(
+            query, df, st_model, DEFAULT_WEIGHTS, retriever=retriever
+        )
+        result = {
+            'weights': pipeline_result['weights'],
+            'colorBy': pipeline_result['colorBy'],
+            'filterMethods': pipeline_result['filterMethods'],
+            'highlightMethods': pipeline_result['highlightMethods'],
+        }
+        rag_text = pipeline_result['rag_text']
+        rag_citations = pipeline_result['rag_citations']
+        rag_analytics = pipeline_result.get('rag_analytics', {})
+        method_relevance = pipeline_result.get('method_relevance', [])
+        method_summaries = pipeline_result['relevant_method_summaries']
+        print(f"[Deterministic] Filter: {len(result['filterMethods']) if result['filterMethods'] else 'none'}, "
+              f"Highlights: {len(result['highlightMethods'])}, ColorBy: {result['colorBy']}, "
+              f"RAG chunks: {len(rag_citations)}")
+        # ── Step 2: Run UMAP/Clustering pipeline ─────────────────────
+        print("[Pipeline] Running UMAP + HDBSCAN...")
         response_data, clustering_info, _, _ = run_umap_pipeline(
             result['weights'], result['filterMethods']
         )
         print(f"[Pipeline] Done: {len(response_data)} methods, {clustering_info['n_clusters']} clusters")
+        # ── Step 3: Single LLM call (interpret results) ──────────────
+        print("[LLM] Generating insight...")
+        _, cluster_stats = build_cluster_stats(
+            response_data, clustering_info, result['weights']
         )
+        # Build compact cluster summary
+        compact_clusters = []
+        for cs in cluster_stats:
+            compact_clusters.append(f"- {cs['label']} ({cs['size']} methods): {', '.join(cs['methods'][:5])}")
+        cluster_text = '\n'.join(compact_clusters)
+        # Build the single, well-structured prompt
+        insight_prompt = f"""You are an expert research assistant for a robotic grasp planning visualization tool. A researcher has queried the system and you have access to real data from academic papers and clustering analysis.
+RESEARCHER'S QUESTION: "{query}"
+EVIDENCE FROM PAPERS:
+{rag_text if rag_text else '(No paper excerpts available for this query)'}
+RELEVANT METHODS IN THE DATASET:
+{method_summaries}
+CLUSTERING RESULTS ({len(response_data)} methods in {len(cluster_stats)} groups):
+{cluster_text}
+Highlighted methods (most relevant to query): {', '.join(result['highlightMethods'][:6])}
+INSTRUCTIONS:
+Write exactly 3-5 bullet points that answer the researcher's question. Each bullet must start with "- ".
+Rules:
+1. Lead with evidence from the paper excerpts. Quote specific techniques, equations, or results by paper name (e.g., "Contact-GraspNet uses a binary cross-entropy loss on predicted contact points").
+2. When no paper excerpt covers a point, draw on the method metadata (planning approach, gripper type, etc.) to provide grounded analysis.
+3. Connect findings to the clustering: explain why methods using similar approaches end up in the same group.
+4. Be specific and technical. Avoid generic statements like "various methods use different approaches."
+5. Never reference cluster numbers. Use group names like "the sampling-based parallel-jaw group."
+Respond with ONLY the bullet points, nothing else."""
         insight_text = llm_chat([
             {'role': 'user', 'content': insight_prompt}
         ], max_tokens=1024)
         if insight_text.startswith('```'):
             lines = insight_text.split('\n')
             insight_text = '\n'.join(lines[1:-1])
         result['insight'] = insight_text
+        print(f"[LLM] Insight: {len(insight_text)} chars")
+        # ── Build response ────────────────────────────────────────────
         result['umapData'] = response_data
         result['clustering'] = {
             'n_clusters': clustering_info['n_clusters'],
             'value_cluster_map': clustering_info['value_cluster_map']
         }
         result['clusterStats'] = cluster_stats
+        if rag_citations:
+            result['ragCitations'] = rag_citations
+        if rag_analytics:
+            result['ragAnalytics'] = rag_analytics
+        if method_relevance:
+            result['methodRelevance'] = method_relevance
         return jsonify({'success': True, **result})
         return jsonify({'success': False, 'error': str(e)}), 500
+@app.route('/api/papers/<path:paper_id>')
+def serve_paper(paper_id):
+    """Serve a PDF from the papers directory."""
+    papers_dir = os.path.join(BASE_DIR, 'papers')
+    # Sanitize: only allow alphanumeric, hyphens, underscores
+    import re as _re
+    safe_id = _re.sub(r'[^a-zA-Z0-9\-_]', '', paper_id.replace('.pdf', ''))
+    pdf_path = os.path.join(papers_dir, f'{safe_id}.pdf')
+    if os.path.isfile(pdf_path):
+        return send_from_directory(papers_dir, f'{safe_id}.pdf', mimetype='application/pdf')
+    return jsonify({'error': 'Paper not found'}), 404
+@app.route('/api/papers')
+def list_papers():
+    """List available PDF papers."""
+    papers_dir = os.path.join(BASE_DIR, 'papers')
+    if not os.path.isdir(papers_dir):
+        return jsonify({'papers': []})
+    pdfs = [f.replace('.pdf', '') for f in sorted(os.listdir(papers_dir)) if f.endswith('.pdf')]
+    return jsonify({'papers': pdfs})
 @app.route('/api/health')
 def health():
     """Health check endpoint."""

backend/rag/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Domain-agnostic RAG pipeline for academic paper exploration."""

backend/rag/config.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Domain-agnostic RAG configuration. One YAML file describes any paper collection."""
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+import yaml
+@dataclass
+class ChunkingConfig:
+    coarse_max_tokens: int = 800
+    mid_min_tokens: int = 200
+    mid_max_tokens: int = 800
+    mid_overlap_ratio: float = 0.15
+    fine_min_tokens: int = 50
+    fine_max_tokens: int = 300
+    semantic_similarity_threshold: float = 0.35
+    strategies: list = field(default_factory=lambda: ["semantic"])
+    domain_topics: list = field(default_factory=list)  # Domain keyword list for topic tagging
+@dataclass
+class RetrievalConfig:
+    coarse_top_k: int = 2
+    mid_top_k: int = 4
+    fine_top_k: int = 4
+    token_budget: int = 3000
+    rerank: bool = False
+@dataclass
+class RAGConfig:
+    project_name: str = "default"
+    domain_context: str = ""
+    csv_path: str = ""
+    name_column: str = "Name"
+    description_column: str = "Description"
+    link_column: str = "Link(s)"
+    embedding_model: str = "all-MiniLM-L6-v2"
+    embedding_dimensions: int = 384
+    chroma_persist_dir: str = "./chroma_db"
+    collection_name: str = "papers"
+    chunking: ChunkingConfig = field(default_factory=ChunkingConfig)
+    retrieval: RetrievalConfig = field(default_factory=RetrievalConfig)
+    tools_enabled: bool = True
+    dataset_columns: list = field(default_factory=list)
+def load_config(path: str) -> RAGConfig:
+    """Load RAG configuration from a YAML file."""
+    with open(path, 'r') as f:
+        raw = yaml.safe_load(f)
+    chunking_raw = raw.pop('chunking', {})
+    retrieval_raw = raw.pop('retrieval', {})
+    config = RAGConfig(**{k: v for k, v in raw.items() if k in RAGConfig.__dataclass_fields__})
+    config.chunking = ChunkingConfig(**{k: v for k, v in chunking_raw.items() if k in ChunkingConfig.__dataclass_fields__})
+    config.retrieval = RetrievalConfig(**{k: v for k, v in retrieval_raw.items() if k in RetrievalConfig.__dataclass_fields__})
+    return config
+def create_default_config(project_name: str, csv_path: str, domain_context: str = "") -> RAGConfig:
+    """Generate a sensible starting config for a new project."""
+    return RAGConfig(
+        project_name=project_name,
+        domain_context=domain_context,
+        csv_path=csv_path,
+    )
+def save_config(config: RAGConfig, path: str):
+    """Save configuration to YAML."""
+    data = {
+        'project_name': config.project_name,
+        'domain_context': config.domain_context,
+        'csv_path': config.csv_path,
+        'name_column': config.name_column,
+        'description_column': config.description_column,
+        'link_column': config.link_column,
+        'embedding_model': config.embedding_model,
+        'embedding_dimensions': config.embedding_dimensions,
+        'chroma_persist_dir': config.chroma_persist_dir,
+        'collection_name': config.collection_name,
+        'chunking': {
+            'coarse_max_tokens': config.chunking.coarse_max_tokens,
+            'mid_min_tokens': config.chunking.mid_min_tokens,
+            'mid_max_tokens': config.chunking.mid_max_tokens,
+            'mid_overlap_ratio': config.chunking.mid_overlap_ratio,
+            'fine_min_tokens': config.chunking.fine_min_tokens,
+            'fine_max_tokens': config.chunking.fine_max_tokens,
+            'semantic_similarity_threshold': config.chunking.semantic_similarity_threshold,
+            'strategies': config.chunking.strategies,
+            'domain_topics': config.chunking.domain_topics,
+        },
+        'retrieval': {
+            'coarse_top_k': config.retrieval.coarse_top_k,
+            'mid_top_k': config.retrieval.mid_top_k,
+            'fine_top_k': config.retrieval.fine_top_k,
+            'token_budget': config.retrieval.token_budget,
+            'rerank': config.retrieval.rerank,
+        },
+        'tools_enabled': config.tools_enabled,
+        'dataset_columns': config.dataset_columns,
+    }
+    with open(path, 'w') as f:
+        yaml.dump(data, f, default_flow_style=False, sort_keys=False)

backend/rag/ingest/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Paper ingestion: PDF parsing, chunking, embedding, and storage."""

backend/rag/ingest/chunker.py ADDED Viewed

	@@ -0,0 +1,762 @@

+"""Hybrid structural + semantic chunker with domain-aware metadata.
+Three-layer hierarchy:
+  - Coarse: paper-level overview (abstract, section summaries, figure captions)
+  - Mid: semantic splitting within sections (topic-boundary detection via
+    sentence embeddings), with configurable overlap between chunks
+  - Fine: granular sentence groups for precise retrieval
+Each chunk is enriched with:
+  - domain_topics: matched keywords from a configurable domain vocabulary
+  - rhetorical_role: heuristic classification (algorithm_description, result, etc.)
+  - content_type: theory vs implementation vs evaluation
+  - chunk_type: abstract, equation, figure, citation, plain, etc.
+"""
+import re
+import numpy as np
+from abc import ABC, abstractmethod
+from collections import Counter
+from dataclasses import dataclass, field
+from .pdf_parser import ParsedPaper, ParsedSection
+# ---------------------------------------------------------------------------
+# Chunk dataclass
+# ---------------------------------------------------------------------------
+@dataclass
+class Chunk:
+    chunk_id: str
+    paper_id: str
+    paper_title: str
+    text: str
+    layer: str              # "coarse", "mid", "fine"
+    chunk_type: str         # "abstract", "section_summary", "figure_captions",
+                            # "equation", "citation_context", "semantic_group", "paragraph"
+    section: str
+    subsection: str = ""
+    page: int = 0
+    position: float = 0.0  # Normalized position in paper (0.0-1.0)
+    token_count: int = 0
+    domain_topics: list = field(default_factory=list)
+    rhetorical_role: str = ""       # algorithm_description, experimental_setup, result, ...
+    content_type: str = ""          # theory, implementation, evaluation
+    metadata: dict = field(default_factory=dict)
+# ---------------------------------------------------------------------------
+# Utility functions
+# ---------------------------------------------------------------------------
+def estimate_tokens(text: str) -> int:
+    """Approximate token count via whitespace splitting."""
+    return len(text.split())
+def _normalize_section_name(title: str) -> str:
+    """Strip leading numbers from section titles."""
+    clean = re.sub(r'^\d+\.?\d*\.?\s*', '', title).strip()
+    return clean if clean else title
+def _split_paragraphs(text: str) -> list:
+    """Split on double newlines."""
+    paragraphs = re.split(r'\n\s*\n|\n{2,}', text)
+    return [p.strip() for p in paragraphs if p.strip()]
+def _split_sentences(text: str) -> list:
+    """Split at sentence boundaries (period/question/exclamation followed by uppercase)."""
+    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
+    return [s.strip() for s in sentences if s.strip()]
+SKIP_SECTIONS = {'references', 'acknowledgments', 'acknowledgements', 'bibliography'}
+def _should_skip_section(name: str) -> bool:
+    return name.lower().strip() in SKIP_SECTIONS
+# ---------------------------------------------------------------------------
+# Domain topic extraction
+# ---------------------------------------------------------------------------
+def extract_domain_topics(text: str, domain_keywords: list) -> list:
+    """Match chunk text against a domain keyword vocabulary.
+    Case-insensitive matching. Returns deduplicated list of matched keywords
+    sorted by frequency of occurrence in the text.
+    """
+    if not domain_keywords:
+        return []
+    text_lower = text.lower()
+    matched = []
+    for kw in domain_keywords:
+        kw_lower = kw.lower()
+        # Use word-boundary matching for short keywords, substring for multi-word
+        if ' ' in kw_lower:
+            if kw_lower in text_lower:
+                matched.append(kw)
+        else:
+            if re.search(r'\b' + re.escape(kw_lower) + r'\b', text_lower):
+                matched.append(kw)
+    return list(dict.fromkeys(matched))  # deduplicate preserving order
+# ---------------------------------------------------------------------------
+# Rhetorical role and content type classification (heuristic)
+# ---------------------------------------------------------------------------
+ROLE_PATTERNS = {
+    'problem_statement': [
+        r'\b(we address|the problem of|challenge of|goal is to|aim to)\b',
+    ],
+    'algorithm_description': [
+        r'\b(we propose|our method|our approach|architecture|pipeline|network|module)\b',
+        r'\b(algorithm \d|step \d|procedure)\b',
+    ],
+    'experimental_setup': [
+        r'\b(we evaluate|experiment|setup|dataset|baseline|benchmark|hardware|robot platform)\b',
+        r'\b(training details|hyperparameter|implementation detail|we train)\b',
+    ],
+    'result': [
+        r'\b(table \d|figure \d|fig\.\s*\d|results show|we achieve|accuracy|success rate|outperform)\b',
+        r'\b(ablation|comparison|performance|improvement|f1|precision|recall)\b',
+    ],
+    'comparison': [
+        r'\b(compared to|in contrast|unlike|whereas|prior work|related work|existing method)\b',
+    ],
+    'limitation': [
+        r'\b(limitation|failure|drawback|future work|open question|cannot|does not)\b',
+    ],
+    'definition': [
+        r'\b(we define|denoted by|let \w+ be|formally|definition)\b',
+    ],
+}
+CONTENT_TYPE_MAP = {
+    'algorithm_description': 'theory',
+    'definition': 'theory',
+    'problem_statement': 'theory',
+    'experimental_setup': 'implementation',
+    'result': 'evaluation',
+    'comparison': 'evaluation',
+    'limitation': 'evaluation',
+}
+def classify_rhetorical_role(text: str) -> str:
+    """Assign a rhetorical role based on keyword/pattern matching."""
+    text_lower = text.lower()
+    scores = {}
+    for role, patterns in ROLE_PATTERNS.items():
+        score = sum(1 for p in patterns if re.search(p, text_lower))
+        if score > 0:
+            scores[role] = score
+    if not scores:
+        return "general"
+    return max(scores, key=scores.get)
+def classify_content_type(rhetorical_role: str, section_name: str) -> str:
+    """Derive content_type from rhetorical role and section name."""
+    # Section-based override
+    sec_lower = section_name.lower()
+    if any(k in sec_lower for k in ('experiment', 'result', 'evaluation', 'ablation')):
+        return 'evaluation'
+    if any(k in sec_lower for k in ('method', 'approach', 'model', 'architecture', 'algorithm')):
+        return 'theory'
+    if any(k in sec_lower for k in ('implement', 'training', 'setup', 'detail')):
+        return 'implementation'
+    # Fall back to rhetorical role mapping
+    return CONTENT_TYPE_MAP.get(rhetorical_role, 'general')
+# ---------------------------------------------------------------------------
+# Equation and citation detection
+# ---------------------------------------------------------------------------
+EQUATION_RE = re.compile(
+    r'(?:'
+    r'\\begin\{(?:equation|align|gather)\}.*?\\end\{(?:equation|align|gather)\}'
+    r'|[A-Za-z]\s*=\s*[^,\n]{10,}'
+    r'|\$[^$]+\$'
+    r')',
+    re.DOTALL
+)
+CITATION_RE = re.compile(
+    r'(?:\[[\d,\s\-]+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\))',
+)
+def detect_chunk_type(text: str) -> str:
+    """Detect whether a chunk is primarily an equation, citation context, or plain text."""
+    eq_matches = len(EQUATION_RE.findall(text))
+    cit_matches = len(CITATION_RE.findall(text))
+    tokens = estimate_tokens(text)
+    if tokens > 0:
+        eq_density = eq_matches / tokens
+        cit_density = cit_matches / tokens
+        if eq_density > 0.02 or eq_matches >= 3:
+            return "equation"
+        if cit_density > 0.03 or cit_matches >= 4:
+            return "citation_context"
+    return "plain"
+# ---------------------------------------------------------------------------
+# Semantic sentence similarity (for topic-boundary detection)
+# ---------------------------------------------------------------------------
+def _compute_sentence_similarities(sentences: list, model) -> np.ndarray:
+    """Embed sentences and compute consecutive cosine similarities.
+    Returns array of shape (n_sentences - 1,) where element i is the
+    cosine similarity between sentence i and sentence i+1.
+    """
+    if len(sentences) < 2:
+        return np.array([])
+    embeddings = model.encode(sentences, show_progress_bar=False)
+    # Normalize
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    norms[norms == 0] = 1
+    embeddings = embeddings / norms
+    # Consecutive cosine similarities
+    sims = np.array([
+        np.dot(embeddings[i], embeddings[i + 1])
+        for i in range(len(embeddings) - 1)
+    ])
+    return sims
+def _find_semantic_boundaries(sims: np.ndarray, threshold: float) -> list:
+    """Find indices where consecutive similarity drops below threshold.
+    Returns list of split points (indices into the similarity array).
+    A split at index i means: cut AFTER sentence i.
+    """
+    boundaries = []
+    for i, sim in enumerate(sims):
+        if sim < threshold:
+            boundaries.append(i)
+    return boundaries
+def _group_sentences_by_boundaries(sentences: list, boundaries: list) -> list:
+    """Group sentences into segments based on boundary indices."""
+    groups = []
+    start = 0
+    for b in sorted(boundaries):
+        cut = b + 1  # cut after sentence b
+        if cut > start:
+            groups.append(sentences[start:cut])
+        start = cut
+    if start < len(sentences):
+        groups.append(sentences[start:])
+    return groups
+# ---------------------------------------------------------------------------
+# Overlap generation
+# ---------------------------------------------------------------------------
+def _apply_overlap(chunks: list, overlap_ratio: float) -> list:
+    """Add overlap between consecutive same-section mid-level chunks.
+    Takes the last N sentences of chunk i and prepends them to chunk i+1,
+    where N is determined by overlap_ratio * chunk_i token count.
+    """
+    if overlap_ratio <= 0 or len(chunks) < 2:
+        return chunks
+    result = [chunks[0]]
+    for i in range(1, len(chunks)):
+        prev = chunks[i - 1]
+        curr = chunks[i]
+        # Only overlap within same section
+        if prev.section != curr.section:
+            result.append(curr)
+            continue
+        overlap_tokens = int(prev.token_count * overlap_ratio)
+        if overlap_tokens < 10:
+            result.append(curr)
+            continue
+        # Extract trailing sentences from previous chunk
+        prev_sentences = _split_sentences(prev.text)
+        overlap_sents = []
+        acc = 0
+        for s in reversed(prev_sentences):
+            t = estimate_tokens(s)
+            if acc + t > overlap_tokens:
+                break
+            overlap_sents.insert(0, s)
+            acc += t
+        if overlap_sents:
+            overlap_text = ' '.join(overlap_sents)
+            new_text = overlap_text + ' ' + curr.text
+            result.append(Chunk(
+                chunk_id=curr.chunk_id,
+                paper_id=curr.paper_id,
+                paper_title=curr.paper_title,
+                text=new_text,
+                layer=curr.layer,
+                chunk_type=curr.chunk_type,
+                section=curr.section,
+                subsection=curr.subsection,
+                page=curr.page,
+                position=curr.position,
+                token_count=estimate_tokens(new_text),
+                domain_topics=curr.domain_topics,
+                rhetorical_role=curr.rhetorical_role,
+                content_type=curr.content_type,
+                metadata={**curr.metadata, 'has_overlap': True},
+            ))
+        else:
+            result.append(curr)
+    return result
+# ---------------------------------------------------------------------------
+# Enrichment: apply domain topics, rhetorical role, content type to all chunks
+# ---------------------------------------------------------------------------
+def _enrich_chunk(chunk: Chunk, domain_keywords: list) -> Chunk:
+    """Add domain_topics, rhetorical_role, and content_type to a chunk."""
+    chunk.domain_topics = extract_domain_topics(chunk.text, domain_keywords)
+    chunk.rhetorical_role = classify_rhetorical_role(chunk.text)
+    chunk.content_type = classify_content_type(chunk.rhetorical_role, chunk.section)
+    # Detect special chunk types for non-abstract/figure chunks
+    if chunk.chunk_type in ('plain', 'semantic_group', 'paragraph', 'subsection'):
+        detected = detect_chunk_type(chunk.text)
+        if detected != 'plain':
+            chunk.chunk_type = detected
+    return chunk
+# ---------------------------------------------------------------------------
+# Chunking strategies
+# ---------------------------------------------------------------------------
+class ChunkingStrategy(ABC):
+    @abstractmethod
+    def chunk(self, paper: ParsedPaper, config, model=None) -> list:
+        pass
+class CoarseChunker(ChunkingStrategy):
+    """Layer 1: Paper-level overview chunks."""
+    def chunk(self, paper: ParsedPaper, config, model=None) -> list:
+        chunks = []
+        max_tokens = config.coarse_max_tokens
+        # 1. Title + Abstract
+        if paper.abstract:
+            text = f"{paper.title}\n\n{paper.abstract}"
+            if estimate_tokens(text) > max_tokens:
+                text = ' '.join(text.split()[:max_tokens])
+            chunks.append(Chunk(
+                chunk_id=f"{paper.paper_id}_coarse_abstract",
+                paper_id=paper.paper_id,
+                paper_title=paper.title,
+                text=text,
+                layer="coarse",
+                chunk_type="abstract",
+                section="Abstract",
+                page=0,
+                position=0.0,
+                token_count=estimate_tokens(text),
+            ))
+        # 2. Section summaries
+        total_sections = len(paper.sections) or 1
+        for i, section in enumerate(paper.sections):
+            section_name = _normalize_section_name(section.title)
+            if _should_skip_section(section_name):
+                continue
+            tokens = estimate_tokens(section.text)
+            if tokens <= max_tokens:
+                summary_text = section.text
+            else:
+                paragraphs = _split_paragraphs(section.text)
+                if len(paragraphs) >= 2:
+                    summary_text = paragraphs[0] + "\n\n" + paragraphs[-1]
+                else:
+                    summary_text = ' '.join(section.text.split()[:max_tokens])
+            chunks.append(Chunk(
+                chunk_id=f"{paper.paper_id}_coarse_sec_{i}",
+                paper_id=paper.paper_id,
+                paper_title=paper.title,
+                text=summary_text,
+                layer="coarse",
+                chunk_type="section_summary",
+                section=section_name,
+                page=section.page_start,
+                position=round(i / total_sections, 2),
+                token_count=estimate_tokens(summary_text),
+            ))
+        # 3. Figure/table captions
+        if paper.figures:
+            captions = '\n'.join(f.caption for f in paper.figures)
+            chunks.append(Chunk(
+                chunk_id=f"{paper.paper_id}_coarse_figures",
+                paper_id=paper.paper_id,
+                paper_title=paper.title,
+                text=captions,
+                layer="coarse",
+                chunk_type="figure_captions",
+                section="Figures",
+                position=0.5,
+                token_count=estimate_tokens(captions),
+            ))
+        return chunks
+class SemanticChunker(ChunkingStrategy):
+    """Layer 2: Structural boundaries as hard cuts, semantic similarity for
+    soft topic-boundary detection within sections.
+    Within each section:
+    1. Split into sentences
+    2. Embed every sentence with the sentence-transformer
+    3. Compute consecutive cosine similarities
+    4. Cut where similarity drops below threshold (topic shift)
+    5. Group sentences between cuts into chunks
+    6. Apply min/max token constraints (merge small groups, split large ones)
+    7. Add overlap between consecutive chunks
+    """
+    def chunk(self, paper: ParsedPaper, config, model=None) -> list:
+        chunks = []
+        min_tokens = config.mid_min_tokens
+        max_tokens = config.mid_max_tokens
+        threshold = config.semantic_similarity_threshold
+        total_sections = len(paper.sections) or 1
+        for sec_idx, section in enumerate(paper.sections):
+            section_name = _normalize_section_name(section.title)
+            if _should_skip_section(section_name):
+                continue
+            sentences = _split_sentences(section.text)
+            if not sentences:
+                continue
+            # --- Semantic boundary detection ---
+            if model is not None and len(sentences) >= 3:
+                sims = _compute_sentence_similarities(sentences, model)
+                boundaries = _find_semantic_boundaries(sims, threshold)
+                groups = _group_sentences_by_boundaries(sentences, boundaries)
+            else:
+                # Fallback: paragraph-based grouping
+                paragraphs = _split_paragraphs(section.text)
+                groups = [_split_sentences(p) for p in paragraphs if p.strip()]
+                if not groups:
+                    groups = [sentences]
+            # --- Enforce min/max token constraints ---
+            merged_groups = []
+            buffer = []
+            buffer_tokens = 0
+            for group in groups:
+                group_text = ' '.join(group)
+                group_tokens = estimate_tokens(group_text)
+                if buffer_tokens + group_tokens <= max_tokens:
+                    buffer.extend(group)
+                    buffer_tokens += group_tokens
+                else:
+                    if buffer and buffer_tokens >= min_tokens:
+                        merged_groups.append(buffer)
+                    elif buffer:
+                        # Buffer too small, absorb this group into it
+                        buffer.extend(group)
+                        buffer_tokens += group_tokens
+                        if buffer_tokens >= min_tokens:
+                            merged_groups.append(buffer)
+                            buffer = []
+                            buffer_tokens = 0
+                        continue
+                    # Start new buffer
+                    if group_tokens > max_tokens:
+                        # Split oversized group at token boundary
+                        sub_buffer = []
+                        sub_tokens = 0
+                        for s in group:
+                            st = estimate_tokens(s)
+                            if sub_tokens + st > max_tokens and sub_buffer:
+                                merged_groups.append(sub_buffer)
+                                sub_buffer = []
+                                sub_tokens = 0
+                            sub_buffer.append(s)
+                            sub_tokens += st
+                        buffer = sub_buffer
+                        buffer_tokens = sub_tokens
+                    else:
+                        buffer = list(group)
+                        buffer_tokens = group_tokens
+            # Flush remaining buffer
+            if buffer:
+                if buffer_tokens >= min_tokens or not merged_groups:
+                    merged_groups.append(buffer)
+                elif merged_groups:
+                    merged_groups[-1].extend(buffer)
+            # --- Create chunk objects ---
+            for chunk_idx, group in enumerate(merged_groups):
+                text = ' '.join(group)
+                chunks.append(Chunk(
+                    chunk_id=f"{paper.paper_id}_mid_{sec_idx}_{chunk_idx}",
+                    paper_id=paper.paper_id,
+                    paper_title=paper.title,
+                    text=text,
+                    layer="mid",
+                    chunk_type="semantic_group",
+                    section=section_name,
+                    subsection=section.title,
+                    page=section.page_start,
+                    position=round(sec_idx / total_sections, 2),
+                    token_count=estimate_tokens(text),
+                ))
+        return chunks
+class StructuralChunker(ChunkingStrategy):
+    """Layer 2 fallback: paragraph-based grouping within sections (no embeddings needed)."""
+    def chunk(self, paper: ParsedPaper, config, model=None) -> list:
+        chunks = []
+        min_tokens = config.mid_min_tokens
+        max_tokens = config.mid_max_tokens
+        total_sections = len(paper.sections) or 1
+        for sec_idx, section in enumerate(paper.sections):
+            section_name = _normalize_section_name(section.title)
+            if _should_skip_section(section_name):
+                continue
+            paragraphs = _split_paragraphs(section.text)
+            if not paragraphs:
+                continue
+            current_text = []
+            current_tokens = 0
+            chunk_idx = 0
+            for para in paragraphs:
+                para_tokens = estimate_tokens(para)
+                if current_tokens + para_tokens > max_tokens and current_text:
+                    text = '\n\n'.join(current_text)
+                    chunks.append(Chunk(
+                        chunk_id=f"{paper.paper_id}_mid_{sec_idx}_{chunk_idx}",
+                        paper_id=paper.paper_id,
+                        paper_title=paper.title,
+                        text=text,
+                        layer="mid",
+                        chunk_type="subsection",
+                        section=section_name,
+                        subsection=section.title,
+                        page=section.page_start,
+                        position=round(sec_idx / total_sections, 2),
+                        token_count=estimate_tokens(text),
+                    ))
+                    chunk_idx += 1
+                    current_text = []
+                    current_tokens = 0
+                current_text.append(para)
+                current_tokens += para_tokens
+            if current_text:
+                text = '\n\n'.join(current_text)
+                if estimate_tokens(text) >= min_tokens or chunk_idx == 0:
+                    chunks.append(Chunk(
+                        chunk_id=f"{paper.paper_id}_mid_{sec_idx}_{chunk_idx}",
+                        paper_id=paper.paper_id,
+                        paper_title=paper.title,
+                        text=text,
+                        layer="mid",
+                        chunk_type="subsection",
+                        section=section_name,
+                        subsection=section.title,
+                        page=section.page_start,
+                        position=round(sec_idx / total_sections, 2),
+                        token_count=estimate_tokens(text),
+                    ))
+                elif chunks:
+                    prev = chunks[-1]
+                    merged = prev.text + '\n\n' + text
+                    chunks[-1] = Chunk(
+                        chunk_id=prev.chunk_id,
+                        paper_id=prev.paper_id,
+                        paper_title=prev.paper_title,
+                        text=merged,
+                        layer=prev.layer,
+                        chunk_type=prev.chunk_type,
+                        section=prev.section,
+                        subsection=prev.subsection,
+                        page=prev.page,
+                        position=prev.position,
+                        token_count=estimate_tokens(merged),
+                    )
+        return chunks
+class FineChunker(ChunkingStrategy):
+    """Layer 3: Sentence-level fine chunks for precise retrieval."""
+    def chunk(self, paper: ParsedPaper, config, model=None) -> list:
+        chunks = []
+        max_tokens = config.fine_max_tokens
+        min_tokens = config.fine_min_tokens
+        total_sections = len(paper.sections) or 1
+        for sec_idx, section in enumerate(paper.sections):
+            section_name = _normalize_section_name(section.title)
+            if _should_skip_section(section_name):
+                continue
+            paragraphs = _split_paragraphs(section.text)
+            chunk_idx = 0
+            for para in paragraphs:
+                tokens = estimate_tokens(para)
+                if tokens < min_tokens:
+                    continue
+                if tokens > max_tokens:
+                    sentences = _split_sentences(para)
+                    current = []
+                    current_tokens = 0
+                    for sent in sentences:
+                        st = estimate_tokens(sent)
+                        if current_tokens + st > max_tokens and current:
+                            text = ' '.join(current)
+                            chunks.append(Chunk(
+                                chunk_id=f"{paper.paper_id}_fine_{sec_idx}_{chunk_idx}",
+                                paper_id=paper.paper_id,
+                                paper_title=paper.title,
+                                text=text,
+                                layer="fine",
+                                chunk_type="paragraph",
+                                section=section_name,
+                                subsection=section.title,
+                                page=section.page_start,
+                                position=round(sec_idx / total_sections, 2),
+                                token_count=estimate_tokens(text),
+                            ))
+                            chunk_idx += 1
+                            current = []
+                            current_tokens = 0
+                        current.append(sent)
+                        current_tokens += st
+                    if current:
+                        text = ' '.join(current)
+                        if estimate_tokens(text) >= min_tokens:
+                            chunks.append(Chunk(
+                                chunk_id=f"{paper.paper_id}_fine_{sec_idx}_{chunk_idx}",
+                                paper_id=paper.paper_id,
+                                paper_title=paper.title,
+                                text=text,
+                                layer="fine",
+                                chunk_type="paragraph",
+                                section=section_name,
+                                subsection=section.title,
+                                page=section.page_start,
+                                position=round(sec_idx / total_sections, 2),
+                                token_count=estimate_tokens(text),
+                            ))
+                            chunk_idx += 1
+                else:
+                    chunks.append(Chunk(
+                        chunk_id=f"{paper.paper_id}_fine_{sec_idx}_{chunk_idx}",
+                        paper_id=paper.paper_id,
+                        paper_title=paper.title,
+                        text=para,
+                        layer="fine",
+                        chunk_type="paragraph",
+                        section=section_name,
+                        subsection=section.title,
+                        page=section.page_start,
+                        position=round(sec_idx / total_sections, 2),
+                        token_count=tokens,
+                    ))
+                    chunk_idx += 1
+        return chunks
+# ---------------------------------------------------------------------------
+# Strategy registry
+# ---------------------------------------------------------------------------
+STRATEGIES = {
+    "coarse": CoarseChunker,
+    "semantic": SemanticChunker,
+    "structural": StructuralChunker,
+    "fine": FineChunker,
+}
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+def chunk_paper(paper: ParsedPaper, config, model=None) -> list:
+    """Chunk a parsed paper using the configured strategy with full metadata enrichment.
+    Args:
+        paper: ParsedPaper from the PDF parser.
+        config: ChunkingConfig with token limits, thresholds, domain keywords.
+        model: Optional SentenceTransformer for semantic chunking. If None and
+               strategy is 'semantic', falls back to structural chunking.
+    Returns:
+        List of Chunk objects across all three layers, enriched with domain
+        topics, rhetorical roles, and content types.
+    """
+    all_chunks = []
+    domain_keywords = getattr(config, 'domain_topics', [])
+    # Layer 1: Coarse (always)
+    all_chunks.extend(CoarseChunker().chunk(paper, config, model))
+    # Layer 2: Mid-level (semantic or structural based on config)
+    strategy_name = config.strategies[0] if config.strategies else "semantic"
+    if strategy_name == "semantic":
+        mid_chunks = SemanticChunker().chunk(paper, config, model)
+    else:
+        mid_chunks = StructuralChunker().chunk(paper, config, model)
+    # Apply overlap between consecutive mid-level chunks
+    mid_chunks = _apply_overlap(mid_chunks, config.mid_overlap_ratio)
+    all_chunks.extend(mid_chunks)
+    # Layer 3: Fine
+    all_chunks.extend(FineChunker().chunk(paper, config, model))
+    # Enrich all chunks with domain topics, rhetorical role, content type
+    all_chunks = [_enrich_chunk(c, domain_keywords) for c in all_chunks]
+    return all_chunks

backend/rag/ingest/embedder.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Embedding wrapper for chunk and query encoding.
+Pluggable model via config. Prepends section context to chunk text
+before embedding to steer vectors toward the right semantic neighborhood.
+"""
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from .chunker import Chunk
+class ChunkEmbedder:
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2", model_instance=None):
+        """Initialize embedder.
+        Args:
+            model_name: HuggingFace model name for sentence-transformers.
+            model_instance: Optional pre-loaded SentenceTransformer to reuse
+                (avoids loading the model twice at runtime).
+        """
+        if model_instance is not None:
+            self.model = model_instance
+        else:
+            self.model = SentenceTransformer(model_name)
+        self.model_name = model_name
+    def _prepare_text(self, chunk: Chunk) -> str:
+        """Prepend section context to chunk text before embedding."""
+        prefix = f"[{chunk.section}"
+        if chunk.subsection and chunk.subsection != chunk.section:
+            prefix += f": {chunk.subsection}"
+        prefix += "] "
+        return prefix + chunk.text
+    def embed_chunks(self, chunks: list, batch_size: int = 32) -> np.ndarray:
+        """Embed a list of chunks. Returns array of shape (n_chunks, dim)."""
+        if not chunks:
+            return np.array([])
+        texts = [self._prepare_text(c) for c in chunks]
+        embeddings = self.model.encode(texts, batch_size=batch_size, show_progress_bar=True)
+        return np.array(embeddings)
+    def embed_query(self, query: str) -> np.ndarray:
+        """Embed a single query string."""
+        return self.model.encode(query)

backend/rag/ingest/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""PDF parsing with pdfplumber + heuristic section detection.
+Domain-agnostic: uses font-size changes and numbering patterns to detect
+section boundaries. Falls back to known academic header keywords when available.
+"""
+import re
+from dataclasses import dataclass, field
+import pdfplumber
+KNOWN_HEADERS = {
+    'abstract', 'introduction', 'related work', 'background',
+    'method', 'methods', 'methodology', 'approach',
+    'experiment', 'experiments', 'results', 'evaluation',
+    'discussion', 'conclusion', 'conclusions',
+    'acknowledgments', 'acknowledgements', 'references', 'appendix',
+}
+NUMBERED_HEADER_RE = re.compile(r'^(\d+\.?\s+|[IVXLC]+\.?\s+)[A-Z]')
+LETTERED_HEADER_RE = re.compile(r'^[A-Z]\.\s+[A-Z]')
+@dataclass
+class ParsedSection:
+    title: str
+    level: int          # 0=paper, 1=section, 2=subsection
+    text: str
+    page_start: int
+    page_end: int
+@dataclass
+class ParsedFigure:
+    caption: str
+    page: int
+    nearby_text: str = ""
+@dataclass
+class ParsedPaper:
+    paper_id: str
+    title: str
+    abstract: str
+    sections: list = field(default_factory=list)
+    figures: list = field(default_factory=list)
+    raw_text: str = ""
+def _estimate_median_font_size(page):
+    """Get median font size from a page's character data."""
+    chars = page.chars
+    if not chars:
+        return 10.0
+    sizes = [c.get('size', 10.0) for c in chars]
+    sizes.sort()
+    return sizes[len(sizes) // 2]
+def _line_font_size(page, line_text, line_top):
+    """Estimate font size for a specific line by matching characters near its y-position."""
+    chars = page.chars
+    if not chars or not line_text.strip():
+        return None
+    line_chars = [c for c in chars if abs(c.get('top', 0) - line_top) < 3]
+    if not line_chars:
+        return None
+    sizes = [c.get('size', 10.0) for c in line_chars]
+    return sum(sizes) / len(sizes)
+def _is_header_line(line: str, font_size: float, median_size: float) -> tuple:
+    """Determine if a line is a section header. Returns (is_header, level)."""
+    stripped = line.strip()
+    if not stripped or len(stripped) > 100:
+        return False, 0
+    # Check font size (headers are typically larger)
+    size_boost = font_size and median_size and font_size > median_size * 1.1
+    # Check known academic headers
+    lower = stripped.lower().rstrip(':').strip()
+    # Remove leading numbers for matching
+    clean = re.sub(r'^\d+\.?\s*', '', lower).strip()
+    is_known = clean in KNOWN_HEADERS
+    # Check numbering pattern (e.g., "3. Method", "IV. Results")
+    has_number = bool(NUMBERED_HEADER_RE.match(stripped)) or bool(LETTERED_HEADER_RE.match(stripped))
+    # Subsection pattern (e.g., "3.1 Dynamics Model")
+    is_subsection = bool(re.match(r'^\d+\.\d+\.?\s+', stripped))
+    if is_known or (has_number and size_boost):
+        level = 2 if is_subsection else 1
+        return True, level
+    if size_boost and len(stripped) < 60 and not stripped.endswith('.'):
+        level = 2 if is_subsection else 1
+        return True, level
+    return False, 0
+def _extract_figures(page, page_num: int) -> list:
+    """Extract figure/table captions from a page."""
+    text = page.extract_text() or ""
+    figures = []
+    caption_re = re.compile(
+        r'((?:Figure|Fig\.|Table|Algorithm)\s*\d+[.:]\s*.+?)(?:\n\n|\n(?=[A-Z0-9])|\Z)',
+        re.IGNORECASE | re.DOTALL
+    )
+    for match in caption_re.finditer(text):
+        caption = match.group(1).strip()
+        if len(caption) > 20:
+            figures.append(ParsedFigure(caption=caption, page=page_num))
+    return figures
+def parse_pdf(pdf_path: str, paper_id: str = None) -> ParsedPaper:
+    """Extract structured text from a PDF.
+    Args:
+        pdf_path: Path to the PDF file.
+        paper_id: Identifier for the paper. Defaults to filename stem.
+    Returns:
+        ParsedPaper with title, abstract, sections, figures, and raw text.
+    """
+    import os
+    if paper_id is None:
+        paper_id = os.path.splitext(os.path.basename(pdf_path))[0]
+    all_text_lines = []
+    page_lines = []  # (line_text, page_num, line_top, font_size)
+    figures = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages):
+            text = page.extract_text() or ""
+            median_size = _estimate_median_font_size(page)
+            lines = text.split('\n')
+            # Track approximate y-positions
+            current_top = 0
+            for line in lines:
+                fs = _line_font_size(page, line, current_top)
+                page_lines.append((line, page_num, current_top, fs, median_size))
+                current_top += 12  # approximate line height
+            all_text_lines.extend(lines)
+            figures.extend(_extract_figures(page, page_num))
+    raw_text = '\n'.join(all_text_lines)
+    # Extract title (first non-empty line, typically largest font)
+    title = ""
+    for line, pn, top, fs, ms in page_lines:
+        if line.strip():
+            title = line.strip()
+            break
+    # Extract abstract
+    abstract = ""
+    in_abstract = False
+    abstract_lines = []
+    for line, pn, top, fs, ms in page_lines:
+        stripped = line.strip()
+        lower = stripped.lower()
+        if lower.startswith('abstract') and not in_abstract:
+            in_abstract = True
+            # Remove "Abstract" prefix
+            remainder = re.sub(r'^abstract[:\s\-]*', '', stripped, flags=re.IGNORECASE).strip()
+            if remainder:
+                abstract_lines.append(remainder)
+            continue
+        if in_abstract:
+            is_hdr, _ = _is_header_line(stripped, fs, ms)
+            if is_hdr and abstract_lines:
+                break
+            if stripped:
+                abstract_lines.append(stripped)
+    abstract = ' '.join(abstract_lines)
+    # Build sections
+    sections = []
+    current_section = None
+    current_lines = []
+    current_page_start = 0
+    for line, pn, top, fs, ms in page_lines:
+        stripped = line.strip()
+        is_hdr, level = _is_header_line(stripped, fs, ms)
+        if is_hdr and stripped.lower().rstrip(':').strip() not in ('abstract',):
+            # Save previous section
+            if current_section is not None:
+                section_text = '\n'.join(current_lines).strip()
+                if section_text:
+                    sections.append(ParsedSection(
+                        title=current_section,
+                        level=level,
+                        text=section_text,
+                        page_start=current_page_start,
+                        page_end=pn,
+                    ))
+            current_section = stripped
+            current_lines = []
+            current_page_start = pn
+        elif current_section is not None:
+            current_lines.append(stripped)
+    # Save last section
+    if current_section and current_lines:
+        section_text = '\n'.join(current_lines).strip()
+        if section_text:
+            sections.append(ParsedSection(
+                title=current_section,
+                level=1,
+                text=section_text,
+                page_start=current_page_start,
+                page_end=len(page_lines) > 0 and page_lines[-1][1] or 0,
+            ))
+    # If no sections detected, create one big section from raw text
+    if not sections and raw_text.strip():
+        sections.append(ParsedSection(
+            title="Full Text",
+            level=1,
+            text=raw_text,
+            page_start=0,
+            page_end=0,
+        ))
+    return ParsedPaper(
+        paper_id=paper_id,
+        title=title,
+        abstract=abstract,
+        sections=sections,
+        figures=figures,
+        raw_text=raw_text,
+    )

backend/rag/ingest/pipeline.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""Ingestion pipeline CLI: parse PDFs, chunk, embed, and store in ChromaDB.
+Usage:
+    python -m rag.ingest.pipeline --papers-dir ./papers/ --config rag_config.yaml
+Scans the papers directory for PDFs, matches them to dataset rows by filename,
+and runs the full parse -> chunk -> embed -> store pipeline.
+"""
+import argparse
+import os
+import sys
+import time
+from ..config import load_config, RAGConfig
+from .pdf_parser import parse_pdf
+from .chunker import chunk_paper
+from .embedder import ChunkEmbedder
+from .store import get_client, create_or_get_collection, upsert_chunks, delete_paper, get_collection_stats
+def find_pdfs(papers_dir: str) -> list:
+    """Find all PDF files in the given directory."""
+    pdfs = []
+    for f in sorted(os.listdir(papers_dir)):
+        if f.lower().endswith('.pdf'):
+            pdfs.append(os.path.join(papers_dir, f))
+    return pdfs
+def paper_id_from_path(pdf_path: str) -> str:
+    """Derive a paper_id from the PDF filename."""
+    name = os.path.splitext(os.path.basename(pdf_path))[0]
+    # Slugify: lowercase, replace spaces/special chars with hyphens
+    slug = name.lower().strip()
+    slug = slug.replace(' ', '-').replace('_', '-')
+    return slug
+def ingest_single(pdf_path: str, config: RAGConfig, embedder: ChunkEmbedder, collection) -> dict:
+    """Ingest a single PDF. Returns stats dict."""
+    paper_id = paper_id_from_path(pdf_path)
+    print(f"\n  Parsing: {os.path.basename(pdf_path)} (id={paper_id})")
+    # Parse
+    paper = parse_pdf(pdf_path, paper_id=paper_id)
+    print(f"    Title: {paper.title[:80]}")
+    print(f"    Sections: {len(paper.sections)}, Figures: {len(paper.figures)}")
+    print(f"    Abstract: {len(paper.abstract)} chars")
+    # Chunk (pass embedder's model for semantic chunking)
+    chunks = chunk_paper(paper, config.chunking, model=embedder.model)
+    coarse = sum(1 for c in chunks if c.layer == "coarse")
+    mid = sum(1 for c in chunks if c.layer == "mid")
+    fine = sum(1 for c in chunks if c.layer == "fine")
+    # Count enrichment stats
+    with_topics = sum(1 for c in chunks if c.domain_topics)
+    roles = set(c.rhetorical_role for c in chunks if c.rhetorical_role)
+    print(f"    Chunks: {len(chunks)} total (coarse={coarse}, mid={mid}, fine={fine})")
+    print(f"    Enrichment: {with_topics} chunks with domain topics, roles: {roles}")
+    if not chunks:
+        print(f"    WARNING: No chunks produced, skipping")
+        return {"paper_id": paper_id, "status": "empty", "n_chunks": 0}
+    # Delete existing chunks for this paper (idempotent re-ingestion)
+    delete_paper(collection, paper_id)
+    # Embed
+    embeddings = embedder.embed_chunks(chunks)
+    print(f"    Embeddings: {embeddings.shape}")
+    # Store
+    upsert_chunks(collection, chunks, embeddings)
+    print(f"    Stored in ChromaDB")
+    return {
+        "paper_id": paper_id,
+        "status": "success",
+        "n_chunks": len(chunks),
+        "n_sections": len(paper.sections),
+        "layers": {"coarse": coarse, "mid": mid, "fine": fine},
+    }
+def run_ingestion(papers_dir: str, config: RAGConfig) -> dict:
+    """Run the full ingestion pipeline.
+    Args:
+        papers_dir: Directory containing PDF files.
+        config: RAG configuration.
+    Returns:
+        Summary dict with stats.
+    """
+    pdfs = find_pdfs(papers_dir)
+    if not pdfs:
+        print(f"No PDF files found in {papers_dir}")
+        return {"n_papers": 0, "n_chunks": 0, "errors": []}
+    print(f"Found {len(pdfs)} PDFs in {papers_dir}")
+    print(f"Embedding model: {config.embedding_model}")
+    print(f"ChromaDB path: {config.chroma_persist_dir}")
+    # Initialize
+    embedder = ChunkEmbedder(model_name=config.embedding_model)
+    client = get_client(config)
+    collection = create_or_get_collection(config, client)
+    results = []
+    errors = []
+    start = time.time()
+    for pdf_path in pdfs:
+        try:
+            result = ingest_single(pdf_path, config, embedder, collection)
+            results.append(result)
+        except Exception as e:
+            error_msg = f"{os.path.basename(pdf_path)}: {str(e)}"
+            print(f"    ERROR: {error_msg}")
+            errors.append(error_msg)
+    elapsed = time.time() - start
+    total_chunks = sum(r.get("n_chunks", 0) for r in results)
+    stats = get_collection_stats(collection)
+    print(f"\n{'='*60}")
+    print(f"Ingestion complete in {elapsed:.1f}s")
+    print(f"  Papers processed: {len(results)}")
+    print(f"  Total chunks: {total_chunks}")
+    print(f"  Errors: {len(errors)}")
+    print(f"  Collection total: {stats['total_chunks']} chunks")
+    return {
+        "n_papers": len(results),
+        "n_chunks": total_chunks,
+        "elapsed_seconds": round(elapsed, 1),
+        "errors": errors,
+        "results": results,
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Ingest academic papers into ChromaDB")
+    parser.add_argument("--papers-dir", required=True, help="Directory containing PDF files")
+    parser.add_argument("--config", required=True, help="Path to rag_config.yaml")
+    args = parser.parse_args()
+    if not os.path.isdir(args.papers_dir):
+        print(f"Error: {args.papers_dir} is not a directory")
+        sys.exit(1)
+    if not os.path.isfile(args.config):
+        print(f"Error: {args.config} not found")
+        sys.exit(1)
+    config = load_config(args.config)
+    summary = run_ingestion(args.papers_dir, config)
+    if summary["errors"]:
+        print(f"\nErrors encountered:")
+        for e in summary["errors"]:
+            print(f"  - {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

backend/rag/ingest/store.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""ChromaDB storage operations for paper chunks.
+Handles collection creation, chunk upserting, and deletion.
+Uses PersistentClient so the index survives restarts.
+"""
+import numpy as np
+import chromadb
+from .chunker import Chunk
+from ..config import RAGConfig
+def get_client(config: RAGConfig) -> chromadb.ClientAPI:
+    """Create a persistent ChromaDB client."""
+    return chromadb.PersistentClient(path=config.chroma_persist_dir)
+def create_or_get_collection(config: RAGConfig, client: chromadb.ClientAPI = None):
+    """Get or create the paper chunks collection."""
+    if client is None:
+        client = get_client(config)
+    return client.get_or_create_collection(
+        name=config.collection_name,
+        metadata={"hnsw:space": "cosine"}
+    )
+def upsert_chunks(collection, chunks: list, embeddings: np.ndarray):
+    """Batch upsert chunks with embeddings and metadata into ChromaDB.
+    Args:
+        collection: ChromaDB collection.
+        chunks: List of Chunk objects.
+        embeddings: numpy array of shape (n_chunks, dim).
+    """
+    if not chunks:
+        return
+    # ChromaDB has a batch limit; process in batches of 500
+    batch_size = 500
+    for i in range(0, len(chunks), batch_size):
+        batch_chunks = chunks[i:i + batch_size]
+        batch_embeddings = embeddings[i:i + batch_size]
+        collection.upsert(
+            ids=[c.chunk_id for c in batch_chunks],
+            embeddings=[e.tolist() for e in batch_embeddings],
+            documents=[c.text for c in batch_chunks],
+            metadatas=[{
+                "paper_id": c.paper_id,
+                "paper_title": c.paper_title,
+                "layer": c.layer,
+                "chunk_type": c.chunk_type,
+                "section": c.section,
+                "subsection": c.subsection or "",
+                "page": c.page,
+                "position": c.position,
+                "token_count": c.token_count,
+                "domain_topics": ", ".join(c.domain_topics) if c.domain_topics else "",
+                "rhetorical_role": c.rhetorical_role or "",
+                "content_type": c.content_type or "",
+            } for c in batch_chunks]
+        )
+def delete_paper(collection, paper_id: str):
+    """Remove all chunks for a paper (for re-ingestion)."""
+    collection.delete(where={"paper_id": paper_id})
+def get_collection_stats(collection) -> dict:
+    """Return basic stats about the collection."""
+    count = collection.count()
+    sample = collection.peek(limit=5) if count > 0 else {}
+    return {
+        "total_chunks": count,
+        "sample_ids": sample.get("ids", []),
+    }

backend/rag/query_engine.py ADDED Viewed

	@@ -0,0 +1,352 @@

+"""Deterministic query engine: replaces LLM Pass 1 with ML-based decisions.
+Given a natural language query, this module:
+1. Embeds the query with sentence-transformer
+2. Searches ChromaDB for relevant paper chunks
+3. Computes query-to-method similarity to find relevant methods
+4. Adjusts column weights based on query-column similarity
+5. Picks color-by and highlight methods deterministically
+The LLM is only used for Pass 2: interpreting results.
+"""
+import re
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from collections import Counter, defaultdict
+# Column keywords for deterministic weight boosting
+COLUMN_KEYWORDS = {
+    'Planning Method': [
+        'planning', 'sampling', 'regression', 'analytical', 'optimization',
+        'reinforcement learning', 'rl', 'generative', 'diffusion', 'vae',
+    ],
+    'Training Data': [
+        'training', 'sim', 'real', 'sim-to-real', 'transfer', 'dataset',
+        'synthetic', 'self-supervised', 'supervised',
+    ],
+    'End-effector Hardware': [
+        'gripper', 'two-finger', 'parallel-jaw', 'multi-finger', 'dexterous',
+        'suction', 'hand', 'end-effector',
+    ],
+    'Object Configuration': [
+        'cluttered', 'piled', 'singulated', 'packed', 'bin picking',
+        'scene', 'objects', 'stacked',
+    ],
+    'Input Data': [
+        'point cloud', 'depth', 'rgb', 'rgbd', 'image', 'voxel', 'tsdf',
+        'tactile', 'sensor', 'camera',
+    ],
+    'Output Pose': [
+        '6-dof', '7-dof', 'grasp pose', 'pose', 'configuration',
+        'rectangle', 'quality',
+    ],
+    'Backbone': [
+        'pointnet', 'resnet', 'vgg', 'transformer', 'cnn', 'architecture',
+        'network', 'encoder', 'decoder',
+    ],
+    'Metric(s) Used ': [
+        'metric', 'loss', 'loss function', 'success rate', 'accuracy',
+        'precision', 'recall', 'evaluation',
+    ],
+    'Corresponding Dataset (see repository linked above)': [
+        'dataset', 'benchmark', 'acronym', 'graspnet', 'ycb', 'shapenet',
+    ],
+    'Simulator (see repository linked above)': [
+        'simulator', 'simulation', 'isaac', 'mujoco', 'pybullet', 'gazebo',
+    ],
+    'Camera Position(s)': [
+        'camera', 'overhead', 'eye-in-hand', 'multi-view', 'viewpoint',
+    ],
+    'Language': [
+        'pytorch', 'tensorflow', 'python', 'framework', 'implementation',
+    ],
+    'Description': [
+        'describe', 'overview', 'summary', 'about', 'explain',
+    ],
+}
+# Color-by mapping: query keywords -> best column to color by
+COLOR_BY_KEYWORDS = {
+    'Planning Method': ['planning', 'sampling', 'regression', 'rl', 'approach', 'method type'],
+    'Training Data': ['training', 'sim', 'real', 'sim-to-real', 'transfer'],
+    'End-effector Hardware': ['gripper', 'finger', 'dexterous', 'suction', 'end-effector', 'hand'],
+    'Object Configuration': ['cluttered', 'piled', 'scene', 'objects', 'singulated', 'bin'],
+    'Input Data': ['point cloud', 'depth', 'rgb', 'image', 'sensor', 'input'],
+    'Backbone': ['architecture', 'network', 'pointnet', 'transformer', 'cnn', 'backbone'],
+    'Learning Paradigm': ['learning', 'paradigm', 'classical', 'hybrid'],
+    'Sensor Complexity': ['sensor', 'modality', 'multimodal', '3d', '2d'],
+    'Scene Difficulty': ['difficulty', 'easy', 'hard', 'complex'],
+    'Gripper Type': ['gripper type', 'parallel-jaw', 'dexterous', 'suction'],
+    'Method Era': ['year', 'era', 'recent', 'old', 'modern', 'pioneer'],
+}
+def compute_query_column_relevance(query: str, model) -> dict:
+    """Compute how relevant each column is to the query using embedding similarity.
+    Returns dict of column_name -> similarity_score.
+    """
+    query_lower = query.lower()
+    # Keyword-based scoring (fast, deterministic)
+    scores = {}
+    for col, keywords in COLUMN_KEYWORDS.items():
+        score = sum(1 for kw in keywords if kw in query_lower)
+        scores[col] = score
+    return scores
+def compute_weights_from_query(query: str, default_weights: dict, model=None) -> dict:
+    """Adjust column weights based on query relevance. Boost relevant columns, keep others at default."""
+    relevance = compute_query_column_relevance(query, model)
+    weights = dict(default_weights)
+    for col, score in relevance.items():
+        if col in weights and score > 0:
+            # Boost proportional to keyword matches, cap at 20
+            boost = min(score * 3, 10)
+            weights[col] = min(20, weights[col] + boost)
+    return weights
+def pick_color_by(query: str) -> str:
+    """Deterministically pick the best color-by column from query keywords."""
+    query_lower = query.lower()
+    best_col = 'cluster'
+    best_score = 0
+    for col, keywords in COLOR_BY_KEYWORDS.items():
+        score = sum(1 for kw in keywords if kw in query_lower)
+        if score > best_score:
+            best_score = score
+            best_col = col
+    return best_col
+def find_relevant_methods(query: str, df, model, top_k: int = 10) -> list:
+    """Find methods most relevant to the query using embedding similarity.
+    Embeds the query and compares against method Description embeddings.
+    Returns list of method names sorted by relevance.
+    """
+    # Embed query
+    query_embedding = model.encode(query).reshape(1, -1)
+    # Embed all descriptions
+    descriptions = df['Description'].fillna('').tolist()
+    names = df['Name'].tolist()
+    # Also match against concatenated key columns for broader matching
+    combined = []
+    for _, row in df.iterrows():
+        parts = [str(row.get('Description', ''))]
+        for col in ['Planning Method', 'End-effector Hardware', 'Input Data',
+                     'Object Configuration', 'Training Data']:
+            val = str(row.get(col, '')) if not (isinstance(row.get(col), float) and np.isnan(row.get(col))) else ''
+            if val:
+                parts.append(val)
+        combined.append(' '.join(parts))
+    desc_embeddings = model.encode(combined, show_progress_bar=False)
+    # Cosine similarity
+    sims = cosine_similarity(query_embedding, desc_embeddings)[0]
+    # Sort by similarity
+    ranked = sorted(zip(names, sims), key=lambda x: x[1], reverse=True)
+    return ranked[:top_k]
+def should_filter(query: str) -> bool:
+    """Determine if the query implies filtering to a subset of methods."""
+    query_lower = query.lower()
+    # Comparison and exploration queries should NOT filter
+    no_filter_signals = ['compare', 'overview', 'all methods', 'landscape', 'field',
+                          'difference between', 'vs', 'versus', 'how do', 'survey']
+    if any(s in query_lower for s in no_filter_signals):
+        return False
+    # Filter signals
+    filter_signals = ['which methods', 'find methods', 'methods for', 'methods that',
+                       'best for', 'suitable for', 'show me', 'i need']
+    return any(s in query_lower for s in filter_signals)
+def extract_citations_from_chunks(chunks) -> list:
+    """Extract academic citations referenced within retrieved chunk text.
+    Looks for patterns like:
+      - Author-year: (Smith et al., 2022), (Smith and Jones, 2020)
+      - Numbered: [1], [1, 5, 12], [32]
+    Returns list of {name, count, source_papers} sorted by frequency.
+    """
+    # Pattern for author-year citations: (Author et al., YYYY) or (Author and Author, YYYY)
+    author_year_re = re.compile(
+        r'\(([A-Z][a-z]+(?:\s+(?:et\s+al\.|and\s+[A-Z][a-z]+))?)[.,]?\s*(\d{4})\)'
+    )
+    # Pattern for numbered citations: [N] or [N, M, ...]
+    numbered_re = re.compile(r'\[(\d+(?:\s*[,;]\s*\d+)*)\]')
+    citation_counts = Counter()
+    citation_sources = defaultdict(set)  # which source paper mentioned this citation
+    for chunk in chunks:
+        text = chunk.text
+        source = chunk.paper_title
+        # Extract author-year citations
+        for match in author_year_re.finditer(text):
+            author = match.group(1).strip()
+            year = match.group(2)
+            ref_name = f"{author}, {year}"
+            citation_counts[ref_name] += 1
+            citation_sources[ref_name].add(source)
+        # Extract numbered citations and expand ranges
+        for match in numbered_re.finditer(text):
+            nums_str = match.group(1)
+            nums = [n.strip() for n in re.split(r'[,;]', nums_str)]
+            for n in nums:
+                if n.isdigit():
+                    ref_name = f"[{n}]"
+                    citation_counts[ref_name] += 1
+                    citation_sources[ref_name].add(source)
+    # Only keep author-year citations (numbered [1] [32] are ambiguous across papers)
+    results = []
+    for ref, count in citation_counts.most_common(20):
+        if ref.startswith('['):
+            continue  # Skip numbered refs entirely - they're paper-specific and meaningless across papers
+        results.append({
+            'name': ref,
+            'count': count,
+            'source_papers': list(citation_sources[ref]),
+        })
+    return results[:15]
+def deterministic_query_pipeline(query: str, df, model, default_weights: dict,
+                                  retriever=None) -> dict:
+    """Full deterministic query pipeline. Replaces LLM Pass 1.
+    Returns dict with:
+        weights, colorBy, filterMethods, highlightMethods,
+        rag_text, rag_citations, relevant_method_summaries
+    """
+    # 1. Compute weights from query
+    weights = compute_weights_from_query(query, default_weights, model)
+    # 2. Pick color-by
+    color_by = pick_color_by(query)
+    # 3. Find relevant methods via embedding similarity
+    ranked_methods = find_relevant_methods(query, df, model, top_k=15)
+    # 4. Decide filtering
+    filter_methods = None
+    if should_filter(query):
+        # Filter to methods with similarity > threshold or top 15
+        threshold = 0.15
+        relevant = [name for name, sim in ranked_methods if sim > threshold]
+        if 3 <= len(relevant) < len(df):
+            filter_methods = relevant
+    # 5. Highlights: top 5-8 most relevant methods
+    highlight_methods = [name for name, sim in ranked_methods[:min(8, len(ranked_methods))]]
+    # 6. RAG retrieval from vector DB
+    rag_text = ""
+    rag_citations = []
+    rag_analytics = {}
+    if retriever is not None:
+        try:
+            from .retrieval.formatter import format_for_prompt, format_chunk_citations
+            from .ingest.store import get_client, create_or_get_collection
+            chunks = retriever.retrieve(query)
+            rag_text = format_for_prompt(chunks, token_budget=1500)
+            rag_citations = format_chunk_citations(chunks)
+            # Build analytics from retrieved chunk metadata
+            config = retriever.config
+            client = get_client(config)
+            col = create_or_get_collection(config, client)
+            # Fetch full metadata for retrieved chunks
+            if chunks:
+                chunk_ids = [c.chunk_id for c in chunks]
+                meta_result = col.get(ids=chunk_ids, include=['metadatas'])
+                metas = meta_result.get('metadatas', [])
+                # Paper source distribution (use paper_id slug as display name since parsed titles can be garbled)
+                def format_paper_id(pid):
+                    return pid.replace('-', ' ').title()
+                paper_counts = Counter(format_paper_id(c.paper_id) for c in chunks)
+                # Domain topic frequency across retrieved chunks
+                topic_counts = Counter()
+                for m in metas:
+                    topics_str = m.get('domain_topics', '')
+                    if topics_str:
+                        for t in topics_str.split(', '):
+                            if t.strip():
+                                topic_counts[t.strip()] += 1
+                # Rhetorical role distribution
+                role_counts = Counter(m.get('rhetorical_role', 'unknown') for m in metas)
+                # Content type distribution
+                content_type_counts = Counter(m.get('content_type', 'unknown') for m in metas)
+                # Section distribution
+                section_counts = Counter(m.get('section', 'unknown') for m in metas)
+                # Extract cited references from chunk text
+                cited_refs = extract_citations_from_chunks(chunks)
+                rag_analytics = {
+                    'paperSources': [{'name': k, 'count': v} for k, v in paper_counts.most_common(10)],
+                    'domainTopics': [{'topic': k, 'count': v} for k, v in topic_counts.most_common(15)],
+                    'rhetoricalRoles': [{'role': k, 'count': v} for k, v in role_counts.most_common()],
+                    'contentTypes': [{'type': k, 'count': v} for k, v in content_type_counts.most_common()],
+                    'sections': [{'section': k, 'count': v} for k, v in section_counts.most_common()],
+                    'citedReferences': cited_refs,
+                }
+        except Exception as e:
+            print(f"[RAG] Error: {e}")
+            import traceback
+            traceback.print_exc()
+    # 7. Build compact summaries for only the relevant methods (for LLM context)
+    relevant_names = set(name for name, _ in ranked_methods[:10])
+    method_summaries = []
+    for _, row in df.iterrows():
+        name = row.get('Name', '')
+        if name in relevant_names:
+            desc = str(row.get('Description', ''))[:150]
+            plan = str(row.get('Planning Method', ''))
+            hw = str(row.get('End-effector Hardware', ''))
+            inp = str(row.get('Input Data', ''))
+            method_summaries.append(f"- {name}: {plan}; {hw}; {inp}; {desc}")
+    # 8. Method relevance scores for visualization
+    method_relevance = [
+        {'name': name, 'score': round(float(sim), 4)}
+        for name, sim in ranked_methods
+    ]
+    return {
+        'weights': weights,
+        'colorBy': color_by,
+        'filterMethods': filter_methods,
+        'highlightMethods': highlight_methods,
+        'rag_text': rag_text,
+        'rag_citations': rag_citations,
+        'rag_analytics': rag_analytics,
+        'relevant_method_summaries': '\n'.join(method_summaries),
+        'ranked_methods': ranked_methods,
+        'method_relevance': method_relevance,
+    }

backend/rag/retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Chunk retrieval: query routing, vector search, and formatting."""

backend/rag/retrieval/formatter.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""Format retrieved chunks for LLM prompt injection and frontend display."""
+from .retriever import RetrievedChunk
+def estimate_tokens(text: str) -> int:
+    """Approximate token count using whitespace splitting."""
+    return len(text.split())
+def format_for_prompt(chunks: list, token_budget: int = 3000) -> str:
+    """Format retrieved chunks as text for LLM prompt injection.
+    Chunks are sorted by relevance. Stops adding chunks when token budget
+    is reached. Returns a formatted text block.
+    Args:
+        chunks: List of RetrievedChunk sorted by score descending.
+        token_budget: Maximum tokens of retrieved text to include.
+    Returns:
+        Formatted string for injection into LLM prompt.
+    """
+    if not chunks:
+        return ""
+    lines = []
+    total_tokens = 0
+    for chunk in chunks:
+        chunk_tokens = estimate_tokens(chunk.text)
+        if total_tokens + chunk_tokens > token_budget and lines:
+            break
+        header = f'--- From "{chunk.paper_title}" ({chunk.section}'
+        if chunk.subsection and chunk.subsection != chunk.section:
+            header += f" > {chunk.subsection}"
+        header += f", relevance: {chunk.score:.2f}) ---"
+        lines.append(header)
+        lines.append(chunk.text)
+        lines.append("")
+        total_tokens += chunk_tokens
+    return '\n'.join(lines).strip()
+def format_chunk_citations(chunks: list) -> list:
+    """Format chunks as structured data for the frontend.
+    Returns a list of citation dicts for rendering in the InsightCard.
+    """
+    citations = []
+    for chunk in chunks:
+        # Return full text for top chunks so frontend can do keyword highlighting
+        text = chunk.text
+        snippet = text[:300] + "..." if len(text) > 300 else text
+        citations.append({
+            "paper_title": chunk.paper_title,
+            "paper_id": chunk.paper_id,
+            "section": chunk.section,
+            "subsection": chunk.subsection or "",
+            "layer": chunk.layer,
+            "score": chunk.score,
+            "snippet": snippet,
+            "full_text": text,
+            "page": getattr(chunk, 'page', 0),
+        })
+    return citations

backend/rag/retrieval/retriever.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""RAG retriever: query ChromaDB with intent-based routing and multi-layer mixing."""
+from dataclasses import dataclass
+import numpy as np
+from ..config import RAGConfig
+from ..ingest.embedder import ChunkEmbedder
+from ..ingest.store import create_or_get_collection, get_client
+from .router import classify_intent, build_metadata_filter, QueryIntent
+@dataclass
+class RetrievedChunk:
+    chunk_id: str
+    text: str
+    paper_id: str
+    paper_title: str
+    section: str
+    subsection: str
+    layer: str
+    chunk_type: str
+    score: float
+    page: int = 0
+    rank: int = 0
+class RAGRetriever:
+    def __init__(self, config: RAGConfig, embedder: ChunkEmbedder = None):
+        self.config = config
+        self.embedder = embedder or ChunkEmbedder(model_name=config.embedding_model)
+        self._client = get_client(config)
+        self._collection = create_or_get_collection(config, self._client)
+    def retrieve(self, query: str, paper_ids: list = None, intent: QueryIntent = None) -> list:
+        """Retrieve relevant chunks for a query.
+        Args:
+            query: Natural language query string.
+            paper_ids: Optional list of paper IDs to restrict search to.
+            intent: Optional pre-classified intent (auto-classified if None).
+        Returns:
+            List of RetrievedChunk objects sorted by relevance.
+        """
+        if self._collection.count() == 0:
+            return []
+        if intent is None:
+            intent = classify_intent(query)
+        query_embedding = self.embedder.embed_query(query)
+        # Multi-layer search: query each target layer separately, then merge
+        chunks = self._multi_layer_search(query_embedding, intent, paper_ids)
+        # Deduplicate by chunk_id (same chunk might match across queries)
+        seen = set()
+        unique = []
+        for chunk in chunks:
+            if chunk.chunk_id not in seen:
+                seen.add(chunk.chunk_id)
+                unique.append(chunk)
+        # Sort by score descending
+        unique.sort(key=lambda c: c.score, reverse=True)
+        # Assign ranks
+        for i, chunk in enumerate(unique):
+            chunk.rank = i + 1
+        return unique
+    def _multi_layer_search(self, query_embedding: np.ndarray, intent: QueryIntent, paper_ids: list = None) -> list:
+        """Query ChromaDB per target layer, then merge. Falls back to broad search."""
+        from .router import INTENT_SECTIONS
+        routing = INTENT_SECTIONS[intent]
+        target_layers = routing["layers"]
+        results = []
+        total_top_k = (self.config.retrieval.coarse_top_k +
+                       self.config.retrieval.mid_top_k +
+                       self.config.retrieval.fine_top_k)
+        # Strategy: try layer-filtered search first, fall back to broad search
+        # Section filtering is skipped because parsed section names may not
+        # match the canonical names in the routing table.
+        for layer in target_layers:
+            layer_top_k = {
+                "coarse": self.config.retrieval.coarse_top_k,
+                "mid": self.config.retrieval.mid_top_k,
+                "fine": self.config.retrieval.fine_top_k,
+            }
+            top_k = layer_top_k.get(layer, 4)
+            conditions = [{"layer": layer}]
+            if paper_ids:
+                conditions.append({"paper_id": {"$in": paper_ids}})
+            where_filter = conditions[0] if len(conditions) == 1 else {"$and": conditions}
+            try:
+                query_result = self._collection.query(
+                    query_embeddings=[query_embedding.tolist()],
+                    n_results=top_k,
+                    where=where_filter,
+                    include=["documents", "metadatas", "distances"],
+                )
+            except Exception:
+                continue
+            # Parse results
+            if not query_result or not query_result.get("ids") or not query_result["ids"][0]:
+                continue
+            ids = query_result["ids"][0]
+            docs = query_result["documents"][0]
+            metas = query_result["metadatas"][0]
+            distances = query_result["distances"][0]
+            for j in range(len(ids)):
+                # ChromaDB returns cosine distance; convert to similarity
+                score = 1.0 - distances[j]
+                meta = metas[j]
+                results.append(RetrievedChunk(
+                    chunk_id=ids[j],
+                    text=docs[j],
+                    paper_id=meta.get("paper_id", ""),
+                    paper_title=meta.get("paper_title", ""),
+                    section=meta.get("section", ""),
+                    subsection=meta.get("subsection", ""),
+                    layer=meta.get("layer", ""),
+                    chunk_type=meta.get("chunk_type", ""),
+                    score=round(score, 4),
+                    page=meta.get("page", 0),
+                ))
+        # Fallback: if layer-filtered search returned nothing, do a broad search
+        if not results:
+            where_filter = {"paper_id": {"$in": paper_ids}} if paper_ids else None
+            try:
+                query_result = self._collection.query(
+                    query_embeddings=[query_embedding.tolist()],
+                    n_results=total_top_k,
+                    where=where_filter,
+                    include=["documents", "metadatas", "distances"],
+                )
+                if query_result and query_result.get("ids") and query_result["ids"][0]:
+                    ids = query_result["ids"][0]
+                    docs = query_result["documents"][0]
+                    metas = query_result["metadatas"][0]
+                    distances = query_result["distances"][0]
+                    for j in range(len(ids)):
+                        score = 1.0 - distances[j]
+                        meta = metas[j]
+                        results.append(RetrievedChunk(
+                            chunk_id=ids[j],
+                            text=docs[j],
+                            paper_id=meta.get("paper_id", ""),
+                            paper_title=meta.get("paper_title", ""),
+                            section=meta.get("section", ""),
+                            subsection=meta.get("subsection", ""),
+                            layer=meta.get("layer", ""),
+                            chunk_type=meta.get("chunk_type", ""),
+                            score=round(score, 4),
+                        ))
+            except Exception:
+                pass
+        return results

backend/rag/retrieval/router.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Query intent classifier for routing retrieval to the right chunks.
+Keyword-based (no ML model). Maps query intent to ChromaDB metadata
+filters so we search the right sections and layers.
+"""
+from enum import Enum
+class QueryIntent(Enum):
+    BROAD = "broad"
+    TECHNICAL = "technical"
+    EVALUATION = "evaluation"
+    COMPARISON = "comparison"
+    LIMITATION = "limitation"
+INTENT_KEYWORDS = {
+    QueryIntent.TECHNICAL: [
+        "equation", "loss", "reward", "objective", "architecture", "algorithm",
+        "network", "model", "training", "backbone", "policy", "dynamics",
+        "controller", "optimization", "gradient", "inference", "pipeline",
+    ],
+    QueryIntent.EVALUATION: [
+        "benchmark", "dataset", "result", "accuracy", "success rate",
+        "real-world", "experiment", "ablation", "baseline", "metric",
+        "performance", "evaluation", "table", "figure", "demo",
+    ],
+    QueryIntent.COMPARISON: [
+        "compare", "comparison", "differ", "difference", "vs", "versus",
+        "better", "worse", "advantage", "disadvantage", "trade-off",
+    ],
+    QueryIntent.LIMITATION: [
+        "limitation", "failure", "gap", "future", "weakness", "drawback",
+        "challenge", "issue", "problem", "cannot", "unable",
+    ],
+}
+INTENT_SECTIONS = {
+    QueryIntent.BROAD: {
+        "layers": ["coarse"],
+        "sections": None,  # No section filter
+    },
+    QueryIntent.TECHNICAL: {
+        "layers": ["mid", "fine"],
+        "sections": ["Method", "Methods", "Methodology", "Approach", "Background"],
+    },
+    QueryIntent.EVALUATION: {
+        "layers": ["mid", "fine"],
+        "sections": ["Experiments", "Results", "Evaluation", "Figures"],
+    },
+    QueryIntent.COMPARISON: {
+        "layers": ["coarse", "mid"],
+        "sections": ["Related Work", "Introduction", "Discussion"],
+    },
+    QueryIntent.LIMITATION: {
+        "layers": ["mid"],
+        "sections": ["Discussion", "Conclusion", "Conclusions"],
+    },
+}
+def classify_intent(query: str) -> QueryIntent:
+    """Classify query intent based on keyword matching.
+    Returns the intent with the highest keyword match count.
+    Defaults to BROAD if no keywords match.
+    """
+    query_lower = query.lower()
+    scores = {}
+    for intent, keywords in INTENT_KEYWORDS.items():
+        score = sum(1 for kw in keywords if kw in query_lower)
+        if score > 0:
+            scores[intent] = score
+    if not scores:
+        return QueryIntent.BROAD
+    return max(scores, key=scores.get)
+def build_metadata_filter(intent: QueryIntent, paper_ids: list = None) -> dict:
+    """Build a ChromaDB where-clause from intent and optional paper filter.
+    Returns a dict suitable for ChromaDB's `where` parameter.
+    """
+    routing = INTENT_SECTIONS[intent]
+    conditions = []
+    # Layer filter
+    layers = routing["layers"]
+    if layers:
+        conditions.append({"layer": {"$in": layers}})
+    # Section filter
+    sections = routing.get("sections")
+    if sections:
+        conditions.append({"section": {"$in": sections}})
+    # Paper filter
+    if paper_ids:
+        conditions.append({"paper_id": {"$in": paper_ids}})
+    if not conditions:
+        return {}
+    if len(conditions) == 1:
+        return conditions[0]
+    return {"$and": conditions}

backend/rag/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Tool calling: registry and domain-agnostic statistical/ML/data/RAG tools."""
+# Import all tool modules to trigger @register_tool decorators
+from . import statistical
+from . import ml_tools
+from . import data_tools
+from . import rag_tool

backend/rag/tools/data_tools.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Data grounding tools: filtering, aggregation, cross-tabulation."""
+from collections import Counter
+from .registry import register_tool, ToolContext
+@register_tool(
+    name="filter_and_count",
+    description="Filter the dataset by column values and return matching method names and count",
+    parameters={
+        "type": "object",
+        "properties": {
+            "filters": {
+                "type": "object",
+                "description": "Column-value pairs to filter by, e.g. {\"Planning Method\": \"Sampling\", \"Training Data\": \"Sim\"}",
+            },
+        },
+        "required": ["filters"],
+    },
+    category="data",
+)
+def filter_and_count_tool(context: ToolContext, filters: dict) -> dict:
+    df = context.df
+    name_col = df.columns[0]
+    mask = [True] * len(df)
+    applied = []
+    for col, value in filters.items():
+        if col not in df.columns:
+            continue
+        col_mask = df[col].fillna('').astype(str).str.contains(value, case=False, na=False)
+        mask = [m and c for m, c in zip(mask, col_mask)]
+        applied.append(f"{col}={value}")
+    matching = df.loc[mask, name_col].tolist()
+    return {
+        "filters_applied": applied,
+        "count": len(matching),
+        "methods": matching,
+    }
+@register_tool(
+    name="cross_tabulate",
+    description="Create a contingency table of two columns showing co-occurrence counts",
+    parameters={
+        "type": "object",
+        "properties": {
+            "column_a": {"type": "string", "description": "First column name"},
+            "column_b": {"type": "string", "description": "Second column name"},
+        },
+        "required": ["column_a", "column_b"],
+    },
+    category="data",
+)
+def cross_tabulate_tool(context: ToolContext, column_a: str, column_b: str) -> dict:
+    df = context.df
+    if column_a not in df.columns:
+        raise ValueError(f"Column '{column_a}' not found")
+    if column_b not in df.columns:
+        raise ValueError(f"Column '{column_b}' not found")
+    # Build cross-tab handling multi-value cells
+    table = {}
+    for _, row in df.iterrows():
+        vals_a = [p.strip() for p in str(row.get(column_a, '')).split(',') if p.strip()]
+        vals_b = [p.strip() for p in str(row.get(column_b, '')).split(',') if p.strip()]
+        for va in vals_a:
+            for vb in vals_b:
+                table.setdefault(va, {})
+                table[va][vb] = table[va].get(vb, 0) + 1
+    return {
+        "column_a": column_a,
+        "column_b": column_b,
+        "table": table,
+    }
+@register_tool(
+    name="value_distribution",
+    description="Get value counts for a column, properly handling multi-value cells",
+    parameters={
+        "type": "object",
+        "properties": {
+            "column": {"type": "string", "description": "Column name to analyze"},
+        },
+        "required": ["column"],
+    },
+    category="data",
+)
+def value_distribution_tool(context: ToolContext, column: str) -> dict:
+    if column not in context.df.columns:
+        raise ValueError(f"Column '{column}' not found")
+    all_values = []
+    for val in context.df[column].fillna('').astype(str):
+        for part in [p.strip() for p in val.split(',')]:
+            if part:
+                all_values.append(part)
+    counts = dict(Counter(all_values).most_common())
+    return {
+        "column": column,
+        "total_entries": len(context.df),
+        "total_values": len(all_values),
+        "unique_values": len(counts),
+        "distribution": counts,
+    }

backend/rag/tools/ml_tools.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""ML grounding tools: nearest neighbors, cluster analysis, feature importance."""
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from .registry import register_tool, ToolContext
+def _get_method_index(context: ToolContext, method_name: str) -> int:
+    """Find row index for a method name."""
+    name_col = context.df.columns[0]
+    matches = context.df[context.df[name_col] == method_name].index
+    if len(matches) == 0:
+        matches = context.df[context.df[name_col].str.lower() == method_name.lower()].index
+    if len(matches) == 0:
+        raise ValueError(f"Method '{method_name}' not found")
+    return matches[0]
+@register_tool(
+    name="nearest_neighbors",
+    description="Find the k most similar methods to a given method based on feature embeddings",
+    parameters={
+        "type": "object",
+        "properties": {
+            "method": {"type": "string", "description": "Name of the target method"},
+            "k": {"type": "integer", "description": "Number of neighbors (default 5)"},
+        },
+        "required": ["method"],
+    },
+    category="ml",
+)
+def nearest_neighbors_tool(context: ToolContext, method: str, k: int = 5) -> dict:
+    if context.feature_matrix is None:
+        raise ValueError("Feature matrix not available")
+    idx = _get_method_index(context, method)
+    vec = context.feature_matrix[idx].reshape(1, -1)
+    sims = cosine_similarity(vec, context.feature_matrix)[0]
+    # Sort by similarity, exclude self
+    name_col = context.df.columns[0]
+    indices = np.argsort(sims)[::-1]
+    neighbors = []
+    for i in indices:
+        if i == idx:
+            continue
+        neighbors.append({
+            "name": context.df.iloc[i][name_col],
+            "similarity": round(float(sims[i]), 4),
+        })
+        if len(neighbors) >= k:
+            break
+    return {"method": method, "k": k, "neighbors": neighbors}
+@register_tool(
+    name="cluster_membership",
+    description="Get the cluster assignment for a method, including its co-members and cluster characteristics",
+    parameters={
+        "type": "object",
+        "properties": {
+            "method": {"type": "string", "description": "Name of the method"},
+        },
+        "required": ["method"],
+    },
+    category="ml",
+)
+def cluster_membership_tool(context: ToolContext, method: str) -> dict:
+    if context.cluster_labels is None:
+        raise ValueError("Cluster labels not available")
+    idx = _get_method_index(context, method)
+    cluster_id = context.cluster_labels[idx]
+    name_col = context.df.columns[0]
+    co_members = []
+    for i, label in enumerate(context.cluster_labels):
+        if label == cluster_id and i != idx:
+            co_members.append(context.df.iloc[i][name_col])
+    return {
+        "method": method,
+        "cluster_id": int(cluster_id),
+        "cluster_size": len(co_members) + 1,
+        "co_members": co_members,
+    }
+@register_tool(
+    name="feature_importance",
+    description="Identify which feature dimensions most distinguish a method from the dataset average",
+    parameters={
+        "type": "object",
+        "properties": {
+            "method": {"type": "string", "description": "Name of the method"},
+            "top_n": {"type": "integer", "description": "Number of top features to return (default 10)"},
+        },
+        "required": ["method"],
+    },
+    category="ml",
+)
+def feature_importance_tool(context: ToolContext, method: str, top_n: int = 10) -> dict:
+    if context.feature_matrix is None:
+        raise ValueError("Feature matrix not available")
+    idx = _get_method_index(context, method)
+    vec = context.feature_matrix[idx]
+    mean_vec = context.feature_matrix.mean(axis=0)
+    std_vec = context.feature_matrix.std(axis=0)
+    std_vec[std_vec == 0] = 1.0  # avoid division by zero
+    # Z-score deviation from mean
+    z_scores = (vec - mean_vec) / std_vec
+    top_indices = np.argsort(np.abs(z_scores))[::-1][:top_n]
+    features = []
+    for i in top_indices:
+        features.append({
+            "dimension": int(i),
+            "z_score": round(float(z_scores[i]), 3),
+            "value": round(float(vec[i]), 4),
+            "mean": round(float(mean_vec[i]), 4),
+            "direction": "above average" if z_scores[i] > 0 else "below average",
+        })
+    return {"method": method, "top_features": features}
+@register_tool(
+    name="outlier_score",
+    description="Compute how atypical a method is relative to the dataset (average distance to all other methods)",
+    parameters={
+        "type": "object",
+        "properties": {
+            "method": {"type": "string", "description": "Name of the method"},
+        },
+        "required": ["method"],
+    },
+    category="ml",
+)
+def outlier_score_tool(context: ToolContext, method: str) -> dict:
+    if context.feature_matrix is None:
+        raise ValueError("Feature matrix not available")
+    idx = _get_method_index(context, method)
+    vec = context.feature_matrix[idx].reshape(1, -1)
+    sims = cosine_similarity(vec, context.feature_matrix)[0]
+    # Exclude self
+    other_sims = np.concatenate([sims[:idx], sims[idx + 1:]])
+    avg_sim = float(other_sims.mean())
+    min_sim = float(other_sims.min())
+    # Compute outlier score for all methods to get percentile
+    all_avg_sims = []
+    for i in range(len(context.feature_matrix)):
+        s = cosine_similarity(context.feature_matrix[i].reshape(1, -1), context.feature_matrix)[0]
+        others = np.concatenate([s[:i], s[i + 1:]])
+        all_avg_sims.append(others.mean())
+    percentile = float(np.sum(np.array(all_avg_sims) > avg_sim) / len(all_avg_sims) * 100)
+    interpretation = "typical" if percentile < 70 else "somewhat unusual" if percentile < 90 else "outlier"
+    return {
+        "method": method,
+        "avg_similarity_to_others": round(avg_sim, 4),
+        "min_similarity": round(min_sim, 4),
+        "outlier_percentile": round(percentile, 1),
+        "interpretation": interpretation,
+    }

backend/rag/tools/rag_tool.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""RAG search as a tool: lets the LLM request paper content on demand."""
+from .registry import register_tool, ToolContext
+@register_tool(
+    name="search_papers",
+    description="Search the academic paper corpus for relevant passages. Use this when the query asks about specific techniques, loss functions, architectures, training details, experimental results, or anything requiring actual paper content.",
+    parameters={
+        "type": "object",
+        "properties": {
+            "search_query": {
+                "type": "string",
+                "description": "What to search for in the papers (e.g., 'loss function for grasp quality', 'sim-to-real transfer', 'PointNet architecture')",
+            },
+        },
+        "required": ["search_query"],
+    },
+    category="rag",
+)
+def search_papers_tool(context: ToolContext, search_query: str) -> dict:
+    """Search ChromaDB for relevant paper chunks."""
+    import os
+    import sys
+    # Get the RAG retriever from the app-level lazy singleton
+    base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    config_path = os.path.join(base_dir, '..', 'rag_config.yaml')
+    from ..config import load_config
+    from ..retrieval.retriever import RAGRetriever
+    from ..ingest.embedder import ChunkEmbedder
+    from ..retrieval.formatter import format_for_prompt, format_chunk_citations
+    config = load_config(config_path)
+    embedder = ChunkEmbedder(model_name=config.embedding_model, model_instance=context.st_model)
+    retriever = RAGRetriever(config=config, embedder=embedder)
+    chunks = retriever.retrieve(search_query)
+    if not chunks:
+        return {"found": 0, "excerpts": [], "formatted": "No relevant paper content found."}
+    prompt_text = format_for_prompt(chunks, token_budget=config.retrieval.token_budget)
+    citations = format_chunk_citations(chunks)
+    # Return both formatted text (for LLM) and structured citations (for frontend)
+    return {
+        "found": len(chunks),
+        "formatted": prompt_text,
+        "citations": citations,
+    }

backend/rag/tools/registry.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Tool registry with decorator-based registration.
+Tools are Python functions that the LLM can request via its JSON response.
+The registry generates JSON schemas for prompt injection and dispatches
+tool calls safely with error handling.
+"""
+import json
+from dataclasses import dataclass, field
+from typing import Callable
+import numpy as np
+import pandas as pd
+@dataclass
+class ToolContext:
+    """Shared context passed to all tool functions."""
+    df: pd.DataFrame
+    feature_matrix: np.ndarray = None
+    cluster_labels: list = None
+    weights: dict = None
+    st_model: object = None  # SentenceTransformer instance
+@dataclass
+class ToolSpec:
+    name: str
+    description: str
+    parameters: dict
+    function: Callable
+    category: str
+# Global registry
+_TOOL_REGISTRY: dict = {}
+def register_tool(name: str, description: str, parameters: dict, category: str = "general"):
+    """Decorator to register a callable as an LLM-invocable tool."""
+    def decorator(fn):
+        _TOOL_REGISTRY[name] = ToolSpec(
+            name=name,
+            description=description,
+            parameters=parameters,
+            function=fn,
+            category=category,
+        )
+        return fn
+    return decorator
+def get_tool_schemas() -> list:
+    """Return JSON-schema descriptions of all registered tools (for LLM prompt)."""
+    return [
+        {
+            "name": t.name,
+            "description": t.description,
+            "parameters": t.parameters,
+            "category": t.category,
+        }
+        for t in _TOOL_REGISTRY.values()
+    ]
+def get_tool_prompt_section() -> str:
+    """Format tool schemas as a text section for the LLM system prompt."""
+    schemas = get_tool_schemas()
+    if not schemas:
+        return ""
+    lines = [
+        "AVAILABLE TOOLS:",
+        "You may request computations by including a \"tools\" array in your JSON response.",
+        "Each tool call: {\"name\": \"tool_name\", \"arguments\": {...}}",
+        "Only request tools when the query genuinely needs computed results. Most queries don't need tools.",
+        "",
+    ]
+    for s in schemas:
+        params_desc = []
+        props = s["parameters"].get("properties", {})
+        for pname, pdef in props.items():
+            req = "(required)" if pname in s["parameters"].get("required", []) else "(optional)"
+            params_desc.append(f"    {pname}: {pdef.get('description', pdef.get('type', ''))} {req}")
+        lines.append(f"- {s['name']}: {s['description']}")
+        if params_desc:
+            lines.extend(params_desc)
+        lines.append("")
+    return '\n'.join(lines)
+def execute_tool(name: str, arguments: dict, context: ToolContext) -> dict:
+    """Dispatch a tool call. Returns {success, result, error}."""
+    tool = _TOOL_REGISTRY.get(name)
+    if not tool:
+        return {"success": False, "result": None, "error": f"Unknown tool: {name}"}
+    try:
+        result = tool.function(context=context, **arguments)
+        return {"success": True, "result": result, "error": None}
+    except Exception as e:
+        return {"success": False, "result": None, "error": str(e)}
+def execute_tool_calls(tool_calls: list, context: ToolContext, max_calls: int = 5) -> list:
+    """Execute a list of tool calls from the LLM response.
+    Args:
+        tool_calls: List of {"name": str, "arguments": dict}.
+        context: Shared ToolContext with dataset and features.
+        max_calls: Safety limit on number of tool calls per query.
+    Returns:
+        List of {"name", "arguments", "success", "result", "error"}.
+    """
+    results = []
+    for call in tool_calls[:max_calls]:
+        name = call.get("name", "")
+        arguments = call.get("arguments", {})
+        result = execute_tool(name, arguments, context)
+        results.append({
+            "name": name,
+            "arguments": arguments,
+            **result,
+        })
+    return results

backend/rag/tools/statistical.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""Statistical grounding tools.
+These let the LLM request real computations instead of hallucinating numbers.
+"""
+import numpy as np
+from collections import Counter
+from sklearn.metrics.pairwise import cosine_similarity as sk_cosine
+from .registry import register_tool, ToolContext
+def _get_method_index(context: ToolContext, method_name: str) -> int:
+    """Find row index for a method name. Raises ValueError if not found."""
+    matches = context.df[context.df[context.df.columns[0]] == method_name].index
+    if len(matches) == 0:
+        # Try case-insensitive
+        name_col = context.df.columns[0]
+        matches = context.df[context.df[name_col].str.lower() == method_name.lower()].index
+    if len(matches) == 0:
+        raise ValueError(f"Method '{method_name}' not found in dataset")
+    return matches[0]
+@register_tool(
+    name="cosine_similarity",
+    description="Compute cosine similarity between two methods based on their weighted feature embeddings",
+    parameters={
+        "type": "object",
+        "properties": {
+            "method_a": {"type": "string", "description": "Name of first method"},
+            "method_b": {"type": "string", "description": "Name of second method"},
+        },
+        "required": ["method_a", "method_b"],
+    },
+    category="statistical",
+)
+def cosine_similarity_tool(context: ToolContext, method_a: str, method_b: str) -> dict:
+    if context.feature_matrix is None:
+        raise ValueError("Feature matrix not available")
+    idx_a = _get_method_index(context, method_a)
+    idx_b = _get_method_index(context, method_b)
+    vec_a = context.feature_matrix[idx_a].reshape(1, -1)
+    vec_b = context.feature_matrix[idx_b].reshape(1, -1)
+    sim = float(sk_cosine(vec_a, vec_b)[0, 0])
+    interpretation = "very similar" if sim > 0.8 else "moderately similar" if sim > 0.5 else "dissimilar"
+    return {
+        "method_a": method_a,
+        "method_b": method_b,
+        "cosine_similarity": round(sim, 4),
+        "interpretation": interpretation,
+    }
+@register_tool(
+    name="pairwise_distances",
+    description="Compute pairwise cosine distances between a set of methods",
+    parameters={
+        "type": "object",
+        "properties": {
+            "methods": {"type": "array", "items": {"type": "string"}, "description": "List of method names (2-10)"},
+        },
+        "required": ["methods"],
+    },
+    category="statistical",
+)
+def pairwise_distances_tool(context: ToolContext, methods: list) -> dict:
+    if context.feature_matrix is None:
+        raise ValueError("Feature matrix not available")
+    if len(methods) > 10:
+        methods = methods[:10]
+    indices = [_get_method_index(context, m) for m in methods]
+    vecs = context.feature_matrix[indices]
+    sim_matrix = sk_cosine(vecs)
+    dist_matrix = 1.0 - sim_matrix
+    pairs = []
+    for i in range(len(methods)):
+        for j in range(i + 1, len(methods)):
+            pairs.append({
+                "method_a": methods[i],
+                "method_b": methods[j],
+                "distance": round(float(dist_matrix[i, j]), 4),
+            })
+    pairs.sort(key=lambda p: p["distance"])
+    return {"methods": methods, "pairs": pairs}
+@register_tool(
+    name="distribution_stats",
+    description="Get value distribution for a dataset column, with optional grouping",
+    parameters={
+        "type": "object",
+        "properties": {
+            "column": {"type": "string", "description": "Column name to analyze"},
+            "group_by": {"type": "string", "description": "Optional column to group by"},
+        },
+        "required": ["column"],
+    },
+    category="statistical",
+)
+def distribution_stats_tool(context: ToolContext, column: str, group_by: str = None) -> dict:
+    if column not in context.df.columns:
+        raise ValueError(f"Column '{column}' not found. Available: {list(context.df.columns)}")
+    values = []
+    for val in context.df[column].fillna('').astype(str):
+        for part in [p.strip() for p in val.split(',')]:
+            if part:
+                values.append(part)
+    counts = dict(Counter(values).most_common(20))
+    n_unique = len(set(values))
+    result = {
+        "column": column,
+        "total_values": len(values),
+        "unique_values": n_unique,
+        "distribution": counts,
+    }
+    if group_by and group_by in context.df.columns:
+        groups = {}
+        for _, row in context.df.iterrows():
+            g = str(row.get(group_by, ''))
+            v = str(row.get(column, ''))
+            for part in [p.strip() for p in v.split(',')]:
+                if part:
+                    groups.setdefault(g, []).append(part)
+        result["grouped"] = {g: dict(Counter(vs).most_common(5)) for g, vs in groups.items()}
+    return result

backend/requirements.txt CHANGED Viewed

@@ -8,3 +8,6 @@ hdbscan>=0.8.33
 umap-learn==0.5.5
 sentence-transformers>=2.2.0
 huggingface_hub>=0.20.0

 umap-learn==0.5.5
 sentence-transformers>=2.2.0
 huggingface_hub>=0.20.0
+chromadb>=0.5.0
+pdfplumber>=0.10.0
+pyyaml>=6.0

docs/pipeline-architecture.md ADDED Viewed

	@@ -0,0 +1,455 @@

+# Grasp Explorer: Full Pipeline Architecture
+## What This Product Does
+The Grasp Explorer is an interactive dashboard for the NSF-funded COMPARE ecosystem (robot-manipulation.org) that lets researchers explore relationships among 56+ robotic grasp planning methods. Instead of manually comparing papers in a spreadsheet, researchers type natural language questions ("How do point cloud methods compare to depth image approaches for cluttered bin picking?") and the system:
+1. Finds the most relevant methods and papers using vector similarity
+2. Adjusts the visualization to emphasize the attributes that matter for the question
+3. Clusters methods into natural groups using density-based clustering
+4. Retrieves actual passages from the research papers via a vector database
+5. Generates grounded, citation-backed insights using a single LLM call
+6. Displays interactive visualizations showing method similarity, evidence breakdown, and topic distribution
+The key design principle is **deterministic computation + grounded LLM interpretation**. The system never asks the LLM to guess at data or make decisions that can be computed. Every number, every grouping, every similarity score is computed by the pipeline. The LLM only interprets results that have already been computed.
+---
+## Layer 1: Feature Engineering
+### Raw Dataset
+The dataset is a CSV (`datasets/csv-gp-combined.csv`) with 56 rows (one per grasp planning method) and 20 columns. 13 of these columns are weighted features used for computing method similarity.
+### Categorical Feature Processing (TF-IDF)
+Each categorical column (Planning Method, End-effector Hardware, Input Data, etc.) is converted to a numerical vector using TF-IDF (Term Frequency-Inverse Document Frequency).
+- **Vectorizer settings**: `max_features=50`, `ngram_range=(1, 2)` to capture both single terms and bigrams
+- **Multi-value handling**: Many cells contain comma-separated values (e.g., "Sampling, Direct regression"). The `smart_split()` function parses these respecting quoted fields, so "6-DoF grasp pose (x, y, z, r, p, y)" is treated as one value, not six.
+- **Normalization**: Multi-value cells are sorted alphabetically before TF-IDF so that "Sampling, Analytical" and "Analytical, Sampling" produce identical vectors.
+- **Result**: Each column becomes a sparse matrix of shape (56, up to 50).
+### Description Embeddings (Sentence-Transformer)
+The free-text Description column is processed differently from categorical columns:
+1. Each description is encoded using `all-MiniLM-L6-v2` (a sentence-transformer model) producing a 384-dimensional dense vector.
+2. PCA reduces these to 50 dimensions (`n_components=min(50, n_rows - 1)`, `random_state=42`).
+3. This prevents the description embeddings (384 dims) from dominating the feature matrix over TF-IDF columns (~5-50 dims each).
+4. Full-dataset embeddings are cached to `.description_embeddings.npy` to avoid recomputation on every request.
+### Weight Application
+Each column's feature matrix is scaled by the square root of its weight:
+```
+weighted_features = features * sqrt(weight)
+```
+Default weights reflect domain importance:
+- **Weight 10**: Planning Method, Object Configuration, Output Pose (most important for distinguishing methods)
+- **Weight 8**: Training Data
+- **Weight 7**: Description
+- **Weight 6**: End-effector Hardware, Input Data
+- **Weight 5**: Backbone, Metrics, Dataset
+- **Weight 4**: Camera Position, Language
+- **Weight 3**: Simulator
+When a researcher asks a question, the deterministic query engine boosts weights for columns that match the query keywords. For example, "cluttered bin picking" boosts Object Configuration weight.
+### Combined Feature Matrix
+All weighted feature matrices are concatenated horizontally:
+```
+combined = np.hstack([tfidf_col1 * sqrt(w1), tfidf_col2 * sqrt(w2), ..., desc_pca * sqrt(w_desc)])
+```
+Result: a matrix of shape (56 methods, ~379 features).
+### Derived Features
+Seven higher-level features are computed from raw columns for the frontend UI (color-by, detail panel). These are NOT fed into the embedding matrix.
+| Feature | Derived From | Categories |
+|---------|-------------|------------|
+| Grasp Dimensionality | Output Pose | 6-DoF, 7-DoF, 2D, Policy, Evaluation, Other |
+| Learning Paradigm | Planning Method + Training Data | Classical, Learning-based, RL-based, Hybrid |
+| Sensor Complexity | Input Data | Multimodal, 3D, 2.5D, 2D, Other |
+| Scene Difficulty | Object Configuration | Singulated, Structured, Cluttered, Packed, Piled (ordinal, takes max) |
+| Gripper Type | End-effector Hardware | Parallel-jaw, Dexterous, Suction, Multi-gripper |
+| ML Framework | Language | PyTorch, TensorFlow, Keras, None |
+| Method Era | Year | Pioneer (2016-2018), Growth (2019-2021), Modern (2022+) |
+---
+## Layer 2: Dimensionality Reduction (UMAP)
+### How UMAP Projects to 2D
+The combined feature matrix is high-dimensional (~379 features). UMAP reduces this to 2D coordinates for the scatter plot.
+1. **Pairwise cosine distances** are computed between all 56 methods: `pairwise_distances(features, metric='cosine')`. This produces a 56x56 distance matrix.
+2. **UMAP** takes this precomputed distance matrix (not the raw features) and finds a 2D layout that preserves both local neighborhoods and global structure.
+UMAP is agnostic to whether the input was categorical or embedding. All the mixing and weighting happened before UMAP ever sees the data.
+### UMAP Parameters
+| Parameter | Value | Why |
+|-----------|-------|-----|
+| n_neighbors | 15 (capped to n_methods-1 for small sets) | Balances local vs global structure |
+| min_dist | 0.1 | Allows some overlap, not too spread out |
+| metric | precomputed (cosine distances) | Cosine works well for sparse TF-IDF + dense embeddings |
+| random_state | 42 | Reproducible projections |
+| n_jobs | 1 | Avoids macOS OpenMP segfault |
+### Small Dataset Handling
+- 1 method: placed at origin [0, 0]
+- 2-3 methods: PCA fallback (UMAP needs at least 4 points)
+- 4+ methods: standard UMAP
+---
+## Layer 3: Clustering (HDBSCAN)
+### Why HDBSCAN
+HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) was chosen over K-Means because:
+- It discovers the natural number of clusters (no need to specify k)
+- It handles non-spherical cluster shapes
+- It identifies outliers (noise points) rather than forcing every method into a cluster
+### HDBSCAN Parameters
+| Parameter | Value | Why |
+|-----------|-------|-----|
+| min_cluster_size | max(3, n_methods // 15) | ~15 methods per cluster, minimum 3 |
+| min_samples | 1 | Allows small clusters |
+| metric | euclidean | On 2D UMAP coordinates |
+| cluster_selection_method | eom (excess of mass) | Finds the most persistent clusters |
+### Noise Point Reassignment
+Since all 56 methods are relevant papers (none should be excluded), noise points (labeled -1 by HDBSCAN) are reassigned to their nearest real cluster:
+1. Identify all points with label -1
+2. Compute pairwise distances between noise points and all real cluster members
+3. Assign each noise point to the cluster of its nearest neighbor
+4. If all points are noise (edge case): assign everyone to cluster 0
+### Descriptive Cluster Labels
+Instead of meaningless IDs like "Cluster 0", each cluster gets a descriptive label derived from its dominant attributes:
+1. For each cluster, count the most common values in Planning Method, End-effector Hardware, and Object Configuration
+2. Take the top value from each column
+3. Join as: "Sampling / Two-finger / Piled"
+A `value_cluster_map` is also built, mapping each attribute value to its dominant cluster. This is used for the cluster legend.
+---
+## Layer 4: RAG Pipeline (Retrieval-Augmented Generation)
+### Overview
+The RAG pipeline indexes 34 research papers (1,074 text chunks) in a ChromaDB vector database. When a researcher asks a question, relevant paper passages are retrieved and fed to the LLM so it can cite actual paper content instead of hallucinating.
+### 4a. PDF Parsing
+Papers are stored as PDFs in `papers/`. The parser (`pdf_parser.py`) uses pdfplumber to extract text with font metadata.
+**Section detection** uses a scoring system:
+- Lines with larger font size (>1.1x median) are header candidates
+- Lines matching known academic headers (Abstract, Introduction, Method, etc.) get a boost
+- Lines matching numbering patterns ("3.1 ", "IV. ") are classified as section/subsection headers
+- Fallback: if no sections detected, the entire document becomes a single "Full Text" section
+**Extracted structure**:
+- `ParsedPaper`: title, abstract, sections (list of ParsedSection), figures (captions), raw_text
+- `ParsedSection`: title, level (1=section, 2=subsection), text, page_start, page_end
+### 4b. Chunking (3-Layer Hybrid: Structural + Semantic)
+The chunker produces chunks at three granularity levels, all enriched with domain-aware metadata.
+**Layer 1: Coarse (paper-level overview)**
+- Title + Abstract chunk (max 800 tokens)
+- One summary chunk per major section (first + last paragraph if section exceeds token budget)
+- All figure/table captions concatenated into one chunk
+- Purpose: retrieved for broad "what is this paper about?" queries
+**Layer 2: Mid-level (semantic topic boundaries within sections)**
+1. Each section is split into sentences
+2. Every sentence is embedded with the sentence-transformer
+3. Consecutive sentence similarities are computed (cosine similarity between adjacent embeddings)
+4. Where similarity drops below 0.35, a topic boundary is detected
+5. Sentences between boundaries are grouped into chunks
+6. Token constraints enforced: merge groups under 200 tokens, split groups over 800 tokens
+7. 15% overlap added between consecutive same-section chunks (last N sentences of chunk i prepended to chunk i+1)
+- Purpose: captures natural topic shifts within a section (e.g., "grasp representation" to "network architecture" to "training procedure")
+**Layer 3: Fine (paragraph-level)**
+- Individual paragraphs, split by sentences if over 300 tokens
+- Minimum 50 tokens (skip very short fragments)
+- Purpose: precise retrieval for specific technical questions
+**Why semantic chunking over fixed-window?** Fixed-size chunking (e.g., every 512 tokens) breaks arguments mid-sentence and mixes unrelated content. Semantic chunking detects real topic shifts so each chunk is about one coherent idea.
+### 4c. Chunk Metadata Enrichment
+Every chunk gets rich metadata for retrieval filtering and visualization:
+**Domain Topic Extraction**: Each chunk is scanned against a configurable vocabulary of 80+ domain-specific keywords (e.g., "point cloud", "6-DoF", "sim-to-real", "parallel-jaw", "PointNet"). Matched terms are stored as `domain_topics`. The vocabulary is defined in `rag_config.yaml` and can be swapped for any domain.
+**Rhetorical Role Classification**: Heuristic keyword patterns classify each chunk's communicative purpose:
+- `algorithm_description`: "we propose", "our method", "architecture"
+- `experimental_setup`: "we evaluate", "dataset", "baseline"
+- `result`: "table 1", "success rate", "outperform"
+- `comparison`: "compared to", "unlike", "prior work"
+- `problem_statement`: "we address", "the problem of"
+- `limitation`: "limitation", "failure", "future work"
+- `definition`: "we define", "denoted by"
+**Content Type**: Derived from rhetorical role + section name:
+- `theory` = how the method works (algorithms, math)
+- `implementation` = how to build it (hyperparameters, training details)
+- `evaluation` = how it performs (benchmarks, results)
+**Chunk Type Detection**: Auto-detects equation-heavy chunks (LaTeX patterns) and citation-dense chunks.
+### 4d. Embedding and Storage
+- **Embedding model**: Same `all-MiniLM-L6-v2` as the main app (reuses the loaded model instance at runtime)
+- **Section-prefix strategy**: Before embedding, each chunk's text is prepended with `[SectionName: Subsection]` to ground the embedding in document structure
+- **Storage**: ChromaDB with cosine distance (HNSW index), persistent at `./chroma_db`
+- **Metadata**: All chunk metadata stored as ChromaDB metadata fields for filtered search
+### 4e. Retrieval
+When a query comes in, retrieval happens in stages:
+1. **Intent classification**: Keywords in the query determine intent (BROAD, TECHNICAL, EVALUATION, COMPARISON, LIMITATION)
+2. **Layer routing**: Each intent maps to target layers and sections. A "loss function" query routes to mid/fine chunks in Methods sections.
+3. **Multi-layer search**: ChromaDB is queried once per target layer with appropriate top-k (2 coarse + 4 mid + 4 fine)
+4. **Fallback**: If layer-filtered search returns nothing, a broad search across all layers is performed
+5. **Deduplication and ranking**: Results merged, deduplicated by chunk_id, sorted by cosine similarity score
+**Token budget**: Retrieved chunks are formatted for the LLM prompt with a 1500-token budget. Chunks are added in order of relevance until the budget is exhausted.
+---
+## Layer 5: Deterministic Query Engine
+When a researcher submits a query, the system does NOT ask the LLM to decide how to configure the visualization. Instead, a deterministic pipeline handles all decisions:
+### Step 1: Weight Adjustment
+The query is scanned against keyword dictionaries for each column. Keywords like "cluttered", "piled", "bin picking" match Object Configuration. Keywords like "PointNet", "transformer", "CNN" match Backbone.
+Each keyword match adds a boost: `new_weight = min(20, default_weight + min(matches * 3, 10))`.
+### Step 2: Method Relevance
+The query is embedded with the sentence-transformer, and cosine similarity is computed against every method's description + key columns (concatenated and embedded). The top 15 methods by similarity are returned as candidates.
+### Step 3: Filter Decision
+Keyword signals determine whether to filter the scatter plot:
+- "compare", "overview", "vs" = show all methods (no filter)
+- "which methods", "find methods", "best for" = filter to relevant subset
+If filtering: methods with similarity > 0.15 are kept, as long as the filtered set has 3+ methods.
+### Step 4: Color-by Selection
+Query keywords are matched against a color-by mapping. "gripper" or "finger" selects End-effector Hardware. "training" or "sim" selects Training Data. Default: "cluster".
+### Step 5: Highlight Selection
+Top 8 methods by query similarity are highlighted (larger, brighter points on the scatter plot).
+### Step 6: RAG Retrieval
+ChromaDB is searched for relevant paper passages (see Layer 4e above).
+### Step 7: Analytics Computation
+From the retrieved chunks:
+- **Paper source distribution**: Which papers contributed the most chunks
+- **Domain topic frequency**: Which technical terms appear most in the evidence
+- **Rhetorical role distribution**: How much of the evidence is "method design" vs "results" vs "experiment setup"
+- **Content type breakdown**: Theory vs implementation vs evaluation
+- **Cited references extraction**: Author-year citations found within chunk text (e.g., "Smith et al., 2022"), counted across all retrieved passages
+---
+## Layer 6: LLM Inference (Single Call)
+After all deterministic computation is complete, a single LLM call generates the insight text. The prompt is structured as:
+```
+You are an expert research assistant for a robotic grasp planning visualization tool.
+RESEARCHER'S QUESTION: "{query}"
+EVIDENCE FROM PAPERS:
+--- From "Paper Title" (Section, relevance: 0.65) ---
+[actual text from the paper]
+RELEVANT METHODS IN THE DATASET:
+- Method1: Planning=Sampling; Gripper=Two-finger; Input=Point cloud
+- Method2: Planning=Direct regression; Gripper=Multi-finger; Input=RGBD
+CLUSTERING RESULTS (56 methods in 7 groups):
+- Sampling / Two-finger / Piled (12 methods): Method1, Method2, ...
+- RL / Multi-finger / Singulated (5 methods): Method3, Method4, ...
+Highlighted methods: Method1, Method5, Method8
+INSTRUCTIONS:
+Write exactly 3-5 bullet points that answer the researcher's question.
+Rules:
+1. Lead with evidence from the paper excerpts. Cite specific papers by name.
+2. Reference concrete technical details from the papers.
+3. Connect findings to the clustering.
+4. Be specific. Avoid generic statements.
+5. Never reference cluster numbers. Use descriptive group names.
+```
+### Why One LLM Call, Not Two
+The original design used two LLM calls (Pass 1: decide config, Pass 2: interpret results). This was replaced because:
+- Pass 1 decisions (weights, filters, highlights) can all be computed deterministically via embedding similarity and keyword matching
+- Removing Pass 1 cuts the prompt size in half and eliminates a point of failure
+- The LLM's only job is now interpretation of already-computed results, which is what it's best at
+### LLM Provider
+- **Primary**: Groq (Llama 3.3 70B, free tier, fastest inference available)
+- **Fallback**: HuggingFace Inference API (Qwen2.5-72B-Instruct)
+- **Local**: Ollama (disabled, crashes the development laptop)
+- **Temperature**: 0.3 (deterministic-leaning but allows slight phrasing variation)
+- **Max tokens**: 1024 for insight generation
+### Grounding
+The LLM cannot hallucinate patterns because:
+1. It only sees real paper excerpts (from ChromaDB), not its training data
+2. It only sees real clustering results (from HDBSCAN), not imagined groupings
+3. It only sees real method metadata (from the CSV), not hallucinated attributes
+4. The prompt explicitly instructs it to cite papers and reference specific numbers
+---
+## Layer 7: Frontend Visualization
+### Scatter Plot (ScatterPlot.js)
+UMAP 2D projection rendered with Plotly.js. Each point is a grasp planning method.
+- **Color**: by cluster (discrete colors) or by any column value (continuous colorscale)
+- **Size**: 10px normal, 16px highlighted, 20px hovered
+- **Opacity**: 0.35 for non-highlighted methods when a query is active, 0.9 normal
+- **Labels**: shown only for highlighted methods to avoid clutter
+### Method Table (MethodTable.js)
+Sortable table showing all method metadata. Highlighted methods are sorted to the top. Max height 520px with scroll.
+### Insight Card (InsightCard.js)
+Displays the LLM-generated bullet points with:
+- **Entity highlighting**: Paper names are bold purple. Technical terms are color-coded by category (blue for architectures like PointNet, green for techniques like UMAP, yellow for gripper types). Hover any highlighted term for a plain-English definition.
+- **Paper Evidence panel**: List of source papers with "View PDF" buttons. Clicking opens the actual PDF in a modal viewer with keyword highlights on the text layer.
+### Query Explanation (QueryExplanation.js)
+A numbered step-by-step section explaining what the system did to answer the query:
+1. How the query was embedded and compared to methods
+2. How many methods were highlighted and why
+3. Whether filtering was applied
+4. How HDBSCAN found N clusters with M methods
+5. How many paper passages were retrieved from the vector database
+Every technical term (sentence-transformer, cosine similarity, HDBSCAN, UMAP, vector database) has a hover tooltip with a plain-English explanation. Designed for robotics experts who may not be familiar with ML terminology.
+### Analytics Dashboard (AnalyticsDashboard.js)
+Four visualization cards that appear after a query:
+1. **Query-Method Similarity**: Horizontal bar chart showing the cosine similarity between the query and each method's description. Shows which methods are most semantically related to the question.
+2. **Cited References in Evidence**: Author-year citations extracted from the retrieved paper passages (e.g., "Smith et al., 2022" found 3x). Shows which foundational works are most relevant, including papers outside the 56-method dataset.
+3. **Papers Referenced**: Bar chart showing how many passages were retrieved from each paper. Indicates which papers have the most content relevant to the query.
+4. **Key Topics in Evidence**: Tag cloud of domain-specific terms found in retrieved passages (e.g., "point cloud", "gripper", "6-DoF"). Larger tags appear more frequently.
+5. **What Kind of Evidence?**: Stacked bar showing the breakdown of retrieved content by type ("How It Works" / "How To Build It" / "How It Performs") and by purpose ("Method Design" / "Results" / "Experiment Setup").
+All chart headings have ? tooltips explaining what the chart shows and why, written for non-ML audiences.
+### PDF Viewer (PdfViewer.js)
+Full PDF viewer using react-pdf (PDF.js wrapper). Opens as a modal overlay when the user clicks "View PDF" on a citation. Features:
+- Page navigation (previous/next)
+- Auto-opens to the page where the retrieved chunk was found
+- Keyword highlights overlaid on the PDF text layer
+- Yellow bar showing which search terms are being highlighted
+---
+## Domain-Agnostic Design
+The entire pipeline is configured via `rag_config.yaml`. To use this system for a different paper collection:
+1. Replace the CSV with your dataset
+2. Place your PDFs in `papers/`
+3. Update `rag_config.yaml`:
+   - `domain_context`: describe your domain
+   - `name_column`, `description_column`: map to your CSV columns
+   - `domain_topics`: list your domain's keyword vocabulary
+4. Run `python -m rag.ingest.pipeline --papers-dir ./papers/ --config rag_config.yaml`
+No code changes needed. The chunking, embedding, retrieval, and visualization pipeline adapts to any collection of academic papers.
+---
+## Data Flow Summary
+```
+User Query
+    |
+    v
+[Deterministic Query Engine]
+    |--- Embed query (sentence-transformer)
+    |--- Compute method similarity (cosine)
+    |--- Adjust weights (keyword matching)
+    |--- Pick color-by, highlights, filter
+    |--- Search ChromaDB (intent-routed, multi-layer)
+    |--- Extract citations from chunks
+    |--- Compute analytics (topics, roles, content types)
+    |
+    v
+[UMAP + HDBSCAN Pipeline]
+    |--- Build weighted feature matrix (TF-IDF + embeddings)
+    |--- Compute cosine distance matrix
+    |--- UMAP project to 2D
+    |--- HDBSCAN cluster with noise reassignment
+    |--- Generate descriptive cluster labels
+    |
+    v
+[Single LLM Call]
+    |--- Prompt: query + paper excerpts + method summaries + cluster results
+    |--- Output: 3-5 grounded bullet points citing specific papers
+    |
+    v
+[Frontend Dashboard]
+    |--- Scatter plot (UMAP projection)
+    |--- Insight card (entity-highlighted bullets + paper evidence + PDF viewer)
+    |--- Query explanation (step-by-step pipeline walkthrough)
+    |--- Analytics dashboard (similarity, citations, topics, evidence types)
+```

frontend/package-lock.json CHANGED Viewed

@@ -1,20 +1,22 @@
 {
-  "name": "frontend-copilot",
   "version": "0.1.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
-      "name": "frontend-copilot",
       "version": "0.1.0",
       "dependencies": {
         "@testing-library/dom": "^10.4.1",
         "@testing-library/jest-dom": "^6.9.1",
         "@testing-library/react": "^16.3.2",
         "@testing-library/user-event": "^13.5.0",
         "plotly.js": "^3.4.0",
         "react": "^19.2.4",
         "react-dom": "^19.2.4",
         "react-plotly.js": "^2.6.0",
         "react-scripts": "5.0.1",
         "web-vitals": "^2.1.4"
@@ -2995,6 +2997,271 @@
       "integrity": "sha512-gRa9gwYU3ECmQYv3lslts5hxuIa90veaEcxDYuu3QGOIAEM2mOZkVHp48ANJuu1CURtRdHKUBY5Lm1tHV+sD4g==",
       "license": "ISC"
     },
     "node_modules/@nicolo-ribaudo/eslint-scope-5-internals": {
       "version": "5.1.1-v1",
       "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/eslint-scope-5-internals/-/eslint-scope-5-internals-5.1.1-v1.tgz",
@@ -6013,6 +6280,15 @@
         "wrap-ansi": "^7.0.0"
       }
     },
     "node_modules/co": {
       "version": "4.6.0",
       "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@@ -12409,6 +12685,22 @@
         "node": ">=4.0"
       }
     },
     "node_modules/kdbush": {
       "version": "4.0.2",
       "resolved": "https://registry.npmjs.org/kdbush/-/kdbush-4.0.2.tgz",
@@ -12639,6 +12931,15 @@
         "sourcemap-codec": "^1.4.8"
       }
     },
     "node_modules/make-dir": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
@@ -12663,6 +12964,15 @@
         "semver": "bin/semver.js"
       }
     },
     "node_modules/makeerror": {
       "version": "1.0.12",
       "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.12.tgz",
@@ -12917,6 +13227,23 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
     "node_modules/merge-stream": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
@@ -13803,6 +14130,18 @@
         "pbf": "bin/pbf"
       }
     },
     "node_modules/performance-now": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
@@ -15703,6 +16042,35 @@
       "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
       "license": "MIT"
     },
     "node_modules/react-plotly.js": {
       "version": "2.6.0",
       "resolved": "https://registry.npmjs.org/react-plotly.js/-/react-plotly.js-2.6.0.tgz",
@@ -18026,6 +18394,12 @@
       "integrity": "sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA==",
       "license": "MIT"
     },
     "node_modules/tinycolor2": {
       "version": "1.6.0",
       "resolved": "https://registry.npmjs.org/tinycolor2/-/tinycolor2-1.6.0.tgz",
@@ -18705,6 +19079,15 @@
         "makeerror": "1.0.12"
       }
     },
     "node_modules/watchpack": {
       "version": "2.5.1",
       "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",

 {
+  "name": "grasp-explorer",
   "version": "0.1.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
+      "name": "grasp-explorer",
       "version": "0.1.0",
       "dependencies": {
         "@testing-library/dom": "^10.4.1",
         "@testing-library/jest-dom": "^6.9.1",
         "@testing-library/react": "^16.3.2",
         "@testing-library/user-event": "^13.5.0",
+        "katex": "^0.16.39",
         "plotly.js": "^3.4.0",
         "react": "^19.2.4",
         "react-dom": "^19.2.4",
+        "react-pdf": "^10.4.1",
         "react-plotly.js": "^2.6.0",
         "react-scripts": "5.0.1",
         "web-vitals": "^2.1.4"
       "integrity": "sha512-gRa9gwYU3ECmQYv3lslts5hxuIa90veaEcxDYuu3QGOIAEM2mOZkVHp48ANJuu1CURtRdHKUBY5Lm1tHV+sD4g==",
       "license": "ISC"
     },
+    "node_modules/@napi-rs/canvas": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
+      "integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
+      "license": "MIT",
+      "optional": true,
+      "workspaces": [
+        "e2e/*"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      },
+      "optionalDependencies": {
+        "@napi-rs/canvas-android-arm64": "0.1.97",
+        "@napi-rs/canvas-darwin-arm64": "0.1.97",
+        "@napi-rs/canvas-darwin-x64": "0.1.97",
+        "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.97",
+        "@napi-rs/canvas-linux-arm64-gnu": "0.1.97",
+        "@napi-rs/canvas-linux-arm64-musl": "0.1.97",
+        "@napi-rs/canvas-linux-riscv64-gnu": "0.1.97",
+        "@napi-rs/canvas-linux-x64-gnu": "0.1.97",
+        "@napi-rs/canvas-linux-x64-musl": "0.1.97",
+        "@napi-rs/canvas-win32-arm64-msvc": "0.1.97",
+        "@napi-rs/canvas-win32-x64-msvc": "0.1.97"
+      }
+    },
+    "node_modules/@napi-rs/canvas-android-arm64": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.97.tgz",
+      "integrity": "sha512-V1c/WVw+NzH8vk7ZK/O8/nyBSCQimU8sfMsB/9qeSvdkGKNU7+mxy/bIF0gTgeBFmHpj30S4E9WHMSrxXGQuVQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-darwin-arm64": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.97.tgz",
+      "integrity": "sha512-ok+SCEF4YejcxuJ9Rm+WWunHHpf2HmiPxfz6z1a/NFQECGXtsY7A4B8XocK1LmT1D7P174MzwPF9Wy3AUAwEPw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-darwin-x64": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.97.tgz",
+      "integrity": "sha512-PUP6e6/UGlclUvAQNnuXCcnkpdUou6VYZfQOQxExLp86epOylmiwLkqXIvpFmjoTEDmPmXrI+coL/9EFU1gKPA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.97.tgz",
+      "integrity": "sha512-XyXH2L/cic8eTNtbrXCcvqHtMX/nEOxN18+7rMrAM2XtLYC/EB5s0wnO1FsLMWmK+04ZSLN9FBGipo7kpIkcOw==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-arm64-gnu": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.97.tgz",
+      "integrity": "sha512-Kuq/M3djq0K8ktgz6nPlK7Ne5d4uWeDxPpyKWOjWDK2RIOhHVtLtyLiJw2fuldw7Vn4mhw05EZXCEr4Q76rs9w==",
+      "cpu": [
+        "arm64"
+      ],
+      "libc": [
+        "glibc"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-arm64-musl": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.97.tgz",
+      "integrity": "sha512-kKmSkQVnWeqg7qdsiXvYxKhAFuHz3tkBjW/zyQv5YKUPhotpaVhpBGv5LqCngzyuRV85SXoe+OFj+Tv0a0QXkQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "libc": [
+        "musl"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.97.tgz",
+      "integrity": "sha512-Jc7I3A51jnEOIAXeLsN/M/+Z28LUeakcsXs07FLq9prXc0eYOtVwsDEv913Gr+06IRo34gJJVgT0TXvmz+N2VA==",
+      "cpu": [
+        "riscv64"
+      ],
+      "libc": [
+        "glibc"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-x64-gnu": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.97.tgz",
+      "integrity": "sha512-iDUBe7AilfuBSRbSa8/IGX38Mf+iCSBqoVKLSQ5XaY2JLOaqz1TVyPFEyIck7wT6mRQhQt5sN6ogfjIDfi74tg==",
+      "cpu": [
+        "x64"
+      ],
+      "libc": [
+        "glibc"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-linux-x64-musl": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.97.tgz",
+      "integrity": "sha512-AKLFd/v0Z5fvgqBDqhvqtAdx+fHMJ5t9JcUNKq4FIZ5WH+iegGm8HPdj00NFlCSnm83Fp3Ln8I2f7uq1aIiWaA==",
+      "cpu": [
+        "x64"
+      ],
+      "libc": [
+        "musl"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-win32-arm64-msvc": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.97.tgz",
+      "integrity": "sha512-u883Yr6A6fO7Vpsy9YE4FVCIxzzo5sO+7pIUjjoDLjS3vQaNMkVzx5bdIpEL+ob+gU88WDK4VcxYMZ6nmnoX9A==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
+    "node_modules/@napi-rs/canvas-win32-x64-msvc": {
+      "version": "0.1.97",
+      "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.97.tgz",
+      "integrity": "sha512-sWtD2EE3fV0IzN+iiQUqr/Q1SwqWhs2O1FKItFlxtdDkikpEj5g7DKQpY3x55H/MAOnL8iomnlk3mcEeGiUMoQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">= 10"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/Brooooooklyn"
+      }
+    },
     "node_modules/@nicolo-ribaudo/eslint-scope-5-internals": {
       "version": "5.1.1-v1",
       "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/eslint-scope-5-internals/-/eslint-scope-5-internals-5.1.1-v1.tgz",
         "wrap-ansi": "^7.0.0"
       }
     },
+    "node_modules/clsx": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz",
+      "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/co": {
       "version": "4.6.0",
       "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
         "node": ">=4.0"
       }
     },
+    "node_modules/katex": {
+      "version": "0.16.39",
+      "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.39.tgz",
+      "integrity": "sha512-FR2f6y85+81ZLO0GPhyQ+EJl/E5ILNWltJhpAeOTzRny952Z13x2867lTFDmvMZix//Ux3CuMQ2VkLXRbUwOFg==",
+      "funding": [
+        "https://opencollective.com/katex",
+        "https://github.com/sponsors/katex"
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "commander": "^8.3.0"
+      },
+      "bin": {
+        "katex": "cli.js"
+      }
+    },
     "node_modules/kdbush": {
       "version": "4.0.2",
       "resolved": "https://registry.npmjs.org/kdbush/-/kdbush-4.0.2.tgz",
         "sourcemap-codec": "^1.4.8"
       }
     },
+    "node_modules/make-cancellable-promise": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/make-cancellable-promise/-/make-cancellable-promise-2.0.0.tgz",
+      "integrity": "sha512-3SEQqTpV9oqVsIWqAcmDuaNeo7yBO3tqPtqGRcKkEo0lrzD3wqbKG9mkxO65KoOgXqj+zH2phJ2LiAsdzlogSw==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/wojtekmaj/make-cancellable-promise?sponsor=1"
+      }
+    },
     "node_modules/make-dir": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
         "semver": "bin/semver.js"
       }
     },
+    "node_modules/make-event-props": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/make-event-props/-/make-event-props-2.0.0.tgz",
+      "integrity": "sha512-G/hncXrl4Qt7mauJEXSg3AcdYzmpkIITTNl5I+rH9sog5Yw0kK6vseJjCaPfOXqOqQuPUP89Rkhfz5kPS8ijtw==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/wojtekmaj/make-event-props?sponsor=1"
+      }
+    },
     "node_modules/makeerror": {
       "version": "1.0.12",
       "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.12.tgz",
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/merge-refs": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/merge-refs/-/merge-refs-2.0.0.tgz",
+      "integrity": "sha512-3+B21mYK2IqUWnd2EivABLT7ueDhb0b8/dGK8LoFQPrU61YITeCMn14F7y7qZafWNZhUEKb24cJdiT5Wxs3prg==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/wojtekmaj/merge-refs?sponsor=1"
+      },
+      "peerDependencies": {
+        "@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/merge-stream": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
         "pbf": "bin/pbf"
       }
     },
+    "node_modules/pdfjs-dist": {
+      "version": "5.4.296",
+      "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz",
+      "integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=20.16.0 || >=22.3.0"
+      },
+      "optionalDependencies": {
+        "@napi-rs/canvas": "^0.1.80"
+      }
+    },
     "node_modules/performance-now": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
       "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
       "license": "MIT"
     },
+    "node_modules/react-pdf": {
+      "version": "10.4.1",
+      "resolved": "https://registry.npmjs.org/react-pdf/-/react-pdf-10.4.1.tgz",
+      "integrity": "sha512-kS/35staVCBqS29verTQJQZXw7RfsRCPO3fdJoW1KXylcv7A9dw6DZ3vJXC2w+bIBgLw5FN4pOFvKSQtkQhPfA==",
+      "license": "MIT",
+      "dependencies": {
+        "clsx": "^2.0.0",
+        "dequal": "^2.0.3",
+        "make-cancellable-promise": "^2.0.0",
+        "make-event-props": "^2.0.0",
+        "merge-refs": "^2.0.0",
+        "pdfjs-dist": "5.4.296",
+        "tiny-invariant": "^1.0.0",
+        "warning": "^4.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/wojtekmaj/react-pdf?sponsor=1"
+      },
+      "peerDependencies": {
+        "@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+        "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+        "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/react-plotly.js": {
       "version": "2.6.0",
       "resolved": "https://registry.npmjs.org/react-plotly.js/-/react-plotly.js-2.6.0.tgz",
       "integrity": "sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA==",
       "license": "MIT"
     },
+    "node_modules/tiny-invariant": {
+      "version": "1.3.3",
+      "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz",
+      "integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==",
+      "license": "MIT"
+    },
     "node_modules/tinycolor2": {
       "version": "1.6.0",
       "resolved": "https://registry.npmjs.org/tinycolor2/-/tinycolor2-1.6.0.tgz",
         "makeerror": "1.0.12"
       }
     },
+    "node_modules/warning": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/warning/-/warning-4.0.3.tgz",
+      "integrity": "sha512-rpJyN222KWIvHJ/F53XSZv0Zl/accqHR8et1kpaMTD/fLCRxtV8iX8czMzY7sVZupTI3zcUTg8eycS2kNF9l6w==",
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.0.0"
+      }
+    },
     "node_modules/watchpack": {
       "version": "2.5.1",
       "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",

frontend/package.json CHANGED Viewed

@@ -7,9 +7,11 @@
     "@testing-library/jest-dom": "^6.9.1",
     "@testing-library/react": "^16.3.2",
     "@testing-library/user-event": "^13.5.0",
     "plotly.js": "^3.4.0",
     "react": "^19.2.4",
     "react-dom": "^19.2.4",
     "react-plotly.js": "^2.6.0",
     "react-scripts": "5.0.1",
     "web-vitals": "^2.1.4"

     "@testing-library/jest-dom": "^6.9.1",
     "@testing-library/react": "^16.3.2",
     "@testing-library/user-event": "^13.5.0",
+    "katex": "^0.16.39",
     "plotly.js": "^3.4.0",
     "react": "^19.2.4",
     "react-dom": "^19.2.4",
+    "react-pdf": "^10.4.1",
     "react-plotly.js": "^2.6.0",
     "react-scripts": "5.0.1",
     "web-vitals": "^2.1.4"

frontend/src/App.css CHANGED Viewed

@@ -117,6 +117,7 @@ body {
   gap: 0.75rem;
   margin-bottom: 0.75rem;
   min-height: 480px;
 }
 .scatter-panel {
@@ -199,6 +200,86 @@ body {
   color: #777; font-size: 0.68rem; line-height: 1.3;
 }
 /* Insight bullet points */
 .insight-bullets {
   margin: 0; padding: 0 0 0 1.2rem;
@@ -218,6 +299,137 @@ body {
   font-size: 0.85rem;
 }
 .table-panel {
   width: 480px;
   flex-shrink: 0;
@@ -227,6 +439,7 @@ body {
   display: flex;
   flex-direction: column;
   overflow: hidden;
 }
 .table-panel-header {
@@ -390,29 +603,186 @@ body {
 @keyframes spin { to { transform: rotate(360deg); } }
 .error-screen { text-align: center; padding: 2rem; color: #991b1b; }
-/* Dendrogram */
-.dendro-card {
   background: white;
   border-radius: 8px;
   box-shadow: 0 1px 3px rgba(0,0,0,0.06);
-  margin-bottom: 0.75rem;
   overflow: hidden;
 }
-.dendro-header {
-  display: flex; align-items: center; justify-content: space-between;
-  padding: 0.5rem 0.75rem;
-  background: #f8fafc;
-  border-bottom: 1px solid #e5e7eb;
 }
-.dendro-title {
-  font-size: 0.82rem; font-weight: 600; color: #444;
 }
-.dendro-subtitle {
-  font-size: 0.72rem; color: #888;
 }
-.dendro-loading {
-  padding: 2rem; text-align: center;
-  font-size: 0.85rem; color: #667eea; font-style: italic;
 }
 /* Footer */

   gap: 0.75rem;
   margin-bottom: 0.75rem;
   min-height: 480px;
+  align-items: flex-start;
 }
 .scatter-panel {
   color: #777; font-size: 0.68rem; line-height: 1.3;
 }
+/* Tooltip (fixed position, renders via portal to body) */
+.tooltip-wrapper {
+  position: relative; display: inline;
+}
+.tooltip-bubble-fixed {
+  position: fixed;
+  background: #1e293b; color: #f1f5f9;
+  font-size: 0.73rem; line-height: 1.5; font-weight: 400;
+  padding: 0.6rem 0.8rem; border-radius: 8px;
+  white-space: normal;
+  box-shadow: 0 8px 24px rgba(0,0,0,0.25);
+  z-index: 10000; pointer-events: none;
+  max-width: 90vw;
+}
+.tooltip-bubble-fixed.tooltip-wide { width: 320px; }
+/* Entity color coding in insights */
+.entity-tag { font-weight: 600; border-radius: 2px; cursor: default; }
+.entity-paper { color: #4338ca; }
+.entity-architecture {
+  color: #0369a1; background: #e0f2fe; padding: 0 0.15rem; border-radius: 3px;
+}
+.entity-technique {
+  color: #065f46; background: #d1fae5; padding: 0 0.15rem; border-radius: 3px;
+}
+.entity-gripper {
+  color: #92400e; background: #fef3c7; padding: 0 0.15rem; border-radius: 3px;
+}
+/* Chart help button */
+.chart-help, .explanation-help {
+  display: inline-flex; align-items: center; justify-content: center;
+  width: 16px; height: 16px; border-radius: 50%;
+  background: #e8ecf1; color: #667eea; font-size: 0.6rem; font-weight: 700;
+  margin-left: 0.4rem; cursor: help; vertical-align: middle;
+}
+.chart-help:hover, .explanation-help:hover { background: #667eea; color: white; }
+/* Query Explanation Section */
+.query-explanation {
+  background: white; border-radius: 8px;
+  box-shadow: 0 1px 3px rgba(0,0,0,0.06);
+  padding: 1.2rem 1.5rem; margin: 0.75rem 0;
+  border-left: 4px solid #667eea;
+}
+.explanation-header {
+  display: flex; align-items: center; margin-bottom: 0.75rem;
+}
+.explanation-title {
+  font-size: 0.95rem; font-weight: 700; color: #222;
+}
+.explanation-steps {
+  display: flex; flex-direction: column; gap: 0.4rem;
+}
+.explanation-step {
+  display: flex; gap: 0.6rem; align-items: flex-start;
+}
+.step-number {
+  width: 22px; height: 22px; border-radius: 50%;
+  background: #667eea; color: white;
+  font-size: 0.68rem; font-weight: 700;
+  display: flex; align-items: center; justify-content: center;
+  flex-shrink: 0; margin-top: 0.05rem;
+}
+.step-content {
+  flex: 1;
+}
+.step-label {
+  font-size: 0.88rem; font-weight: 600; color: #333;
+  display: block; margin-bottom: 0.15rem;
+}
+.step-detail {
+  font-size: 0.82rem; color: #555; line-height: 1.6;
+}
+.inline-term {
+  color: #065f46; background: #d1fae5; padding: 0.05rem 0.25rem;
+  border-radius: 3px; font-weight: 600; font-size: 0.82rem;
+  cursor: help; border-bottom: 1px dashed #065f46;
+}
 /* Insight bullet points */
 .insight-bullets {
   margin: 0; padding: 0 0 0 1.2rem;
   font-size: 0.85rem;
 }
+/* Paper references in insights */
+.paper-reference {
+  color: #4338ca; font-weight: 700;
+}
+/* Paper Evidence Panel */
+.paper-evidence-panel {
+  margin-top: 0.6rem;
+  border-top: 1px solid #e8ecf1;
+  padding-top: 0.5rem;
+}
+.evidence-header {
+  display: flex; justify-content: space-between; align-items: center;
+  margin-bottom: 0.5rem;
+}
+.evidence-title {
+  font-size: 0.82rem; font-weight: 700; color: #333;
+}
+.evidence-count {
+  font-size: 0.7rem; color: #888;
+}
+.evidence-paper-list {
+  display: flex; flex-direction: column; gap: 0.35rem;
+}
+.evidence-paper-row {
+  display: flex; align-items: center; justify-content: space-between;
+  padding: 0.5rem 0.7rem; border-radius: 6px;
+  border: 1px solid #e8ecf1; background: #fafaff;
+  transition: border-color 0.15s;
+}
+.evidence-paper-row:hover { border-color: #667eea; }
+.evidence-paper-info {
+  display: flex; align-items: center; gap: 0.6rem; flex: 1; min-width: 0;
+}
+.evidence-paper-name {
+  font-size: 0.8rem; font-weight: 600; color: #333;
+  white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
+}
+.evidence-paper-score {
+  font-size: 0.72rem; font-weight: 600; color: #667eea;
+  white-space: nowrap; background: #f0f2ff; padding: 0.1rem 0.4rem;
+  border-radius: 8px; flex-shrink: 0;
+}
+.evidence-detail {
+  flex: 1; border-radius: 8px;
+  padding: 0.8rem 1rem; border: 1px solid #e2e6f0;
+  overflow-y: auto; max-height: 320px;
+}
+.evidence-detail.paper-style {
+  background: #fffef8;
+  border: 1px solid #d4d0c8;
+  box-shadow: 2px 2px 8px rgba(0,0,0,0.06);
+}
+.paper-title-bar {
+  display: flex; justify-content: space-between; align-items: flex-start;
+  margin-bottom: 0.3rem;
+}
+.paper-title-text {
+  font-family: 'Georgia', 'Times New Roman', serif;
+  font-size: 0.95rem; font-weight: 700; color: #1a1a1a; line-height: 1.3;
+  max-width: 75%;
+}
+.paper-relevance-badge {
+  font-size: 0.68rem; font-weight: 600; color: #667eea;
+  white-space: nowrap; background: #f0f2ff; padding: 0.15rem 0.5rem;
+  border-radius: 10px; flex-shrink: 0;
+}
+.paper-section-tag {
+  font-size: 0.72rem; color: #666; font-style: italic;
+  margin-bottom: 0.3rem;
+}
+.paper-divider {
+  height: 1px; background: #c8c4b8; margin: 0.4rem 0 0.6rem 0;
+}
+.evidence-detail-text {
+  font-size: 0.82rem; color: #333; line-height: 1.75;
+  word-break: break-word;
+}
+.evidence-text-body {
+  font-family: 'Georgia', 'Times New Roman', serif;
+  letter-spacing: 0.01em;
+}
+.rag-highlight {
+  background: linear-gradient(120deg, #fef08a 0%, #fde047 100%);
+  padding: 0.08rem 0.2rem; border-radius: 3px;
+  font-weight: 600; color: #333;
+  box-shadow: 0 1px 2px rgba(0,0,0,0.05);
+}
+.latex-inline { display: inline; vertical-align: baseline; }
+.latex-block {
+  display: block; text-align: center;
+  margin: 0.5rem 0; padding: 0.3rem;
+  background: #f8f9fa; border-radius: 4px;
+}
+.latex-fallback {
+  font-family: 'Courier New', monospace; font-size: 0.78rem;
+  background: #f0f0f0; padding: 0.1rem 0.3rem; border-radius: 3px;
+}
+.evidence-keywords {
+  display: flex; flex-wrap: wrap; gap: 0.3rem; margin-top: 0.5rem;
+  padding-top: 0.4rem; border-top: 1px solid #e8ecf1;
+  align-items: center;
+}
+.evidence-keywords-label {
+  font-size: 0.68rem; color: #888; font-weight: 500; margin-right: 0.2rem;
+}
+.evidence-keyword-tag {
+  font-size: 0.67rem; background: #fef9c3; color: #854d0e;
+  padding: 0.12rem 0.45rem; border-radius: 10px; font-weight: 500;
+}
+.evidence-show-more {
+  background: none; border: none; color: #667eea; font-size: 0.72rem;
+  cursor: pointer; padding: 0.3rem 0; font-weight: 500; text-align: left;
+}
+.evidence-show-more:hover { text-decoration: underline; }
+/* Tool Results */
+.tool-results-list { margin-top: 0.3rem; }
+.tool-result-item {
+  background: #f7f8fc; border-radius: 6px; padding: 0.4rem 0.6rem;
+  margin-bottom: 0.3rem; border-left: 3px solid #48bb78;
+}
+.tool-result-name {
+  font-size: 0.75rem; font-weight: 600; color: #333; margin-bottom: 0.2rem;
+}
+.tool-result-data {
+  font-size: 0.7rem; color: #555; line-height: 1.4;
+  margin: 0; white-space: pre-wrap; max-height: 6rem; overflow-y: auto;
+  background: #fff; padding: 0.3rem; border-radius: 4px;
+}
 .table-panel {
   width: 480px;
   flex-shrink: 0;
   display: flex;
   flex-direction: column;
   overflow: hidden;
+  max-height: 520px;
 }
 .table-panel-header {
 @keyframes spin { to { transform: rotate(360deg); } }
 .error-screen { text-align: center; padding: 2rem; color: #991b1b; }
+/* Analytics Dashboard */
+.analytics-dashboard {
+  margin: 0.75rem 0;
+}
+.analytics-grid {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 0.75rem;
+}
+.analytics-card {
   background: white;
   border-radius: 8px;
   box-shadow: 0 1px 3px rgba(0,0,0,0.06);
+  padding: 0.85rem 1rem;
   overflow: hidden;
 }
+.analytics-card-title {
+  font-size: 0.82rem;
+  font-weight: 700;
+  color: #222;
+  margin: 0 0 0.15rem 0;
+}
+.analytics-card-subtitle {
+  font-size: 0.7rem;
+  color: #888;
+  margin: 0 0 0.5rem 0;
+  line-height: 1.3;
+}
+.topic-cloud {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.4rem;
+  padding: 0.3rem 0;
+}
+.topic-tag {
+  padding: 0.3rem 0.7rem;
+  border-radius: 14px;
+  font-weight: 500;
+  display: inline-flex;
+  align-items: center;
+  gap: 0.3rem;
+  transition: transform 0.1s;
+}
+.topic-tag:hover { transform: scale(1.05); }
+.topic-count {
+  font-size: 0.6rem;
+  padding: 0.08rem 0.35rem;
+  border-radius: 8px;
+  font-weight: 700;
+}
+/* Stacked bar for evidence types */
+.evidence-type-section {
+  margin-bottom: 0.7rem;
+}
+.evidence-type-label {
+  font-size: 0.72rem; font-weight: 600; color: #555;
+  display: block; margin-bottom: 0.3rem;
+}
+.stacked-bar {
+  display: flex; height: 28px; border-radius: 6px; overflow: hidden;
+  box-shadow: inset 0 1px 2px rgba(0,0,0,0.05);
+}
+.stacked-bar-segment {
+  display: flex; align-items: center; justify-content: center;
+  min-width: 4px; transition: opacity 0.15s;
+}
+.stacked-bar-segment:hover { opacity: 0.85; }
+.segment-label {
+  font-size: 0.65rem; color: white; font-weight: 600;
+  text-shadow: 0 1px 2px rgba(0,0,0,0.2);
+  white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
+  padding: 0 0.3rem;
+}
+.evidence-type-legend {
+  display: flex; flex-wrap: wrap; gap: 0.5rem; margin-top: 0.3rem;
+}
+.legend-item {
+  font-size: 0.68rem; color: #555;
+  display: flex; align-items: center; gap: 0.25rem;
+}
+.legend-dot {
+  width: 8px; height: 8px; border-radius: 2px; flex-shrink: 0;
+}
+.cited-refs-note {
+  font-size: 0.68rem; color: #888; margin-top: 0.3rem;
+  font-style: italic;
+}
+/* View PDF button */
+.evidence-actions-row {
+  display: flex; justify-content: space-between; align-items: flex-start;
+  margin-top: 0.5rem; padding-top: 0.4rem; border-top: 1px solid #e8ecf1;
+  flex-wrap: wrap; gap: 0.4rem;
+}
+.view-pdf-btn {
+  background: #667eea; color: white; border: none; border-radius: 6px;
+  padding: 0.35rem 0.8rem; font-size: 0.75rem; font-weight: 600;
+  cursor: pointer; white-space: nowrap; flex-shrink: 0;
+}
+.view-pdf-btn:hover { background: #5a6fd6; }
+/* PDF Viewer Modal */
+.pdf-viewer-overlay {
+  position: fixed; top: 0; left: 0; right: 0; bottom: 0;
+  background: rgba(0,0,0,0.6); z-index: 9999;
+  display: flex; align-items: center; justify-content: center;
+}
+.pdf-viewer-modal {
+  background: white; border-radius: 10px;
+  width: 90vw; max-width: 860px; height: 88vh;
+  display: flex; flex-direction: column;
+  box-shadow: 0 20px 60px rgba(0,0,0,0.3);
+  overflow: hidden;
+}
+.pdf-viewer-header {
+  display: flex; justify-content: space-between; align-items: center;
+  padding: 0.6rem 1rem; background: #1e293b; color: white;
+  flex-shrink: 0;
+}
+.pdf-viewer-title {
+  font-size: 0.85rem; font-weight: 600; text-transform: capitalize;
+  overflow: hidden; text-overflow: ellipsis; white-space: nowrap;
+  max-width: 40%;
+}
+.pdf-viewer-controls {
+  display: flex; align-items: center; gap: 0.5rem;
 }
+.pdf-nav-btn {
+  background: rgba(255,255,255,0.15); color: white; border: none;
+  border-radius: 4px; padding: 0.3rem 0.6rem; font-size: 0.75rem;
+  cursor: pointer;
+}
+.pdf-nav-btn:hover { background: rgba(255,255,255,0.25); }
+.pdf-nav-btn:disabled { opacity: 0.3; cursor: default; }
+.pdf-page-info { font-size: 0.75rem; color: #cbd5e1; }
+.pdf-close-btn {
+  background: rgba(255,255,255,0.15); color: white; border: none;
+  border-radius: 4px; padding: 0.2rem 0.6rem; font-size: 1.1rem;
+  cursor: pointer; margin-left: 0.5rem;
+}
+.pdf-close-btn:hover { background: #e53e3e; }
+.pdf-keywords-bar {
+  padding: 0.4rem 1rem; background: #fef9c3;
+  font-size: 0.73rem; color: #854d0e;
+  display: flex; align-items: center; gap: 0.4rem; flex-wrap: wrap;
+  flex-shrink: 0;
+}
+.pdf-kw-tag {
+  background: #fde047; padding: 0.1rem 0.4rem; border-radius: 8px;
+  font-weight: 600; font-size: 0.68rem;
+}
+.pdf-viewer-content {
+  flex: 1; overflow-y: auto; display: flex; justify-content: center;
+  padding: 1rem; background: #f1f5f9;
+}
+.pdf-loading, .pdf-error {
+  padding: 2rem; text-align: center; font-size: 0.9rem; color: #666;
 }
+.pdf-error { color: #e53e3e; }
+/* Keyword highlights on the PDF text layer */
+.pdf-keyword-highlight {
+  background: rgba(254, 224, 71, 0.5) !important;
+  border-radius: 2px;
+}
+/* react-pdf text layer base styles */
+.react-pdf__Page__textContent {
+  opacity: 0.4;
 }
+.react-pdf__Page__textContent span.pdf-keyword-highlight {
+  opacity: 1;
+  background: rgba(254, 224, 71, 0.6) !important;
+}
+@media (max-width: 900px) {
+  .analytics-grid {
+    grid-template-columns: 1fr;
+  }
 }
 /* Footer */

frontend/src/App.js CHANGED Viewed

@@ -5,7 +5,8 @@ import ClusterOverview from './components/ClusterOverview';
 import ScatterPlot from './components/ScatterPlot';
 import MethodTable from './components/MethodTable';
 import DetailPanel from './components/DetailPanel';
-import Dendrogram from './components/Dendrogram';
 import './App.css';
 function App() {
@@ -169,6 +170,7 @@ function App() {
           <InsightCard
             suggestion={suggestion}
             weights={weights}
             onClose={() => setSuggestion(null)}
           />
         )}
@@ -222,7 +224,14 @@ function App() {
         />
       </div>
-      <Dendrogram />
       <DetailPanel point={selectedPoint} onClose={() => setSelectedPoint(null)} />

 import ScatterPlot from './components/ScatterPlot';
 import MethodTable from './components/MethodTable';
 import DetailPanel from './components/DetailPanel';
+import AnalyticsDashboard from './components/AnalyticsDashboard';
+import QueryExplanation from './components/QueryExplanation';
 import './App.css';
 function App() {
           <InsightCard
             suggestion={suggestion}
             weights={weights}
+            query={query}
             onClose={() => setSuggestion(null)}
           />
         )}
         />
       </div>
+      <QueryExplanation
+        suggestion={suggestion}
+        query={query}
+        data={data}
+        clusterStats={clusterStats}
+      />
+      <AnalyticsDashboard suggestion={suggestion} />
       <DetailPanel point={selectedPoint} onClose={() => setSelectedPoint(null)} />

frontend/src/components/AnalyticsDashboard.js ADDED Viewed

	@@ -0,0 +1,354 @@

+import React from 'react';
+import Plot from 'react-plotly.js';
+import Tooltip from './Tooltip';
+// Human-friendly labels for content types and rhetorical roles
+const CONTENT_TYPE_LABELS = {
+  theory: 'How It Works',
+  implementation: 'How To Build It',
+  evaluation: 'How It Performs',
+  general: 'General',
+};
+const ROLE_LABELS = {
+  algorithm_description: 'Method Design',
+  experimental_setup: 'Experiment Setup',
+  result: 'Results & Metrics',
+  comparison: 'Comparisons',
+  problem_statement: 'Problem Definition',
+  limitation: 'Limitations',
+  definition: 'Definitions',
+  general: 'General',
+};
+const CONTENT_COLORS = {
+  'How It Works': '#667eea',
+  'How To Build It': '#48bb78',
+  'How It Performs': '#ed8936',
+  'General': '#a0aec0',
+};
+const ROLE_COLORS = {
+  'Method Design': '#667eea',
+  'Experiment Setup': '#48bb78',
+  'Results & Metrics': '#ed8936',
+  'Comparisons': '#e53e3e',
+  'Problem Definition': '#9f7aea',
+  'Limitations': '#dd6b20',
+  'Definitions': '#38b2ac',
+  'General': '#a0aec0',
+};
+function MethodRelevanceChart({ methodRelevance }) {
+  if (!methodRelevance || methodRelevance.length === 0) return null;
+  const top = methodRelevance.slice(0, 10);
+  const names = top.map(m => m.name.length > 35 ? m.name.slice(0, 33) + '...' : m.name);
+  const scores = top.map(m => m.score);
+  const maxScore = Math.max(...scores);
+  return (
+    <div className="analytics-card">
+      <h3 className="analytics-card-title">
+        Query-Method Similarity
+        <Tooltip text="Each method's text description was converted to a vector using a sentence-transformer model, then compared to your query vector using cosine similarity. Higher scores mean the method's description is more semantically related to what you asked." wide>
+          <span className="chart-help">?</span>
+        </Tooltip>
+      </h3>
+      <p className="analytics-card-subtitle">How closely each method's description matches your query</p>
+      <Plot
+        data={[{
+          type: 'bar',
+          x: scores,
+          y: names,
+          orientation: 'h',
+          marker: {
+            color: scores.map(s => {
+              const ratio = s / maxScore;
+              return ratio > 0.9 ? '#667eea' : ratio > 0.7 ? '#7c8ef2' : '#b4bff8';
+            }),
+            line: { width: 0 },
+          },
+          text: scores.map(s => (s * 100).toFixed(0) + '%'),
+          textposition: 'outside',
+          textfont: { size: 11, color: '#555' },
+          hovertemplate: '%{y}: %{x:.1%}<extra></extra>',
+        }]}
+        layout={{
+          margin: { l: 210, r: 50, t: 5, b: 25 },
+          height: Math.max(180, top.length * 26),
+          xaxis: {
+            title: { text: 'Cosine Similarity', font: { size: 10, color: '#888' } },
+            range: [0, maxScore * 1.2],
+            showgrid: true, gridcolor: '#f0f0f0',
+          },
+          yaxis: { autorange: 'reversed', tickfont: { size: 11 } },
+          paper_bgcolor: 'transparent',
+          plot_bgcolor: 'transparent',
+        }}
+        config={{ displayModeBar: false, responsive: true }}
+        style={{ width: '100%' }}
+      />
+    </div>
+  );
+}
+function PaperSourcesChart({ paperSources }) {
+  if (!paperSources || paperSources.length === 0) return null;
+  const names = paperSources.map(p => p.name.length > 35 ? p.name.slice(0, 33) + '...' : p.name);
+  const counts = paperSources.map(p => p.count);
+  return (
+    <div className="analytics-card">
+      <h3 className="analytics-card-title">
+        Papers Referenced
+        <Tooltip text="When you ask a question, the system searches a vector database of 1,074 text chunks extracted from 34 research papers. This chart shows which papers had the most passages matching your query. More passages means the paper is more relevant to your question." wide>
+          <span className="chart-help">?</span>
+        </Tooltip>
+      </h3>
+      <p className="analytics-card-subtitle">Number of relevant passages retrieved from each paper</p>
+      <Plot
+        data={[{
+          type: 'bar',
+          x: counts,
+          y: names,
+          orientation: 'h',
+          marker: { color: '#48bb78', line: { width: 0 } },
+          text: counts.map(String),
+          textposition: 'outside',
+          textfont: { size: 11, color: '#555' },
+          hovertemplate: '%{y}: %{x} passages<extra></extra>',
+        }]}
+        layout={{
+          margin: { l: 220, r: 40, t: 5, b: 25 },
+          height: Math.max(140, paperSources.length * 32),
+          xaxis: {
+            title: { text: 'Passages Found', font: { size: 10, color: '#888' } },
+            dtick: 1, showgrid: true, gridcolor: '#f0f0f0',
+          },
+          yaxis: { autorange: 'reversed', tickfont: { size: 11 } },
+          paper_bgcolor: 'transparent',
+          plot_bgcolor: 'transparent',
+        }}
+        config={{ displayModeBar: false, responsive: true }}
+        style={{ width: '100%' }}
+      />
+    </div>
+  );
+}
+function DomainTopicsChart({ domainTopics }) {
+  if (!domainTopics || domainTopics.length === 0) return null;
+  const top = domainTopics.slice(0, 12);
+  return (
+    <div className="analytics-card">
+      <h3 className="analytics-card-title">
+        Key Topics in Evidence
+        <Tooltip text="Each retrieved paper passage was scanned for domain-specific technical terms (like 'point cloud', 'gripper', '6-DoF'). Larger, darker tags appear more frequently across the evidence, showing what concepts dominate the retrieved content." wide>
+          <span className="chart-help">?</span>
+        </Tooltip>
+      </h3>
+      <p className="analytics-card-subtitle">Technical terms found across retrieved paper passages</p>
+      <div className="topic-cloud">
+        {top.map((t, i) => {
+          const ratio = t.count / top[0].count;
+          return (
+            <span
+              key={i}
+              className="topic-tag"
+              style={{
+                fontSize: `${0.72 + ratio * 0.4}rem`,
+                background: ratio > 0.6 ? '#667eea' : ratio > 0.3 ? '#e8ecf1' : '#f7f8fc',
+                color: ratio > 0.6 ? 'white' : '#4a5568',
+              }}
+            >
+              {t.topic}
+              <span className="topic-count" style={{
+                background: ratio > 0.6 ? 'rgba(255,255,255,0.3)' : '#667eea',
+                color: 'white',
+              }}>{t.count}</span>
+            </span>
+          );
+        })}
+      </div>
+    </div>
+  );
+}
+function EvidenceTypeChart({ contentTypes, rhetoricalRoles }) {
+  if ((!contentTypes || contentTypes.length === 0) &&
+      (!rhetoricalRoles || rhetoricalRoles.length === 0)) return null;
+  // Build stacked bar for "What kind of evidence did we find?"
+  const typeData = (contentTypes || []).map(c => ({
+    label: CONTENT_TYPE_LABELS[c.type] || c.type,
+    count: c.count,
+    color: CONTENT_COLORS[CONTENT_TYPE_LABELS[c.type]] || '#cbd5e0',
+  }));
+  const roleData = (rhetoricalRoles || []).map(r => ({
+    label: ROLE_LABELS[r.role] || r.role,
+    count: r.count,
+    color: ROLE_COLORS[ROLE_LABELS[r.role]] || '#cbd5e0',
+  }));
+  const totalChunks = typeData.reduce((sum, d) => sum + d.count, 0) || 1;
+  return (
+    <div className="analytics-card">
+      <h3 className="analytics-card-title">
+        What Kind of Evidence?
+        <Tooltip text="Each paper passage is automatically classified by what it describes. 'How It Works' covers algorithms and math. 'How To Build It' covers training details and implementation. 'How It Performs' covers experimental results and benchmarks. This shows what type of content the system found for your query." wide>
+          <span className="chart-help">?</span>
+        </Tooltip>
+      </h3>
+      <p className="analytics-card-subtitle">Breakdown of retrieved content by type and purpose</p>
+      {typeData.length > 0 && (
+        <div className="evidence-type-section">
+          <span className="evidence-type-label">Content Focus</span>
+          <div className="stacked-bar">
+            {typeData.map((d, i) => (
+              <div
+                key={i}
+                className="stacked-bar-segment"
+                style={{
+                  width: `${(d.count / totalChunks) * 100}%`,
+                  background: d.color,
+                }}
+                title={`${d.label}: ${d.count} passages`}
+              >
+                {d.count / totalChunks > 0.15 && (
+                  <span className="segment-label">{d.label}</span>
+                )}
+              </div>
+            ))}
+          </div>
+          <div className="evidence-type-legend">
+            {typeData.map((d, i) => (
+              <span key={i} className="legend-item">
+                <span className="legend-dot" style={{ background: d.color }}></span>
+                {d.label} ({d.count})
+              </span>
+            ))}
+          </div>
+        </div>
+      )}
+      {roleData.length > 0 && (
+        <div className="evidence-type-section">
+          <span className="evidence-type-label">Paper Section Purpose</span>
+          <div className="stacked-bar">
+            {roleData.map((d, i) => (
+              <div
+                key={i}
+                className="stacked-bar-segment"
+                style={{
+                  width: `${(d.count / totalChunks) * 100}%`,
+                  background: d.color,
+                }}
+                title={`${d.label}: ${d.count} passages`}
+              >
+                {d.count / totalChunks > 0.15 && (
+                  <span className="segment-label">{d.label}</span>
+                )}
+              </div>
+            ))}
+          </div>
+          <div className="evidence-type-legend">
+            {roleData.map((d, i) => (
+              <span key={i} className="legend-item">
+                <span className="legend-dot" style={{ background: d.color }}></span>
+                {d.label} ({d.count})
+              </span>
+            ))}
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
+function CitedReferencesChart({ citedReferences }) {
+  if (!citedReferences || citedReferences.length === 0) return null;
+  // Filter to only author-year citations (skip numbered [1], [2] which are noisy)
+  const authorCites = citedReferences.filter(r => !r.name.startsWith('['));
+  const numbered = citedReferences.filter(r => r.name.startsWith('['));
+  const toShow = authorCites.length > 0 ? authorCites.slice(0, 10) : numbered.slice(0, 10);
+  if (toShow.length === 0) return null;
+  const names = toShow.map(r => r.name);
+  const counts = toShow.map(r => r.count);
+  return (
+    <div className="analytics-card">
+      <h3 className="analytics-card-title">
+        Cited References in Evidence
+        <Tooltip text="These are academic papers that were cited WITHIN the retrieved passages. For example, if a retrieved chunk says 'as shown by (Smith et al., 2022)', that reference is counted here. This reveals which foundational works are most relevant to your query, even papers outside our 56-method dataset." wide>
+          <span className="chart-help">?</span>
+        </Tooltip>
+      </h3>
+      <p className="analytics-card-subtitle">Papers referenced inside the retrieved evidence passages</p>
+      <Plot
+        data={[{
+          type: 'bar',
+          x: counts,
+          y: names,
+          orientation: 'h',
+          marker: { color: '#9f7aea', line: { width: 0 } },
+          text: counts.map(c => `${c}x`),
+          textposition: 'outside',
+          textfont: { size: 11, color: '#555' },
+          hovertemplate: '%{y}: cited %{x} times<extra></extra>',
+        }]}
+        layout={{
+          margin: { l: 180, r: 40, t: 5, b: 25 },
+          height: Math.max(140, toShow.length * 28),
+          xaxis: {
+            title: { text: 'Times Cited', font: { size: 10, color: '#888' } },
+            dtick: 1, showgrid: true, gridcolor: '#f0f0f0',
+          },
+          yaxis: { autorange: 'reversed', tickfont: { size: 11 } },
+          paper_bgcolor: 'transparent',
+          plot_bgcolor: 'transparent',
+        }}
+        config={{ displayModeBar: false, responsive: true }}
+        style={{ width: '100%' }}
+      />
+      {toShow.length > 0 && toShow[0].source_papers && (
+        <div className="cited-refs-note">
+          Found across: {[...new Set(toShow.flatMap(r => r.source_papers))].slice(0, 3).join(', ')}
+          {[...new Set(toShow.flatMap(r => r.source_papers))].length > 3 && ' and more'}
+        </div>
+      )}
+    </div>
+  );
+}
+export default function AnalyticsDashboard({ suggestion }) {
+  if (!suggestion) return null;
+  const analytics = suggestion.ragAnalytics || {};
+  const methodRelevance = suggestion.methodRelevance || [];
+  const hasData = methodRelevance.length > 0 ||
+    (analytics.paperSources && analytics.paperSources.length > 0);
+  if (!hasData) return null;
+  return (
+    <div className="analytics-dashboard">
+      <div className="analytics-grid">
+        <MethodRelevanceChart methodRelevance={methodRelevance} />
+        <CitedReferencesChart citedReferences={analytics.citedReferences} />
+        <PaperSourcesChart paperSources={analytics.paperSources} />
+        <DomainTopicsChart domainTopics={analytics.domainTopics} />
+        <EvidenceTypeChart
+          contentTypes={analytics.contentTypes}
+          rhetoricalRoles={analytics.rhetoricalRoles}
+        />
+      </div>
+    </div>
+  );
+}

frontend/src/components/InsightBullets.js CHANGED Viewed

@@ -1,4 +1,101 @@
 import React from 'react';
 export default function InsightBullets({ text }) {
   if (!text) return null;
@@ -7,10 +104,10 @@ export default function InsightBullets({ text }) {
     return (
       <ul className="insight-bullets">
         {bullets.map((line, i) => (
-          <li key={i}>{line.replace(/^-\s*/, '')}</li>
         ))}
       </ul>
     );
   }
-  return <p>{text}</p>;
 }

 import React from 'react';
+import Tooltip from './Tooltip';
+const ARCHITECTURE_TERMS = {
+  'PointNet++': 'Hierarchical extension of PointNet that captures local geometric structures at multiple scales',
+  'PointNet': 'Neural network that directly processes 3D point clouds for classification and segmentation',
+  'ResNet': 'Deep residual network using skip connections for image recognition',
+  'VGG': 'Deep convolutional network known for small 3x3 filters',
+  'CNN': 'Convolutional Neural Network, processes grid-like data using learned spatial filters',
+  'transformer': 'Attention-based architecture that processes sequences in parallel',
+  'VAE': 'Variational Autoencoder, generates diverse outputs via a probabilistic latent space',
+  'GAN': 'Generative Adversarial Network, generates outputs through adversarial training',
+  'U-Net': 'Encoder-decoder with skip connections for segmentation',
+  'MLP': 'Multi-Layer Perceptron, a basic feedforward neural network',
+  'diffusion model': 'Generative model that reverses a noise process to create diverse samples',
+};
+const TECHNIQUE_TERMS = {
+  'UMAP': 'Reduces high-dimensional data to 2D for visualization while preserving structure',
+  'HDBSCAN': 'Density-based clustering that finds natural groups without specifying count',
+  'TF-IDF': 'Text representation weighting terms by importance across documents',
+  'cosine similarity': 'Measures angle between vectors to compare text or feature embeddings',
+  'sentence-transformer': 'Neural model converting sentences into vectors capturing meaning',
+  '6-DoF': 'Six Degrees of Freedom: 3D position (x,y,z) + orientation (roll,pitch,yaw)',
+  '7-DoF': 'Seven Degrees of Freedom: 6-DoF plus gripper width or approach angle',
+  'sim-to-real': 'Transferring simulation-trained models to real robots',
+  'point cloud': 'Set of 3D points representing object surfaces from depth sensors',
+  'TSDF': 'Truncated Signed Distance Function, a volumetric 3D scene representation',
+  'RGBD': 'Color image (RGB) + depth channel (D), giving appearance and geometry',
+  'antipodal grasp': 'Grasp with two fingers pressing opposite sides along the same force line',
+  'grasp quality': 'Metric evaluating how stable and reliable a planned grasp is',
+  'ablation': 'Experiment removing components one-by-one to measure contribution',
+  'domain randomization': 'Randomizing simulation properties so models generalize to real world',
+  'cross-entropy': 'Loss function measuring difference between predicted and true probability distributions',
+  'binary cross-entropy': 'Cross-entropy loss for two-class classification problems',
+};
+const GRIPPER_TERMS = {
+  'parallel-jaw': 'Simple two-finger gripper opening and closing on a single axis',
+  'two-finger': 'Gripper with two opposing fingers, most common in industrial robotics',
+  'multi-finger': 'Gripper with 3+ articulated fingers for complex manipulation',
+  'dexterous': 'Robot hand with many joints for fine manipulation like a human hand',
+  'suction': 'Gripper that picks objects by vacuum seal on flat surfaces',
+};
+// Build lookup: term -> {type, tooltip}
+const ENTITY_LOOKUP = {};
+Object.entries(ARCHITECTURE_TERMS).forEach(([t, d]) => { ENTITY_LOOKUP[t.toLowerCase()] = { type: 'architecture', tooltip: d }; });
+Object.entries(TECHNIQUE_TERMS).forEach(([t, d]) => { ENTITY_LOOKUP[t.toLowerCase()] = { type: 'technique', tooltip: d }; });
+Object.entries(GRIPPER_TERMS).forEach(([t, d]) => { ENTITY_LOOKUP[t.toLowerCase()] = { type: 'gripper', tooltip: d }; });
+// All entity terms sorted by length (longest first for matching)
+const ALL_TERMS = [
+  ...Object.keys(ARCHITECTURE_TERMS),
+  ...Object.keys(TECHNIQUE_TERMS),
+  ...Object.keys(GRIPPER_TERMS),
+].sort((a, b) => b.length - a.length);
+const TERM_REGEX = new RegExp(
+  `(${ALL_TERMS.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})`,
+  'gi'
+);
+function renderWithEntities(text) {
+  const parts = text.split(TERM_REGEX);
+  return parts.map((part, i) => {
+    const lookup = ENTITY_LOOKUP[part.toLowerCase()];
+    if (lookup) {
+      const className = `entity-tag entity-${lookup.type}`;
+      if (lookup.tooltip) {
+        return (
+          <Tooltip key={i} text={lookup.tooltip}>
+            <span className={className}>{part}</span>
+          </Tooltip>
+        );
+      }
+      return <span key={i} className={className}>{part}</span>;
+    }
+    return <span key={i}>{part}</span>;
+  });
+}
+function formatBullet(text) {
+  // Step 1: Extract and bold quoted paper names (remove the quotes)
+  // Match "Paper Name" patterns
+  const quoteRegex = /("[^"]{3,}")/g;
+  const segments = text.split(quoteRegex);
+  return segments.map((seg, i) => {
+    if (seg.startsWith('"') && seg.endsWith('"')) {
+      // This is a quoted paper name - bold it, strip quotes
+      const paperName = seg.slice(1, -1);
+      return <strong key={i} className="entity-tag entity-paper">{paperName}</strong>;
+    }
+    // For non-quoted text, scan for technical entities
+    return <span key={i}>{renderWithEntities(seg)}</span>;
+  });
+}
 export default function InsightBullets({ text }) {
   if (!text) return null;
     return (
       <ul className="insight-bullets">
         {bullets.map((line, i) => (
+          <li key={i}>{formatBullet(line.replace(/^-\s*/, ''))}</li>
         ))}
       </ul>
     );
   }
+  return <p>{formatBullet(text)}</p>;
 }

frontend/src/components/InsightCard.js CHANGED Viewed

@@ -1,8 +1,196 @@
-import React from 'react';
 import InsightBullets from './InsightBullets';
 import { SHORT_NAMES } from '../constants';
-export default function InsightCard({ suggestion, weights, onClose }) {
   const weightDiffs = Object.entries(suggestion.weights)
     .filter(([col, val]) => val !== (weights[col] ?? 0))
     .map(([col, val]) => ({
@@ -22,6 +210,9 @@ export default function InsightCard({ suggestion, weights, onClose }) {
       <div className="insight-body">
         <InsightBullets text={suggestion.insight} />
       </div>
       <div className="insight-actions-summary">
         {suggestion.filterMethods && (
           <span className="action-chip filter-chip">
@@ -39,15 +230,6 @@ export default function InsightCard({ suggestion, weights, onClose }) {
           </span>
         )}
       </div>
-      {weightDiffs.length > 0 && (
-        <div className="insight-weight-details">
-          {weightDiffs.map(({ col, short, from, to }) => (
-            <span key={col} className="weight-chip">
-              {short}: {from} &rarr; <strong>{to}</strong>
-            </span>
-          ))}
-        </div>
-      )}
       {(suggestion.highlightMethods || []).length > 0 && (
         <div className="insight-matches">
           <span className="matches-label">Best matches:</span>

+import React, { useState, useMemo, useEffect, useRef } from 'react';
 import InsightBullets from './InsightBullets';
+import PdfViewer from './PdfViewer';
 import { SHORT_NAMES } from '../constants';
+import katex from 'katex';
+import 'katex/dist/katex.min.css';
+const STOP_WORDS = new Set([
+  'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+  'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+  'should', 'may', 'might', 'can', 'shall', 'to', 'of', 'in', 'for',
+  'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during',
+  'before', 'after', 'above', 'below', 'between', 'and', 'but', 'or',
+  'not', 'no', 'nor', 'so', 'yet', 'both', 'each', 'all', 'any', 'few',
+  'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'just',
+  'about', 'up', 'out', 'if', 'then', 'also', 'how', 'what', 'which',
+  'who', 'when', 'where', 'why', 'this', 'that', 'these', 'those',
+  'it', 'its', 'they', 'them', 'their', 'we', 'us', 'our', 'i', 'me',
+  'my', 'you', 'your', 'he', 'she', 'him', 'her', 'his', 'used', 'using',
+  'across', 'methods', 'method', 'approach', 'based', 'use',
+]);
+function extractKeywords(query) {
+  if (!query) return [];
+  const words = query.toLowerCase()
+    .replace(/[?!.,;:'"()]/g, '')
+    .split(/\s+/)
+    .filter(w => w.length > 2 && !STOP_WORDS.has(w));
+  const parts = query.toLowerCase().replace(/[?!.,;:'"()]/g, '').split(/\s+/);
+  const phrases = [];
+  for (let i = 0; i < parts.length - 1; i++) {
+    const bigram = parts[i] + ' ' + parts[i + 1];
+    if (!STOP_WORDS.has(parts[i]) && !STOP_WORDS.has(parts[i + 1])) {
+      phrases.push(bigram);
+    }
+  }
+  return [...phrases, ...words];
+}
+function cleanPdfText(text) {
+  if (!text) return '';
+  let cleaned = text
+    // Add space before uppercase letter following lowercase (e.g., "graspNetwork" -> "grasp Network")
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    // Add space before uppercase following a digit (e.g., "3DPoint" -> "3D Point")
+    .replace(/(\d)([A-Z][a-z])/g, '$1 $2')
+    // Add space before opening parens/brackets that follow word chars
+    .replace(/([a-zA-Z0-9])\(/g, '$1 (')
+    .replace(/([a-zA-Z0-9])\[/g, '$1 [')
+    // Add space after closing parens/brackets before word chars
+    .replace(/\)([a-zA-Z])/g, ') $1')
+    .replace(/\]([a-zA-Z])/g, '] $1')
+    // Add space after period followed by uppercase (sentence boundary)
+    .replace(/\.([A-Z])/g, '. $1')
+    // Add space after comma followed by letter
+    .replace(/,([a-zA-Z])/g, ', $1')
+    // Fix concatenated common English words (lowercase to lowercase)
+    .replace(/([a-z])(the|and|for|with|from|that|this|which|our|we|are|is|in|of|to|on|at|by|as|an|or|it|be|do|no|so|if|up|can|has|had|was|not|but|its|may|all|any|use|how|one|two|new|set|see|per|via|get|let|put|run|own|out|off|top|low|few|key|big|old|raw|due|end|aim|way|pre|sub|non)(?=[a-z])/gi, '$1 $2')
+    // Fix lowercase-to-lowercase concatenation with common word patterns
+    .replace(/([a-z]{3,})(using|based|given|shown|each|over|than|into|also|then|only|such|much|well|very|most|some|both|like|many|more|other|after|about|under|along|above|below|since|while|until|where|there|these|those|their|being|could|would|should|which|every|first|second|third)/gi, '$1 $2')
+    // Fix "wordword" where second word starts with common prefixes
+    .replace(/([a-z])(approach|method|network|model|object|grasp|robot|point|cloud|image|depth|scene|train|learn|predict|generate|sample|evaluate|compute|estimate|detect|process)/gi, (match, p1, p2) => {
+      // Only add space if the first part is 3+ chars
+      if (p1.length >= 3) return p1 + ' ' + p2;
+      return match;
+    })
+    // Normalize whitespace
+    .replace(/\s+/g, ' ')
+    // Fix double spaces around punctuation
+    .replace(/\s+([.,;:!?])/g, '$1')
+    .trim();
+  return cleaned;
+}
+function renderLatex(text) {
+  // Find LaTeX patterns: $...$ or \(...\) or common equation patterns
+  const latexPattern = /(\$[^$]+\$|\\[\(\[][^\\]+\\[\)\]])/g;
+  const parts = text.split(latexPattern);
+  return parts.map((part, i) => {
+    if (part.match(/^\$[^$]+\$$/)) {
+      const latex = part.slice(1, -1);
+      try {
+        const html = katex.renderToString(latex, { throwOnError: false, displayMode: false });
+        return <span key={i} className="latex-inline" dangerouslySetInnerHTML={{ __html: html }} />;
+      } catch { return <span key={i} className="latex-fallback">{part}</span>; }
+    }
+    if (part.match(/^\\[\(\[][^\\]+\\[\)\]]$/)) {
+      const latex = part.slice(2, -2);
+      try {
+        const html = katex.renderToString(latex, { throwOnError: false, displayMode: true });
+        return <span key={i} className="latex-block" dangerouslySetInnerHTML={{ __html: html }} />;
+      } catch { return <span key={i} className="latex-fallback">{part}</span>; }
+    }
+    return part;
+  });
+}
+function HighlightedText({ text, keywords }) {
+  const cleanedText = cleanPdfText(text);
+  // First pass: split by keywords for highlighting
+  if (!keywords || keywords.length === 0) {
+    return <span className="evidence-text-body">{renderLatex(cleanedText)}</span>;
+  }
+  const sorted = [...keywords].sort((a, b) => b.length - a.length);
+  const escaped = sorted.map(k => k.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
+  const regex = new RegExp(`(${escaped.join('|')})`, 'gi');
+  const parts = cleanedText.split(regex);
+  return (
+    <span className="evidence-text-body">
+      {parts.map((part, i) => {
+        const isMatch = keywords.some(k => part.toLowerCase() === k.toLowerCase());
+        if (isMatch) {
+          return <mark key={i} className="rag-highlight">{part}</mark>;
+        }
+        return <span key={i}>{renderLatex(part)}</span>;
+      })}
+    </span>
+  );
+}
+function formatPaperId(id) {
+  if (!id) return '';
+  return id.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
+}
+function PaperEvidencePanel({ citations, query }) {
+  const [showAll, setShowAll] = useState(false);
+  const [pdfOpen, setPdfOpen] = useState(null);
+  const keywords = useMemo(() => extractKeywords(query), [query]);
+  if (!citations || citations.length === 0) return null;
+  // Deduplicate by paper_id, keep best score and earliest page
+  const paperMap = {};
+  citations.forEach(cit => {
+    const key = cit.paper_id;
+    if (!paperMap[key] || cit.score > paperMap[key].score) {
+      paperMap[key] = { ...cit };
+    }
+  });
+  const papers = Object.values(paperMap).sort((a, b) => b.score - a.score);
+  const shown = showAll ? papers : papers.slice(0, 5);
+  return (
+    <div className="paper-evidence-panel">
+      <div className="evidence-header">
+        <span className="evidence-title">Paper Evidence</span>
+        <span className="evidence-count">
+          {citations.length} passages from {papers.length} papers
+        </span>
+      </div>
+      <div className="evidence-paper-list">
+        {shown.map((paper, i) => (
+          <div key={i} className="evidence-paper-row">
+            <div className="evidence-paper-info">
+              <span className="evidence-paper-name">{formatPaperId(paper.paper_id)}</span>
+              <span className="evidence-paper-score">{(paper.score * 100).toFixed(0)}% match</span>
+            </div>
+            <button
+              className="view-pdf-btn"
+              onClick={() => setPdfOpen({
+                paperId: paper.paper_id,
+                page: Math.max(1, paper.page || 1),
+                keywords,
+              })}
+            >
+              View PDF
+            </button>
+          </div>
+        ))}
+        {papers.length > 5 && !showAll && (
+          <button className="evidence-show-more" onClick={() => setShowAll(true)}>
+            +{papers.length - 5} more papers
+          </button>
+        )}
+      </div>
+      {pdfOpen && (
+        <PdfViewer
+          paperId={pdfOpen.paperId}
+          page={pdfOpen.page}
+          keywords={pdfOpen.keywords}
+          onClose={() => setPdfOpen(null)}
+        />
+      )}
+    </div>
+  );
+}
+export default function InsightCard({ suggestion, weights, query, onClose }) {
   const weightDiffs = Object.entries(suggestion.weights)
     .filter(([col, val]) => val !== (weights[col] ?? 0))
     .map(([col, val]) => ({
       <div className="insight-body">
         <InsightBullets text={suggestion.insight} />
       </div>
+      <PaperEvidencePanel citations={suggestion.ragCitations} query={query} />
       <div className="insight-actions-summary">
         {suggestion.filterMethods && (
           <span className="action-chip filter-chip">
           </span>
         )}
       </div>
       {(suggestion.highlightMethods || []).length > 0 && (
         <div className="insight-matches">
           <span className="matches-label">Best matches:</span>

frontend/src/components/PdfViewer.js ADDED Viewed

	@@ -0,0 +1,118 @@

+import React, { useState, useEffect, useCallback, useRef, useMemo } from 'react';
+import { Document, Page, pdfjs } from 'react-pdf';
+import 'react-pdf/dist/Page/AnnotationLayer.css';
+import 'react-pdf/dist/Page/TextLayer.css';
+// Set up the PDF.js worker
+pdfjs.GlobalWorkerOptions.workerSrc = `//unpkg.com/pdfjs-dist@${pdfjs.version}/build/pdf.worker.min.mjs`;
+export default function PdfViewer({ paperId, page, keywords, onClose }) {
+  const [numPages, setNumPages] = useState(null);
+  const [currentPage, setCurrentPage] = useState(page || 1);
+  const [loading, setLoading] = useState(true);
+  const [error, setError] = useState(null);
+  const containerRef = useRef(null);
+  const pdfUrl = `/api/papers/${paperId}`;
+  // Ensure page is at least 1
+  useEffect(() => {
+    if (page && page >= 1) setCurrentPage(page);
+  }, [page]);
+  const onDocumentLoadSuccess = useCallback(({ numPages }) => {
+    setNumPages(numPages);
+    setLoading(false);
+  }, []);
+  const onDocumentLoadError = useCallback((err) => {
+    setError(err.message);
+    setLoading(false);
+  }, []);
+  // After page renders, highlight matching keywords in the text layer
+  const highlightKeywords = useCallback(() => {
+    if (!keywords || keywords.length === 0) return;
+    if (!containerRef.current) return;
+    // Wait for text layer to render
+    setTimeout(() => {
+      const textLayer = containerRef.current?.querySelector('.react-pdf__Page__textContent');
+      if (!textLayer) return;
+      const spans = textLayer.querySelectorAll('span');
+      spans.forEach(span => {
+        const text = span.textContent.toLowerCase();
+        const hasMatch = keywords.some(kw => text.includes(kw.toLowerCase()));
+        if (hasMatch) {
+          span.classList.add('pdf-keyword-highlight');
+        }
+      });
+    }, 500);
+  }, [keywords]);
+  const pageWidth = useMemo(() => {
+    if (!containerRef.current) return 700;
+    return Math.min(containerRef.current.offsetWidth - 40, 800);
+  }, [containerRef.current]);
+  if (!paperId) return null;
+  return (
+    <div className="pdf-viewer-overlay" onClick={onClose}>
+      <div className="pdf-viewer-modal" onClick={e => e.stopPropagation()} ref={containerRef}>
+        <div className="pdf-viewer-header">
+          <div className="pdf-viewer-title">{paperId.replace(/-/g, ' ')}</div>
+          <div className="pdf-viewer-controls">
+            <button
+              disabled={currentPage <= 1}
+              onClick={() => setCurrentPage(p => Math.max(1, p - 1))}
+              className="pdf-nav-btn"
+            >
+              &larr; Prev
+            </button>
+            <span className="pdf-page-info">
+              Page {currentPage}{numPages ? ` of ${numPages}` : ''}
+            </span>
+            <button
+              disabled={currentPage >= (numPages || 1)}
+              onClick={() => setCurrentPage(p => Math.min(numPages || p, p + 1))}
+              className="pdf-nav-btn"
+            >
+              Next &rarr;
+            </button>
+            <button onClick={onClose} className="pdf-close-btn">&times;</button>
+          </div>
+        </div>
+        {keywords && keywords.length > 0 && (
+          <div className="pdf-keywords-bar">
+            Highlighting: {keywords.slice(0, 5).map((kw, i) => (
+              <span key={i} className="pdf-kw-tag">{kw}</span>
+            ))}
+          </div>
+        )}
+        <div className="pdf-viewer-content">
+          {error && <div className="pdf-error">Failed to load PDF: {error}</div>}
+          {loading && !error && <div className="pdf-loading">Loading PDF...</div>}
+          <Document
+            file={pdfUrl}
+            onLoadSuccess={onDocumentLoadSuccess}
+            onLoadError={onDocumentLoadError}
+            loading=""
+          >
+            <Page
+              pageNumber={currentPage}
+              width={pageWidth}
+              onRenderTextLayerSuccess={highlightKeywords}
+              renderAnnotationLayer={true}
+              renderTextLayer={true}
+            />
+          </Document>
+        </div>
+      </div>
+    </div>
+  );
+}

frontend/src/components/QueryExplanation.js ADDED Viewed

	@@ -0,0 +1,106 @@

+import React from 'react';
+import Tooltip from './Tooltip';
+export default function QueryExplanation({ suggestion, query, data, clusterStats }) {
+  if (!suggestion) return null;
+  const nMethods = data ? data.length : 0;
+  const nClusters = clusterStats ? clusterStats.length : 0;
+  const nHighlights = (suggestion.highlightMethods || []).length;
+  const isFiltered = !!suggestion.filterMethods;
+  const colorBy = suggestion.colorBy || 'cluster';
+  const nCitations = (suggestion.ragCitations || []).length;
+  const nPapers = new Set((suggestion.ragCitations || []).map(c => c.paper_title)).size;
+  return (
+    <div className="query-explanation">
+      <div className="explanation-header">
+        <span className="explanation-title">How This View Was Built</span>
+        <Tooltip text="This section explains what the system did to answer your query. Every step is deterministic (no AI guessing) except the final insight text." wide>
+          <span className="explanation-help">?</span>
+        </Tooltip>
+      </div>
+      <div className="explanation-steps">
+        <div className="explanation-step">
+          <div className="step-number">1</div>
+          <div className="step-content">
+            <span className="step-label">Query Understanding</span>
+            <span className="step-detail">
+              Your question was converted into a numerical vector using a
+              <Tooltip text="A neural network (all-MiniLM-L6-v2) that converts text into 384-dimensional vectors. Similar questions produce similar vectors, enabling mathematical comparison.">
+                <span className="inline-term">sentence-transformer</span>
+              </Tooltip>
+              {' '}model, then compared against all 56 method descriptions to find the most relevant ones.
+            </span>
+          </div>
+        </div>
+        <div className="explanation-step">
+          <div className="step-number">2</div>
+          <div className="step-content">
+            <span className="step-label">
+              {nHighlights} Methods Highlighted
+            </span>
+            <span className="step-detail">
+              The
+              <Tooltip text="Cosine similarity measures the angle between two vectors. A score of 1.0 means identical direction (perfect match), 0.0 means unrelated. The highlighted methods scored highest against your query.">
+                <span className="inline-term">cosine similarity</span>
+              </Tooltip>
+              {' '}between your query and each method's description determined the {nHighlights} best matches.
+              These are shown as larger, brighter points on the scatter plot.
+            </span>
+          </div>
+        </div>
+        {isFiltered && (
+          <div className="explanation-step">
+            <div className="step-number">3</div>
+            <div className="step-content">
+              <span className="step-label">Filtered to {nMethods} Methods</span>
+              <span className="step-detail">
+                Your query implied a specific subset, so only methods matching the criteria are shown.
+                The scatter plot and clustering were recomputed for just these methods.
+              </span>
+            </div>
+          </div>
+        )}
+        <div className="explanation-step">
+          <div className="step-number">{isFiltered ? 4 : 3}</div>
+          <div className="step-content">
+            <span className="step-label">{nClusters} Groups via HDBSCAN Clustering</span>
+            <span className="step-detail">
+              <Tooltip text="HDBSCAN (Hierarchical Density-Based Spatial Clustering) automatically finds groups of similar methods without needing to pre-specify how many groups there are. Unlike K-Means, it discovers natural groupings based on data density.">
+                <span className="inline-term">HDBSCAN</span>
+              </Tooltip>
+              {' '}automatically found {nClusters} natural groups among the {nMethods} methods.
+              Column weights were adjusted based on your query keywords to emphasize relevant attributes in the
+              <Tooltip text="UMAP (Uniform Manifold Approximation and Projection) takes the high-dimensional feature vectors and projects them to 2D coordinates so you can see which methods are similar (close together) or different (far apart).">
+                <span className="inline-term">UMAP projection</span>
+              </Tooltip>.
+              The scatter plot is colored by <strong>{colorBy}</strong>.
+            </span>
+          </div>
+        </div>
+        {nCitations > 0 && (
+          <div className="explanation-step">
+            <div className="step-number">{isFiltered ? 5 : 4}</div>
+            <div className="step-content">
+              <span className="step-label">{nCitations} Passages from {nPapers} Papers</span>
+              <span className="step-detail">
+                The
+                <Tooltip text="ChromaDB stores 1,074 text chunks from 34 research papers. Each chunk was embedded using the same sentence-transformer model, so your query can be matched against actual paper content by vector similarity.">
+                  <span className="inline-term">vector database</span>
+                </Tooltip>
+                {' '}was searched for passages relevant to your query. These paper excerpts were
+                fed to the LLM to generate the grounded insight above.
+              </span>
+            </div>
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}

frontend/src/components/Tooltip.js ADDED Viewed

	@@ -0,0 +1,64 @@

+import React, { useState, useRef, useCallback, useEffect } from 'react';
+import ReactDOM from 'react-dom';
+export default function Tooltip({ children, text, wide = false }) {
+  const [visible, setVisible] = useState(false);
+  const [coords, setCoords] = useState({ top: 0, left: 0 });
+  const triggerRef = useRef(null);
+  const timeoutRef = useRef(null);
+  const updatePosition = useCallback(() => {
+    if (!triggerRef.current) return;
+    const rect = triggerRef.current.getBoundingClientRect();
+    const tooltipWidth = wide ? 320 : 240;
+    let left = rect.left + rect.width / 2 - tooltipWidth / 2;
+    // Keep tooltip within viewport
+    left = Math.max(8, Math.min(left, window.innerWidth - tooltipWidth - 8));
+    // Show below if too close to top, otherwise above
+    const showBelow = rect.top < 120;
+    const top = showBelow ? rect.bottom + 8 : rect.top - 8;
+    setCoords({ top, left, showBelow, tooltipWidth });
+  }, [wide]);
+  const show = () => {
+    clearTimeout(timeoutRef.current);
+    timeoutRef.current = setTimeout(() => {
+      updatePosition();
+      setVisible(true);
+    }, 250);
+  };
+  const hide = () => {
+    clearTimeout(timeoutRef.current);
+    setVisible(false);
+  };
+  useEffect(() => () => clearTimeout(timeoutRef.current), []);
+  const tooltip = visible ? ReactDOM.createPortal(
+    <div
+      className={`tooltip-bubble-fixed ${wide ? 'tooltip-wide' : ''}`}
+      style={{
+        top: coords.showBelow ? coords.top : undefined,
+        bottom: coords.showBelow ? undefined : `${window.innerHeight - coords.top}px`,
+        left: coords.left,
+        width: coords.tooltipWidth,
+      }}
+    >
+      {text}
+    </div>,
+    document.body
+  ) : null;
+  return (
+    <span
+      ref={triggerRef}
+      className="tooltip-wrapper"
+      onMouseEnter={show}
+      onMouseLeave={hide}
+    >
+      {children}
+      {tooltip}
+    </span>
+  );
+}

rag_config.yaml ADDED Viewed

	@@ -0,0 +1,125 @@

+project_name: grasp-explorer
+domain_context: >
+  Robotic grasp planning: computing how a robot should position its gripper
+  to pick up objects. Methods vary by planning approach (sampling, regression,
+  RL, analytical), gripper type, sensor input, scene complexity, and training
+  paradigm (sim, real, or both).
+csv_path: datasets/csv-gp-combined.csv
+name_column: Name
+description_column: Description
+link_column: "Link(s)"
+embedding_model: all-MiniLM-L6-v2
+embedding_dimensions: 384
+chroma_persist_dir: ./chroma_db
+collection_name: grasp_papers
+chunking:
+  coarse_max_tokens: 800
+  mid_min_tokens: 200
+  mid_max_tokens: 800
+  mid_overlap_ratio: 0.15
+  fine_min_tokens: 50
+  fine_max_tokens: 300
+  semantic_similarity_threshold: 0.35
+  strategies:
+    - semantic
+  domain_topics:
+    # Grasp planning approaches
+    - grasp planning
+    - grasp detection
+    - grasp synthesis
+    - grasp pose
+    - 6-DoF
+    - 7-DoF
+    - antipodal grasp
+    - power grasp
+    - precision grasp
+    - grasp quality
+    - grasp success rate
+    # Planning methods
+    - sampling
+    - direct regression
+    - reinforcement learning
+    - analytical
+    - optimization
+    - generative model
+    - diffusion model
+    - VAE
+    - GAN
+    # Gripper types
+    - parallel-jaw
+    - two-finger
+    - multi-finger
+    - dexterous
+    - suction
+    - gripper
+    - end-effector
+    # Sensors and input
+    - point cloud
+    - depth image
+    - RGB-D
+    - TSDF
+    - voxel
+    - mesh
+    - tactile
+    - force-torque
+    # Scene types
+    - cluttered
+    - piled
+    - singulated
+    - packed
+    - bin picking
+    # Neural network architectures
+    - PointNet
+    - PointNet++
+    - ResNet
+    - VGG
+    - transformer
+    - CNN
+    - encoder-decoder
+    - U-Net
+    # Training and simulation
+    - sim-to-real
+    - domain randomization
+    - transfer learning
+    - self-supervised
+    - real-world
+    - simulation
+    - Isaac
+    - MuJoCo
+    - PyBullet
+    # Robotics concepts
+    - contact model
+    - collision detection
+    - motion planning
+    - inverse kinematics
+    - workspace
+    - reachability
+    - robot arm
+    - manipulator
+    # Evaluation
+    - success rate
+    - grasp metric
+    - clearance
+    - coverage
+    - ablation
+retrieval:
+  coarse_top_k: 2
+  mid_top_k: 4
+  fine_top_k: 4
+  token_budget: 3000
+  rerank: false
+tools_enabled: true
+dataset_columns:
+  - Planning Method
+  - Training Data
+  - End-effector Hardware
+  - Object Configuration
+  - Input Data
+  - Output Pose
+  - Backbone
+  - Language