skapoor-wpi commited on
Commit
8a2906d
·
1 Parent(s): 6acb635

RAG pipeline nintroduced- up the UI and add more insights

Browse files
.gitignore CHANGED
@@ -39,3 +39,13 @@ backend/package-lock.json
39
  # Cached embeddings
40
  backend/.description_embeddings.npy
41
  .env.example
 
 
 
 
 
 
 
 
 
 
 
39
  # Cached embeddings
40
  backend/.description_embeddings.npy
41
  .env.example
42
+
43
+ # ChromaDB vector database (rebuilt from papers via ingestion pipeline)
44
+ chroma_db/
45
+
46
+ # PDF papers (downloaded from arXiv, not checked into git)
47
+ papers/*.pdf
48
+
49
+ # Temporary / scratch
50
+ /tmp/
51
+ *.tmp
backend/app.py CHANGED
@@ -87,13 +87,40 @@ Key distinctions that matter:
87
  Why clustering matters: Methods that cluster together share fundamental design choices. Separation between clusters often reflects genuinely different philosophies (e.g., learning-based vs. analytical, or 2D vs. 3D grasp representations)."""
88
 
89
  # AI Copilot configuration
90
- # Supports: "ollama" (local, no key needed) or "huggingface" (needs HF_API_TOKEN)
91
- AI_PROVIDER = os.environ.get('AI_PROVIDER', 'ollama')
92
  OLLAMA_BASE_URL = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434')
93
  OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'llama3.1:8b')
94
  HF_API_TOKEN = os.environ.get('HF_API_TOKEN', os.environ.get('HF_TOKEN', ''))
95
  HF_MODEL = os.environ.get('AI_MODEL', 'Qwen/Qwen2.5-72B-Instruct')
 
 
96
  USE_RAG = os.environ.get('USE_RAG', 'false').lower() == 'true'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
 
99
  def llm_chat(messages, max_tokens=2048, temperature=0.3):
@@ -114,6 +141,19 @@ def llm_chat(messages, max_tokens=2048, temperature=0.3):
114
  with urllib.request.urlopen(req, timeout=120) as resp:
115
  result = json.loads(resp.read().decode('utf-8'))
116
  return result['message']['content'].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  else:
118
  # HuggingFace Inference API
119
  if not HF_API_TOKEN:
@@ -616,15 +656,18 @@ DEFAULT WEIGHTS:
616
  {json.dumps(dict(DEFAULT_WEIGHTS), indent=2)}"""
617
 
618
 
 
 
 
 
 
619
  def build_method_summaries(df):
620
- """Build compact one-line summaries of all methods."""
621
  summaries = []
622
  for _, row in df.iterrows():
623
  name = row.get('Name', '')
624
  parts = []
625
- for col in DEFAULT_WEIGHTS.keys():
626
- if col == 'Description':
627
- continue
628
  val = str(row.get(col, '')) if pd.notna(row.get(col, '')) else ''
629
  if val:
630
  short = SHORT_COLUMN_NAMES.get(col, col)
@@ -633,10 +676,29 @@ def build_method_summaries(df):
633
  return '\n'.join(summaries)
634
 
635
 
636
- def retrieve_relevant_chunks(query):
637
- """Placeholder for RAG retrieval. Returns empty string.
638
- Future: query ChromaDB for relevant paper chunks."""
639
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
 
641
 
642
  def build_ai_system_prompt(df, query):
@@ -644,11 +706,18 @@ def build_ai_system_prompt(df, query):
644
  schema = build_schema_context(df)
645
  methods = build_method_summaries(df)
646
 
647
- retrieved = ""
648
- if USE_RAG:
649
- retrieved = retrieve_relevant_chunks(query)
650
- if retrieved:
651
- retrieved = f"\n\nRELEVANT PAPER EXCERPTS:\n{retrieved}"
 
 
 
 
 
 
 
652
 
653
  return f"""You are an AI copilot for the Grasp Planner Explorer, a visualization tool that shows 56 robotic grasp planning methods projected via weighted UMAP.
654
 
@@ -656,7 +725,7 @@ def build_ai_system_prompt(df, query):
656
 
657
  ALL {len(df)} METHODS:
658
  {methods}
659
- {retrieved}
660
 
661
  YOUR TASK (Pass 1 — Configuration):
662
  Given a natural language query from a researcher, respond with a JSON object containing:
@@ -667,6 +736,7 @@ Given a natural language query from a researcher, respond with a JSON object con
667
  - For SEARCH queries ("find methods for X"): highlight the strongest matches for X.
668
  - For COMPARISON queries ("how do X and Y differ?"): highlight representative examples from EACH side — e.g., 3-4 examples of X AND 3-4 examples of Y so the user sees both groups.
669
  - For EXPLORATION queries ("overview of the field"): highlight diverse, well-known methods spanning different clusters.
 
670
 
671
  FILTERING GUIDELINES:
672
  - When the query specifies attributes (e.g., "cluttered scenes"), filter to methods that have those attributes.
@@ -786,122 +856,134 @@ WHAT YOU DID:
786
  {cluster_summary}
787
 
788
  YOUR TASK (Pass 2 — Insight):
789
- Based on the ACTUAL clustering results and your domain knowledge, write concise bullet points. Format as bullet points starting with "- ".
790
 
791
  Write 3-5 bullet points that:
792
- - Explain WHY methods group together using domain knowledge (e.g., "these methods share a sampling-based approach which requires different input representations than direct regression methods")
793
- - Point out meaningful patterns not just what's in each group, but WHY that grouping matters for the researcher's query
794
- - Call attention to the highlighted best-match methods what group did they land in and what does that tell us?
795
- - Note any surprising groupings or trade-offs the researcher should be aware of
 
796
 
797
- IMPORTANT: Do NOT reference cluster numbers (e.g., "Cluster 0", "Cluster 3"). Instead, refer to groups by their defining characteristics (e.g., "the sampling-based group", "the RL + multi-finger group"). Reference specific method names and attribute values. Ground insights in both the data AND domain knowledge.
 
 
 
 
798
 
799
  Respond with ONLY the bullet points, no JSON, no markdown fences, no headers."""
800
 
801
 
802
  @app.route('/api/ai-query', methods=['POST'])
803
  def ai_query():
804
- """Two-pass AI copilot:
805
- Pass 1: LLM decides filter, weights, colorBy, highlights (from raw metadata)
806
- Pass 2: Run UMAP/clustering, feed results back to LLM for grounded insight
 
807
  """
808
- response_text = ''
809
  try:
810
  data = request.get_json() or {}
811
  query = data.get('query', '').strip()
812
  if not query:
813
  return jsonify({'success': False, 'error': 'Empty query'}), 400
814
 
815
- current_weights = data.get('currentWeights', DEFAULT_WEIGHTS)
816
- current_color_by = data.get('currentColorBy', 'cluster')
817
-
818
  df = pd.read_csv(CSV_FILE)
819
- valid_names = set(df['Name'].tolist())
820
-
821
- # ── Pass 1: Decide configuration ──────────────────────────────
822
- print(f"[Pass 1] Query: '{query}'")
823
- system_prompt = build_ai_system_prompt(df, query)
824
- user_message = f"""Current weights: {json.dumps(current_weights)}
825
- Current color-by: {current_color_by}
826
-
827
- Researcher's query: {query}"""
828
-
829
- response_text = llm_chat([
830
- {'role': 'system', 'content': system_prompt},
831
- {'role': 'user', 'content': user_message}
832
- ])
833
-
834
- # Handle potential markdown fences
835
- if response_text.startswith('```'):
836
- lines = response_text.split('\n')
837
- response_text = '\n'.join(lines[1:-1])
838
-
839
- result = json.loads(response_text)
840
-
841
- # Validate required fields (insight no longer required from Pass 1)
842
- required = ['weights', 'colorBy', 'highlightMethods']
843
- for field in required:
844
- if field not in result:
845
- return jsonify({
846
- 'success': False,
847
- 'error': f'AI response missing field: {field}'
848
- }), 500
849
-
850
- # Validate filterMethods
851
- if 'filterMethods' in result:
852
- result['filterMethods'] = [
853
- m for m in result['filterMethods'] if m in valid_names
854
- ]
855
- if not result['filterMethods'] or len(result['filterMethods']) >= len(valid_names):
856
- result['filterMethods'] = None
857
- else:
858
- result['filterMethods'] = None
859
 
860
- result['highlightMethods'] = [
861
- m for m in result['highlightMethods'] if m in valid_names
862
- ]
 
 
 
 
 
 
 
 
 
 
 
863
 
864
- # Clamp weights to 0-20
865
- for col in result['weights']:
866
- result['weights'][col] = max(0, min(20, int(result['weights'][col])))
 
 
867
 
868
- print(f"[Pass 1] Filter: {len(result['filterMethods']) if result['filterMethods'] else 'none'}, "
869
- f"Highlights: {len(result['highlightMethods'])}, ColorBy: {result['colorBy']}")
 
870
 
871
- # ── Run UMAP/Clustering pipeline ──────────────────────────────
872
- print("[Pipeline] Running UMAP + K-Means on AI-configured data...")
873
  response_data, clustering_info, _, _ = run_umap_pipeline(
874
  result['weights'], result['filterMethods']
875
  )
876
  print(f"[Pipeline] Done: {len(response_data)} methods, {clustering_info['n_clusters']} clusters")
877
 
878
- # ── Pass 2: Generate grounded insight ─────────────────────────
879
- print("[Pass 2] Generating insight from clustering results...")
880
- insight_prompt = build_insight_prompt(
881
- query, response_data, clustering_info,
882
- result['weights'], result['colorBy'],
883
- result['highlightMethods'], result['filterMethods']
884
  )
885
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
886
  insight_text = llm_chat([
887
  {'role': 'user', 'content': insight_prompt}
888
  ], max_tokens=1024)
889
- # Clean up any markdown formatting
890
  if insight_text.startswith('```'):
891
  lines = insight_text.split('\n')
892
  insight_text = '\n'.join(lines[1:-1])
893
  result['insight'] = insight_text
894
 
895
- print(f"[Pass 2] Insight generated ({len(insight_text)} chars)")
896
 
897
- # Include the UMAP data and cluster stats in the response
898
- _, cluster_stats = build_cluster_stats(response_data, clustering_info, result['weights'])
899
  result['umapData'] = response_data
900
  result['clustering'] = {
901
  'n_clusters': clustering_info['n_clusters'],
902
  'value_cluster_map': clustering_info['value_cluster_map']
903
  }
904
  result['clusterStats'] = cluster_stats
 
 
 
 
 
 
905
 
906
  return jsonify({'success': True, **result})
907
 
@@ -972,6 +1054,29 @@ Respond with ONLY the bullet points, no JSON, no markdown fences, no headers."""
972
  return jsonify({'success': False, 'error': str(e)}), 500
973
 
974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975
  @app.route('/api/health')
976
  def health():
977
  """Health check endpoint."""
 
87
  Why clustering matters: Methods that cluster together share fundamental design choices. Separation between clusters often reflects genuinely different philosophies (e.g., learning-based vs. analytical, or 2D vs. 3D grasp representations)."""
88
 
89
  # AI Copilot configuration
90
+ # Supports: "ollama", "huggingface", or "groq"
91
+ AI_PROVIDER = os.environ.get('AI_PROVIDER', 'groq')
92
  OLLAMA_BASE_URL = os.environ.get('OLLAMA_BASE_URL', 'http://localhost:11434')
93
  OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'llama3.1:8b')
94
  HF_API_TOKEN = os.environ.get('HF_API_TOKEN', os.environ.get('HF_TOKEN', ''))
95
  HF_MODEL = os.environ.get('AI_MODEL', 'Qwen/Qwen2.5-72B-Instruct')
96
+ GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '')
97
+ GROQ_MODEL = os.environ.get('GROQ_MODEL', 'llama-3.3-70b-versatile')
98
  USE_RAG = os.environ.get('USE_RAG', 'false').lower() == 'true'
99
+ USE_TOOLS = os.environ.get('USE_TOOLS', 'true').lower() == 'true'
100
+
101
+ # RAG + Tool calling initialization (lazy-loaded)
102
+ _rag_retriever = None
103
+ _rag_config = None
104
+
105
+ def _get_rag_config():
106
+ global _rag_config
107
+ if _rag_config is None:
108
+ config_path = os.path.join(BASE_DIR, 'rag_config.yaml')
109
+ if os.path.exists(config_path):
110
+ from rag.config import load_config
111
+ _rag_config = load_config(config_path)
112
+ return _rag_config
113
+
114
+ def _get_rag_retriever():
115
+ global _rag_retriever
116
+ if _rag_retriever is None:
117
+ config = _get_rag_config()
118
+ if config:
119
+ from rag.retrieval.retriever import RAGRetriever
120
+ from rag.ingest.embedder import ChunkEmbedder
121
+ embedder = ChunkEmbedder(model_name=config.embedding_model, model_instance=st_model)
122
+ _rag_retriever = RAGRetriever(config=config, embedder=embedder)
123
+ return _rag_retriever
124
 
125
 
126
  def llm_chat(messages, max_tokens=2048, temperature=0.3):
 
141
  with urllib.request.urlopen(req, timeout=120) as resp:
142
  result = json.loads(resp.read().decode('utf-8'))
143
  return result['message']['content'].strip()
144
+ elif AI_PROVIDER == 'groq':
145
+ # Groq (free, fast, OpenAI-compatible)
146
+ if not GROQ_API_KEY:
147
+ raise ValueError('GROQ_API_KEY not configured. Set it as an environment variable.')
148
+ from groq import Groq
149
+ client = Groq(api_key=GROQ_API_KEY)
150
+ completion = client.chat.completions.create(
151
+ model=GROQ_MODEL,
152
+ messages=messages,
153
+ max_tokens=max_tokens,
154
+ temperature=temperature,
155
+ )
156
+ return completion.choices[0].message.content.strip()
157
  else:
158
  # HuggingFace Inference API
159
  if not HF_API_TOKEN:
 
656
  {json.dumps(dict(DEFAULT_WEIGHTS), indent=2)}"""
657
 
658
 
659
+ SUMMARY_COLUMNS = [
660
+ 'Planning Method', 'End-effector Hardware', 'Input Data',
661
+ 'Training Data', 'Object Configuration',
662
+ ]
663
+
664
  def build_method_summaries(df):
665
+ """Build compact one-line summaries of all methods (key columns only to save tokens)."""
666
  summaries = []
667
  for _, row in df.iterrows():
668
  name = row.get('Name', '')
669
  parts = []
670
+ for col in SUMMARY_COLUMNS:
 
 
671
  val = str(row.get(col, '')) if pd.notna(row.get(col, '')) else ''
672
  if val:
673
  short = SHORT_COLUMN_NAMES.get(col, col)
 
676
  return '\n'.join(summaries)
677
 
678
 
679
+ def retrieve_relevant_chunks(query, paper_ids=None):
680
+ """Retrieve relevant paper chunks from ChromaDB.
681
+
682
+ Returns (prompt_text, citations) where prompt_text is formatted for LLM
683
+ injection and citations is structured data for the frontend.
684
+ """
685
+ if not USE_RAG:
686
+ return "", []
687
+ retriever = _get_rag_retriever()
688
+ if retriever is None:
689
+ return "", []
690
+ try:
691
+ from rag.retrieval.formatter import format_for_prompt, format_chunk_citations
692
+ config = _get_rag_config()
693
+ chunks = retriever.retrieve(query, paper_ids=paper_ids)
694
+ token_budget = config.retrieval.token_budget if config else 3000
695
+ prompt_text = format_for_prompt(chunks, token_budget=token_budget)
696
+ citations = format_chunk_citations(chunks)
697
+ print(f"[RAG] Retrieved {len(chunks)} chunks ({len(prompt_text)} chars)")
698
+ return prompt_text, citations
699
+ except Exception as e:
700
+ print(f"[RAG] Error: {e}")
701
+ return "", []
702
 
703
 
704
  def build_ai_system_prompt(df, query):
 
706
  schema = build_schema_context(df)
707
  methods = build_method_summaries(df)
708
 
709
+ tools_section = ""
710
+ if USE_TOOLS or USE_RAG:
711
+ try:
712
+ import rag.tools # triggers registration of all tools including search_papers
713
+ from rag.tools.registry import get_tool_prompt_section
714
+ tools_section = "\n\n" + get_tool_prompt_section()
715
+ except Exception:
716
+ pass
717
+
718
+ tools_instruction = ""
719
+ if tools_section:
720
+ tools_instruction = '\n5. "tools" (OPTIONAL) - Array of tool calls if the query needs computed results or paper content. Each: {"name": "tool_name", "arguments": {...}}. Use "search_papers" when the query asks about specific techniques, loss functions, architectures, or anything that requires reading actual paper content.'
721
 
722
  return f"""You are an AI copilot for the Grasp Planner Explorer, a visualization tool that shows 56 robotic grasp planning methods projected via weighted UMAP.
723
 
 
725
 
726
  ALL {len(df)} METHODS:
727
  {methods}
728
+ {tools_section}
729
 
730
  YOUR TASK (Pass 1 — Configuration):
731
  Given a natural language query from a researcher, respond with a JSON object containing:
 
736
  - For SEARCH queries ("find methods for X"): highlight the strongest matches for X.
737
  - For COMPARISON queries ("how do X and Y differ?"): highlight representative examples from EACH side — e.g., 3-4 examples of X AND 3-4 examples of Y so the user sees both groups.
738
  - For EXPLORATION queries ("overview of the field"): highlight diverse, well-known methods spanning different clusters.
739
+ {tools_instruction}
740
 
741
  FILTERING GUIDELINES:
742
  - When the query specifies attributes (e.g., "cluttered scenes"), filter to methods that have those attributes.
 
856
  {cluster_summary}
857
 
858
  YOUR TASK (Pass 2 — Insight):
859
+ Based on the ACTUAL clustering results, paper excerpts (if provided), and computed tool results (if provided), write concise bullet points. Format as bullet points starting with "- ".
860
 
861
  Write 3-5 bullet points that:
862
+ - DIRECTLY ANSWER the researcher's query using specific evidence from the paper excerpts and clustering results
863
+ - When paper excerpts are provided, CITE specific papers by name (e.g., "Contact-GraspNet uses a binary cross-entropy loss on predicted contact points")
864
+ - Reference concrete technical details from the papers, not generic descriptions
865
+ - Point out meaningful patterns relevant to the query, grounded in actual paper content
866
+ - If computed results are provided (e.g., similarity scores, distributions), incorporate the exact numbers
867
 
868
+ IMPORTANT RULES:
869
+ - Do NOT reference cluster numbers (e.g., "Cluster 0", "Cluster 3"). Refer to groups by their defining characteristics.
870
+ - Do NOT give generic overviews of the clusters. Focus on answering the specific query.
871
+ - When paper excerpts are available, prioritize insights derived from actual paper content over general domain knowledge.
872
+ - Reference specific method names and attribute values.
873
 
874
  Respond with ONLY the bullet points, no JSON, no markdown fences, no headers."""
875
 
876
 
877
  @app.route('/api/ai-query', methods=['POST'])
878
  def ai_query():
879
+ """Deterministic pipeline + single LLM call:
880
+ 1. Deterministic: embed query, search vector DB, compute weights/filters/highlights
881
+ 2. Pipeline: run UMAP + HDBSCAN with computed weights
882
+ 3. LLM: interpret results with RAG context (single, small prompt)
883
  """
 
884
  try:
885
  data = request.get_json() or {}
886
  query = data.get('query', '').strip()
887
  if not query:
888
  return jsonify({'success': False, 'error': 'Empty query'}), 400
889
 
 
 
 
890
  df = pd.read_csv(CSV_FILE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
891
 
892
+ # ── Step 1: Deterministic query analysis ──────────────────────
893
+ print(f"[Query] '{query}'")
894
+ from rag.query_engine import deterministic_query_pipeline
895
+ retriever = _get_rag_retriever()
896
+ pipeline_result = deterministic_query_pipeline(
897
+ query, df, st_model, DEFAULT_WEIGHTS, retriever=retriever
898
+ )
899
+
900
+ result = {
901
+ 'weights': pipeline_result['weights'],
902
+ 'colorBy': pipeline_result['colorBy'],
903
+ 'filterMethods': pipeline_result['filterMethods'],
904
+ 'highlightMethods': pipeline_result['highlightMethods'],
905
+ }
906
 
907
+ rag_text = pipeline_result['rag_text']
908
+ rag_citations = pipeline_result['rag_citations']
909
+ rag_analytics = pipeline_result.get('rag_analytics', {})
910
+ method_relevance = pipeline_result.get('method_relevance', [])
911
+ method_summaries = pipeline_result['relevant_method_summaries']
912
 
913
+ print(f"[Deterministic] Filter: {len(result['filterMethods']) if result['filterMethods'] else 'none'}, "
914
+ f"Highlights: {len(result['highlightMethods'])}, ColorBy: {result['colorBy']}, "
915
+ f"RAG chunks: {len(rag_citations)}")
916
 
917
+ # ── Step 2: Run UMAP/Clustering pipeline ─────────────────────
918
+ print("[Pipeline] Running UMAP + HDBSCAN...")
919
  response_data, clustering_info, _, _ = run_umap_pipeline(
920
  result['weights'], result['filterMethods']
921
  )
922
  print(f"[Pipeline] Done: {len(response_data)} methods, {clustering_info['n_clusters']} clusters")
923
 
924
+ # ── Step 3: Single LLM call (interpret results) ──────────────
925
+ print("[LLM] Generating insight...")
926
+ _, cluster_stats = build_cluster_stats(
927
+ response_data, clustering_info, result['weights']
 
 
928
  )
929
 
930
+ # Build compact cluster summary
931
+ compact_clusters = []
932
+ for cs in cluster_stats:
933
+ compact_clusters.append(f"- {cs['label']} ({cs['size']} methods): {', '.join(cs['methods'][:5])}")
934
+ cluster_text = '\n'.join(compact_clusters)
935
+
936
+ # Build the single, well-structured prompt
937
+ insight_prompt = f"""You are an expert research assistant for a robotic grasp planning visualization tool. A researcher has queried the system and you have access to real data from academic papers and clustering analysis.
938
+
939
+ RESEARCHER'S QUESTION: "{query}"
940
+
941
+ EVIDENCE FROM PAPERS:
942
+ {rag_text if rag_text else '(No paper excerpts available for this query)'}
943
+
944
+ RELEVANT METHODS IN THE DATASET:
945
+ {method_summaries}
946
+
947
+ CLUSTERING RESULTS ({len(response_data)} methods in {len(cluster_stats)} groups):
948
+ {cluster_text}
949
+
950
+ Highlighted methods (most relevant to query): {', '.join(result['highlightMethods'][:6])}
951
+
952
+ INSTRUCTIONS:
953
+ Write exactly 3-5 bullet points that answer the researcher's question. Each bullet must start with "- ".
954
+
955
+ Rules:
956
+ 1. Lead with evidence from the paper excerpts. Quote specific techniques, equations, or results by paper name (e.g., "Contact-GraspNet uses a binary cross-entropy loss on predicted contact points").
957
+ 2. When no paper excerpt covers a point, draw on the method metadata (planning approach, gripper type, etc.) to provide grounded analysis.
958
+ 3. Connect findings to the clustering: explain why methods using similar approaches end up in the same group.
959
+ 4. Be specific and technical. Avoid generic statements like "various methods use different approaches."
960
+ 5. Never reference cluster numbers. Use group names like "the sampling-based parallel-jaw group."
961
+
962
+ Respond with ONLY the bullet points, nothing else."""
963
+
964
  insight_text = llm_chat([
965
  {'role': 'user', 'content': insight_prompt}
966
  ], max_tokens=1024)
 
967
  if insight_text.startswith('```'):
968
  lines = insight_text.split('\n')
969
  insight_text = '\n'.join(lines[1:-1])
970
  result['insight'] = insight_text
971
 
972
+ print(f"[LLM] Insight: {len(insight_text)} chars")
973
 
974
+ # ── Build response ────────────────────────────────────────────
 
975
  result['umapData'] = response_data
976
  result['clustering'] = {
977
  'n_clusters': clustering_info['n_clusters'],
978
  'value_cluster_map': clustering_info['value_cluster_map']
979
  }
980
  result['clusterStats'] = cluster_stats
981
+ if rag_citations:
982
+ result['ragCitations'] = rag_citations
983
+ if rag_analytics:
984
+ result['ragAnalytics'] = rag_analytics
985
+ if method_relevance:
986
+ result['methodRelevance'] = method_relevance
987
 
988
  return jsonify({'success': True, **result})
989
 
 
1054
  return jsonify({'success': False, 'error': str(e)}), 500
1055
 
1056
 
1057
+ @app.route('/api/papers/<path:paper_id>')
1058
+ def serve_paper(paper_id):
1059
+ """Serve a PDF from the papers directory."""
1060
+ papers_dir = os.path.join(BASE_DIR, 'papers')
1061
+ # Sanitize: only allow alphanumeric, hyphens, underscores
1062
+ import re as _re
1063
+ safe_id = _re.sub(r'[^a-zA-Z0-9\-_]', '', paper_id.replace('.pdf', ''))
1064
+ pdf_path = os.path.join(papers_dir, f'{safe_id}.pdf')
1065
+ if os.path.isfile(pdf_path):
1066
+ return send_from_directory(papers_dir, f'{safe_id}.pdf', mimetype='application/pdf')
1067
+ return jsonify({'error': 'Paper not found'}), 404
1068
+
1069
+
1070
+ @app.route('/api/papers')
1071
+ def list_papers():
1072
+ """List available PDF papers."""
1073
+ papers_dir = os.path.join(BASE_DIR, 'papers')
1074
+ if not os.path.isdir(papers_dir):
1075
+ return jsonify({'papers': []})
1076
+ pdfs = [f.replace('.pdf', '') for f in sorted(os.listdir(papers_dir)) if f.endswith('.pdf')]
1077
+ return jsonify({'papers': pdfs})
1078
+
1079
+
1080
  @app.route('/api/health')
1081
  def health():
1082
  """Health check endpoint."""
backend/rag/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Domain-agnostic RAG pipeline for academic paper exploration."""
backend/rag/config.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Domain-agnostic RAG configuration. One YAML file describes any paper collection."""
2
+
3
+ import os
4
+ from dataclasses import dataclass, field
5
+ from typing import Optional
6
+
7
+ import yaml
8
+
9
+
10
+ @dataclass
11
+ class ChunkingConfig:
12
+ coarse_max_tokens: int = 800
13
+ mid_min_tokens: int = 200
14
+ mid_max_tokens: int = 800
15
+ mid_overlap_ratio: float = 0.15
16
+ fine_min_tokens: int = 50
17
+ fine_max_tokens: int = 300
18
+ semantic_similarity_threshold: float = 0.35
19
+ strategies: list = field(default_factory=lambda: ["semantic"])
20
+ domain_topics: list = field(default_factory=list) # Domain keyword list for topic tagging
21
+
22
+
23
+ @dataclass
24
+ class RetrievalConfig:
25
+ coarse_top_k: int = 2
26
+ mid_top_k: int = 4
27
+ fine_top_k: int = 4
28
+ token_budget: int = 3000
29
+ rerank: bool = False
30
+
31
+
32
+ @dataclass
33
+ class RAGConfig:
34
+ project_name: str = "default"
35
+ domain_context: str = ""
36
+ csv_path: str = ""
37
+ name_column: str = "Name"
38
+ description_column: str = "Description"
39
+ link_column: str = "Link(s)"
40
+ embedding_model: str = "all-MiniLM-L6-v2"
41
+ embedding_dimensions: int = 384
42
+ chroma_persist_dir: str = "./chroma_db"
43
+ collection_name: str = "papers"
44
+ chunking: ChunkingConfig = field(default_factory=ChunkingConfig)
45
+ retrieval: RetrievalConfig = field(default_factory=RetrievalConfig)
46
+ tools_enabled: bool = True
47
+ dataset_columns: list = field(default_factory=list)
48
+
49
+
50
+ def load_config(path: str) -> RAGConfig:
51
+ """Load RAG configuration from a YAML file."""
52
+ with open(path, 'r') as f:
53
+ raw = yaml.safe_load(f)
54
+
55
+ chunking_raw = raw.pop('chunking', {})
56
+ retrieval_raw = raw.pop('retrieval', {})
57
+
58
+ config = RAGConfig(**{k: v for k, v in raw.items() if k in RAGConfig.__dataclass_fields__})
59
+ config.chunking = ChunkingConfig(**{k: v for k, v in chunking_raw.items() if k in ChunkingConfig.__dataclass_fields__})
60
+ config.retrieval = RetrievalConfig(**{k: v for k, v in retrieval_raw.items() if k in RetrievalConfig.__dataclass_fields__})
61
+
62
+ return config
63
+
64
+
65
+ def create_default_config(project_name: str, csv_path: str, domain_context: str = "") -> RAGConfig:
66
+ """Generate a sensible starting config for a new project."""
67
+ return RAGConfig(
68
+ project_name=project_name,
69
+ domain_context=domain_context,
70
+ csv_path=csv_path,
71
+ )
72
+
73
+
74
+ def save_config(config: RAGConfig, path: str):
75
+ """Save configuration to YAML."""
76
+ data = {
77
+ 'project_name': config.project_name,
78
+ 'domain_context': config.domain_context,
79
+ 'csv_path': config.csv_path,
80
+ 'name_column': config.name_column,
81
+ 'description_column': config.description_column,
82
+ 'link_column': config.link_column,
83
+ 'embedding_model': config.embedding_model,
84
+ 'embedding_dimensions': config.embedding_dimensions,
85
+ 'chroma_persist_dir': config.chroma_persist_dir,
86
+ 'collection_name': config.collection_name,
87
+ 'chunking': {
88
+ 'coarse_max_tokens': config.chunking.coarse_max_tokens,
89
+ 'mid_min_tokens': config.chunking.mid_min_tokens,
90
+ 'mid_max_tokens': config.chunking.mid_max_tokens,
91
+ 'mid_overlap_ratio': config.chunking.mid_overlap_ratio,
92
+ 'fine_min_tokens': config.chunking.fine_min_tokens,
93
+ 'fine_max_tokens': config.chunking.fine_max_tokens,
94
+ 'semantic_similarity_threshold': config.chunking.semantic_similarity_threshold,
95
+ 'strategies': config.chunking.strategies,
96
+ 'domain_topics': config.chunking.domain_topics,
97
+ },
98
+ 'retrieval': {
99
+ 'coarse_top_k': config.retrieval.coarse_top_k,
100
+ 'mid_top_k': config.retrieval.mid_top_k,
101
+ 'fine_top_k': config.retrieval.fine_top_k,
102
+ 'token_budget': config.retrieval.token_budget,
103
+ 'rerank': config.retrieval.rerank,
104
+ },
105
+ 'tools_enabled': config.tools_enabled,
106
+ 'dataset_columns': config.dataset_columns,
107
+ }
108
+ with open(path, 'w') as f:
109
+ yaml.dump(data, f, default_flow_style=False, sort_keys=False)
backend/rag/ingest/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Paper ingestion: PDF parsing, chunking, embedding, and storage."""
backend/rag/ingest/chunker.py ADDED
@@ -0,0 +1,762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hybrid structural + semantic chunker with domain-aware metadata.
2
+
3
+ Three-layer hierarchy:
4
+ - Coarse: paper-level overview (abstract, section summaries, figure captions)
5
+ - Mid: semantic splitting within sections (topic-boundary detection via
6
+ sentence embeddings), with configurable overlap between chunks
7
+ - Fine: granular sentence groups for precise retrieval
8
+
9
+ Each chunk is enriched with:
10
+ - domain_topics: matched keywords from a configurable domain vocabulary
11
+ - rhetorical_role: heuristic classification (algorithm_description, result, etc.)
12
+ - content_type: theory vs implementation vs evaluation
13
+ - chunk_type: abstract, equation, figure, citation, plain, etc.
14
+ """
15
+
16
+ import re
17
+ import numpy as np
18
+ from abc import ABC, abstractmethod
19
+ from collections import Counter
20
+ from dataclasses import dataclass, field
21
+
22
+ from .pdf_parser import ParsedPaper, ParsedSection
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Chunk dataclass
27
+ # ---------------------------------------------------------------------------
28
+
29
+ @dataclass
30
+ class Chunk:
31
+ chunk_id: str
32
+ paper_id: str
33
+ paper_title: str
34
+ text: str
35
+ layer: str # "coarse", "mid", "fine"
36
+ chunk_type: str # "abstract", "section_summary", "figure_captions",
37
+ # "equation", "citation_context", "semantic_group", "paragraph"
38
+ section: str
39
+ subsection: str = ""
40
+ page: int = 0
41
+ position: float = 0.0 # Normalized position in paper (0.0-1.0)
42
+ token_count: int = 0
43
+ domain_topics: list = field(default_factory=list)
44
+ rhetorical_role: str = "" # algorithm_description, experimental_setup, result, ...
45
+ content_type: str = "" # theory, implementation, evaluation
46
+ metadata: dict = field(default_factory=dict)
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Utility functions
51
+ # ---------------------------------------------------------------------------
52
+
53
+ def estimate_tokens(text: str) -> int:
54
+ """Approximate token count via whitespace splitting."""
55
+ return len(text.split())
56
+
57
+
58
+ def _normalize_section_name(title: str) -> str:
59
+ """Strip leading numbers from section titles."""
60
+ clean = re.sub(r'^\d+\.?\d*\.?\s*', '', title).strip()
61
+ return clean if clean else title
62
+
63
+
64
+ def _split_paragraphs(text: str) -> list:
65
+ """Split on double newlines."""
66
+ paragraphs = re.split(r'\n\s*\n|\n{2,}', text)
67
+ return [p.strip() for p in paragraphs if p.strip()]
68
+
69
+
70
+ def _split_sentences(text: str) -> list:
71
+ """Split at sentence boundaries (period/question/exclamation followed by uppercase)."""
72
+ sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
73
+ return [s.strip() for s in sentences if s.strip()]
74
+
75
+
76
+ SKIP_SECTIONS = {'references', 'acknowledgments', 'acknowledgements', 'bibliography'}
77
+
78
+
79
+ def _should_skip_section(name: str) -> bool:
80
+ return name.lower().strip() in SKIP_SECTIONS
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Domain topic extraction
85
+ # ---------------------------------------------------------------------------
86
+
87
+ def extract_domain_topics(text: str, domain_keywords: list) -> list:
88
+ """Match chunk text against a domain keyword vocabulary.
89
+
90
+ Case-insensitive matching. Returns deduplicated list of matched keywords
91
+ sorted by frequency of occurrence in the text.
92
+ """
93
+ if not domain_keywords:
94
+ return []
95
+ text_lower = text.lower()
96
+ matched = []
97
+ for kw in domain_keywords:
98
+ kw_lower = kw.lower()
99
+ # Use word-boundary matching for short keywords, substring for multi-word
100
+ if ' ' in kw_lower:
101
+ if kw_lower in text_lower:
102
+ matched.append(kw)
103
+ else:
104
+ if re.search(r'\b' + re.escape(kw_lower) + r'\b', text_lower):
105
+ matched.append(kw)
106
+ return list(dict.fromkeys(matched)) # deduplicate preserving order
107
+
108
+
109
+ # ---------------------------------------------------------------------------
110
+ # Rhetorical role and content type classification (heuristic)
111
+ # ---------------------------------------------------------------------------
112
+
113
+ ROLE_PATTERNS = {
114
+ 'problem_statement': [
115
+ r'\b(we address|the problem of|challenge of|goal is to|aim to)\b',
116
+ ],
117
+ 'algorithm_description': [
118
+ r'\b(we propose|our method|our approach|architecture|pipeline|network|module)\b',
119
+ r'\b(algorithm \d|step \d|procedure)\b',
120
+ ],
121
+ 'experimental_setup': [
122
+ r'\b(we evaluate|experiment|setup|dataset|baseline|benchmark|hardware|robot platform)\b',
123
+ r'\b(training details|hyperparameter|implementation detail|we train)\b',
124
+ ],
125
+ 'result': [
126
+ r'\b(table \d|figure \d|fig\.\s*\d|results show|we achieve|accuracy|success rate|outperform)\b',
127
+ r'\b(ablation|comparison|performance|improvement|f1|precision|recall)\b',
128
+ ],
129
+ 'comparison': [
130
+ r'\b(compared to|in contrast|unlike|whereas|prior work|related work|existing method)\b',
131
+ ],
132
+ 'limitation': [
133
+ r'\b(limitation|failure|drawback|future work|open question|cannot|does not)\b',
134
+ ],
135
+ 'definition': [
136
+ r'\b(we define|denoted by|let \w+ be|formally|definition)\b',
137
+ ],
138
+ }
139
+
140
+ CONTENT_TYPE_MAP = {
141
+ 'algorithm_description': 'theory',
142
+ 'definition': 'theory',
143
+ 'problem_statement': 'theory',
144
+ 'experimental_setup': 'implementation',
145
+ 'result': 'evaluation',
146
+ 'comparison': 'evaluation',
147
+ 'limitation': 'evaluation',
148
+ }
149
+
150
+
151
+ def classify_rhetorical_role(text: str) -> str:
152
+ """Assign a rhetorical role based on keyword/pattern matching."""
153
+ text_lower = text.lower()
154
+ scores = {}
155
+ for role, patterns in ROLE_PATTERNS.items():
156
+ score = sum(1 for p in patterns if re.search(p, text_lower))
157
+ if score > 0:
158
+ scores[role] = score
159
+ if not scores:
160
+ return "general"
161
+ return max(scores, key=scores.get)
162
+
163
+
164
+ def classify_content_type(rhetorical_role: str, section_name: str) -> str:
165
+ """Derive content_type from rhetorical role and section name."""
166
+ # Section-based override
167
+ sec_lower = section_name.lower()
168
+ if any(k in sec_lower for k in ('experiment', 'result', 'evaluation', 'ablation')):
169
+ return 'evaluation'
170
+ if any(k in sec_lower for k in ('method', 'approach', 'model', 'architecture', 'algorithm')):
171
+ return 'theory'
172
+ if any(k in sec_lower for k in ('implement', 'training', 'setup', 'detail')):
173
+ return 'implementation'
174
+ # Fall back to rhetorical role mapping
175
+ return CONTENT_TYPE_MAP.get(rhetorical_role, 'general')
176
+
177
+
178
+ # ---------------------------------------------------------------------------
179
+ # Equation and citation detection
180
+ # ---------------------------------------------------------------------------
181
+
182
+ EQUATION_RE = re.compile(
183
+ r'(?:'
184
+ r'\\begin\{(?:equation|align|gather)\}.*?\\end\{(?:equation|align|gather)\}'
185
+ r'|[A-Za-z]\s*=\s*[^,\n]{10,}'
186
+ r'|\$[^$]+\$'
187
+ r')',
188
+ re.DOTALL
189
+ )
190
+
191
+ CITATION_RE = re.compile(
192
+ r'(?:\[[\d,\s\-]+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\))',
193
+ )
194
+
195
+
196
+ def detect_chunk_type(text: str) -> str:
197
+ """Detect whether a chunk is primarily an equation, citation context, or plain text."""
198
+ eq_matches = len(EQUATION_RE.findall(text))
199
+ cit_matches = len(CITATION_RE.findall(text))
200
+ tokens = estimate_tokens(text)
201
+ if tokens > 0:
202
+ eq_density = eq_matches / tokens
203
+ cit_density = cit_matches / tokens
204
+ if eq_density > 0.02 or eq_matches >= 3:
205
+ return "equation"
206
+ if cit_density > 0.03 or cit_matches >= 4:
207
+ return "citation_context"
208
+ return "plain"
209
+
210
+
211
+ # ---------------------------------------------------------------------------
212
+ # Semantic sentence similarity (for topic-boundary detection)
213
+ # ---------------------------------------------------------------------------
214
+
215
+ def _compute_sentence_similarities(sentences: list, model) -> np.ndarray:
216
+ """Embed sentences and compute consecutive cosine similarities.
217
+
218
+ Returns array of shape (n_sentences - 1,) where element i is the
219
+ cosine similarity between sentence i and sentence i+1.
220
+ """
221
+ if len(sentences) < 2:
222
+ return np.array([])
223
+ embeddings = model.encode(sentences, show_progress_bar=False)
224
+ # Normalize
225
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
226
+ norms[norms == 0] = 1
227
+ embeddings = embeddings / norms
228
+ # Consecutive cosine similarities
229
+ sims = np.array([
230
+ np.dot(embeddings[i], embeddings[i + 1])
231
+ for i in range(len(embeddings) - 1)
232
+ ])
233
+ return sims
234
+
235
+
236
+ def _find_semantic_boundaries(sims: np.ndarray, threshold: float) -> list:
237
+ """Find indices where consecutive similarity drops below threshold.
238
+
239
+ Returns list of split points (indices into the similarity array).
240
+ A split at index i means: cut AFTER sentence i.
241
+ """
242
+ boundaries = []
243
+ for i, sim in enumerate(sims):
244
+ if sim < threshold:
245
+ boundaries.append(i)
246
+ return boundaries
247
+
248
+
249
+ def _group_sentences_by_boundaries(sentences: list, boundaries: list) -> list:
250
+ """Group sentences into segments based on boundary indices."""
251
+ groups = []
252
+ start = 0
253
+ for b in sorted(boundaries):
254
+ cut = b + 1 # cut after sentence b
255
+ if cut > start:
256
+ groups.append(sentences[start:cut])
257
+ start = cut
258
+ if start < len(sentences):
259
+ groups.append(sentences[start:])
260
+ return groups
261
+
262
+
263
+ # ---------------------------------------------------------------------------
264
+ # Overlap generation
265
+ # ---------------------------------------------------------------------------
266
+
267
+ def _apply_overlap(chunks: list, overlap_ratio: float) -> list:
268
+ """Add overlap between consecutive same-section mid-level chunks.
269
+
270
+ Takes the last N sentences of chunk i and prepends them to chunk i+1,
271
+ where N is determined by overlap_ratio * chunk_i token count.
272
+ """
273
+ if overlap_ratio <= 0 or len(chunks) < 2:
274
+ return chunks
275
+
276
+ result = [chunks[0]]
277
+ for i in range(1, len(chunks)):
278
+ prev = chunks[i - 1]
279
+ curr = chunks[i]
280
+
281
+ # Only overlap within same section
282
+ if prev.section != curr.section:
283
+ result.append(curr)
284
+ continue
285
+
286
+ overlap_tokens = int(prev.token_count * overlap_ratio)
287
+ if overlap_tokens < 10:
288
+ result.append(curr)
289
+ continue
290
+
291
+ # Extract trailing sentences from previous chunk
292
+ prev_sentences = _split_sentences(prev.text)
293
+ overlap_sents = []
294
+ acc = 0
295
+ for s in reversed(prev_sentences):
296
+ t = estimate_tokens(s)
297
+ if acc + t > overlap_tokens:
298
+ break
299
+ overlap_sents.insert(0, s)
300
+ acc += t
301
+
302
+ if overlap_sents:
303
+ overlap_text = ' '.join(overlap_sents)
304
+ new_text = overlap_text + ' ' + curr.text
305
+ result.append(Chunk(
306
+ chunk_id=curr.chunk_id,
307
+ paper_id=curr.paper_id,
308
+ paper_title=curr.paper_title,
309
+ text=new_text,
310
+ layer=curr.layer,
311
+ chunk_type=curr.chunk_type,
312
+ section=curr.section,
313
+ subsection=curr.subsection,
314
+ page=curr.page,
315
+ position=curr.position,
316
+ token_count=estimate_tokens(new_text),
317
+ domain_topics=curr.domain_topics,
318
+ rhetorical_role=curr.rhetorical_role,
319
+ content_type=curr.content_type,
320
+ metadata={**curr.metadata, 'has_overlap': True},
321
+ ))
322
+ else:
323
+ result.append(curr)
324
+
325
+ return result
326
+
327
+
328
+ # ---------------------------------------------------------------------------
329
+ # Enrichment: apply domain topics, rhetorical role, content type to all chunks
330
+ # ---------------------------------------------------------------------------
331
+
332
+ def _enrich_chunk(chunk: Chunk, domain_keywords: list) -> Chunk:
333
+ """Add domain_topics, rhetorical_role, and content_type to a chunk."""
334
+ chunk.domain_topics = extract_domain_topics(chunk.text, domain_keywords)
335
+ chunk.rhetorical_role = classify_rhetorical_role(chunk.text)
336
+ chunk.content_type = classify_content_type(chunk.rhetorical_role, chunk.section)
337
+
338
+ # Detect special chunk types for non-abstract/figure chunks
339
+ if chunk.chunk_type in ('plain', 'semantic_group', 'paragraph', 'subsection'):
340
+ detected = detect_chunk_type(chunk.text)
341
+ if detected != 'plain':
342
+ chunk.chunk_type = detected
343
+
344
+ return chunk
345
+
346
+
347
+ # ---------------------------------------------------------------------------
348
+ # Chunking strategies
349
+ # ---------------------------------------------------------------------------
350
+
351
+ class ChunkingStrategy(ABC):
352
+ @abstractmethod
353
+ def chunk(self, paper: ParsedPaper, config, model=None) -> list:
354
+ pass
355
+
356
+
357
+ class CoarseChunker(ChunkingStrategy):
358
+ """Layer 1: Paper-level overview chunks."""
359
+
360
+ def chunk(self, paper: ParsedPaper, config, model=None) -> list:
361
+ chunks = []
362
+ max_tokens = config.coarse_max_tokens
363
+
364
+ # 1. Title + Abstract
365
+ if paper.abstract:
366
+ text = f"{paper.title}\n\n{paper.abstract}"
367
+ if estimate_tokens(text) > max_tokens:
368
+ text = ' '.join(text.split()[:max_tokens])
369
+ chunks.append(Chunk(
370
+ chunk_id=f"{paper.paper_id}_coarse_abstract",
371
+ paper_id=paper.paper_id,
372
+ paper_title=paper.title,
373
+ text=text,
374
+ layer="coarse",
375
+ chunk_type="abstract",
376
+ section="Abstract",
377
+ page=0,
378
+ position=0.0,
379
+ token_count=estimate_tokens(text),
380
+ ))
381
+
382
+ # 2. Section summaries
383
+ total_sections = len(paper.sections) or 1
384
+ for i, section in enumerate(paper.sections):
385
+ section_name = _normalize_section_name(section.title)
386
+ if _should_skip_section(section_name):
387
+ continue
388
+
389
+ tokens = estimate_tokens(section.text)
390
+ if tokens <= max_tokens:
391
+ summary_text = section.text
392
+ else:
393
+ paragraphs = _split_paragraphs(section.text)
394
+ if len(paragraphs) >= 2:
395
+ summary_text = paragraphs[0] + "\n\n" + paragraphs[-1]
396
+ else:
397
+ summary_text = ' '.join(section.text.split()[:max_tokens])
398
+
399
+ chunks.append(Chunk(
400
+ chunk_id=f"{paper.paper_id}_coarse_sec_{i}",
401
+ paper_id=paper.paper_id,
402
+ paper_title=paper.title,
403
+ text=summary_text,
404
+ layer="coarse",
405
+ chunk_type="section_summary",
406
+ section=section_name,
407
+ page=section.page_start,
408
+ position=round(i / total_sections, 2),
409
+ token_count=estimate_tokens(summary_text),
410
+ ))
411
+
412
+ # 3. Figure/table captions
413
+ if paper.figures:
414
+ captions = '\n'.join(f.caption for f in paper.figures)
415
+ chunks.append(Chunk(
416
+ chunk_id=f"{paper.paper_id}_coarse_figures",
417
+ paper_id=paper.paper_id,
418
+ paper_title=paper.title,
419
+ text=captions,
420
+ layer="coarse",
421
+ chunk_type="figure_captions",
422
+ section="Figures",
423
+ position=0.5,
424
+ token_count=estimate_tokens(captions),
425
+ ))
426
+
427
+ return chunks
428
+
429
+
430
+ class SemanticChunker(ChunkingStrategy):
431
+ """Layer 2: Structural boundaries as hard cuts, semantic similarity for
432
+ soft topic-boundary detection within sections.
433
+
434
+ Within each section:
435
+ 1. Split into sentences
436
+ 2. Embed every sentence with the sentence-transformer
437
+ 3. Compute consecutive cosine similarities
438
+ 4. Cut where similarity drops below threshold (topic shift)
439
+ 5. Group sentences between cuts into chunks
440
+ 6. Apply min/max token constraints (merge small groups, split large ones)
441
+ 7. Add overlap between consecutive chunks
442
+ """
443
+
444
+ def chunk(self, paper: ParsedPaper, config, model=None) -> list:
445
+ chunks = []
446
+ min_tokens = config.mid_min_tokens
447
+ max_tokens = config.mid_max_tokens
448
+ threshold = config.semantic_similarity_threshold
449
+ total_sections = len(paper.sections) or 1
450
+
451
+ for sec_idx, section in enumerate(paper.sections):
452
+ section_name = _normalize_section_name(section.title)
453
+ if _should_skip_section(section_name):
454
+ continue
455
+
456
+ sentences = _split_sentences(section.text)
457
+ if not sentences:
458
+ continue
459
+
460
+ # --- Semantic boundary detection ---
461
+ if model is not None and len(sentences) >= 3:
462
+ sims = _compute_sentence_similarities(sentences, model)
463
+ boundaries = _find_semantic_boundaries(sims, threshold)
464
+ groups = _group_sentences_by_boundaries(sentences, boundaries)
465
+ else:
466
+ # Fallback: paragraph-based grouping
467
+ paragraphs = _split_paragraphs(section.text)
468
+ groups = [_split_sentences(p) for p in paragraphs if p.strip()]
469
+ if not groups:
470
+ groups = [sentences]
471
+
472
+ # --- Enforce min/max token constraints ---
473
+ merged_groups = []
474
+ buffer = []
475
+ buffer_tokens = 0
476
+
477
+ for group in groups:
478
+ group_text = ' '.join(group)
479
+ group_tokens = estimate_tokens(group_text)
480
+
481
+ if buffer_tokens + group_tokens <= max_tokens:
482
+ buffer.extend(group)
483
+ buffer_tokens += group_tokens
484
+ else:
485
+ if buffer and buffer_tokens >= min_tokens:
486
+ merged_groups.append(buffer)
487
+ elif buffer:
488
+ # Buffer too small, absorb this group into it
489
+ buffer.extend(group)
490
+ buffer_tokens += group_tokens
491
+ if buffer_tokens >= min_tokens:
492
+ merged_groups.append(buffer)
493
+ buffer = []
494
+ buffer_tokens = 0
495
+ continue
496
+
497
+ # Start new buffer
498
+ if group_tokens > max_tokens:
499
+ # Split oversized group at token boundary
500
+ sub_buffer = []
501
+ sub_tokens = 0
502
+ for s in group:
503
+ st = estimate_tokens(s)
504
+ if sub_tokens + st > max_tokens and sub_buffer:
505
+ merged_groups.append(sub_buffer)
506
+ sub_buffer = []
507
+ sub_tokens = 0
508
+ sub_buffer.append(s)
509
+ sub_tokens += st
510
+ buffer = sub_buffer
511
+ buffer_tokens = sub_tokens
512
+ else:
513
+ buffer = list(group)
514
+ buffer_tokens = group_tokens
515
+
516
+ # Flush remaining buffer
517
+ if buffer:
518
+ if buffer_tokens >= min_tokens or not merged_groups:
519
+ merged_groups.append(buffer)
520
+ elif merged_groups:
521
+ merged_groups[-1].extend(buffer)
522
+
523
+ # --- Create chunk objects ---
524
+ for chunk_idx, group in enumerate(merged_groups):
525
+ text = ' '.join(group)
526
+ chunks.append(Chunk(
527
+ chunk_id=f"{paper.paper_id}_mid_{sec_idx}_{chunk_idx}",
528
+ paper_id=paper.paper_id,
529
+ paper_title=paper.title,
530
+ text=text,
531
+ layer="mid",
532
+ chunk_type="semantic_group",
533
+ section=section_name,
534
+ subsection=section.title,
535
+ page=section.page_start,
536
+ position=round(sec_idx / total_sections, 2),
537
+ token_count=estimate_tokens(text),
538
+ ))
539
+
540
+ return chunks
541
+
542
+
543
+ class StructuralChunker(ChunkingStrategy):
544
+ """Layer 2 fallback: paragraph-based grouping within sections (no embeddings needed)."""
545
+
546
+ def chunk(self, paper: ParsedPaper, config, model=None) -> list:
547
+ chunks = []
548
+ min_tokens = config.mid_min_tokens
549
+ max_tokens = config.mid_max_tokens
550
+ total_sections = len(paper.sections) or 1
551
+
552
+ for sec_idx, section in enumerate(paper.sections):
553
+ section_name = _normalize_section_name(section.title)
554
+ if _should_skip_section(section_name):
555
+ continue
556
+
557
+ paragraphs = _split_paragraphs(section.text)
558
+ if not paragraphs:
559
+ continue
560
+
561
+ current_text = []
562
+ current_tokens = 0
563
+ chunk_idx = 0
564
+
565
+ for para in paragraphs:
566
+ para_tokens = estimate_tokens(para)
567
+
568
+ if current_tokens + para_tokens > max_tokens and current_text:
569
+ text = '\n\n'.join(current_text)
570
+ chunks.append(Chunk(
571
+ chunk_id=f"{paper.paper_id}_mid_{sec_idx}_{chunk_idx}",
572
+ paper_id=paper.paper_id,
573
+ paper_title=paper.title,
574
+ text=text,
575
+ layer="mid",
576
+ chunk_type="subsection",
577
+ section=section_name,
578
+ subsection=section.title,
579
+ page=section.page_start,
580
+ position=round(sec_idx / total_sections, 2),
581
+ token_count=estimate_tokens(text),
582
+ ))
583
+ chunk_idx += 1
584
+ current_text = []
585
+ current_tokens = 0
586
+
587
+ current_text.append(para)
588
+ current_tokens += para_tokens
589
+
590
+ if current_text:
591
+ text = '\n\n'.join(current_text)
592
+ if estimate_tokens(text) >= min_tokens or chunk_idx == 0:
593
+ chunks.append(Chunk(
594
+ chunk_id=f"{paper.paper_id}_mid_{sec_idx}_{chunk_idx}",
595
+ paper_id=paper.paper_id,
596
+ paper_title=paper.title,
597
+ text=text,
598
+ layer="mid",
599
+ chunk_type="subsection",
600
+ section=section_name,
601
+ subsection=section.title,
602
+ page=section.page_start,
603
+ position=round(sec_idx / total_sections, 2),
604
+ token_count=estimate_tokens(text),
605
+ ))
606
+ elif chunks:
607
+ prev = chunks[-1]
608
+ merged = prev.text + '\n\n' + text
609
+ chunks[-1] = Chunk(
610
+ chunk_id=prev.chunk_id,
611
+ paper_id=prev.paper_id,
612
+ paper_title=prev.paper_title,
613
+ text=merged,
614
+ layer=prev.layer,
615
+ chunk_type=prev.chunk_type,
616
+ section=prev.section,
617
+ subsection=prev.subsection,
618
+ page=prev.page,
619
+ position=prev.position,
620
+ token_count=estimate_tokens(merged),
621
+ )
622
+
623
+ return chunks
624
+
625
+
626
+ class FineChunker(ChunkingStrategy):
627
+ """Layer 3: Sentence-level fine chunks for precise retrieval."""
628
+
629
+ def chunk(self, paper: ParsedPaper, config, model=None) -> list:
630
+ chunks = []
631
+ max_tokens = config.fine_max_tokens
632
+ min_tokens = config.fine_min_tokens
633
+ total_sections = len(paper.sections) or 1
634
+
635
+ for sec_idx, section in enumerate(paper.sections):
636
+ section_name = _normalize_section_name(section.title)
637
+ if _should_skip_section(section_name):
638
+ continue
639
+
640
+ paragraphs = _split_paragraphs(section.text)
641
+ chunk_idx = 0
642
+
643
+ for para in paragraphs:
644
+ tokens = estimate_tokens(para)
645
+ if tokens < min_tokens:
646
+ continue
647
+
648
+ if tokens > max_tokens:
649
+ sentences = _split_sentences(para)
650
+ current = []
651
+ current_tokens = 0
652
+ for sent in sentences:
653
+ st = estimate_tokens(sent)
654
+ if current_tokens + st > max_tokens and current:
655
+ text = ' '.join(current)
656
+ chunks.append(Chunk(
657
+ chunk_id=f"{paper.paper_id}_fine_{sec_idx}_{chunk_idx}",
658
+ paper_id=paper.paper_id,
659
+ paper_title=paper.title,
660
+ text=text,
661
+ layer="fine",
662
+ chunk_type="paragraph",
663
+ section=section_name,
664
+ subsection=section.title,
665
+ page=section.page_start,
666
+ position=round(sec_idx / total_sections, 2),
667
+ token_count=estimate_tokens(text),
668
+ ))
669
+ chunk_idx += 1
670
+ current = []
671
+ current_tokens = 0
672
+ current.append(sent)
673
+ current_tokens += st
674
+ if current:
675
+ text = ' '.join(current)
676
+ if estimate_tokens(text) >= min_tokens:
677
+ chunks.append(Chunk(
678
+ chunk_id=f"{paper.paper_id}_fine_{sec_idx}_{chunk_idx}",
679
+ paper_id=paper.paper_id,
680
+ paper_title=paper.title,
681
+ text=text,
682
+ layer="fine",
683
+ chunk_type="paragraph",
684
+ section=section_name,
685
+ subsection=section.title,
686
+ page=section.page_start,
687
+ position=round(sec_idx / total_sections, 2),
688
+ token_count=estimate_tokens(text),
689
+ ))
690
+ chunk_idx += 1
691
+ else:
692
+ chunks.append(Chunk(
693
+ chunk_id=f"{paper.paper_id}_fine_{sec_idx}_{chunk_idx}",
694
+ paper_id=paper.paper_id,
695
+ paper_title=paper.title,
696
+ text=para,
697
+ layer="fine",
698
+ chunk_type="paragraph",
699
+ section=section_name,
700
+ subsection=section.title,
701
+ page=section.page_start,
702
+ position=round(sec_idx / total_sections, 2),
703
+ token_count=tokens,
704
+ ))
705
+ chunk_idx += 1
706
+
707
+ return chunks
708
+
709
+
710
+ # ---------------------------------------------------------------------------
711
+ # Strategy registry
712
+ # ---------------------------------------------------------------------------
713
+
714
+ STRATEGIES = {
715
+ "coarse": CoarseChunker,
716
+ "semantic": SemanticChunker,
717
+ "structural": StructuralChunker,
718
+ "fine": FineChunker,
719
+ }
720
+
721
+
722
+ # ---------------------------------------------------------------------------
723
+ # Main entry point
724
+ # ---------------------------------------------------------------------------
725
+
726
+ def chunk_paper(paper: ParsedPaper, config, model=None) -> list:
727
+ """Chunk a parsed paper using the configured strategy with full metadata enrichment.
728
+
729
+ Args:
730
+ paper: ParsedPaper from the PDF parser.
731
+ config: ChunkingConfig with token limits, thresholds, domain keywords.
732
+ model: Optional SentenceTransformer for semantic chunking. If None and
733
+ strategy is 'semantic', falls back to structural chunking.
734
+
735
+ Returns:
736
+ List of Chunk objects across all three layers, enriched with domain
737
+ topics, rhetorical roles, and content types.
738
+ """
739
+ all_chunks = []
740
+ domain_keywords = getattr(config, 'domain_topics', [])
741
+
742
+ # Layer 1: Coarse (always)
743
+ all_chunks.extend(CoarseChunker().chunk(paper, config, model))
744
+
745
+ # Layer 2: Mid-level (semantic or structural based on config)
746
+ strategy_name = config.strategies[0] if config.strategies else "semantic"
747
+ if strategy_name == "semantic":
748
+ mid_chunks = SemanticChunker().chunk(paper, config, model)
749
+ else:
750
+ mid_chunks = StructuralChunker().chunk(paper, config, model)
751
+
752
+ # Apply overlap between consecutive mid-level chunks
753
+ mid_chunks = _apply_overlap(mid_chunks, config.mid_overlap_ratio)
754
+ all_chunks.extend(mid_chunks)
755
+
756
+ # Layer 3: Fine
757
+ all_chunks.extend(FineChunker().chunk(paper, config, model))
758
+
759
+ # Enrich all chunks with domain topics, rhetorical role, content type
760
+ all_chunks = [_enrich_chunk(c, domain_keywords) for c in all_chunks]
761
+
762
+ return all_chunks
backend/rag/ingest/embedder.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Embedding wrapper for chunk and query encoding.
2
+
3
+ Pluggable model via config. Prepends section context to chunk text
4
+ before embedding to steer vectors toward the right semantic neighborhood.
5
+ """
6
+
7
+ import numpy as np
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ from .chunker import Chunk
11
+
12
+
13
+ class ChunkEmbedder:
14
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2", model_instance=None):
15
+ """Initialize embedder.
16
+
17
+ Args:
18
+ model_name: HuggingFace model name for sentence-transformers.
19
+ model_instance: Optional pre-loaded SentenceTransformer to reuse
20
+ (avoids loading the model twice at runtime).
21
+ """
22
+ if model_instance is not None:
23
+ self.model = model_instance
24
+ else:
25
+ self.model = SentenceTransformer(model_name)
26
+ self.model_name = model_name
27
+
28
+ def _prepare_text(self, chunk: Chunk) -> str:
29
+ """Prepend section context to chunk text before embedding."""
30
+ prefix = f"[{chunk.section}"
31
+ if chunk.subsection and chunk.subsection != chunk.section:
32
+ prefix += f": {chunk.subsection}"
33
+ prefix += "] "
34
+ return prefix + chunk.text
35
+
36
+ def embed_chunks(self, chunks: list, batch_size: int = 32) -> np.ndarray:
37
+ """Embed a list of chunks. Returns array of shape (n_chunks, dim)."""
38
+ if not chunks:
39
+ return np.array([])
40
+ texts = [self._prepare_text(c) for c in chunks]
41
+ embeddings = self.model.encode(texts, batch_size=batch_size, show_progress_bar=True)
42
+ return np.array(embeddings)
43
+
44
+ def embed_query(self, query: str) -> np.ndarray:
45
+ """Embed a single query string."""
46
+ return self.model.encode(query)
backend/rag/ingest/pdf_parser.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PDF parsing with pdfplumber + heuristic section detection.
2
+
3
+ Domain-agnostic: uses font-size changes and numbering patterns to detect
4
+ section boundaries. Falls back to known academic header keywords when available.
5
+ """
6
+
7
+ import re
8
+ from dataclasses import dataclass, field
9
+
10
+ import pdfplumber
11
+
12
+
13
+ KNOWN_HEADERS = {
14
+ 'abstract', 'introduction', 'related work', 'background',
15
+ 'method', 'methods', 'methodology', 'approach',
16
+ 'experiment', 'experiments', 'results', 'evaluation',
17
+ 'discussion', 'conclusion', 'conclusions',
18
+ 'acknowledgments', 'acknowledgements', 'references', 'appendix',
19
+ }
20
+
21
+ NUMBERED_HEADER_RE = re.compile(r'^(\d+\.?\s+|[IVXLC]+\.?\s+)[A-Z]')
22
+ LETTERED_HEADER_RE = re.compile(r'^[A-Z]\.\s+[A-Z]')
23
+
24
+
25
+ @dataclass
26
+ class ParsedSection:
27
+ title: str
28
+ level: int # 0=paper, 1=section, 2=subsection
29
+ text: str
30
+ page_start: int
31
+ page_end: int
32
+
33
+
34
+ @dataclass
35
+ class ParsedFigure:
36
+ caption: str
37
+ page: int
38
+ nearby_text: str = ""
39
+
40
+
41
+ @dataclass
42
+ class ParsedPaper:
43
+ paper_id: str
44
+ title: str
45
+ abstract: str
46
+ sections: list = field(default_factory=list)
47
+ figures: list = field(default_factory=list)
48
+ raw_text: str = ""
49
+
50
+
51
+ def _estimate_median_font_size(page):
52
+ """Get median font size from a page's character data."""
53
+ chars = page.chars
54
+ if not chars:
55
+ return 10.0
56
+ sizes = [c.get('size', 10.0) for c in chars]
57
+ sizes.sort()
58
+ return sizes[len(sizes) // 2]
59
+
60
+
61
+ def _line_font_size(page, line_text, line_top):
62
+ """Estimate font size for a specific line by matching characters near its y-position."""
63
+ chars = page.chars
64
+ if not chars or not line_text.strip():
65
+ return None
66
+ line_chars = [c for c in chars if abs(c.get('top', 0) - line_top) < 3]
67
+ if not line_chars:
68
+ return None
69
+ sizes = [c.get('size', 10.0) for c in line_chars]
70
+ return sum(sizes) / len(sizes)
71
+
72
+
73
+ def _is_header_line(line: str, font_size: float, median_size: float) -> tuple:
74
+ """Determine if a line is a section header. Returns (is_header, level)."""
75
+ stripped = line.strip()
76
+ if not stripped or len(stripped) > 100:
77
+ return False, 0
78
+
79
+ # Check font size (headers are typically larger)
80
+ size_boost = font_size and median_size and font_size > median_size * 1.1
81
+
82
+ # Check known academic headers
83
+ lower = stripped.lower().rstrip(':').strip()
84
+ # Remove leading numbers for matching
85
+ clean = re.sub(r'^\d+\.?\s*', '', lower).strip()
86
+ is_known = clean in KNOWN_HEADERS
87
+
88
+ # Check numbering pattern (e.g., "3. Method", "IV. Results")
89
+ has_number = bool(NUMBERED_HEADER_RE.match(stripped)) or bool(LETTERED_HEADER_RE.match(stripped))
90
+
91
+ # Subsection pattern (e.g., "3.1 Dynamics Model")
92
+ is_subsection = bool(re.match(r'^\d+\.\d+\.?\s+', stripped))
93
+
94
+ if is_known or (has_number and size_boost):
95
+ level = 2 if is_subsection else 1
96
+ return True, level
97
+ if size_boost and len(stripped) < 60 and not stripped.endswith('.'):
98
+ level = 2 if is_subsection else 1
99
+ return True, level
100
+
101
+ return False, 0
102
+
103
+
104
+ def _extract_figures(page, page_num: int) -> list:
105
+ """Extract figure/table captions from a page."""
106
+ text = page.extract_text() or ""
107
+ figures = []
108
+ caption_re = re.compile(
109
+ r'((?:Figure|Fig\.|Table|Algorithm)\s*\d+[.:]\s*.+?)(?:\n\n|\n(?=[A-Z0-9])|\Z)',
110
+ re.IGNORECASE | re.DOTALL
111
+ )
112
+ for match in caption_re.finditer(text):
113
+ caption = match.group(1).strip()
114
+ if len(caption) > 20:
115
+ figures.append(ParsedFigure(caption=caption, page=page_num))
116
+ return figures
117
+
118
+
119
+ def parse_pdf(pdf_path: str, paper_id: str = None) -> ParsedPaper:
120
+ """Extract structured text from a PDF.
121
+
122
+ Args:
123
+ pdf_path: Path to the PDF file.
124
+ paper_id: Identifier for the paper. Defaults to filename stem.
125
+
126
+ Returns:
127
+ ParsedPaper with title, abstract, sections, figures, and raw text.
128
+ """
129
+ import os
130
+ if paper_id is None:
131
+ paper_id = os.path.splitext(os.path.basename(pdf_path))[0]
132
+
133
+ all_text_lines = []
134
+ page_lines = [] # (line_text, page_num, line_top, font_size)
135
+ figures = []
136
+
137
+ with pdfplumber.open(pdf_path) as pdf:
138
+ for page_num, page in enumerate(pdf.pages):
139
+ text = page.extract_text() or ""
140
+ median_size = _estimate_median_font_size(page)
141
+
142
+ lines = text.split('\n')
143
+ # Track approximate y-positions
144
+ current_top = 0
145
+ for line in lines:
146
+ fs = _line_font_size(page, line, current_top)
147
+ page_lines.append((line, page_num, current_top, fs, median_size))
148
+ current_top += 12 # approximate line height
149
+
150
+ all_text_lines.extend(lines)
151
+ figures.extend(_extract_figures(page, page_num))
152
+
153
+ raw_text = '\n'.join(all_text_lines)
154
+
155
+ # Extract title (first non-empty line, typically largest font)
156
+ title = ""
157
+ for line, pn, top, fs, ms in page_lines:
158
+ if line.strip():
159
+ title = line.strip()
160
+ break
161
+
162
+ # Extract abstract
163
+ abstract = ""
164
+ in_abstract = False
165
+ abstract_lines = []
166
+ for line, pn, top, fs, ms in page_lines:
167
+ stripped = line.strip()
168
+ lower = stripped.lower()
169
+ if lower.startswith('abstract') and not in_abstract:
170
+ in_abstract = True
171
+ # Remove "Abstract" prefix
172
+ remainder = re.sub(r'^abstract[:\s\-]*', '', stripped, flags=re.IGNORECASE).strip()
173
+ if remainder:
174
+ abstract_lines.append(remainder)
175
+ continue
176
+ if in_abstract:
177
+ is_hdr, _ = _is_header_line(stripped, fs, ms)
178
+ if is_hdr and abstract_lines:
179
+ break
180
+ if stripped:
181
+ abstract_lines.append(stripped)
182
+ abstract = ' '.join(abstract_lines)
183
+
184
+ # Build sections
185
+ sections = []
186
+ current_section = None
187
+ current_lines = []
188
+ current_page_start = 0
189
+
190
+ for line, pn, top, fs, ms in page_lines:
191
+ stripped = line.strip()
192
+ is_hdr, level = _is_header_line(stripped, fs, ms)
193
+
194
+ if is_hdr and stripped.lower().rstrip(':').strip() not in ('abstract',):
195
+ # Save previous section
196
+ if current_section is not None:
197
+ section_text = '\n'.join(current_lines).strip()
198
+ if section_text:
199
+ sections.append(ParsedSection(
200
+ title=current_section,
201
+ level=level,
202
+ text=section_text,
203
+ page_start=current_page_start,
204
+ page_end=pn,
205
+ ))
206
+ current_section = stripped
207
+ current_lines = []
208
+ current_page_start = pn
209
+ elif current_section is not None:
210
+ current_lines.append(stripped)
211
+
212
+ # Save last section
213
+ if current_section and current_lines:
214
+ section_text = '\n'.join(current_lines).strip()
215
+ if section_text:
216
+ sections.append(ParsedSection(
217
+ title=current_section,
218
+ level=1,
219
+ text=section_text,
220
+ page_start=current_page_start,
221
+ page_end=len(page_lines) > 0 and page_lines[-1][1] or 0,
222
+ ))
223
+
224
+ # If no sections detected, create one big section from raw text
225
+ if not sections and raw_text.strip():
226
+ sections.append(ParsedSection(
227
+ title="Full Text",
228
+ level=1,
229
+ text=raw_text,
230
+ page_start=0,
231
+ page_end=0,
232
+ ))
233
+
234
+ return ParsedPaper(
235
+ paper_id=paper_id,
236
+ title=title,
237
+ abstract=abstract,
238
+ sections=sections,
239
+ figures=figures,
240
+ raw_text=raw_text,
241
+ )
backend/rag/ingest/pipeline.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ingestion pipeline CLI: parse PDFs, chunk, embed, and store in ChromaDB.
2
+
3
+ Usage:
4
+ python -m rag.ingest.pipeline --papers-dir ./papers/ --config rag_config.yaml
5
+
6
+ Scans the papers directory for PDFs, matches them to dataset rows by filename,
7
+ and runs the full parse -> chunk -> embed -> store pipeline.
8
+ """
9
+
10
+ import argparse
11
+ import os
12
+ import sys
13
+ import time
14
+
15
+ from ..config import load_config, RAGConfig
16
+ from .pdf_parser import parse_pdf
17
+ from .chunker import chunk_paper
18
+ from .embedder import ChunkEmbedder
19
+ from .store import get_client, create_or_get_collection, upsert_chunks, delete_paper, get_collection_stats
20
+
21
+
22
+ def find_pdfs(papers_dir: str) -> list:
23
+ """Find all PDF files in the given directory."""
24
+ pdfs = []
25
+ for f in sorted(os.listdir(papers_dir)):
26
+ if f.lower().endswith('.pdf'):
27
+ pdfs.append(os.path.join(papers_dir, f))
28
+ return pdfs
29
+
30
+
31
+ def paper_id_from_path(pdf_path: str) -> str:
32
+ """Derive a paper_id from the PDF filename."""
33
+ name = os.path.splitext(os.path.basename(pdf_path))[0]
34
+ # Slugify: lowercase, replace spaces/special chars with hyphens
35
+ slug = name.lower().strip()
36
+ slug = slug.replace(' ', '-').replace('_', '-')
37
+ return slug
38
+
39
+
40
+ def ingest_single(pdf_path: str, config: RAGConfig, embedder: ChunkEmbedder, collection) -> dict:
41
+ """Ingest a single PDF. Returns stats dict."""
42
+ paper_id = paper_id_from_path(pdf_path)
43
+ print(f"\n Parsing: {os.path.basename(pdf_path)} (id={paper_id})")
44
+
45
+ # Parse
46
+ paper = parse_pdf(pdf_path, paper_id=paper_id)
47
+ print(f" Title: {paper.title[:80]}")
48
+ print(f" Sections: {len(paper.sections)}, Figures: {len(paper.figures)}")
49
+ print(f" Abstract: {len(paper.abstract)} chars")
50
+
51
+ # Chunk (pass embedder's model for semantic chunking)
52
+ chunks = chunk_paper(paper, config.chunking, model=embedder.model)
53
+ coarse = sum(1 for c in chunks if c.layer == "coarse")
54
+ mid = sum(1 for c in chunks if c.layer == "mid")
55
+ fine = sum(1 for c in chunks if c.layer == "fine")
56
+ # Count enrichment stats
57
+ with_topics = sum(1 for c in chunks if c.domain_topics)
58
+ roles = set(c.rhetorical_role for c in chunks if c.rhetorical_role)
59
+ print(f" Chunks: {len(chunks)} total (coarse={coarse}, mid={mid}, fine={fine})")
60
+ print(f" Enrichment: {with_topics} chunks with domain topics, roles: {roles}")
61
+
62
+ if not chunks:
63
+ print(f" WARNING: No chunks produced, skipping")
64
+ return {"paper_id": paper_id, "status": "empty", "n_chunks": 0}
65
+
66
+ # Delete existing chunks for this paper (idempotent re-ingestion)
67
+ delete_paper(collection, paper_id)
68
+
69
+ # Embed
70
+ embeddings = embedder.embed_chunks(chunks)
71
+ print(f" Embeddings: {embeddings.shape}")
72
+
73
+ # Store
74
+ upsert_chunks(collection, chunks, embeddings)
75
+ print(f" Stored in ChromaDB")
76
+
77
+ return {
78
+ "paper_id": paper_id,
79
+ "status": "success",
80
+ "n_chunks": len(chunks),
81
+ "n_sections": len(paper.sections),
82
+ "layers": {"coarse": coarse, "mid": mid, "fine": fine},
83
+ }
84
+
85
+
86
+ def run_ingestion(papers_dir: str, config: RAGConfig) -> dict:
87
+ """Run the full ingestion pipeline.
88
+
89
+ Args:
90
+ papers_dir: Directory containing PDF files.
91
+ config: RAG configuration.
92
+
93
+ Returns:
94
+ Summary dict with stats.
95
+ """
96
+ pdfs = find_pdfs(papers_dir)
97
+ if not pdfs:
98
+ print(f"No PDF files found in {papers_dir}")
99
+ return {"n_papers": 0, "n_chunks": 0, "errors": []}
100
+
101
+ print(f"Found {len(pdfs)} PDFs in {papers_dir}")
102
+ print(f"Embedding model: {config.embedding_model}")
103
+ print(f"ChromaDB path: {config.chroma_persist_dir}")
104
+
105
+ # Initialize
106
+ embedder = ChunkEmbedder(model_name=config.embedding_model)
107
+ client = get_client(config)
108
+ collection = create_or_get_collection(config, client)
109
+
110
+ results = []
111
+ errors = []
112
+ start = time.time()
113
+
114
+ for pdf_path in pdfs:
115
+ try:
116
+ result = ingest_single(pdf_path, config, embedder, collection)
117
+ results.append(result)
118
+ except Exception as e:
119
+ error_msg = f"{os.path.basename(pdf_path)}: {str(e)}"
120
+ print(f" ERROR: {error_msg}")
121
+ errors.append(error_msg)
122
+
123
+ elapsed = time.time() - start
124
+ total_chunks = sum(r.get("n_chunks", 0) for r in results)
125
+
126
+ stats = get_collection_stats(collection)
127
+
128
+ print(f"\n{'='*60}")
129
+ print(f"Ingestion complete in {elapsed:.1f}s")
130
+ print(f" Papers processed: {len(results)}")
131
+ print(f" Total chunks: {total_chunks}")
132
+ print(f" Errors: {len(errors)}")
133
+ print(f" Collection total: {stats['total_chunks']} chunks")
134
+
135
+ return {
136
+ "n_papers": len(results),
137
+ "n_chunks": total_chunks,
138
+ "elapsed_seconds": round(elapsed, 1),
139
+ "errors": errors,
140
+ "results": results,
141
+ }
142
+
143
+
144
+ def main():
145
+ parser = argparse.ArgumentParser(description="Ingest academic papers into ChromaDB")
146
+ parser.add_argument("--papers-dir", required=True, help="Directory containing PDF files")
147
+ parser.add_argument("--config", required=True, help="Path to rag_config.yaml")
148
+ args = parser.parse_args()
149
+
150
+ if not os.path.isdir(args.papers_dir):
151
+ print(f"Error: {args.papers_dir} is not a directory")
152
+ sys.exit(1)
153
+ if not os.path.isfile(args.config):
154
+ print(f"Error: {args.config} not found")
155
+ sys.exit(1)
156
+
157
+ config = load_config(args.config)
158
+ summary = run_ingestion(args.papers_dir, config)
159
+
160
+ if summary["errors"]:
161
+ print(f"\nErrors encountered:")
162
+ for e in summary["errors"]:
163
+ print(f" - {e}")
164
+ sys.exit(1)
165
+
166
+
167
+ if __name__ == "__main__":
168
+ main()
backend/rag/ingest/store.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ChromaDB storage operations for paper chunks.
2
+
3
+ Handles collection creation, chunk upserting, and deletion.
4
+ Uses PersistentClient so the index survives restarts.
5
+ """
6
+
7
+ import numpy as np
8
+ import chromadb
9
+
10
+ from .chunker import Chunk
11
+ from ..config import RAGConfig
12
+
13
+
14
+ def get_client(config: RAGConfig) -> chromadb.ClientAPI:
15
+ """Create a persistent ChromaDB client."""
16
+ return chromadb.PersistentClient(path=config.chroma_persist_dir)
17
+
18
+
19
+ def create_or_get_collection(config: RAGConfig, client: chromadb.ClientAPI = None):
20
+ """Get or create the paper chunks collection."""
21
+ if client is None:
22
+ client = get_client(config)
23
+ return client.get_or_create_collection(
24
+ name=config.collection_name,
25
+ metadata={"hnsw:space": "cosine"}
26
+ )
27
+
28
+
29
+ def upsert_chunks(collection, chunks: list, embeddings: np.ndarray):
30
+ """Batch upsert chunks with embeddings and metadata into ChromaDB.
31
+
32
+ Args:
33
+ collection: ChromaDB collection.
34
+ chunks: List of Chunk objects.
35
+ embeddings: numpy array of shape (n_chunks, dim).
36
+ """
37
+ if not chunks:
38
+ return
39
+
40
+ # ChromaDB has a batch limit; process in batches of 500
41
+ batch_size = 500
42
+ for i in range(0, len(chunks), batch_size):
43
+ batch_chunks = chunks[i:i + batch_size]
44
+ batch_embeddings = embeddings[i:i + batch_size]
45
+
46
+ collection.upsert(
47
+ ids=[c.chunk_id for c in batch_chunks],
48
+ embeddings=[e.tolist() for e in batch_embeddings],
49
+ documents=[c.text for c in batch_chunks],
50
+ metadatas=[{
51
+ "paper_id": c.paper_id,
52
+ "paper_title": c.paper_title,
53
+ "layer": c.layer,
54
+ "chunk_type": c.chunk_type,
55
+ "section": c.section,
56
+ "subsection": c.subsection or "",
57
+ "page": c.page,
58
+ "position": c.position,
59
+ "token_count": c.token_count,
60
+ "domain_topics": ", ".join(c.domain_topics) if c.domain_topics else "",
61
+ "rhetorical_role": c.rhetorical_role or "",
62
+ "content_type": c.content_type or "",
63
+ } for c in batch_chunks]
64
+ )
65
+
66
+
67
+ def delete_paper(collection, paper_id: str):
68
+ """Remove all chunks for a paper (for re-ingestion)."""
69
+ collection.delete(where={"paper_id": paper_id})
70
+
71
+
72
+ def get_collection_stats(collection) -> dict:
73
+ """Return basic stats about the collection."""
74
+ count = collection.count()
75
+ sample = collection.peek(limit=5) if count > 0 else {}
76
+ return {
77
+ "total_chunks": count,
78
+ "sample_ids": sample.get("ids", []),
79
+ }
backend/rag/query_engine.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic query engine: replaces LLM Pass 1 with ML-based decisions.
2
+
3
+ Given a natural language query, this module:
4
+ 1. Embeds the query with sentence-transformer
5
+ 2. Searches ChromaDB for relevant paper chunks
6
+ 3. Computes query-to-method similarity to find relevant methods
7
+ 4. Adjusts column weights based on query-column similarity
8
+ 5. Picks color-by and highlight methods deterministically
9
+
10
+ The LLM is only used for Pass 2: interpreting results.
11
+ """
12
+
13
+ import re
14
+ import numpy as np
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ from collections import Counter, defaultdict
17
+
18
+
19
+ # Column keywords for deterministic weight boosting
20
+ COLUMN_KEYWORDS = {
21
+ 'Planning Method': [
22
+ 'planning', 'sampling', 'regression', 'analytical', 'optimization',
23
+ 'reinforcement learning', 'rl', 'generative', 'diffusion', 'vae',
24
+ ],
25
+ 'Training Data': [
26
+ 'training', 'sim', 'real', 'sim-to-real', 'transfer', 'dataset',
27
+ 'synthetic', 'self-supervised', 'supervised',
28
+ ],
29
+ 'End-effector Hardware': [
30
+ 'gripper', 'two-finger', 'parallel-jaw', 'multi-finger', 'dexterous',
31
+ 'suction', 'hand', 'end-effector',
32
+ ],
33
+ 'Object Configuration': [
34
+ 'cluttered', 'piled', 'singulated', 'packed', 'bin picking',
35
+ 'scene', 'objects', 'stacked',
36
+ ],
37
+ 'Input Data': [
38
+ 'point cloud', 'depth', 'rgb', 'rgbd', 'image', 'voxel', 'tsdf',
39
+ 'tactile', 'sensor', 'camera',
40
+ ],
41
+ 'Output Pose': [
42
+ '6-dof', '7-dof', 'grasp pose', 'pose', 'configuration',
43
+ 'rectangle', 'quality',
44
+ ],
45
+ 'Backbone': [
46
+ 'pointnet', 'resnet', 'vgg', 'transformer', 'cnn', 'architecture',
47
+ 'network', 'encoder', 'decoder',
48
+ ],
49
+ 'Metric(s) Used ': [
50
+ 'metric', 'loss', 'loss function', 'success rate', 'accuracy',
51
+ 'precision', 'recall', 'evaluation',
52
+ ],
53
+ 'Corresponding Dataset (see repository linked above)': [
54
+ 'dataset', 'benchmark', 'acronym', 'graspnet', 'ycb', 'shapenet',
55
+ ],
56
+ 'Simulator (see repository linked above)': [
57
+ 'simulator', 'simulation', 'isaac', 'mujoco', 'pybullet', 'gazebo',
58
+ ],
59
+ 'Camera Position(s)': [
60
+ 'camera', 'overhead', 'eye-in-hand', 'multi-view', 'viewpoint',
61
+ ],
62
+ 'Language': [
63
+ 'pytorch', 'tensorflow', 'python', 'framework', 'implementation',
64
+ ],
65
+ 'Description': [
66
+ 'describe', 'overview', 'summary', 'about', 'explain',
67
+ ],
68
+ }
69
+
70
+ # Color-by mapping: query keywords -> best column to color by
71
+ COLOR_BY_KEYWORDS = {
72
+ 'Planning Method': ['planning', 'sampling', 'regression', 'rl', 'approach', 'method type'],
73
+ 'Training Data': ['training', 'sim', 'real', 'sim-to-real', 'transfer'],
74
+ 'End-effector Hardware': ['gripper', 'finger', 'dexterous', 'suction', 'end-effector', 'hand'],
75
+ 'Object Configuration': ['cluttered', 'piled', 'scene', 'objects', 'singulated', 'bin'],
76
+ 'Input Data': ['point cloud', 'depth', 'rgb', 'image', 'sensor', 'input'],
77
+ 'Backbone': ['architecture', 'network', 'pointnet', 'transformer', 'cnn', 'backbone'],
78
+ 'Learning Paradigm': ['learning', 'paradigm', 'classical', 'hybrid'],
79
+ 'Sensor Complexity': ['sensor', 'modality', 'multimodal', '3d', '2d'],
80
+ 'Scene Difficulty': ['difficulty', 'easy', 'hard', 'complex'],
81
+ 'Gripper Type': ['gripper type', 'parallel-jaw', 'dexterous', 'suction'],
82
+ 'Method Era': ['year', 'era', 'recent', 'old', 'modern', 'pioneer'],
83
+ }
84
+
85
+
86
+ def compute_query_column_relevance(query: str, model) -> dict:
87
+ """Compute how relevant each column is to the query using embedding similarity.
88
+
89
+ Returns dict of column_name -> similarity_score.
90
+ """
91
+ query_lower = query.lower()
92
+
93
+ # Keyword-based scoring (fast, deterministic)
94
+ scores = {}
95
+ for col, keywords in COLUMN_KEYWORDS.items():
96
+ score = sum(1 for kw in keywords if kw in query_lower)
97
+ scores[col] = score
98
+
99
+ return scores
100
+
101
+
102
+ def compute_weights_from_query(query: str, default_weights: dict, model=None) -> dict:
103
+ """Adjust column weights based on query relevance. Boost relevant columns, keep others at default."""
104
+ relevance = compute_query_column_relevance(query, model)
105
+
106
+ weights = dict(default_weights)
107
+ for col, score in relevance.items():
108
+ if col in weights and score > 0:
109
+ # Boost proportional to keyword matches, cap at 20
110
+ boost = min(score * 3, 10)
111
+ weights[col] = min(20, weights[col] + boost)
112
+
113
+ return weights
114
+
115
+
116
+ def pick_color_by(query: str) -> str:
117
+ """Deterministically pick the best color-by column from query keywords."""
118
+ query_lower = query.lower()
119
+ best_col = 'cluster'
120
+ best_score = 0
121
+
122
+ for col, keywords in COLOR_BY_KEYWORDS.items():
123
+ score = sum(1 for kw in keywords if kw in query_lower)
124
+ if score > best_score:
125
+ best_score = score
126
+ best_col = col
127
+
128
+ return best_col
129
+
130
+
131
+ def find_relevant_methods(query: str, df, model, top_k: int = 10) -> list:
132
+ """Find methods most relevant to the query using embedding similarity.
133
+
134
+ Embeds the query and compares against method Description embeddings.
135
+ Returns list of method names sorted by relevance.
136
+ """
137
+ # Embed query
138
+ query_embedding = model.encode(query).reshape(1, -1)
139
+
140
+ # Embed all descriptions
141
+ descriptions = df['Description'].fillna('').tolist()
142
+ names = df['Name'].tolist()
143
+
144
+ # Also match against concatenated key columns for broader matching
145
+ combined = []
146
+ for _, row in df.iterrows():
147
+ parts = [str(row.get('Description', ''))]
148
+ for col in ['Planning Method', 'End-effector Hardware', 'Input Data',
149
+ 'Object Configuration', 'Training Data']:
150
+ val = str(row.get(col, '')) if not (isinstance(row.get(col), float) and np.isnan(row.get(col))) else ''
151
+ if val:
152
+ parts.append(val)
153
+ combined.append(' '.join(parts))
154
+
155
+ desc_embeddings = model.encode(combined, show_progress_bar=False)
156
+
157
+ # Cosine similarity
158
+ sims = cosine_similarity(query_embedding, desc_embeddings)[0]
159
+
160
+ # Sort by similarity
161
+ ranked = sorted(zip(names, sims), key=lambda x: x[1], reverse=True)
162
+
163
+ return ranked[:top_k]
164
+
165
+
166
+ def should_filter(query: str) -> bool:
167
+ """Determine if the query implies filtering to a subset of methods."""
168
+ query_lower = query.lower()
169
+ # Comparison and exploration queries should NOT filter
170
+ no_filter_signals = ['compare', 'overview', 'all methods', 'landscape', 'field',
171
+ 'difference between', 'vs', 'versus', 'how do', 'survey']
172
+ if any(s in query_lower for s in no_filter_signals):
173
+ return False
174
+ # Filter signals
175
+ filter_signals = ['which methods', 'find methods', 'methods for', 'methods that',
176
+ 'best for', 'suitable for', 'show me', 'i need']
177
+ return any(s in query_lower for s in filter_signals)
178
+
179
+
180
+ def extract_citations_from_chunks(chunks) -> list:
181
+ """Extract academic citations referenced within retrieved chunk text.
182
+
183
+ Looks for patterns like:
184
+ - Author-year: (Smith et al., 2022), (Smith and Jones, 2020)
185
+ - Numbered: [1], [1, 5, 12], [32]
186
+
187
+ Returns list of {name, count, source_papers} sorted by frequency.
188
+ """
189
+ # Pattern for author-year citations: (Author et al., YYYY) or (Author and Author, YYYY)
190
+ author_year_re = re.compile(
191
+ r'\(([A-Z][a-z]+(?:\s+(?:et\s+al\.|and\s+[A-Z][a-z]+))?)[.,]?\s*(\d{4})\)'
192
+ )
193
+ # Pattern for numbered citations: [N] or [N, M, ...]
194
+ numbered_re = re.compile(r'\[(\d+(?:\s*[,;]\s*\d+)*)\]')
195
+
196
+ citation_counts = Counter()
197
+ citation_sources = defaultdict(set) # which source paper mentioned this citation
198
+
199
+ for chunk in chunks:
200
+ text = chunk.text
201
+ source = chunk.paper_title
202
+
203
+ # Extract author-year citations
204
+ for match in author_year_re.finditer(text):
205
+ author = match.group(1).strip()
206
+ year = match.group(2)
207
+ ref_name = f"{author}, {year}"
208
+ citation_counts[ref_name] += 1
209
+ citation_sources[ref_name].add(source)
210
+
211
+ # Extract numbered citations and expand ranges
212
+ for match in numbered_re.finditer(text):
213
+ nums_str = match.group(1)
214
+ nums = [n.strip() for n in re.split(r'[,;]', nums_str)]
215
+ for n in nums:
216
+ if n.isdigit():
217
+ ref_name = f"[{n}]"
218
+ citation_counts[ref_name] += 1
219
+ citation_sources[ref_name].add(source)
220
+
221
+ # Only keep author-year citations (numbered [1] [32] are ambiguous across papers)
222
+ results = []
223
+ for ref, count in citation_counts.most_common(20):
224
+ if ref.startswith('['):
225
+ continue # Skip numbered refs entirely - they're paper-specific and meaningless across papers
226
+ results.append({
227
+ 'name': ref,
228
+ 'count': count,
229
+ 'source_papers': list(citation_sources[ref]),
230
+ })
231
+
232
+ return results[:15]
233
+
234
+
235
+ def deterministic_query_pipeline(query: str, df, model, default_weights: dict,
236
+ retriever=None) -> dict:
237
+ """Full deterministic query pipeline. Replaces LLM Pass 1.
238
+
239
+ Returns dict with:
240
+ weights, colorBy, filterMethods, highlightMethods,
241
+ rag_text, rag_citations, relevant_method_summaries
242
+ """
243
+ # 1. Compute weights from query
244
+ weights = compute_weights_from_query(query, default_weights, model)
245
+
246
+ # 2. Pick color-by
247
+ color_by = pick_color_by(query)
248
+
249
+ # 3. Find relevant methods via embedding similarity
250
+ ranked_methods = find_relevant_methods(query, df, model, top_k=15)
251
+
252
+ # 4. Decide filtering
253
+ filter_methods = None
254
+ if should_filter(query):
255
+ # Filter to methods with similarity > threshold or top 15
256
+ threshold = 0.15
257
+ relevant = [name for name, sim in ranked_methods if sim > threshold]
258
+ if 3 <= len(relevant) < len(df):
259
+ filter_methods = relevant
260
+
261
+ # 5. Highlights: top 5-8 most relevant methods
262
+ highlight_methods = [name for name, sim in ranked_methods[:min(8, len(ranked_methods))]]
263
+
264
+ # 6. RAG retrieval from vector DB
265
+ rag_text = ""
266
+ rag_citations = []
267
+ rag_analytics = {}
268
+ if retriever is not None:
269
+ try:
270
+ from .retrieval.formatter import format_for_prompt, format_chunk_citations
271
+ from .ingest.store import get_client, create_or_get_collection
272
+
273
+ chunks = retriever.retrieve(query)
274
+ rag_text = format_for_prompt(chunks, token_budget=1500)
275
+ rag_citations = format_chunk_citations(chunks)
276
+
277
+ # Build analytics from retrieved chunk metadata
278
+ config = retriever.config
279
+ client = get_client(config)
280
+ col = create_or_get_collection(config, client)
281
+
282
+ # Fetch full metadata for retrieved chunks
283
+ if chunks:
284
+ chunk_ids = [c.chunk_id for c in chunks]
285
+ meta_result = col.get(ids=chunk_ids, include=['metadatas'])
286
+ metas = meta_result.get('metadatas', [])
287
+
288
+ # Paper source distribution (use paper_id slug as display name since parsed titles can be garbled)
289
+ def format_paper_id(pid):
290
+ return pid.replace('-', ' ').title()
291
+ paper_counts = Counter(format_paper_id(c.paper_id) for c in chunks)
292
+ # Domain topic frequency across retrieved chunks
293
+ topic_counts = Counter()
294
+ for m in metas:
295
+ topics_str = m.get('domain_topics', '')
296
+ if topics_str:
297
+ for t in topics_str.split(', '):
298
+ if t.strip():
299
+ topic_counts[t.strip()] += 1
300
+ # Rhetorical role distribution
301
+ role_counts = Counter(m.get('rhetorical_role', 'unknown') for m in metas)
302
+ # Content type distribution
303
+ content_type_counts = Counter(m.get('content_type', 'unknown') for m in metas)
304
+ # Section distribution
305
+ section_counts = Counter(m.get('section', 'unknown') for m in metas)
306
+
307
+ # Extract cited references from chunk text
308
+ cited_refs = extract_citations_from_chunks(chunks)
309
+
310
+ rag_analytics = {
311
+ 'paperSources': [{'name': k, 'count': v} for k, v in paper_counts.most_common(10)],
312
+ 'domainTopics': [{'topic': k, 'count': v} for k, v in topic_counts.most_common(15)],
313
+ 'rhetoricalRoles': [{'role': k, 'count': v} for k, v in role_counts.most_common()],
314
+ 'contentTypes': [{'type': k, 'count': v} for k, v in content_type_counts.most_common()],
315
+ 'sections': [{'section': k, 'count': v} for k, v in section_counts.most_common()],
316
+ 'citedReferences': cited_refs,
317
+ }
318
+ except Exception as e:
319
+ print(f"[RAG] Error: {e}")
320
+ import traceback
321
+ traceback.print_exc()
322
+
323
+ # 7. Build compact summaries for only the relevant methods (for LLM context)
324
+ relevant_names = set(name for name, _ in ranked_methods[:10])
325
+ method_summaries = []
326
+ for _, row in df.iterrows():
327
+ name = row.get('Name', '')
328
+ if name in relevant_names:
329
+ desc = str(row.get('Description', ''))[:150]
330
+ plan = str(row.get('Planning Method', ''))
331
+ hw = str(row.get('End-effector Hardware', ''))
332
+ inp = str(row.get('Input Data', ''))
333
+ method_summaries.append(f"- {name}: {plan}; {hw}; {inp}; {desc}")
334
+
335
+ # 8. Method relevance scores for visualization
336
+ method_relevance = [
337
+ {'name': name, 'score': round(float(sim), 4)}
338
+ for name, sim in ranked_methods
339
+ ]
340
+
341
+ return {
342
+ 'weights': weights,
343
+ 'colorBy': color_by,
344
+ 'filterMethods': filter_methods,
345
+ 'highlightMethods': highlight_methods,
346
+ 'rag_text': rag_text,
347
+ 'rag_citations': rag_citations,
348
+ 'rag_analytics': rag_analytics,
349
+ 'relevant_method_summaries': '\n'.join(method_summaries),
350
+ 'ranked_methods': ranked_methods,
351
+ 'method_relevance': method_relevance,
352
+ }
backend/rag/retrieval/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Chunk retrieval: query routing, vector search, and formatting."""
backend/rag/retrieval/formatter.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Format retrieved chunks for LLM prompt injection and frontend display."""
2
+
3
+ from .retriever import RetrievedChunk
4
+
5
+
6
+ def estimate_tokens(text: str) -> int:
7
+ """Approximate token count using whitespace splitting."""
8
+ return len(text.split())
9
+
10
+
11
+ def format_for_prompt(chunks: list, token_budget: int = 3000) -> str:
12
+ """Format retrieved chunks as text for LLM prompt injection.
13
+
14
+ Chunks are sorted by relevance. Stops adding chunks when token budget
15
+ is reached. Returns a formatted text block.
16
+
17
+ Args:
18
+ chunks: List of RetrievedChunk sorted by score descending.
19
+ token_budget: Maximum tokens of retrieved text to include.
20
+
21
+ Returns:
22
+ Formatted string for injection into LLM prompt.
23
+ """
24
+ if not chunks:
25
+ return ""
26
+
27
+ lines = []
28
+ total_tokens = 0
29
+
30
+ for chunk in chunks:
31
+ chunk_tokens = estimate_tokens(chunk.text)
32
+ if total_tokens + chunk_tokens > token_budget and lines:
33
+ break
34
+
35
+ header = f'--- From "{chunk.paper_title}" ({chunk.section}'
36
+ if chunk.subsection and chunk.subsection != chunk.section:
37
+ header += f" > {chunk.subsection}"
38
+ header += f", relevance: {chunk.score:.2f}) ---"
39
+
40
+ lines.append(header)
41
+ lines.append(chunk.text)
42
+ lines.append("")
43
+
44
+ total_tokens += chunk_tokens
45
+
46
+ return '\n'.join(lines).strip()
47
+
48
+
49
+ def format_chunk_citations(chunks: list) -> list:
50
+ """Format chunks as structured data for the frontend.
51
+
52
+ Returns a list of citation dicts for rendering in the InsightCard.
53
+ """
54
+ citations = []
55
+ for chunk in chunks:
56
+ # Return full text for top chunks so frontend can do keyword highlighting
57
+ text = chunk.text
58
+ snippet = text[:300] + "..." if len(text) > 300 else text
59
+
60
+ citations.append({
61
+ "paper_title": chunk.paper_title,
62
+ "paper_id": chunk.paper_id,
63
+ "section": chunk.section,
64
+ "subsection": chunk.subsection or "",
65
+ "layer": chunk.layer,
66
+ "score": chunk.score,
67
+ "snippet": snippet,
68
+ "full_text": text,
69
+ "page": getattr(chunk, 'page', 0),
70
+ })
71
+ return citations
backend/rag/retrieval/retriever.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAG retriever: query ChromaDB with intent-based routing and multi-layer mixing."""
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import numpy as np
6
+
7
+ from ..config import RAGConfig
8
+ from ..ingest.embedder import ChunkEmbedder
9
+ from ..ingest.store import create_or_get_collection, get_client
10
+ from .router import classify_intent, build_metadata_filter, QueryIntent
11
+
12
+
13
+ @dataclass
14
+ class RetrievedChunk:
15
+ chunk_id: str
16
+ text: str
17
+ paper_id: str
18
+ paper_title: str
19
+ section: str
20
+ subsection: str
21
+ layer: str
22
+ chunk_type: str
23
+ score: float
24
+ page: int = 0
25
+ rank: int = 0
26
+
27
+
28
+ class RAGRetriever:
29
+ def __init__(self, config: RAGConfig, embedder: ChunkEmbedder = None):
30
+ self.config = config
31
+ self.embedder = embedder or ChunkEmbedder(model_name=config.embedding_model)
32
+ self._client = get_client(config)
33
+ self._collection = create_or_get_collection(config, self._client)
34
+
35
+ def retrieve(self, query: str, paper_ids: list = None, intent: QueryIntent = None) -> list:
36
+ """Retrieve relevant chunks for a query.
37
+
38
+ Args:
39
+ query: Natural language query string.
40
+ paper_ids: Optional list of paper IDs to restrict search to.
41
+ intent: Optional pre-classified intent (auto-classified if None).
42
+
43
+ Returns:
44
+ List of RetrievedChunk objects sorted by relevance.
45
+ """
46
+ if self._collection.count() == 0:
47
+ return []
48
+
49
+ if intent is None:
50
+ intent = classify_intent(query)
51
+
52
+ query_embedding = self.embedder.embed_query(query)
53
+
54
+ # Multi-layer search: query each target layer separately, then merge
55
+ chunks = self._multi_layer_search(query_embedding, intent, paper_ids)
56
+
57
+ # Deduplicate by chunk_id (same chunk might match across queries)
58
+ seen = set()
59
+ unique = []
60
+ for chunk in chunks:
61
+ if chunk.chunk_id not in seen:
62
+ seen.add(chunk.chunk_id)
63
+ unique.append(chunk)
64
+
65
+ # Sort by score descending
66
+ unique.sort(key=lambda c: c.score, reverse=True)
67
+
68
+ # Assign ranks
69
+ for i, chunk in enumerate(unique):
70
+ chunk.rank = i + 1
71
+
72
+ return unique
73
+
74
+ def _multi_layer_search(self, query_embedding: np.ndarray, intent: QueryIntent, paper_ids: list = None) -> list:
75
+ """Query ChromaDB per target layer, then merge. Falls back to broad search."""
76
+ from .router import INTENT_SECTIONS
77
+
78
+ routing = INTENT_SECTIONS[intent]
79
+ target_layers = routing["layers"]
80
+ results = []
81
+ total_top_k = (self.config.retrieval.coarse_top_k +
82
+ self.config.retrieval.mid_top_k +
83
+ self.config.retrieval.fine_top_k)
84
+
85
+ # Strategy: try layer-filtered search first, fall back to broad search
86
+ # Section filtering is skipped because parsed section names may not
87
+ # match the canonical names in the routing table.
88
+ for layer in target_layers:
89
+ layer_top_k = {
90
+ "coarse": self.config.retrieval.coarse_top_k,
91
+ "mid": self.config.retrieval.mid_top_k,
92
+ "fine": self.config.retrieval.fine_top_k,
93
+ }
94
+ top_k = layer_top_k.get(layer, 4)
95
+
96
+ conditions = [{"layer": layer}]
97
+ if paper_ids:
98
+ conditions.append({"paper_id": {"$in": paper_ids}})
99
+ where_filter = conditions[0] if len(conditions) == 1 else {"$and": conditions}
100
+
101
+ try:
102
+ query_result = self._collection.query(
103
+ query_embeddings=[query_embedding.tolist()],
104
+ n_results=top_k,
105
+ where=where_filter,
106
+ include=["documents", "metadatas", "distances"],
107
+ )
108
+ except Exception:
109
+ continue
110
+
111
+ # Parse results
112
+ if not query_result or not query_result.get("ids") or not query_result["ids"][0]:
113
+ continue
114
+
115
+ ids = query_result["ids"][0]
116
+ docs = query_result["documents"][0]
117
+ metas = query_result["metadatas"][0]
118
+ distances = query_result["distances"][0]
119
+
120
+ for j in range(len(ids)):
121
+ # ChromaDB returns cosine distance; convert to similarity
122
+ score = 1.0 - distances[j]
123
+ meta = metas[j]
124
+
125
+ results.append(RetrievedChunk(
126
+ chunk_id=ids[j],
127
+ text=docs[j],
128
+ paper_id=meta.get("paper_id", ""),
129
+ paper_title=meta.get("paper_title", ""),
130
+ section=meta.get("section", ""),
131
+ subsection=meta.get("subsection", ""),
132
+ layer=meta.get("layer", ""),
133
+ chunk_type=meta.get("chunk_type", ""),
134
+ score=round(score, 4),
135
+ page=meta.get("page", 0),
136
+ ))
137
+
138
+ # Fallback: if layer-filtered search returned nothing, do a broad search
139
+ if not results:
140
+ where_filter = {"paper_id": {"$in": paper_ids}} if paper_ids else None
141
+ try:
142
+ query_result = self._collection.query(
143
+ query_embeddings=[query_embedding.tolist()],
144
+ n_results=total_top_k,
145
+ where=where_filter,
146
+ include=["documents", "metadatas", "distances"],
147
+ )
148
+ if query_result and query_result.get("ids") and query_result["ids"][0]:
149
+ ids = query_result["ids"][0]
150
+ docs = query_result["documents"][0]
151
+ metas = query_result["metadatas"][0]
152
+ distances = query_result["distances"][0]
153
+ for j in range(len(ids)):
154
+ score = 1.0 - distances[j]
155
+ meta = metas[j]
156
+ results.append(RetrievedChunk(
157
+ chunk_id=ids[j],
158
+ text=docs[j],
159
+ paper_id=meta.get("paper_id", ""),
160
+ paper_title=meta.get("paper_title", ""),
161
+ section=meta.get("section", ""),
162
+ subsection=meta.get("subsection", ""),
163
+ layer=meta.get("layer", ""),
164
+ chunk_type=meta.get("chunk_type", ""),
165
+ score=round(score, 4),
166
+ ))
167
+ except Exception:
168
+ pass
169
+
170
+ return results
backend/rag/retrieval/router.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Query intent classifier for routing retrieval to the right chunks.
2
+
3
+ Keyword-based (no ML model). Maps query intent to ChromaDB metadata
4
+ filters so we search the right sections and layers.
5
+ """
6
+
7
+ from enum import Enum
8
+
9
+
10
+ class QueryIntent(Enum):
11
+ BROAD = "broad"
12
+ TECHNICAL = "technical"
13
+ EVALUATION = "evaluation"
14
+ COMPARISON = "comparison"
15
+ LIMITATION = "limitation"
16
+
17
+
18
+ INTENT_KEYWORDS = {
19
+ QueryIntent.TECHNICAL: [
20
+ "equation", "loss", "reward", "objective", "architecture", "algorithm",
21
+ "network", "model", "training", "backbone", "policy", "dynamics",
22
+ "controller", "optimization", "gradient", "inference", "pipeline",
23
+ ],
24
+ QueryIntent.EVALUATION: [
25
+ "benchmark", "dataset", "result", "accuracy", "success rate",
26
+ "real-world", "experiment", "ablation", "baseline", "metric",
27
+ "performance", "evaluation", "table", "figure", "demo",
28
+ ],
29
+ QueryIntent.COMPARISON: [
30
+ "compare", "comparison", "differ", "difference", "vs", "versus",
31
+ "better", "worse", "advantage", "disadvantage", "trade-off",
32
+ ],
33
+ QueryIntent.LIMITATION: [
34
+ "limitation", "failure", "gap", "future", "weakness", "drawback",
35
+ "challenge", "issue", "problem", "cannot", "unable",
36
+ ],
37
+ }
38
+
39
+ INTENT_SECTIONS = {
40
+ QueryIntent.BROAD: {
41
+ "layers": ["coarse"],
42
+ "sections": None, # No section filter
43
+ },
44
+ QueryIntent.TECHNICAL: {
45
+ "layers": ["mid", "fine"],
46
+ "sections": ["Method", "Methods", "Methodology", "Approach", "Background"],
47
+ },
48
+ QueryIntent.EVALUATION: {
49
+ "layers": ["mid", "fine"],
50
+ "sections": ["Experiments", "Results", "Evaluation", "Figures"],
51
+ },
52
+ QueryIntent.COMPARISON: {
53
+ "layers": ["coarse", "mid"],
54
+ "sections": ["Related Work", "Introduction", "Discussion"],
55
+ },
56
+ QueryIntent.LIMITATION: {
57
+ "layers": ["mid"],
58
+ "sections": ["Discussion", "Conclusion", "Conclusions"],
59
+ },
60
+ }
61
+
62
+
63
+ def classify_intent(query: str) -> QueryIntent:
64
+ """Classify query intent based on keyword matching.
65
+
66
+ Returns the intent with the highest keyword match count.
67
+ Defaults to BROAD if no keywords match.
68
+ """
69
+ query_lower = query.lower()
70
+ scores = {}
71
+
72
+ for intent, keywords in INTENT_KEYWORDS.items():
73
+ score = sum(1 for kw in keywords if kw in query_lower)
74
+ if score > 0:
75
+ scores[intent] = score
76
+
77
+ if not scores:
78
+ return QueryIntent.BROAD
79
+
80
+ return max(scores, key=scores.get)
81
+
82
+
83
+ def build_metadata_filter(intent: QueryIntent, paper_ids: list = None) -> dict:
84
+ """Build a ChromaDB where-clause from intent and optional paper filter.
85
+
86
+ Returns a dict suitable for ChromaDB's `where` parameter.
87
+ """
88
+ routing = INTENT_SECTIONS[intent]
89
+ conditions = []
90
+
91
+ # Layer filter
92
+ layers = routing["layers"]
93
+ if layers:
94
+ conditions.append({"layer": {"$in": layers}})
95
+
96
+ # Section filter
97
+ sections = routing.get("sections")
98
+ if sections:
99
+ conditions.append({"section": {"$in": sections}})
100
+
101
+ # Paper filter
102
+ if paper_ids:
103
+ conditions.append({"paper_id": {"$in": paper_ids}})
104
+
105
+ if not conditions:
106
+ return {}
107
+ if len(conditions) == 1:
108
+ return conditions[0]
109
+ return {"$and": conditions}
backend/rag/tools/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Tool calling: registry and domain-agnostic statistical/ML/data/RAG tools."""
2
+
3
+ # Import all tool modules to trigger @register_tool decorators
4
+ from . import statistical
5
+ from . import ml_tools
6
+ from . import data_tools
7
+ from . import rag_tool
backend/rag/tools/data_tools.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data grounding tools: filtering, aggregation, cross-tabulation."""
2
+
3
+ from collections import Counter
4
+
5
+ from .registry import register_tool, ToolContext
6
+
7
+
8
+ @register_tool(
9
+ name="filter_and_count",
10
+ description="Filter the dataset by column values and return matching method names and count",
11
+ parameters={
12
+ "type": "object",
13
+ "properties": {
14
+ "filters": {
15
+ "type": "object",
16
+ "description": "Column-value pairs to filter by, e.g. {\"Planning Method\": \"Sampling\", \"Training Data\": \"Sim\"}",
17
+ },
18
+ },
19
+ "required": ["filters"],
20
+ },
21
+ category="data",
22
+ )
23
+ def filter_and_count_tool(context: ToolContext, filters: dict) -> dict:
24
+ df = context.df
25
+ name_col = df.columns[0]
26
+ mask = [True] * len(df)
27
+
28
+ applied = []
29
+ for col, value in filters.items():
30
+ if col not in df.columns:
31
+ continue
32
+ col_mask = df[col].fillna('').astype(str).str.contains(value, case=False, na=False)
33
+ mask = [m and c for m, c in zip(mask, col_mask)]
34
+ applied.append(f"{col}={value}")
35
+
36
+ matching = df.loc[mask, name_col].tolist()
37
+ return {
38
+ "filters_applied": applied,
39
+ "count": len(matching),
40
+ "methods": matching,
41
+ }
42
+
43
+
44
+ @register_tool(
45
+ name="cross_tabulate",
46
+ description="Create a contingency table of two columns showing co-occurrence counts",
47
+ parameters={
48
+ "type": "object",
49
+ "properties": {
50
+ "column_a": {"type": "string", "description": "First column name"},
51
+ "column_b": {"type": "string", "description": "Second column name"},
52
+ },
53
+ "required": ["column_a", "column_b"],
54
+ },
55
+ category="data",
56
+ )
57
+ def cross_tabulate_tool(context: ToolContext, column_a: str, column_b: str) -> dict:
58
+ df = context.df
59
+ if column_a not in df.columns:
60
+ raise ValueError(f"Column '{column_a}' not found")
61
+ if column_b not in df.columns:
62
+ raise ValueError(f"Column '{column_b}' not found")
63
+
64
+ # Build cross-tab handling multi-value cells
65
+ table = {}
66
+ for _, row in df.iterrows():
67
+ vals_a = [p.strip() for p in str(row.get(column_a, '')).split(',') if p.strip()]
68
+ vals_b = [p.strip() for p in str(row.get(column_b, '')).split(',') if p.strip()]
69
+ for va in vals_a:
70
+ for vb in vals_b:
71
+ table.setdefault(va, {})
72
+ table[va][vb] = table[va].get(vb, 0) + 1
73
+
74
+ return {
75
+ "column_a": column_a,
76
+ "column_b": column_b,
77
+ "table": table,
78
+ }
79
+
80
+
81
+ @register_tool(
82
+ name="value_distribution",
83
+ description="Get value counts for a column, properly handling multi-value cells",
84
+ parameters={
85
+ "type": "object",
86
+ "properties": {
87
+ "column": {"type": "string", "description": "Column name to analyze"},
88
+ },
89
+ "required": ["column"],
90
+ },
91
+ category="data",
92
+ )
93
+ def value_distribution_tool(context: ToolContext, column: str) -> dict:
94
+ if column not in context.df.columns:
95
+ raise ValueError(f"Column '{column}' not found")
96
+
97
+ all_values = []
98
+ for val in context.df[column].fillna('').astype(str):
99
+ for part in [p.strip() for p in val.split(',')]:
100
+ if part:
101
+ all_values.append(part)
102
+
103
+ counts = dict(Counter(all_values).most_common())
104
+ return {
105
+ "column": column,
106
+ "total_entries": len(context.df),
107
+ "total_values": len(all_values),
108
+ "unique_values": len(counts),
109
+ "distribution": counts,
110
+ }
backend/rag/tools/ml_tools.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ML grounding tools: nearest neighbors, cluster analysis, feature importance."""
2
+
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ from .registry import register_tool, ToolContext
7
+
8
+
9
+ def _get_method_index(context: ToolContext, method_name: str) -> int:
10
+ """Find row index for a method name."""
11
+ name_col = context.df.columns[0]
12
+ matches = context.df[context.df[name_col] == method_name].index
13
+ if len(matches) == 0:
14
+ matches = context.df[context.df[name_col].str.lower() == method_name.lower()].index
15
+ if len(matches) == 0:
16
+ raise ValueError(f"Method '{method_name}' not found")
17
+ return matches[0]
18
+
19
+
20
+ @register_tool(
21
+ name="nearest_neighbors",
22
+ description="Find the k most similar methods to a given method based on feature embeddings",
23
+ parameters={
24
+ "type": "object",
25
+ "properties": {
26
+ "method": {"type": "string", "description": "Name of the target method"},
27
+ "k": {"type": "integer", "description": "Number of neighbors (default 5)"},
28
+ },
29
+ "required": ["method"],
30
+ },
31
+ category="ml",
32
+ )
33
+ def nearest_neighbors_tool(context: ToolContext, method: str, k: int = 5) -> dict:
34
+ if context.feature_matrix is None:
35
+ raise ValueError("Feature matrix not available")
36
+
37
+ idx = _get_method_index(context, method)
38
+ vec = context.feature_matrix[idx].reshape(1, -1)
39
+ sims = cosine_similarity(vec, context.feature_matrix)[0]
40
+
41
+ # Sort by similarity, exclude self
42
+ name_col = context.df.columns[0]
43
+ indices = np.argsort(sims)[::-1]
44
+ neighbors = []
45
+ for i in indices:
46
+ if i == idx:
47
+ continue
48
+ neighbors.append({
49
+ "name": context.df.iloc[i][name_col],
50
+ "similarity": round(float(sims[i]), 4),
51
+ })
52
+ if len(neighbors) >= k:
53
+ break
54
+
55
+ return {"method": method, "k": k, "neighbors": neighbors}
56
+
57
+
58
+ @register_tool(
59
+ name="cluster_membership",
60
+ description="Get the cluster assignment for a method, including its co-members and cluster characteristics",
61
+ parameters={
62
+ "type": "object",
63
+ "properties": {
64
+ "method": {"type": "string", "description": "Name of the method"},
65
+ },
66
+ "required": ["method"],
67
+ },
68
+ category="ml",
69
+ )
70
+ def cluster_membership_tool(context: ToolContext, method: str) -> dict:
71
+ if context.cluster_labels is None:
72
+ raise ValueError("Cluster labels not available")
73
+
74
+ idx = _get_method_index(context, method)
75
+ cluster_id = context.cluster_labels[idx]
76
+ name_col = context.df.columns[0]
77
+
78
+ co_members = []
79
+ for i, label in enumerate(context.cluster_labels):
80
+ if label == cluster_id and i != idx:
81
+ co_members.append(context.df.iloc[i][name_col])
82
+
83
+ return {
84
+ "method": method,
85
+ "cluster_id": int(cluster_id),
86
+ "cluster_size": len(co_members) + 1,
87
+ "co_members": co_members,
88
+ }
89
+
90
+
91
+ @register_tool(
92
+ name="feature_importance",
93
+ description="Identify which feature dimensions most distinguish a method from the dataset average",
94
+ parameters={
95
+ "type": "object",
96
+ "properties": {
97
+ "method": {"type": "string", "description": "Name of the method"},
98
+ "top_n": {"type": "integer", "description": "Number of top features to return (default 10)"},
99
+ },
100
+ "required": ["method"],
101
+ },
102
+ category="ml",
103
+ )
104
+ def feature_importance_tool(context: ToolContext, method: str, top_n: int = 10) -> dict:
105
+ if context.feature_matrix is None:
106
+ raise ValueError("Feature matrix not available")
107
+
108
+ idx = _get_method_index(context, method)
109
+ vec = context.feature_matrix[idx]
110
+ mean_vec = context.feature_matrix.mean(axis=0)
111
+ std_vec = context.feature_matrix.std(axis=0)
112
+ std_vec[std_vec == 0] = 1.0 # avoid division by zero
113
+
114
+ # Z-score deviation from mean
115
+ z_scores = (vec - mean_vec) / std_vec
116
+ top_indices = np.argsort(np.abs(z_scores))[::-1][:top_n]
117
+
118
+ features = []
119
+ for i in top_indices:
120
+ features.append({
121
+ "dimension": int(i),
122
+ "z_score": round(float(z_scores[i]), 3),
123
+ "value": round(float(vec[i]), 4),
124
+ "mean": round(float(mean_vec[i]), 4),
125
+ "direction": "above average" if z_scores[i] > 0 else "below average",
126
+ })
127
+
128
+ return {"method": method, "top_features": features}
129
+
130
+
131
+ @register_tool(
132
+ name="outlier_score",
133
+ description="Compute how atypical a method is relative to the dataset (average distance to all other methods)",
134
+ parameters={
135
+ "type": "object",
136
+ "properties": {
137
+ "method": {"type": "string", "description": "Name of the method"},
138
+ },
139
+ "required": ["method"],
140
+ },
141
+ category="ml",
142
+ )
143
+ def outlier_score_tool(context: ToolContext, method: str) -> dict:
144
+ if context.feature_matrix is None:
145
+ raise ValueError("Feature matrix not available")
146
+
147
+ idx = _get_method_index(context, method)
148
+ vec = context.feature_matrix[idx].reshape(1, -1)
149
+ sims = cosine_similarity(vec, context.feature_matrix)[0]
150
+
151
+ # Exclude self
152
+ other_sims = np.concatenate([sims[:idx], sims[idx + 1:]])
153
+ avg_sim = float(other_sims.mean())
154
+ min_sim = float(other_sims.min())
155
+
156
+ # Compute outlier score for all methods to get percentile
157
+ all_avg_sims = []
158
+ for i in range(len(context.feature_matrix)):
159
+ s = cosine_similarity(context.feature_matrix[i].reshape(1, -1), context.feature_matrix)[0]
160
+ others = np.concatenate([s[:i], s[i + 1:]])
161
+ all_avg_sims.append(others.mean())
162
+
163
+ percentile = float(np.sum(np.array(all_avg_sims) > avg_sim) / len(all_avg_sims) * 100)
164
+ interpretation = "typical" if percentile < 70 else "somewhat unusual" if percentile < 90 else "outlier"
165
+
166
+ return {
167
+ "method": method,
168
+ "avg_similarity_to_others": round(avg_sim, 4),
169
+ "min_similarity": round(min_sim, 4),
170
+ "outlier_percentile": round(percentile, 1),
171
+ "interpretation": interpretation,
172
+ }
backend/rag/tools/rag_tool.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAG search as a tool: lets the LLM request paper content on demand."""
2
+
3
+ from .registry import register_tool, ToolContext
4
+
5
+
6
+ @register_tool(
7
+ name="search_papers",
8
+ description="Search the academic paper corpus for relevant passages. Use this when the query asks about specific techniques, loss functions, architectures, training details, experimental results, or anything requiring actual paper content.",
9
+ parameters={
10
+ "type": "object",
11
+ "properties": {
12
+ "search_query": {
13
+ "type": "string",
14
+ "description": "What to search for in the papers (e.g., 'loss function for grasp quality', 'sim-to-real transfer', 'PointNet architecture')",
15
+ },
16
+ },
17
+ "required": ["search_query"],
18
+ },
19
+ category="rag",
20
+ )
21
+ def search_papers_tool(context: ToolContext, search_query: str) -> dict:
22
+ """Search ChromaDB for relevant paper chunks."""
23
+ import os
24
+ import sys
25
+
26
+ # Get the RAG retriever from the app-level lazy singleton
27
+ base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
28
+ config_path = os.path.join(base_dir, '..', 'rag_config.yaml')
29
+
30
+ from ..config import load_config
31
+ from ..retrieval.retriever import RAGRetriever
32
+ from ..ingest.embedder import ChunkEmbedder
33
+ from ..retrieval.formatter import format_for_prompt, format_chunk_citations
34
+
35
+ config = load_config(config_path)
36
+ embedder = ChunkEmbedder(model_name=config.embedding_model, model_instance=context.st_model)
37
+ retriever = RAGRetriever(config=config, embedder=embedder)
38
+
39
+ chunks = retriever.retrieve(search_query)
40
+ if not chunks:
41
+ return {"found": 0, "excerpts": [], "formatted": "No relevant paper content found."}
42
+
43
+ prompt_text = format_for_prompt(chunks, token_budget=config.retrieval.token_budget)
44
+ citations = format_chunk_citations(chunks)
45
+
46
+ # Return both formatted text (for LLM) and structured citations (for frontend)
47
+ return {
48
+ "found": len(chunks),
49
+ "formatted": prompt_text,
50
+ "citations": citations,
51
+ }
backend/rag/tools/registry.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tool registry with decorator-based registration.
2
+
3
+ Tools are Python functions that the LLM can request via its JSON response.
4
+ The registry generates JSON schemas for prompt injection and dispatches
5
+ tool calls safely with error handling.
6
+ """
7
+
8
+ import json
9
+ from dataclasses import dataclass, field
10
+ from typing import Callable
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+
16
+ @dataclass
17
+ class ToolContext:
18
+ """Shared context passed to all tool functions."""
19
+ df: pd.DataFrame
20
+ feature_matrix: np.ndarray = None
21
+ cluster_labels: list = None
22
+ weights: dict = None
23
+ st_model: object = None # SentenceTransformer instance
24
+
25
+
26
+ @dataclass
27
+ class ToolSpec:
28
+ name: str
29
+ description: str
30
+ parameters: dict
31
+ function: Callable
32
+ category: str
33
+
34
+
35
+ # Global registry
36
+ _TOOL_REGISTRY: dict = {}
37
+
38
+
39
+ def register_tool(name: str, description: str, parameters: dict, category: str = "general"):
40
+ """Decorator to register a callable as an LLM-invocable tool."""
41
+ def decorator(fn):
42
+ _TOOL_REGISTRY[name] = ToolSpec(
43
+ name=name,
44
+ description=description,
45
+ parameters=parameters,
46
+ function=fn,
47
+ category=category,
48
+ )
49
+ return fn
50
+ return decorator
51
+
52
+
53
+ def get_tool_schemas() -> list:
54
+ """Return JSON-schema descriptions of all registered tools (for LLM prompt)."""
55
+ return [
56
+ {
57
+ "name": t.name,
58
+ "description": t.description,
59
+ "parameters": t.parameters,
60
+ "category": t.category,
61
+ }
62
+ for t in _TOOL_REGISTRY.values()
63
+ ]
64
+
65
+
66
+ def get_tool_prompt_section() -> str:
67
+ """Format tool schemas as a text section for the LLM system prompt."""
68
+ schemas = get_tool_schemas()
69
+ if not schemas:
70
+ return ""
71
+
72
+ lines = [
73
+ "AVAILABLE TOOLS:",
74
+ "You may request computations by including a \"tools\" array in your JSON response.",
75
+ "Each tool call: {\"name\": \"tool_name\", \"arguments\": {...}}",
76
+ "Only request tools when the query genuinely needs computed results. Most queries don't need tools.",
77
+ "",
78
+ ]
79
+ for s in schemas:
80
+ params_desc = []
81
+ props = s["parameters"].get("properties", {})
82
+ for pname, pdef in props.items():
83
+ req = "(required)" if pname in s["parameters"].get("required", []) else "(optional)"
84
+ params_desc.append(f" {pname}: {pdef.get('description', pdef.get('type', ''))} {req}")
85
+
86
+ lines.append(f"- {s['name']}: {s['description']}")
87
+ if params_desc:
88
+ lines.extend(params_desc)
89
+ lines.append("")
90
+
91
+ return '\n'.join(lines)
92
+
93
+
94
+ def execute_tool(name: str, arguments: dict, context: ToolContext) -> dict:
95
+ """Dispatch a tool call. Returns {success, result, error}."""
96
+ tool = _TOOL_REGISTRY.get(name)
97
+ if not tool:
98
+ return {"success": False, "result": None, "error": f"Unknown tool: {name}"}
99
+ try:
100
+ result = tool.function(context=context, **arguments)
101
+ return {"success": True, "result": result, "error": None}
102
+ except Exception as e:
103
+ return {"success": False, "result": None, "error": str(e)}
104
+
105
+
106
+ def execute_tool_calls(tool_calls: list, context: ToolContext, max_calls: int = 5) -> list:
107
+ """Execute a list of tool calls from the LLM response.
108
+
109
+ Args:
110
+ tool_calls: List of {"name": str, "arguments": dict}.
111
+ context: Shared ToolContext with dataset and features.
112
+ max_calls: Safety limit on number of tool calls per query.
113
+
114
+ Returns:
115
+ List of {"name", "arguments", "success", "result", "error"}.
116
+ """
117
+ results = []
118
+ for call in tool_calls[:max_calls]:
119
+ name = call.get("name", "")
120
+ arguments = call.get("arguments", {})
121
+ result = execute_tool(name, arguments, context)
122
+ results.append({
123
+ "name": name,
124
+ "arguments": arguments,
125
+ **result,
126
+ })
127
+ return results
backend/rag/tools/statistical.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Statistical grounding tools.
2
+
3
+ These let the LLM request real computations instead of hallucinating numbers.
4
+ """
5
+
6
+ import numpy as np
7
+ from collections import Counter
8
+ from sklearn.metrics.pairwise import cosine_similarity as sk_cosine
9
+
10
+ from .registry import register_tool, ToolContext
11
+
12
+
13
+ def _get_method_index(context: ToolContext, method_name: str) -> int:
14
+ """Find row index for a method name. Raises ValueError if not found."""
15
+ matches = context.df[context.df[context.df.columns[0]] == method_name].index
16
+ if len(matches) == 0:
17
+ # Try case-insensitive
18
+ name_col = context.df.columns[0]
19
+ matches = context.df[context.df[name_col].str.lower() == method_name.lower()].index
20
+ if len(matches) == 0:
21
+ raise ValueError(f"Method '{method_name}' not found in dataset")
22
+ return matches[0]
23
+
24
+
25
+ @register_tool(
26
+ name="cosine_similarity",
27
+ description="Compute cosine similarity between two methods based on their weighted feature embeddings",
28
+ parameters={
29
+ "type": "object",
30
+ "properties": {
31
+ "method_a": {"type": "string", "description": "Name of first method"},
32
+ "method_b": {"type": "string", "description": "Name of second method"},
33
+ },
34
+ "required": ["method_a", "method_b"],
35
+ },
36
+ category="statistical",
37
+ )
38
+ def cosine_similarity_tool(context: ToolContext, method_a: str, method_b: str) -> dict:
39
+ if context.feature_matrix is None:
40
+ raise ValueError("Feature matrix not available")
41
+ idx_a = _get_method_index(context, method_a)
42
+ idx_b = _get_method_index(context, method_b)
43
+ vec_a = context.feature_matrix[idx_a].reshape(1, -1)
44
+ vec_b = context.feature_matrix[idx_b].reshape(1, -1)
45
+ sim = float(sk_cosine(vec_a, vec_b)[0, 0])
46
+ interpretation = "very similar" if sim > 0.8 else "moderately similar" if sim > 0.5 else "dissimilar"
47
+ return {
48
+ "method_a": method_a,
49
+ "method_b": method_b,
50
+ "cosine_similarity": round(sim, 4),
51
+ "interpretation": interpretation,
52
+ }
53
+
54
+
55
+ @register_tool(
56
+ name="pairwise_distances",
57
+ description="Compute pairwise cosine distances between a set of methods",
58
+ parameters={
59
+ "type": "object",
60
+ "properties": {
61
+ "methods": {"type": "array", "items": {"type": "string"}, "description": "List of method names (2-10)"},
62
+ },
63
+ "required": ["methods"],
64
+ },
65
+ category="statistical",
66
+ )
67
+ def pairwise_distances_tool(context: ToolContext, methods: list) -> dict:
68
+ if context.feature_matrix is None:
69
+ raise ValueError("Feature matrix not available")
70
+ if len(methods) > 10:
71
+ methods = methods[:10]
72
+
73
+ indices = [_get_method_index(context, m) for m in methods]
74
+ vecs = context.feature_matrix[indices]
75
+ sim_matrix = sk_cosine(vecs)
76
+ dist_matrix = 1.0 - sim_matrix
77
+
78
+ pairs = []
79
+ for i in range(len(methods)):
80
+ for j in range(i + 1, len(methods)):
81
+ pairs.append({
82
+ "method_a": methods[i],
83
+ "method_b": methods[j],
84
+ "distance": round(float(dist_matrix[i, j]), 4),
85
+ })
86
+
87
+ pairs.sort(key=lambda p: p["distance"])
88
+ return {"methods": methods, "pairs": pairs}
89
+
90
+
91
+ @register_tool(
92
+ name="distribution_stats",
93
+ description="Get value distribution for a dataset column, with optional grouping",
94
+ parameters={
95
+ "type": "object",
96
+ "properties": {
97
+ "column": {"type": "string", "description": "Column name to analyze"},
98
+ "group_by": {"type": "string", "description": "Optional column to group by"},
99
+ },
100
+ "required": ["column"],
101
+ },
102
+ category="statistical",
103
+ )
104
+ def distribution_stats_tool(context: ToolContext, column: str, group_by: str = None) -> dict:
105
+ if column not in context.df.columns:
106
+ raise ValueError(f"Column '{column}' not found. Available: {list(context.df.columns)}")
107
+
108
+ values = []
109
+ for val in context.df[column].fillna('').astype(str):
110
+ for part in [p.strip() for p in val.split(',')]:
111
+ if part:
112
+ values.append(part)
113
+
114
+ counts = dict(Counter(values).most_common(20))
115
+ n_unique = len(set(values))
116
+
117
+ result = {
118
+ "column": column,
119
+ "total_values": len(values),
120
+ "unique_values": n_unique,
121
+ "distribution": counts,
122
+ }
123
+
124
+ if group_by and group_by in context.df.columns:
125
+ groups = {}
126
+ for _, row in context.df.iterrows():
127
+ g = str(row.get(group_by, ''))
128
+ v = str(row.get(column, ''))
129
+ for part in [p.strip() for p in v.split(',')]:
130
+ if part:
131
+ groups.setdefault(g, []).append(part)
132
+ result["grouped"] = {g: dict(Counter(vs).most_common(5)) for g, vs in groups.items()}
133
+
134
+ return result
backend/requirements.txt CHANGED
@@ -8,3 +8,6 @@ hdbscan>=0.8.33
8
  umap-learn==0.5.5
9
  sentence-transformers>=2.2.0
10
  huggingface_hub>=0.20.0
 
 
 
 
8
  umap-learn==0.5.5
9
  sentence-transformers>=2.2.0
10
  huggingface_hub>=0.20.0
11
+ chromadb>=0.5.0
12
+ pdfplumber>=0.10.0
13
+ pyyaml>=6.0
docs/pipeline-architecture.md ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grasp Explorer: Full Pipeline Architecture
2
+
3
+ ## What This Product Does
4
+
5
+ The Grasp Explorer is an interactive dashboard for the NSF-funded COMPARE ecosystem (robot-manipulation.org) that lets researchers explore relationships among 56+ robotic grasp planning methods. Instead of manually comparing papers in a spreadsheet, researchers type natural language questions ("How do point cloud methods compare to depth image approaches for cluttered bin picking?") and the system:
6
+
7
+ 1. Finds the most relevant methods and papers using vector similarity
8
+ 2. Adjusts the visualization to emphasize the attributes that matter for the question
9
+ 3. Clusters methods into natural groups using density-based clustering
10
+ 4. Retrieves actual passages from the research papers via a vector database
11
+ 5. Generates grounded, citation-backed insights using a single LLM call
12
+ 6. Displays interactive visualizations showing method similarity, evidence breakdown, and topic distribution
13
+
14
+ The key design principle is **deterministic computation + grounded LLM interpretation**. The system never asks the LLM to guess at data or make decisions that can be computed. Every number, every grouping, every similarity score is computed by the pipeline. The LLM only interprets results that have already been computed.
15
+
16
+ ---
17
+
18
+ ## Layer 1: Feature Engineering
19
+
20
+ ### Raw Dataset
21
+
22
+ The dataset is a CSV (`datasets/csv-gp-combined.csv`) with 56 rows (one per grasp planning method) and 20 columns. 13 of these columns are weighted features used for computing method similarity.
23
+
24
+ ### Categorical Feature Processing (TF-IDF)
25
+
26
+ Each categorical column (Planning Method, End-effector Hardware, Input Data, etc.) is converted to a numerical vector using TF-IDF (Term Frequency-Inverse Document Frequency).
27
+
28
+ - **Vectorizer settings**: `max_features=50`, `ngram_range=(1, 2)` to capture both single terms and bigrams
29
+ - **Multi-value handling**: Many cells contain comma-separated values (e.g., "Sampling, Direct regression"). The `smart_split()` function parses these respecting quoted fields, so "6-DoF grasp pose (x, y, z, r, p, y)" is treated as one value, not six.
30
+ - **Normalization**: Multi-value cells are sorted alphabetically before TF-IDF so that "Sampling, Analytical" and "Analytical, Sampling" produce identical vectors.
31
+ - **Result**: Each column becomes a sparse matrix of shape (56, up to 50).
32
+
33
+ ### Description Embeddings (Sentence-Transformer)
34
+
35
+ The free-text Description column is processed differently from categorical columns:
36
+
37
+ 1. Each description is encoded using `all-MiniLM-L6-v2` (a sentence-transformer model) producing a 384-dimensional dense vector.
38
+ 2. PCA reduces these to 50 dimensions (`n_components=min(50, n_rows - 1)`, `random_state=42`).
39
+ 3. This prevents the description embeddings (384 dims) from dominating the feature matrix over TF-IDF columns (~5-50 dims each).
40
+ 4. Full-dataset embeddings are cached to `.description_embeddings.npy` to avoid recomputation on every request.
41
+
42
+ ### Weight Application
43
+
44
+ Each column's feature matrix is scaled by the square root of its weight:
45
+
46
+ ```
47
+ weighted_features = features * sqrt(weight)
48
+ ```
49
+
50
+ Default weights reflect domain importance:
51
+ - **Weight 10**: Planning Method, Object Configuration, Output Pose (most important for distinguishing methods)
52
+ - **Weight 8**: Training Data
53
+ - **Weight 7**: Description
54
+ - **Weight 6**: End-effector Hardware, Input Data
55
+ - **Weight 5**: Backbone, Metrics, Dataset
56
+ - **Weight 4**: Camera Position, Language
57
+ - **Weight 3**: Simulator
58
+
59
+ When a researcher asks a question, the deterministic query engine boosts weights for columns that match the query keywords. For example, "cluttered bin picking" boosts Object Configuration weight.
60
+
61
+ ### Combined Feature Matrix
62
+
63
+ All weighted feature matrices are concatenated horizontally:
64
+
65
+ ```
66
+ combined = np.hstack([tfidf_col1 * sqrt(w1), tfidf_col2 * sqrt(w2), ..., desc_pca * sqrt(w_desc)])
67
+ ```
68
+
69
+ Result: a matrix of shape (56 methods, ~379 features).
70
+
71
+ ### Derived Features
72
+
73
+ Seven higher-level features are computed from raw columns for the frontend UI (color-by, detail panel). These are NOT fed into the embedding matrix.
74
+
75
+ | Feature | Derived From | Categories |
76
+ |---------|-------------|------------|
77
+ | Grasp Dimensionality | Output Pose | 6-DoF, 7-DoF, 2D, Policy, Evaluation, Other |
78
+ | Learning Paradigm | Planning Method + Training Data | Classical, Learning-based, RL-based, Hybrid |
79
+ | Sensor Complexity | Input Data | Multimodal, 3D, 2.5D, 2D, Other |
80
+ | Scene Difficulty | Object Configuration | Singulated, Structured, Cluttered, Packed, Piled (ordinal, takes max) |
81
+ | Gripper Type | End-effector Hardware | Parallel-jaw, Dexterous, Suction, Multi-gripper |
82
+ | ML Framework | Language | PyTorch, TensorFlow, Keras, None |
83
+ | Method Era | Year | Pioneer (2016-2018), Growth (2019-2021), Modern (2022+) |
84
+
85
+ ---
86
+
87
+ ## Layer 2: Dimensionality Reduction (UMAP)
88
+
89
+ ### How UMAP Projects to 2D
90
+
91
+ The combined feature matrix is high-dimensional (~379 features). UMAP reduces this to 2D coordinates for the scatter plot.
92
+
93
+ 1. **Pairwise cosine distances** are computed between all 56 methods: `pairwise_distances(features, metric='cosine')`. This produces a 56x56 distance matrix.
94
+ 2. **UMAP** takes this precomputed distance matrix (not the raw features) and finds a 2D layout that preserves both local neighborhoods and global structure.
95
+
96
+ UMAP is agnostic to whether the input was categorical or embedding. All the mixing and weighting happened before UMAP ever sees the data.
97
+
98
+ ### UMAP Parameters
99
+
100
+ | Parameter | Value | Why |
101
+ |-----------|-------|-----|
102
+ | n_neighbors | 15 (capped to n_methods-1 for small sets) | Balances local vs global structure |
103
+ | min_dist | 0.1 | Allows some overlap, not too spread out |
104
+ | metric | precomputed (cosine distances) | Cosine works well for sparse TF-IDF + dense embeddings |
105
+ | random_state | 42 | Reproducible projections |
106
+ | n_jobs | 1 | Avoids macOS OpenMP segfault |
107
+
108
+ ### Small Dataset Handling
109
+
110
+ - 1 method: placed at origin [0, 0]
111
+ - 2-3 methods: PCA fallback (UMAP needs at least 4 points)
112
+ - 4+ methods: standard UMAP
113
+
114
+ ---
115
+
116
+ ## Layer 3: Clustering (HDBSCAN)
117
+
118
+ ### Why HDBSCAN
119
+
120
+ HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) was chosen over K-Means because:
121
+ - It discovers the natural number of clusters (no need to specify k)
122
+ - It handles non-spherical cluster shapes
123
+ - It identifies outliers (noise points) rather than forcing every method into a cluster
124
+
125
+ ### HDBSCAN Parameters
126
+
127
+ | Parameter | Value | Why |
128
+ |-----------|-------|-----|
129
+ | min_cluster_size | max(3, n_methods // 15) | ~15 methods per cluster, minimum 3 |
130
+ | min_samples | 1 | Allows small clusters |
131
+ | metric | euclidean | On 2D UMAP coordinates |
132
+ | cluster_selection_method | eom (excess of mass) | Finds the most persistent clusters |
133
+
134
+ ### Noise Point Reassignment
135
+
136
+ Since all 56 methods are relevant papers (none should be excluded), noise points (labeled -1 by HDBSCAN) are reassigned to their nearest real cluster:
137
+
138
+ 1. Identify all points with label -1
139
+ 2. Compute pairwise distances between noise points and all real cluster members
140
+ 3. Assign each noise point to the cluster of its nearest neighbor
141
+ 4. If all points are noise (edge case): assign everyone to cluster 0
142
+
143
+ ### Descriptive Cluster Labels
144
+
145
+ Instead of meaningless IDs like "Cluster 0", each cluster gets a descriptive label derived from its dominant attributes:
146
+
147
+ 1. For each cluster, count the most common values in Planning Method, End-effector Hardware, and Object Configuration
148
+ 2. Take the top value from each column
149
+ 3. Join as: "Sampling / Two-finger / Piled"
150
+
151
+ A `value_cluster_map` is also built, mapping each attribute value to its dominant cluster. This is used for the cluster legend.
152
+
153
+ ---
154
+
155
+ ## Layer 4: RAG Pipeline (Retrieval-Augmented Generation)
156
+
157
+ ### Overview
158
+
159
+ The RAG pipeline indexes 34 research papers (1,074 text chunks) in a ChromaDB vector database. When a researcher asks a question, relevant paper passages are retrieved and fed to the LLM so it can cite actual paper content instead of hallucinating.
160
+
161
+ ### 4a. PDF Parsing
162
+
163
+ Papers are stored as PDFs in `papers/`. The parser (`pdf_parser.py`) uses pdfplumber to extract text with font metadata.
164
+
165
+ **Section detection** uses a scoring system:
166
+ - Lines with larger font size (>1.1x median) are header candidates
167
+ - Lines matching known academic headers (Abstract, Introduction, Method, etc.) get a boost
168
+ - Lines matching numbering patterns ("3.1 ", "IV. ") are classified as section/subsection headers
169
+ - Fallback: if no sections detected, the entire document becomes a single "Full Text" section
170
+
171
+ **Extracted structure**:
172
+ - `ParsedPaper`: title, abstract, sections (list of ParsedSection), figures (captions), raw_text
173
+ - `ParsedSection`: title, level (1=section, 2=subsection), text, page_start, page_end
174
+
175
+ ### 4b. Chunking (3-Layer Hybrid: Structural + Semantic)
176
+
177
+ The chunker produces chunks at three granularity levels, all enriched with domain-aware metadata.
178
+
179
+ **Layer 1: Coarse (paper-level overview)**
180
+ - Title + Abstract chunk (max 800 tokens)
181
+ - One summary chunk per major section (first + last paragraph if section exceeds token budget)
182
+ - All figure/table captions concatenated into one chunk
183
+ - Purpose: retrieved for broad "what is this paper about?" queries
184
+
185
+ **Layer 2: Mid-level (semantic topic boundaries within sections)**
186
+ 1. Each section is split into sentences
187
+ 2. Every sentence is embedded with the sentence-transformer
188
+ 3. Consecutive sentence similarities are computed (cosine similarity between adjacent embeddings)
189
+ 4. Where similarity drops below 0.35, a topic boundary is detected
190
+ 5. Sentences between boundaries are grouped into chunks
191
+ 6. Token constraints enforced: merge groups under 200 tokens, split groups over 800 tokens
192
+ 7. 15% overlap added between consecutive same-section chunks (last N sentences of chunk i prepended to chunk i+1)
193
+ - Purpose: captures natural topic shifts within a section (e.g., "grasp representation" to "network architecture" to "training procedure")
194
+
195
+ **Layer 3: Fine (paragraph-level)**
196
+ - Individual paragraphs, split by sentences if over 300 tokens
197
+ - Minimum 50 tokens (skip very short fragments)
198
+ - Purpose: precise retrieval for specific technical questions
199
+
200
+ **Why semantic chunking over fixed-window?** Fixed-size chunking (e.g., every 512 tokens) breaks arguments mid-sentence and mixes unrelated content. Semantic chunking detects real topic shifts so each chunk is about one coherent idea.
201
+
202
+ ### 4c. Chunk Metadata Enrichment
203
+
204
+ Every chunk gets rich metadata for retrieval filtering and visualization:
205
+
206
+ **Domain Topic Extraction**: Each chunk is scanned against a configurable vocabulary of 80+ domain-specific keywords (e.g., "point cloud", "6-DoF", "sim-to-real", "parallel-jaw", "PointNet"). Matched terms are stored as `domain_topics`. The vocabulary is defined in `rag_config.yaml` and can be swapped for any domain.
207
+
208
+ **Rhetorical Role Classification**: Heuristic keyword patterns classify each chunk's communicative purpose:
209
+ - `algorithm_description`: "we propose", "our method", "architecture"
210
+ - `experimental_setup`: "we evaluate", "dataset", "baseline"
211
+ - `result`: "table 1", "success rate", "outperform"
212
+ - `comparison`: "compared to", "unlike", "prior work"
213
+ - `problem_statement`: "we address", "the problem of"
214
+ - `limitation`: "limitation", "failure", "future work"
215
+ - `definition`: "we define", "denoted by"
216
+
217
+ **Content Type**: Derived from rhetorical role + section name:
218
+ - `theory` = how the method works (algorithms, math)
219
+ - `implementation` = how to build it (hyperparameters, training details)
220
+ - `evaluation` = how it performs (benchmarks, results)
221
+
222
+ **Chunk Type Detection**: Auto-detects equation-heavy chunks (LaTeX patterns) and citation-dense chunks.
223
+
224
+ ### 4d. Embedding and Storage
225
+
226
+ - **Embedding model**: Same `all-MiniLM-L6-v2` as the main app (reuses the loaded model instance at runtime)
227
+ - **Section-prefix strategy**: Before embedding, each chunk's text is prepended with `[SectionName: Subsection]` to ground the embedding in document structure
228
+ - **Storage**: ChromaDB with cosine distance (HNSW index), persistent at `./chroma_db`
229
+ - **Metadata**: All chunk metadata stored as ChromaDB metadata fields for filtered search
230
+
231
+ ### 4e. Retrieval
232
+
233
+ When a query comes in, retrieval happens in stages:
234
+
235
+ 1. **Intent classification**: Keywords in the query determine intent (BROAD, TECHNICAL, EVALUATION, COMPARISON, LIMITATION)
236
+ 2. **Layer routing**: Each intent maps to target layers and sections. A "loss function" query routes to mid/fine chunks in Methods sections.
237
+ 3. **Multi-layer search**: ChromaDB is queried once per target layer with appropriate top-k (2 coarse + 4 mid + 4 fine)
238
+ 4. **Fallback**: If layer-filtered search returns nothing, a broad search across all layers is performed
239
+ 5. **Deduplication and ranking**: Results merged, deduplicated by chunk_id, sorted by cosine similarity score
240
+
241
+ **Token budget**: Retrieved chunks are formatted for the LLM prompt with a 1500-token budget. Chunks are added in order of relevance until the budget is exhausted.
242
+
243
+ ---
244
+
245
+ ## Layer 5: Deterministic Query Engine
246
+
247
+ When a researcher submits a query, the system does NOT ask the LLM to decide how to configure the visualization. Instead, a deterministic pipeline handles all decisions:
248
+
249
+ ### Step 1: Weight Adjustment
250
+
251
+ The query is scanned against keyword dictionaries for each column. Keywords like "cluttered", "piled", "bin picking" match Object Configuration. Keywords like "PointNet", "transformer", "CNN" match Backbone.
252
+
253
+ Each keyword match adds a boost: `new_weight = min(20, default_weight + min(matches * 3, 10))`.
254
+
255
+ ### Step 2: Method Relevance
256
+
257
+ The query is embedded with the sentence-transformer, and cosine similarity is computed against every method's description + key columns (concatenated and embedded). The top 15 methods by similarity are returned as candidates.
258
+
259
+ ### Step 3: Filter Decision
260
+
261
+ Keyword signals determine whether to filter the scatter plot:
262
+ - "compare", "overview", "vs" = show all methods (no filter)
263
+ - "which methods", "find methods", "best for" = filter to relevant subset
264
+
265
+ If filtering: methods with similarity > 0.15 are kept, as long as the filtered set has 3+ methods.
266
+
267
+ ### Step 4: Color-by Selection
268
+
269
+ Query keywords are matched against a color-by mapping. "gripper" or "finger" selects End-effector Hardware. "training" or "sim" selects Training Data. Default: "cluster".
270
+
271
+ ### Step 5: Highlight Selection
272
+
273
+ Top 8 methods by query similarity are highlighted (larger, brighter points on the scatter plot).
274
+
275
+ ### Step 6: RAG Retrieval
276
+
277
+ ChromaDB is searched for relevant paper passages (see Layer 4e above).
278
+
279
+ ### Step 7: Analytics Computation
280
+
281
+ From the retrieved chunks:
282
+ - **Paper source distribution**: Which papers contributed the most chunks
283
+ - **Domain topic frequency**: Which technical terms appear most in the evidence
284
+ - **Rhetorical role distribution**: How much of the evidence is "method design" vs "results" vs "experiment setup"
285
+ - **Content type breakdown**: Theory vs implementation vs evaluation
286
+ - **Cited references extraction**: Author-year citations found within chunk text (e.g., "Smith et al., 2022"), counted across all retrieved passages
287
+
288
+ ---
289
+
290
+ ## Layer 6: LLM Inference (Single Call)
291
+
292
+ After all deterministic computation is complete, a single LLM call generates the insight text. The prompt is structured as:
293
+
294
+ ```
295
+ You are an expert research assistant for a robotic grasp planning visualization tool.
296
+
297
+ RESEARCHER'S QUESTION: "{query}"
298
+
299
+ EVIDENCE FROM PAPERS:
300
+ --- From "Paper Title" (Section, relevance: 0.65) ---
301
+ [actual text from the paper]
302
+
303
+ RELEVANT METHODS IN THE DATASET:
304
+ - Method1: Planning=Sampling; Gripper=Two-finger; Input=Point cloud
305
+ - Method2: Planning=Direct regression; Gripper=Multi-finger; Input=RGBD
306
+
307
+ CLUSTERING RESULTS (56 methods in 7 groups):
308
+ - Sampling / Two-finger / Piled (12 methods): Method1, Method2, ...
309
+ - RL / Multi-finger / Singulated (5 methods): Method3, Method4, ...
310
+
311
+ Highlighted methods: Method1, Method5, Method8
312
+
313
+ INSTRUCTIONS:
314
+ Write exactly 3-5 bullet points that answer the researcher's question.
315
+ Rules:
316
+ 1. Lead with evidence from the paper excerpts. Cite specific papers by name.
317
+ 2. Reference concrete technical details from the papers.
318
+ 3. Connect findings to the clustering.
319
+ 4. Be specific. Avoid generic statements.
320
+ 5. Never reference cluster numbers. Use descriptive group names.
321
+ ```
322
+
323
+ ### Why One LLM Call, Not Two
324
+
325
+ The original design used two LLM calls (Pass 1: decide config, Pass 2: interpret results). This was replaced because:
326
+ - Pass 1 decisions (weights, filters, highlights) can all be computed deterministically via embedding similarity and keyword matching
327
+ - Removing Pass 1 cuts the prompt size in half and eliminates a point of failure
328
+ - The LLM's only job is now interpretation of already-computed results, which is what it's best at
329
+
330
+ ### LLM Provider
331
+
332
+ - **Primary**: Groq (Llama 3.3 70B, free tier, fastest inference available)
333
+ - **Fallback**: HuggingFace Inference API (Qwen2.5-72B-Instruct)
334
+ - **Local**: Ollama (disabled, crashes the development laptop)
335
+ - **Temperature**: 0.3 (deterministic-leaning but allows slight phrasing variation)
336
+ - **Max tokens**: 1024 for insight generation
337
+
338
+ ### Grounding
339
+
340
+ The LLM cannot hallucinate patterns because:
341
+ 1. It only sees real paper excerpts (from ChromaDB), not its training data
342
+ 2. It only sees real clustering results (from HDBSCAN), not imagined groupings
343
+ 3. It only sees real method metadata (from the CSV), not hallucinated attributes
344
+ 4. The prompt explicitly instructs it to cite papers and reference specific numbers
345
+
346
+ ---
347
+
348
+ ## Layer 7: Frontend Visualization
349
+
350
+ ### Scatter Plot (ScatterPlot.js)
351
+
352
+ UMAP 2D projection rendered with Plotly.js. Each point is a grasp planning method.
353
+ - **Color**: by cluster (discrete colors) or by any column value (continuous colorscale)
354
+ - **Size**: 10px normal, 16px highlighted, 20px hovered
355
+ - **Opacity**: 0.35 for non-highlighted methods when a query is active, 0.9 normal
356
+ - **Labels**: shown only for highlighted methods to avoid clutter
357
+
358
+ ### Method Table (MethodTable.js)
359
+
360
+ Sortable table showing all method metadata. Highlighted methods are sorted to the top. Max height 520px with scroll.
361
+
362
+ ### Insight Card (InsightCard.js)
363
+
364
+ Displays the LLM-generated bullet points with:
365
+ - **Entity highlighting**: Paper names are bold purple. Technical terms are color-coded by category (blue for architectures like PointNet, green for techniques like UMAP, yellow for gripper types). Hover any highlighted term for a plain-English definition.
366
+ - **Paper Evidence panel**: List of source papers with "View PDF" buttons. Clicking opens the actual PDF in a modal viewer with keyword highlights on the text layer.
367
+
368
+ ### Query Explanation (QueryExplanation.js)
369
+
370
+ A numbered step-by-step section explaining what the system did to answer the query:
371
+ 1. How the query was embedded and compared to methods
372
+ 2. How many methods were highlighted and why
373
+ 3. Whether filtering was applied
374
+ 4. How HDBSCAN found N clusters with M methods
375
+ 5. How many paper passages were retrieved from the vector database
376
+
377
+ Every technical term (sentence-transformer, cosine similarity, HDBSCAN, UMAP, vector database) has a hover tooltip with a plain-English explanation. Designed for robotics experts who may not be familiar with ML terminology.
378
+
379
+ ### Analytics Dashboard (AnalyticsDashboard.js)
380
+
381
+ Four visualization cards that appear after a query:
382
+
383
+ 1. **Query-Method Similarity**: Horizontal bar chart showing the cosine similarity between the query and each method's description. Shows which methods are most semantically related to the question.
384
+
385
+ 2. **Cited References in Evidence**: Author-year citations extracted from the retrieved paper passages (e.g., "Smith et al., 2022" found 3x). Shows which foundational works are most relevant, including papers outside the 56-method dataset.
386
+
387
+ 3. **Papers Referenced**: Bar chart showing how many passages were retrieved from each paper. Indicates which papers have the most content relevant to the query.
388
+
389
+ 4. **Key Topics in Evidence**: Tag cloud of domain-specific terms found in retrieved passages (e.g., "point cloud", "gripper", "6-DoF"). Larger tags appear more frequently.
390
+
391
+ 5. **What Kind of Evidence?**: Stacked bar showing the breakdown of retrieved content by type ("How It Works" / "How To Build It" / "How It Performs") and by purpose ("Method Design" / "Results" / "Experiment Setup").
392
+
393
+ All chart headings have ? tooltips explaining what the chart shows and why, written for non-ML audiences.
394
+
395
+ ### PDF Viewer (PdfViewer.js)
396
+
397
+ Full PDF viewer using react-pdf (PDF.js wrapper). Opens as a modal overlay when the user clicks "View PDF" on a citation. Features:
398
+ - Page navigation (previous/next)
399
+ - Auto-opens to the page where the retrieved chunk was found
400
+ - Keyword highlights overlaid on the PDF text layer
401
+ - Yellow bar showing which search terms are being highlighted
402
+
403
+ ---
404
+
405
+ ## Domain-Agnostic Design
406
+
407
+ The entire pipeline is configured via `rag_config.yaml`. To use this system for a different paper collection:
408
+
409
+ 1. Replace the CSV with your dataset
410
+ 2. Place your PDFs in `papers/`
411
+ 3. Update `rag_config.yaml`:
412
+ - `domain_context`: describe your domain
413
+ - `name_column`, `description_column`: map to your CSV columns
414
+ - `domain_topics`: list your domain's keyword vocabulary
415
+ 4. Run `python -m rag.ingest.pipeline --papers-dir ./papers/ --config rag_config.yaml`
416
+
417
+ No code changes needed. The chunking, embedding, retrieval, and visualization pipeline adapts to any collection of academic papers.
418
+
419
+ ---
420
+
421
+ ## Data Flow Summary
422
+
423
+ ```
424
+ User Query
425
+ |
426
+ v
427
+ [Deterministic Query Engine]
428
+ |--- Embed query (sentence-transformer)
429
+ |--- Compute method similarity (cosine)
430
+ |--- Adjust weights (keyword matching)
431
+ |--- Pick color-by, highlights, filter
432
+ |--- Search ChromaDB (intent-routed, multi-layer)
433
+ |--- Extract citations from chunks
434
+ |--- Compute analytics (topics, roles, content types)
435
+ |
436
+ v
437
+ [UMAP + HDBSCAN Pipeline]
438
+ |--- Build weighted feature matrix (TF-IDF + embeddings)
439
+ |--- Compute cosine distance matrix
440
+ |--- UMAP project to 2D
441
+ |--- HDBSCAN cluster with noise reassignment
442
+ |--- Generate descriptive cluster labels
443
+ |
444
+ v
445
+ [Single LLM Call]
446
+ |--- Prompt: query + paper excerpts + method summaries + cluster results
447
+ |--- Output: 3-5 grounded bullet points citing specific papers
448
+ |
449
+ v
450
+ [Frontend Dashboard]
451
+ |--- Scatter plot (UMAP projection)
452
+ |--- Insight card (entity-highlighted bullets + paper evidence + PDF viewer)
453
+ |--- Query explanation (step-by-step pipeline walkthrough)
454
+ |--- Analytics dashboard (similarity, citations, topics, evidence types)
455
+ ```
frontend/package-lock.json CHANGED
@@ -1,20 +1,22 @@
1
  {
2
- "name": "frontend-copilot",
3
  "version": "0.1.0",
4
  "lockfileVersion": 3,
5
  "requires": true,
6
  "packages": {
7
  "": {
8
- "name": "frontend-copilot",
9
  "version": "0.1.0",
10
  "dependencies": {
11
  "@testing-library/dom": "^10.4.1",
12
  "@testing-library/jest-dom": "^6.9.1",
13
  "@testing-library/react": "^16.3.2",
14
  "@testing-library/user-event": "^13.5.0",
 
15
  "plotly.js": "^3.4.0",
16
  "react": "^19.2.4",
17
  "react-dom": "^19.2.4",
 
18
  "react-plotly.js": "^2.6.0",
19
  "react-scripts": "5.0.1",
20
  "web-vitals": "^2.1.4"
@@ -2995,6 +2997,271 @@
2995
  "integrity": "sha512-gRa9gwYU3ECmQYv3lslts5hxuIa90veaEcxDYuu3QGOIAEM2mOZkVHp48ANJuu1CURtRdHKUBY5Lm1tHV+sD4g==",
2996
  "license": "ISC"
2997
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2998
  "node_modules/@nicolo-ribaudo/eslint-scope-5-internals": {
2999
  "version": "5.1.1-v1",
3000
  "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/eslint-scope-5-internals/-/eslint-scope-5-internals-5.1.1-v1.tgz",
@@ -6013,6 +6280,15 @@
6013
  "wrap-ansi": "^7.0.0"
6014
  }
6015
  },
 
 
 
 
 
 
 
 
 
6016
  "node_modules/co": {
6017
  "version": "4.6.0",
6018
  "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@@ -12409,6 +12685,22 @@
12409
  "node": ">=4.0"
12410
  }
12411
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12412
  "node_modules/kdbush": {
12413
  "version": "4.0.2",
12414
  "resolved": "https://registry.npmjs.org/kdbush/-/kdbush-4.0.2.tgz",
@@ -12639,6 +12931,15 @@
12639
  "sourcemap-codec": "^1.4.8"
12640
  }
12641
  },
 
 
 
 
 
 
 
 
 
12642
  "node_modules/make-dir": {
12643
  "version": "3.1.0",
12644
  "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
@@ -12663,6 +12964,15 @@
12663
  "semver": "bin/semver.js"
12664
  }
12665
  },
 
 
 
 
 
 
 
 
 
12666
  "node_modules/makeerror": {
12667
  "version": "1.0.12",
12668
  "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.12.tgz",
@@ -12917,6 +13227,23 @@
12917
  "url": "https://github.com/sponsors/sindresorhus"
12918
  }
12919
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12920
  "node_modules/merge-stream": {
12921
  "version": "2.0.0",
12922
  "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
@@ -13803,6 +14130,18 @@
13803
  "pbf": "bin/pbf"
13804
  }
13805
  },
 
 
 
 
 
 
 
 
 
 
 
 
13806
  "node_modules/performance-now": {
13807
  "version": "2.1.0",
13808
  "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
@@ -15703,6 +16042,35 @@
15703
  "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
15704
  "license": "MIT"
15705
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15706
  "node_modules/react-plotly.js": {
15707
  "version": "2.6.0",
15708
  "resolved": "https://registry.npmjs.org/react-plotly.js/-/react-plotly.js-2.6.0.tgz",
@@ -18026,6 +18394,12 @@
18026
  "integrity": "sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA==",
18027
  "license": "MIT"
18028
  },
 
 
 
 
 
 
18029
  "node_modules/tinycolor2": {
18030
  "version": "1.6.0",
18031
  "resolved": "https://registry.npmjs.org/tinycolor2/-/tinycolor2-1.6.0.tgz",
@@ -18705,6 +19079,15 @@
18705
  "makeerror": "1.0.12"
18706
  }
18707
  },
 
 
 
 
 
 
 
 
 
18708
  "node_modules/watchpack": {
18709
  "version": "2.5.1",
18710
  "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",
 
1
  {
2
+ "name": "grasp-explorer",
3
  "version": "0.1.0",
4
  "lockfileVersion": 3,
5
  "requires": true,
6
  "packages": {
7
  "": {
8
+ "name": "grasp-explorer",
9
  "version": "0.1.0",
10
  "dependencies": {
11
  "@testing-library/dom": "^10.4.1",
12
  "@testing-library/jest-dom": "^6.9.1",
13
  "@testing-library/react": "^16.3.2",
14
  "@testing-library/user-event": "^13.5.0",
15
+ "katex": "^0.16.39",
16
  "plotly.js": "^3.4.0",
17
  "react": "^19.2.4",
18
  "react-dom": "^19.2.4",
19
+ "react-pdf": "^10.4.1",
20
  "react-plotly.js": "^2.6.0",
21
  "react-scripts": "5.0.1",
22
  "web-vitals": "^2.1.4"
 
2997
  "integrity": "sha512-gRa9gwYU3ECmQYv3lslts5hxuIa90veaEcxDYuu3QGOIAEM2mOZkVHp48ANJuu1CURtRdHKUBY5Lm1tHV+sD4g==",
2998
  "license": "ISC"
2999
  },
3000
+ "node_modules/@napi-rs/canvas": {
3001
+ "version": "0.1.97",
3002
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.97.tgz",
3003
+ "integrity": "sha512-8cFniXvrIEnVwuNSRCW9wirRZbHvrD3JVujdS2P5n5xiJZNZMOZcfOvJ1pb66c7jXMKHHglJEDVJGbm8XWFcXQ==",
3004
+ "license": "MIT",
3005
+ "optional": true,
3006
+ "workspaces": [
3007
+ "e2e/*"
3008
+ ],
3009
+ "engines": {
3010
+ "node": ">= 10"
3011
+ },
3012
+ "funding": {
3013
+ "type": "github",
3014
+ "url": "https://github.com/sponsors/Brooooooklyn"
3015
+ },
3016
+ "optionalDependencies": {
3017
+ "@napi-rs/canvas-android-arm64": "0.1.97",
3018
+ "@napi-rs/canvas-darwin-arm64": "0.1.97",
3019
+ "@napi-rs/canvas-darwin-x64": "0.1.97",
3020
+ "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.97",
3021
+ "@napi-rs/canvas-linux-arm64-gnu": "0.1.97",
3022
+ "@napi-rs/canvas-linux-arm64-musl": "0.1.97",
3023
+ "@napi-rs/canvas-linux-riscv64-gnu": "0.1.97",
3024
+ "@napi-rs/canvas-linux-x64-gnu": "0.1.97",
3025
+ "@napi-rs/canvas-linux-x64-musl": "0.1.97",
3026
+ "@napi-rs/canvas-win32-arm64-msvc": "0.1.97",
3027
+ "@napi-rs/canvas-win32-x64-msvc": "0.1.97"
3028
+ }
3029
+ },
3030
+ "node_modules/@napi-rs/canvas-android-arm64": {
3031
+ "version": "0.1.97",
3032
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.97.tgz",
3033
+ "integrity": "sha512-V1c/WVw+NzH8vk7ZK/O8/nyBSCQimU8sfMsB/9qeSvdkGKNU7+mxy/bIF0gTgeBFmHpj30S4E9WHMSrxXGQuVQ==",
3034
+ "cpu": [
3035
+ "arm64"
3036
+ ],
3037
+ "license": "MIT",
3038
+ "optional": true,
3039
+ "os": [
3040
+ "android"
3041
+ ],
3042
+ "engines": {
3043
+ "node": ">= 10"
3044
+ },
3045
+ "funding": {
3046
+ "type": "github",
3047
+ "url": "https://github.com/sponsors/Brooooooklyn"
3048
+ }
3049
+ },
3050
+ "node_modules/@napi-rs/canvas-darwin-arm64": {
3051
+ "version": "0.1.97",
3052
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.97.tgz",
3053
+ "integrity": "sha512-ok+SCEF4YejcxuJ9Rm+WWunHHpf2HmiPxfz6z1a/NFQECGXtsY7A4B8XocK1LmT1D7P174MzwPF9Wy3AUAwEPw==",
3054
+ "cpu": [
3055
+ "arm64"
3056
+ ],
3057
+ "license": "MIT",
3058
+ "optional": true,
3059
+ "os": [
3060
+ "darwin"
3061
+ ],
3062
+ "engines": {
3063
+ "node": ">= 10"
3064
+ },
3065
+ "funding": {
3066
+ "type": "github",
3067
+ "url": "https://github.com/sponsors/Brooooooklyn"
3068
+ }
3069
+ },
3070
+ "node_modules/@napi-rs/canvas-darwin-x64": {
3071
+ "version": "0.1.97",
3072
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.97.tgz",
3073
+ "integrity": "sha512-PUP6e6/UGlclUvAQNnuXCcnkpdUou6VYZfQOQxExLp86epOylmiwLkqXIvpFmjoTEDmPmXrI+coL/9EFU1gKPA==",
3074
+ "cpu": [
3075
+ "x64"
3076
+ ],
3077
+ "license": "MIT",
3078
+ "optional": true,
3079
+ "os": [
3080
+ "darwin"
3081
+ ],
3082
+ "engines": {
3083
+ "node": ">= 10"
3084
+ },
3085
+ "funding": {
3086
+ "type": "github",
3087
+ "url": "https://github.com/sponsors/Brooooooklyn"
3088
+ }
3089
+ },
3090
+ "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
3091
+ "version": "0.1.97",
3092
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.97.tgz",
3093
+ "integrity": "sha512-XyXH2L/cic8eTNtbrXCcvqHtMX/nEOxN18+7rMrAM2XtLYC/EB5s0wnO1FsLMWmK+04ZSLN9FBGipo7kpIkcOw==",
3094
+ "cpu": [
3095
+ "arm"
3096
+ ],
3097
+ "license": "MIT",
3098
+ "optional": true,
3099
+ "os": [
3100
+ "linux"
3101
+ ],
3102
+ "engines": {
3103
+ "node": ">= 10"
3104
+ },
3105
+ "funding": {
3106
+ "type": "github",
3107
+ "url": "https://github.com/sponsors/Brooooooklyn"
3108
+ }
3109
+ },
3110
+ "node_modules/@napi-rs/canvas-linux-arm64-gnu": {
3111
+ "version": "0.1.97",
3112
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.97.tgz",
3113
+ "integrity": "sha512-Kuq/M3djq0K8ktgz6nPlK7Ne5d4uWeDxPpyKWOjWDK2RIOhHVtLtyLiJw2fuldw7Vn4mhw05EZXCEr4Q76rs9w==",
3114
+ "cpu": [
3115
+ "arm64"
3116
+ ],
3117
+ "libc": [
3118
+ "glibc"
3119
+ ],
3120
+ "license": "MIT",
3121
+ "optional": true,
3122
+ "os": [
3123
+ "linux"
3124
+ ],
3125
+ "engines": {
3126
+ "node": ">= 10"
3127
+ },
3128
+ "funding": {
3129
+ "type": "github",
3130
+ "url": "https://github.com/sponsors/Brooooooklyn"
3131
+ }
3132
+ },
3133
+ "node_modules/@napi-rs/canvas-linux-arm64-musl": {
3134
+ "version": "0.1.97",
3135
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.97.tgz",
3136
+ "integrity": "sha512-kKmSkQVnWeqg7qdsiXvYxKhAFuHz3tkBjW/zyQv5YKUPhotpaVhpBGv5LqCngzyuRV85SXoe+OFj+Tv0a0QXkQ==",
3137
+ "cpu": [
3138
+ "arm64"
3139
+ ],
3140
+ "libc": [
3141
+ "musl"
3142
+ ],
3143
+ "license": "MIT",
3144
+ "optional": true,
3145
+ "os": [
3146
+ "linux"
3147
+ ],
3148
+ "engines": {
3149
+ "node": ">= 10"
3150
+ },
3151
+ "funding": {
3152
+ "type": "github",
3153
+ "url": "https://github.com/sponsors/Brooooooklyn"
3154
+ }
3155
+ },
3156
+ "node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
3157
+ "version": "0.1.97",
3158
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.97.tgz",
3159
+ "integrity": "sha512-Jc7I3A51jnEOIAXeLsN/M/+Z28LUeakcsXs07FLq9prXc0eYOtVwsDEv913Gr+06IRo34gJJVgT0TXvmz+N2VA==",
3160
+ "cpu": [
3161
+ "riscv64"
3162
+ ],
3163
+ "libc": [
3164
+ "glibc"
3165
+ ],
3166
+ "license": "MIT",
3167
+ "optional": true,
3168
+ "os": [
3169
+ "linux"
3170
+ ],
3171
+ "engines": {
3172
+ "node": ">= 10"
3173
+ },
3174
+ "funding": {
3175
+ "type": "github",
3176
+ "url": "https://github.com/sponsors/Brooooooklyn"
3177
+ }
3178
+ },
3179
+ "node_modules/@napi-rs/canvas-linux-x64-gnu": {
3180
+ "version": "0.1.97",
3181
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.97.tgz",
3182
+ "integrity": "sha512-iDUBe7AilfuBSRbSa8/IGX38Mf+iCSBqoVKLSQ5XaY2JLOaqz1TVyPFEyIck7wT6mRQhQt5sN6ogfjIDfi74tg==",
3183
+ "cpu": [
3184
+ "x64"
3185
+ ],
3186
+ "libc": [
3187
+ "glibc"
3188
+ ],
3189
+ "license": "MIT",
3190
+ "optional": true,
3191
+ "os": [
3192
+ "linux"
3193
+ ],
3194
+ "engines": {
3195
+ "node": ">= 10"
3196
+ },
3197
+ "funding": {
3198
+ "type": "github",
3199
+ "url": "https://github.com/sponsors/Brooooooklyn"
3200
+ }
3201
+ },
3202
+ "node_modules/@napi-rs/canvas-linux-x64-musl": {
3203
+ "version": "0.1.97",
3204
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.97.tgz",
3205
+ "integrity": "sha512-AKLFd/v0Z5fvgqBDqhvqtAdx+fHMJ5t9JcUNKq4FIZ5WH+iegGm8HPdj00NFlCSnm83Fp3Ln8I2f7uq1aIiWaA==",
3206
+ "cpu": [
3207
+ "x64"
3208
+ ],
3209
+ "libc": [
3210
+ "musl"
3211
+ ],
3212
+ "license": "MIT",
3213
+ "optional": true,
3214
+ "os": [
3215
+ "linux"
3216
+ ],
3217
+ "engines": {
3218
+ "node": ">= 10"
3219
+ },
3220
+ "funding": {
3221
+ "type": "github",
3222
+ "url": "https://github.com/sponsors/Brooooooklyn"
3223
+ }
3224
+ },
3225
+ "node_modules/@napi-rs/canvas-win32-arm64-msvc": {
3226
+ "version": "0.1.97",
3227
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.97.tgz",
3228
+ "integrity": "sha512-u883Yr6A6fO7Vpsy9YE4FVCIxzzo5sO+7pIUjjoDLjS3vQaNMkVzx5bdIpEL+ob+gU88WDK4VcxYMZ6nmnoX9A==",
3229
+ "cpu": [
3230
+ "arm64"
3231
+ ],
3232
+ "license": "MIT",
3233
+ "optional": true,
3234
+ "os": [
3235
+ "win32"
3236
+ ],
3237
+ "engines": {
3238
+ "node": ">= 10"
3239
+ },
3240
+ "funding": {
3241
+ "type": "github",
3242
+ "url": "https://github.com/sponsors/Brooooooklyn"
3243
+ }
3244
+ },
3245
+ "node_modules/@napi-rs/canvas-win32-x64-msvc": {
3246
+ "version": "0.1.97",
3247
+ "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.97.tgz",
3248
+ "integrity": "sha512-sWtD2EE3fV0IzN+iiQUqr/Q1SwqWhs2O1FKItFlxtdDkikpEj5g7DKQpY3x55H/MAOnL8iomnlk3mcEeGiUMoQ==",
3249
+ "cpu": [
3250
+ "x64"
3251
+ ],
3252
+ "license": "MIT",
3253
+ "optional": true,
3254
+ "os": [
3255
+ "win32"
3256
+ ],
3257
+ "engines": {
3258
+ "node": ">= 10"
3259
+ },
3260
+ "funding": {
3261
+ "type": "github",
3262
+ "url": "https://github.com/sponsors/Brooooooklyn"
3263
+ }
3264
+ },
3265
  "node_modules/@nicolo-ribaudo/eslint-scope-5-internals": {
3266
  "version": "5.1.1-v1",
3267
  "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/eslint-scope-5-internals/-/eslint-scope-5-internals-5.1.1-v1.tgz",
 
6280
  "wrap-ansi": "^7.0.0"
6281
  }
6282
  },
6283
+ "node_modules/clsx": {
6284
+ "version": "2.1.1",
6285
+ "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz",
6286
+ "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==",
6287
+ "license": "MIT",
6288
+ "engines": {
6289
+ "node": ">=6"
6290
+ }
6291
+ },
6292
  "node_modules/co": {
6293
  "version": "4.6.0",
6294
  "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
 
12685
  "node": ">=4.0"
12686
  }
12687
  },
12688
+ "node_modules/katex": {
12689
+ "version": "0.16.39",
12690
+ "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.39.tgz",
12691
+ "integrity": "sha512-FR2f6y85+81ZLO0GPhyQ+EJl/E5ILNWltJhpAeOTzRny952Z13x2867lTFDmvMZix//Ux3CuMQ2VkLXRbUwOFg==",
12692
+ "funding": [
12693
+ "https://opencollective.com/katex",
12694
+ "https://github.com/sponsors/katex"
12695
+ ],
12696
+ "license": "MIT",
12697
+ "dependencies": {
12698
+ "commander": "^8.3.0"
12699
+ },
12700
+ "bin": {
12701
+ "katex": "cli.js"
12702
+ }
12703
+ },
12704
  "node_modules/kdbush": {
12705
  "version": "4.0.2",
12706
  "resolved": "https://registry.npmjs.org/kdbush/-/kdbush-4.0.2.tgz",
 
12931
  "sourcemap-codec": "^1.4.8"
12932
  }
12933
  },
12934
+ "node_modules/make-cancellable-promise": {
12935
+ "version": "2.0.0",
12936
+ "resolved": "https://registry.npmjs.org/make-cancellable-promise/-/make-cancellable-promise-2.0.0.tgz",
12937
+ "integrity": "sha512-3SEQqTpV9oqVsIWqAcmDuaNeo7yBO3tqPtqGRcKkEo0lrzD3wqbKG9mkxO65KoOgXqj+zH2phJ2LiAsdzlogSw==",
12938
+ "license": "MIT",
12939
+ "funding": {
12940
+ "url": "https://github.com/wojtekmaj/make-cancellable-promise?sponsor=1"
12941
+ }
12942
+ },
12943
  "node_modules/make-dir": {
12944
  "version": "3.1.0",
12945
  "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
 
12964
  "semver": "bin/semver.js"
12965
  }
12966
  },
12967
+ "node_modules/make-event-props": {
12968
+ "version": "2.0.0",
12969
+ "resolved": "https://registry.npmjs.org/make-event-props/-/make-event-props-2.0.0.tgz",
12970
+ "integrity": "sha512-G/hncXrl4Qt7mauJEXSg3AcdYzmpkIITTNl5I+rH9sog5Yw0kK6vseJjCaPfOXqOqQuPUP89Rkhfz5kPS8ijtw==",
12971
+ "license": "MIT",
12972
+ "funding": {
12973
+ "url": "https://github.com/wojtekmaj/make-event-props?sponsor=1"
12974
+ }
12975
+ },
12976
  "node_modules/makeerror": {
12977
  "version": "1.0.12",
12978
  "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.12.tgz",
 
13227
  "url": "https://github.com/sponsors/sindresorhus"
13228
  }
13229
  },
13230
+ "node_modules/merge-refs": {
13231
+ "version": "2.0.0",
13232
+ "resolved": "https://registry.npmjs.org/merge-refs/-/merge-refs-2.0.0.tgz",
13233
+ "integrity": "sha512-3+B21mYK2IqUWnd2EivABLT7ueDhb0b8/dGK8LoFQPrU61YITeCMn14F7y7qZafWNZhUEKb24cJdiT5Wxs3prg==",
13234
+ "license": "MIT",
13235
+ "funding": {
13236
+ "url": "https://github.com/wojtekmaj/merge-refs?sponsor=1"
13237
+ },
13238
+ "peerDependencies": {
13239
+ "@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
13240
+ },
13241
+ "peerDependenciesMeta": {
13242
+ "@types/react": {
13243
+ "optional": true
13244
+ }
13245
+ }
13246
+ },
13247
  "node_modules/merge-stream": {
13248
  "version": "2.0.0",
13249
  "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
 
14130
  "pbf": "bin/pbf"
14131
  }
14132
  },
14133
+ "node_modules/pdfjs-dist": {
14134
+ "version": "5.4.296",
14135
+ "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz",
14136
+ "integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==",
14137
+ "license": "Apache-2.0",
14138
+ "engines": {
14139
+ "node": ">=20.16.0 || >=22.3.0"
14140
+ },
14141
+ "optionalDependencies": {
14142
+ "@napi-rs/canvas": "^0.1.80"
14143
+ }
14144
+ },
14145
  "node_modules/performance-now": {
14146
  "version": "2.1.0",
14147
  "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
 
16042
  "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
16043
  "license": "MIT"
16044
  },
16045
+ "node_modules/react-pdf": {
16046
+ "version": "10.4.1",
16047
+ "resolved": "https://registry.npmjs.org/react-pdf/-/react-pdf-10.4.1.tgz",
16048
+ "integrity": "sha512-kS/35staVCBqS29verTQJQZXw7RfsRCPO3fdJoW1KXylcv7A9dw6DZ3vJXC2w+bIBgLw5FN4pOFvKSQtkQhPfA==",
16049
+ "license": "MIT",
16050
+ "dependencies": {
16051
+ "clsx": "^2.0.0",
16052
+ "dequal": "^2.0.3",
16053
+ "make-cancellable-promise": "^2.0.0",
16054
+ "make-event-props": "^2.0.0",
16055
+ "merge-refs": "^2.0.0",
16056
+ "pdfjs-dist": "5.4.296",
16057
+ "tiny-invariant": "^1.0.0",
16058
+ "warning": "^4.0.0"
16059
+ },
16060
+ "funding": {
16061
+ "url": "https://github.com/wojtekmaj/react-pdf?sponsor=1"
16062
+ },
16063
+ "peerDependencies": {
16064
+ "@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
16065
+ "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
16066
+ "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
16067
+ },
16068
+ "peerDependenciesMeta": {
16069
+ "@types/react": {
16070
+ "optional": true
16071
+ }
16072
+ }
16073
+ },
16074
  "node_modules/react-plotly.js": {
16075
  "version": "2.6.0",
16076
  "resolved": "https://registry.npmjs.org/react-plotly.js/-/react-plotly.js-2.6.0.tgz",
 
18394
  "integrity": "sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA==",
18395
  "license": "MIT"
18396
  },
18397
+ "node_modules/tiny-invariant": {
18398
+ "version": "1.3.3",
18399
+ "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz",
18400
+ "integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==",
18401
+ "license": "MIT"
18402
+ },
18403
  "node_modules/tinycolor2": {
18404
  "version": "1.6.0",
18405
  "resolved": "https://registry.npmjs.org/tinycolor2/-/tinycolor2-1.6.0.tgz",
 
19079
  "makeerror": "1.0.12"
19080
  }
19081
  },
19082
+ "node_modules/warning": {
19083
+ "version": "4.0.3",
19084
+ "resolved": "https://registry.npmjs.org/warning/-/warning-4.0.3.tgz",
19085
+ "integrity": "sha512-rpJyN222KWIvHJ/F53XSZv0Zl/accqHR8et1kpaMTD/fLCRxtV8iX8czMzY7sVZupTI3zcUTg8eycS2kNF9l6w==",
19086
+ "license": "MIT",
19087
+ "dependencies": {
19088
+ "loose-envify": "^1.0.0"
19089
+ }
19090
+ },
19091
  "node_modules/watchpack": {
19092
  "version": "2.5.1",
19093
  "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-2.5.1.tgz",
frontend/package.json CHANGED
@@ -7,9 +7,11 @@
7
  "@testing-library/jest-dom": "^6.9.1",
8
  "@testing-library/react": "^16.3.2",
9
  "@testing-library/user-event": "^13.5.0",
 
10
  "plotly.js": "^3.4.0",
11
  "react": "^19.2.4",
12
  "react-dom": "^19.2.4",
 
13
  "react-plotly.js": "^2.6.0",
14
  "react-scripts": "5.0.1",
15
  "web-vitals": "^2.1.4"
 
7
  "@testing-library/jest-dom": "^6.9.1",
8
  "@testing-library/react": "^16.3.2",
9
  "@testing-library/user-event": "^13.5.0",
10
+ "katex": "^0.16.39",
11
  "plotly.js": "^3.4.0",
12
  "react": "^19.2.4",
13
  "react-dom": "^19.2.4",
14
+ "react-pdf": "^10.4.1",
15
  "react-plotly.js": "^2.6.0",
16
  "react-scripts": "5.0.1",
17
  "web-vitals": "^2.1.4"
frontend/src/App.css CHANGED
@@ -117,6 +117,7 @@ body {
117
  gap: 0.75rem;
118
  margin-bottom: 0.75rem;
119
  min-height: 480px;
 
120
  }
121
 
122
  .scatter-panel {
@@ -199,6 +200,86 @@ body {
199
  color: #777; font-size: 0.68rem; line-height: 1.3;
200
  }
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  /* Insight bullet points */
203
  .insight-bullets {
204
  margin: 0; padding: 0 0 0 1.2rem;
@@ -218,6 +299,137 @@ body {
218
  font-size: 0.85rem;
219
  }
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  .table-panel {
222
  width: 480px;
223
  flex-shrink: 0;
@@ -227,6 +439,7 @@ body {
227
  display: flex;
228
  flex-direction: column;
229
  overflow: hidden;
 
230
  }
231
 
232
  .table-panel-header {
@@ -390,29 +603,186 @@ body {
390
  @keyframes spin { to { transform: rotate(360deg); } }
391
  .error-screen { text-align: center; padding: 2rem; color: #991b1b; }
392
 
393
- /* Dendrogram */
394
- .dendro-card {
 
 
 
 
 
 
 
 
395
  background: white;
396
  border-radius: 8px;
397
  box-shadow: 0 1px 3px rgba(0,0,0,0.06);
398
- margin-bottom: 0.75rem;
399
  overflow: hidden;
400
  }
401
- .dendro-header {
402
- display: flex; align-items: center; justify-content: space-between;
403
- padding: 0.5rem 0.75rem;
404
- background: #f8fafc;
405
- border-bottom: 1px solid #e5e7eb;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  }
407
- .dendro-title {
408
- font-size: 0.82rem; font-weight: 600; color: #444;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  }
410
- .dendro-subtitle {
411
- font-size: 0.72rem; color: #888;
 
 
 
 
 
 
 
 
 
412
  }
413
- .dendro-loading {
414
- padding: 2rem; text-align: center;
415
- font-size: 0.85rem; color: #667eea; font-style: italic;
 
 
 
 
 
 
416
  }
417
 
418
  /* Footer */
 
117
  gap: 0.75rem;
118
  margin-bottom: 0.75rem;
119
  min-height: 480px;
120
+ align-items: flex-start;
121
  }
122
 
123
  .scatter-panel {
 
200
  color: #777; font-size: 0.68rem; line-height: 1.3;
201
  }
202
 
203
+ /* Tooltip (fixed position, renders via portal to body) */
204
+ .tooltip-wrapper {
205
+ position: relative; display: inline;
206
+ }
207
+ .tooltip-bubble-fixed {
208
+ position: fixed;
209
+ background: #1e293b; color: #f1f5f9;
210
+ font-size: 0.73rem; line-height: 1.5; font-weight: 400;
211
+ padding: 0.6rem 0.8rem; border-radius: 8px;
212
+ white-space: normal;
213
+ box-shadow: 0 8px 24px rgba(0,0,0,0.25);
214
+ z-index: 10000; pointer-events: none;
215
+ max-width: 90vw;
216
+ }
217
+ .tooltip-bubble-fixed.tooltip-wide { width: 320px; }
218
+
219
+ /* Entity color coding in insights */
220
+ .entity-tag { font-weight: 600; border-radius: 2px; cursor: default; }
221
+ .entity-paper { color: #4338ca; }
222
+ .entity-architecture {
223
+ color: #0369a1; background: #e0f2fe; padding: 0 0.15rem; border-radius: 3px;
224
+ }
225
+ .entity-technique {
226
+ color: #065f46; background: #d1fae5; padding: 0 0.15rem; border-radius: 3px;
227
+ }
228
+ .entity-gripper {
229
+ color: #92400e; background: #fef3c7; padding: 0 0.15rem; border-radius: 3px;
230
+ }
231
+
232
+ /* Chart help button */
233
+ .chart-help, .explanation-help {
234
+ display: inline-flex; align-items: center; justify-content: center;
235
+ width: 16px; height: 16px; border-radius: 50%;
236
+ background: #e8ecf1; color: #667eea; font-size: 0.6rem; font-weight: 700;
237
+ margin-left: 0.4rem; cursor: help; vertical-align: middle;
238
+ }
239
+ .chart-help:hover, .explanation-help:hover { background: #667eea; color: white; }
240
+
241
+ /* Query Explanation Section */
242
+ .query-explanation {
243
+ background: white; border-radius: 8px;
244
+ box-shadow: 0 1px 3px rgba(0,0,0,0.06);
245
+ padding: 1.2rem 1.5rem; margin: 0.75rem 0;
246
+ border-left: 4px solid #667eea;
247
+ }
248
+ .explanation-header {
249
+ display: flex; align-items: center; margin-bottom: 0.75rem;
250
+ }
251
+ .explanation-title {
252
+ font-size: 0.95rem; font-weight: 700; color: #222;
253
+ }
254
+ .explanation-steps {
255
+ display: flex; flex-direction: column; gap: 0.4rem;
256
+ }
257
+ .explanation-step {
258
+ display: flex; gap: 0.6rem; align-items: flex-start;
259
+ }
260
+ .step-number {
261
+ width: 22px; height: 22px; border-radius: 50%;
262
+ background: #667eea; color: white;
263
+ font-size: 0.68rem; font-weight: 700;
264
+ display: flex; align-items: center; justify-content: center;
265
+ flex-shrink: 0; margin-top: 0.05rem;
266
+ }
267
+ .step-content {
268
+ flex: 1;
269
+ }
270
+ .step-label {
271
+ font-size: 0.88rem; font-weight: 600; color: #333;
272
+ display: block; margin-bottom: 0.15rem;
273
+ }
274
+ .step-detail {
275
+ font-size: 0.82rem; color: #555; line-height: 1.6;
276
+ }
277
+ .inline-term {
278
+ color: #065f46; background: #d1fae5; padding: 0.05rem 0.25rem;
279
+ border-radius: 3px; font-weight: 600; font-size: 0.82rem;
280
+ cursor: help; border-bottom: 1px dashed #065f46;
281
+ }
282
+
283
  /* Insight bullet points */
284
  .insight-bullets {
285
  margin: 0; padding: 0 0 0 1.2rem;
 
299
  font-size: 0.85rem;
300
  }
301
 
302
+ /* Paper references in insights */
303
+ .paper-reference {
304
+ color: #4338ca; font-weight: 700;
305
+ }
306
+
307
+ /* Paper Evidence Panel */
308
+ .paper-evidence-panel {
309
+ margin-top: 0.6rem;
310
+ border-top: 1px solid #e8ecf1;
311
+ padding-top: 0.5rem;
312
+ }
313
+ .evidence-header {
314
+ display: flex; justify-content: space-between; align-items: center;
315
+ margin-bottom: 0.5rem;
316
+ }
317
+ .evidence-title {
318
+ font-size: 0.82rem; font-weight: 700; color: #333;
319
+ }
320
+ .evidence-count {
321
+ font-size: 0.7rem; color: #888;
322
+ }
323
+ .evidence-paper-list {
324
+ display: flex; flex-direction: column; gap: 0.35rem;
325
+ }
326
+ .evidence-paper-row {
327
+ display: flex; align-items: center; justify-content: space-between;
328
+ padding: 0.5rem 0.7rem; border-radius: 6px;
329
+ border: 1px solid #e8ecf1; background: #fafaff;
330
+ transition: border-color 0.15s;
331
+ }
332
+ .evidence-paper-row:hover { border-color: #667eea; }
333
+ .evidence-paper-info {
334
+ display: flex; align-items: center; gap: 0.6rem; flex: 1; min-width: 0;
335
+ }
336
+ .evidence-paper-name {
337
+ font-size: 0.8rem; font-weight: 600; color: #333;
338
+ white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
339
+ }
340
+ .evidence-paper-score {
341
+ font-size: 0.72rem; font-weight: 600; color: #667eea;
342
+ white-space: nowrap; background: #f0f2ff; padding: 0.1rem 0.4rem;
343
+ border-radius: 8px; flex-shrink: 0;
344
+ }
345
+ .evidence-detail {
346
+ flex: 1; border-radius: 8px;
347
+ padding: 0.8rem 1rem; border: 1px solid #e2e6f0;
348
+ overflow-y: auto; max-height: 320px;
349
+ }
350
+ .evidence-detail.paper-style {
351
+ background: #fffef8;
352
+ border: 1px solid #d4d0c8;
353
+ box-shadow: 2px 2px 8px rgba(0,0,0,0.06);
354
+ }
355
+ .paper-title-bar {
356
+ display: flex; justify-content: space-between; align-items: flex-start;
357
+ margin-bottom: 0.3rem;
358
+ }
359
+ .paper-title-text {
360
+ font-family: 'Georgia', 'Times New Roman', serif;
361
+ font-size: 0.95rem; font-weight: 700; color: #1a1a1a; line-height: 1.3;
362
+ max-width: 75%;
363
+ }
364
+ .paper-relevance-badge {
365
+ font-size: 0.68rem; font-weight: 600; color: #667eea;
366
+ white-space: nowrap; background: #f0f2ff; padding: 0.15rem 0.5rem;
367
+ border-radius: 10px; flex-shrink: 0;
368
+ }
369
+ .paper-section-tag {
370
+ font-size: 0.72rem; color: #666; font-style: italic;
371
+ margin-bottom: 0.3rem;
372
+ }
373
+ .paper-divider {
374
+ height: 1px; background: #c8c4b8; margin: 0.4rem 0 0.6rem 0;
375
+ }
376
+ .evidence-detail-text {
377
+ font-size: 0.82rem; color: #333; line-height: 1.75;
378
+ word-break: break-word;
379
+ }
380
+ .evidence-text-body {
381
+ font-family: 'Georgia', 'Times New Roman', serif;
382
+ letter-spacing: 0.01em;
383
+ }
384
+ .rag-highlight {
385
+ background: linear-gradient(120deg, #fef08a 0%, #fde047 100%);
386
+ padding: 0.08rem 0.2rem; border-radius: 3px;
387
+ font-weight: 600; color: #333;
388
+ box-shadow: 0 1px 2px rgba(0,0,0,0.05);
389
+ }
390
+ .latex-inline { display: inline; vertical-align: baseline; }
391
+ .latex-block {
392
+ display: block; text-align: center;
393
+ margin: 0.5rem 0; padding: 0.3rem;
394
+ background: #f8f9fa; border-radius: 4px;
395
+ }
396
+ .latex-fallback {
397
+ font-family: 'Courier New', monospace; font-size: 0.78rem;
398
+ background: #f0f0f0; padding: 0.1rem 0.3rem; border-radius: 3px;
399
+ }
400
+ .evidence-keywords {
401
+ display: flex; flex-wrap: wrap; gap: 0.3rem; margin-top: 0.5rem;
402
+ padding-top: 0.4rem; border-top: 1px solid #e8ecf1;
403
+ align-items: center;
404
+ }
405
+ .evidence-keywords-label {
406
+ font-size: 0.68rem; color: #888; font-weight: 500; margin-right: 0.2rem;
407
+ }
408
+ .evidence-keyword-tag {
409
+ font-size: 0.67rem; background: #fef9c3; color: #854d0e;
410
+ padding: 0.12rem 0.45rem; border-radius: 10px; font-weight: 500;
411
+ }
412
+ .evidence-show-more {
413
+ background: none; border: none; color: #667eea; font-size: 0.72rem;
414
+ cursor: pointer; padding: 0.3rem 0; font-weight: 500; text-align: left;
415
+ }
416
+ .evidence-show-more:hover { text-decoration: underline; }
417
+
418
+ /* Tool Results */
419
+ .tool-results-list { margin-top: 0.3rem; }
420
+ .tool-result-item {
421
+ background: #f7f8fc; border-radius: 6px; padding: 0.4rem 0.6rem;
422
+ margin-bottom: 0.3rem; border-left: 3px solid #48bb78;
423
+ }
424
+ .tool-result-name {
425
+ font-size: 0.75rem; font-weight: 600; color: #333; margin-bottom: 0.2rem;
426
+ }
427
+ .tool-result-data {
428
+ font-size: 0.7rem; color: #555; line-height: 1.4;
429
+ margin: 0; white-space: pre-wrap; max-height: 6rem; overflow-y: auto;
430
+ background: #fff; padding: 0.3rem; border-radius: 4px;
431
+ }
432
+
433
  .table-panel {
434
  width: 480px;
435
  flex-shrink: 0;
 
439
  display: flex;
440
  flex-direction: column;
441
  overflow: hidden;
442
+ max-height: 520px;
443
  }
444
 
445
  .table-panel-header {
 
603
  @keyframes spin { to { transform: rotate(360deg); } }
604
  .error-screen { text-align: center; padding: 2rem; color: #991b1b; }
605
 
606
+ /* Analytics Dashboard */
607
+ .analytics-dashboard {
608
+ margin: 0.75rem 0;
609
+ }
610
+ .analytics-grid {
611
+ display: grid;
612
+ grid-template-columns: 1fr 1fr;
613
+ gap: 0.75rem;
614
+ }
615
+ .analytics-card {
616
  background: white;
617
  border-radius: 8px;
618
  box-shadow: 0 1px 3px rgba(0,0,0,0.06);
619
+ padding: 0.85rem 1rem;
620
  overflow: hidden;
621
  }
622
+ .analytics-card-title {
623
+ font-size: 0.82rem;
624
+ font-weight: 700;
625
+ color: #222;
626
+ margin: 0 0 0.15rem 0;
627
+ }
628
+ .analytics-card-subtitle {
629
+ font-size: 0.7rem;
630
+ color: #888;
631
+ margin: 0 0 0.5rem 0;
632
+ line-height: 1.3;
633
+ }
634
+ .topic-cloud {
635
+ display: flex;
636
+ flex-wrap: wrap;
637
+ gap: 0.4rem;
638
+ padding: 0.3rem 0;
639
+ }
640
+ .topic-tag {
641
+ padding: 0.3rem 0.7rem;
642
+ border-radius: 14px;
643
+ font-weight: 500;
644
+ display: inline-flex;
645
+ align-items: center;
646
+ gap: 0.3rem;
647
+ transition: transform 0.1s;
648
+ }
649
+ .topic-tag:hover { transform: scale(1.05); }
650
+ .topic-count {
651
+ font-size: 0.6rem;
652
+ padding: 0.08rem 0.35rem;
653
+ border-radius: 8px;
654
+ font-weight: 700;
655
+ }
656
+ /* Stacked bar for evidence types */
657
+ .evidence-type-section {
658
+ margin-bottom: 0.7rem;
659
+ }
660
+ .evidence-type-label {
661
+ font-size: 0.72rem; font-weight: 600; color: #555;
662
+ display: block; margin-bottom: 0.3rem;
663
+ }
664
+ .stacked-bar {
665
+ display: flex; height: 28px; border-radius: 6px; overflow: hidden;
666
+ box-shadow: inset 0 1px 2px rgba(0,0,0,0.05);
667
+ }
668
+ .stacked-bar-segment {
669
+ display: flex; align-items: center; justify-content: center;
670
+ min-width: 4px; transition: opacity 0.15s;
671
+ }
672
+ .stacked-bar-segment:hover { opacity: 0.85; }
673
+ .segment-label {
674
+ font-size: 0.65rem; color: white; font-weight: 600;
675
+ text-shadow: 0 1px 2px rgba(0,0,0,0.2);
676
+ white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
677
+ padding: 0 0.3rem;
678
+ }
679
+ .evidence-type-legend {
680
+ display: flex; flex-wrap: wrap; gap: 0.5rem; margin-top: 0.3rem;
681
+ }
682
+ .legend-item {
683
+ font-size: 0.68rem; color: #555;
684
+ display: flex; align-items: center; gap: 0.25rem;
685
+ }
686
+ .legend-dot {
687
+ width: 8px; height: 8px; border-radius: 2px; flex-shrink: 0;
688
+ }
689
+
690
+ .cited-refs-note {
691
+ font-size: 0.68rem; color: #888; margin-top: 0.3rem;
692
+ font-style: italic;
693
+ }
694
+
695
+ /* View PDF button */
696
+ .evidence-actions-row {
697
+ display: flex; justify-content: space-between; align-items: flex-start;
698
+ margin-top: 0.5rem; padding-top: 0.4rem; border-top: 1px solid #e8ecf1;
699
+ flex-wrap: wrap; gap: 0.4rem;
700
+ }
701
+ .view-pdf-btn {
702
+ background: #667eea; color: white; border: none; border-radius: 6px;
703
+ padding: 0.35rem 0.8rem; font-size: 0.75rem; font-weight: 600;
704
+ cursor: pointer; white-space: nowrap; flex-shrink: 0;
705
+ }
706
+ .view-pdf-btn:hover { background: #5a6fd6; }
707
+
708
+ /* PDF Viewer Modal */
709
+ .pdf-viewer-overlay {
710
+ position: fixed; top: 0; left: 0; right: 0; bottom: 0;
711
+ background: rgba(0,0,0,0.6); z-index: 9999;
712
+ display: flex; align-items: center; justify-content: center;
713
+ }
714
+ .pdf-viewer-modal {
715
+ background: white; border-radius: 10px;
716
+ width: 90vw; max-width: 860px; height: 88vh;
717
+ display: flex; flex-direction: column;
718
+ box-shadow: 0 20px 60px rgba(0,0,0,0.3);
719
+ overflow: hidden;
720
+ }
721
+ .pdf-viewer-header {
722
+ display: flex; justify-content: space-between; align-items: center;
723
+ padding: 0.6rem 1rem; background: #1e293b; color: white;
724
+ flex-shrink: 0;
725
+ }
726
+ .pdf-viewer-title {
727
+ font-size: 0.85rem; font-weight: 600; text-transform: capitalize;
728
+ overflow: hidden; text-overflow: ellipsis; white-space: nowrap;
729
+ max-width: 40%;
730
+ }
731
+ .pdf-viewer-controls {
732
+ display: flex; align-items: center; gap: 0.5rem;
733
  }
734
+ .pdf-nav-btn {
735
+ background: rgba(255,255,255,0.15); color: white; border: none;
736
+ border-radius: 4px; padding: 0.3rem 0.6rem; font-size: 0.75rem;
737
+ cursor: pointer;
738
+ }
739
+ .pdf-nav-btn:hover { background: rgba(255,255,255,0.25); }
740
+ .pdf-nav-btn:disabled { opacity: 0.3; cursor: default; }
741
+ .pdf-page-info { font-size: 0.75rem; color: #cbd5e1; }
742
+ .pdf-close-btn {
743
+ background: rgba(255,255,255,0.15); color: white; border: none;
744
+ border-radius: 4px; padding: 0.2rem 0.6rem; font-size: 1.1rem;
745
+ cursor: pointer; margin-left: 0.5rem;
746
+ }
747
+ .pdf-close-btn:hover { background: #e53e3e; }
748
+ .pdf-keywords-bar {
749
+ padding: 0.4rem 1rem; background: #fef9c3;
750
+ font-size: 0.73rem; color: #854d0e;
751
+ display: flex; align-items: center; gap: 0.4rem; flex-wrap: wrap;
752
+ flex-shrink: 0;
753
+ }
754
+ .pdf-kw-tag {
755
+ background: #fde047; padding: 0.1rem 0.4rem; border-radius: 8px;
756
+ font-weight: 600; font-size: 0.68rem;
757
+ }
758
+ .pdf-viewer-content {
759
+ flex: 1; overflow-y: auto; display: flex; justify-content: center;
760
+ padding: 1rem; background: #f1f5f9;
761
+ }
762
+ .pdf-loading, .pdf-error {
763
+ padding: 2rem; text-align: center; font-size: 0.9rem; color: #666;
764
  }
765
+ .pdf-error { color: #e53e3e; }
766
+
767
+ /* Keyword highlights on the PDF text layer */
768
+ .pdf-keyword-highlight {
769
+ background: rgba(254, 224, 71, 0.5) !important;
770
+ border-radius: 2px;
771
+ }
772
+
773
+ /* react-pdf text layer base styles */
774
+ .react-pdf__Page__textContent {
775
+ opacity: 0.4;
776
  }
777
+ .react-pdf__Page__textContent span.pdf-keyword-highlight {
778
+ opacity: 1;
779
+ background: rgba(254, 224, 71, 0.6) !important;
780
+ }
781
+
782
+ @media (max-width: 900px) {
783
+ .analytics-grid {
784
+ grid-template-columns: 1fr;
785
+ }
786
  }
787
 
788
  /* Footer */
frontend/src/App.js CHANGED
@@ -5,7 +5,8 @@ import ClusterOverview from './components/ClusterOverview';
5
  import ScatterPlot from './components/ScatterPlot';
6
  import MethodTable from './components/MethodTable';
7
  import DetailPanel from './components/DetailPanel';
8
- import Dendrogram from './components/Dendrogram';
 
9
  import './App.css';
10
 
11
  function App() {
@@ -169,6 +170,7 @@ function App() {
169
  <InsightCard
170
  suggestion={suggestion}
171
  weights={weights}
 
172
  onClose={() => setSuggestion(null)}
173
  />
174
  )}
@@ -222,7 +224,14 @@ function App() {
222
  />
223
  </div>
224
 
225
- <Dendrogram />
 
 
 
 
 
 
 
226
 
227
  <DetailPanel point={selectedPoint} onClose={() => setSelectedPoint(null)} />
228
 
 
5
  import ScatterPlot from './components/ScatterPlot';
6
  import MethodTable from './components/MethodTable';
7
  import DetailPanel from './components/DetailPanel';
8
+ import AnalyticsDashboard from './components/AnalyticsDashboard';
9
+ import QueryExplanation from './components/QueryExplanation';
10
  import './App.css';
11
 
12
  function App() {
 
170
  <InsightCard
171
  suggestion={suggestion}
172
  weights={weights}
173
+ query={query}
174
  onClose={() => setSuggestion(null)}
175
  />
176
  )}
 
224
  />
225
  </div>
226
 
227
+ <QueryExplanation
228
+ suggestion={suggestion}
229
+ query={query}
230
+ data={data}
231
+ clusterStats={clusterStats}
232
+ />
233
+
234
+ <AnalyticsDashboard suggestion={suggestion} />
235
 
236
  <DetailPanel point={selectedPoint} onClose={() => setSelectedPoint(null)} />
237
 
frontend/src/components/AnalyticsDashboard.js ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import Plot from 'react-plotly.js';
3
+ import Tooltip from './Tooltip';
4
+
5
+ // Human-friendly labels for content types and rhetorical roles
6
+ const CONTENT_TYPE_LABELS = {
7
+ theory: 'How It Works',
8
+ implementation: 'How To Build It',
9
+ evaluation: 'How It Performs',
10
+ general: 'General',
11
+ };
12
+
13
+ const ROLE_LABELS = {
14
+ algorithm_description: 'Method Design',
15
+ experimental_setup: 'Experiment Setup',
16
+ result: 'Results & Metrics',
17
+ comparison: 'Comparisons',
18
+ problem_statement: 'Problem Definition',
19
+ limitation: 'Limitations',
20
+ definition: 'Definitions',
21
+ general: 'General',
22
+ };
23
+
24
+ const CONTENT_COLORS = {
25
+ 'How It Works': '#667eea',
26
+ 'How To Build It': '#48bb78',
27
+ 'How It Performs': '#ed8936',
28
+ 'General': '#a0aec0',
29
+ };
30
+
31
+ const ROLE_COLORS = {
32
+ 'Method Design': '#667eea',
33
+ 'Experiment Setup': '#48bb78',
34
+ 'Results & Metrics': '#ed8936',
35
+ 'Comparisons': '#e53e3e',
36
+ 'Problem Definition': '#9f7aea',
37
+ 'Limitations': '#dd6b20',
38
+ 'Definitions': '#38b2ac',
39
+ 'General': '#a0aec0',
40
+ };
41
+
42
+ function MethodRelevanceChart({ methodRelevance }) {
43
+ if (!methodRelevance || methodRelevance.length === 0) return null;
44
+ const top = methodRelevance.slice(0, 10);
45
+ const names = top.map(m => m.name.length > 35 ? m.name.slice(0, 33) + '...' : m.name);
46
+ const scores = top.map(m => m.score);
47
+ const maxScore = Math.max(...scores);
48
+
49
+ return (
50
+ <div className="analytics-card">
51
+ <h3 className="analytics-card-title">
52
+ Query-Method Similarity
53
+ <Tooltip text="Each method's text description was converted to a vector using a sentence-transformer model, then compared to your query vector using cosine similarity. Higher scores mean the method's description is more semantically related to what you asked." wide>
54
+ <span className="chart-help">?</span>
55
+ </Tooltip>
56
+ </h3>
57
+ <p className="analytics-card-subtitle">How closely each method's description matches your query</p>
58
+ <Plot
59
+ data={[{
60
+ type: 'bar',
61
+ x: scores,
62
+ y: names,
63
+ orientation: 'h',
64
+ marker: {
65
+ color: scores.map(s => {
66
+ const ratio = s / maxScore;
67
+ return ratio > 0.9 ? '#667eea' : ratio > 0.7 ? '#7c8ef2' : '#b4bff8';
68
+ }),
69
+ line: { width: 0 },
70
+ },
71
+ text: scores.map(s => (s * 100).toFixed(0) + '%'),
72
+ textposition: 'outside',
73
+ textfont: { size: 11, color: '#555' },
74
+ hovertemplate: '%{y}: %{x:.1%}<extra></extra>',
75
+ }]}
76
+ layout={{
77
+ margin: { l: 210, r: 50, t: 5, b: 25 },
78
+ height: Math.max(180, top.length * 26),
79
+ xaxis: {
80
+ title: { text: 'Cosine Similarity', font: { size: 10, color: '#888' } },
81
+ range: [0, maxScore * 1.2],
82
+ showgrid: true, gridcolor: '#f0f0f0',
83
+ },
84
+ yaxis: { autorange: 'reversed', tickfont: { size: 11 } },
85
+ paper_bgcolor: 'transparent',
86
+ plot_bgcolor: 'transparent',
87
+ }}
88
+ config={{ displayModeBar: false, responsive: true }}
89
+ style={{ width: '100%' }}
90
+ />
91
+ </div>
92
+ );
93
+ }
94
+
95
+ function PaperSourcesChart({ paperSources }) {
96
+ if (!paperSources || paperSources.length === 0) return null;
97
+ const names = paperSources.map(p => p.name.length > 35 ? p.name.slice(0, 33) + '...' : p.name);
98
+ const counts = paperSources.map(p => p.count);
99
+
100
+ return (
101
+ <div className="analytics-card">
102
+ <h3 className="analytics-card-title">
103
+ Papers Referenced
104
+ <Tooltip text="When you ask a question, the system searches a vector database of 1,074 text chunks extracted from 34 research papers. This chart shows which papers had the most passages matching your query. More passages means the paper is more relevant to your question." wide>
105
+ <span className="chart-help">?</span>
106
+ </Tooltip>
107
+ </h3>
108
+ <p className="analytics-card-subtitle">Number of relevant passages retrieved from each paper</p>
109
+ <Plot
110
+ data={[{
111
+ type: 'bar',
112
+ x: counts,
113
+ y: names,
114
+ orientation: 'h',
115
+ marker: { color: '#48bb78', line: { width: 0 } },
116
+ text: counts.map(String),
117
+ textposition: 'outside',
118
+ textfont: { size: 11, color: '#555' },
119
+ hovertemplate: '%{y}: %{x} passages<extra></extra>',
120
+ }]}
121
+ layout={{
122
+ margin: { l: 220, r: 40, t: 5, b: 25 },
123
+ height: Math.max(140, paperSources.length * 32),
124
+ xaxis: {
125
+ title: { text: 'Passages Found', font: { size: 10, color: '#888' } },
126
+ dtick: 1, showgrid: true, gridcolor: '#f0f0f0',
127
+ },
128
+ yaxis: { autorange: 'reversed', tickfont: { size: 11 } },
129
+ paper_bgcolor: 'transparent',
130
+ plot_bgcolor: 'transparent',
131
+ }}
132
+ config={{ displayModeBar: false, responsive: true }}
133
+ style={{ width: '100%' }}
134
+ />
135
+ </div>
136
+ );
137
+ }
138
+
139
+ function DomainTopicsChart({ domainTopics }) {
140
+ if (!domainTopics || domainTopics.length === 0) return null;
141
+ const top = domainTopics.slice(0, 12);
142
+
143
+ return (
144
+ <div className="analytics-card">
145
+ <h3 className="analytics-card-title">
146
+ Key Topics in Evidence
147
+ <Tooltip text="Each retrieved paper passage was scanned for domain-specific technical terms (like 'point cloud', 'gripper', '6-DoF'). Larger, darker tags appear more frequently across the evidence, showing what concepts dominate the retrieved content." wide>
148
+ <span className="chart-help">?</span>
149
+ </Tooltip>
150
+ </h3>
151
+ <p className="analytics-card-subtitle">Technical terms found across retrieved paper passages</p>
152
+ <div className="topic-cloud">
153
+ {top.map((t, i) => {
154
+ const ratio = t.count / top[0].count;
155
+ return (
156
+ <span
157
+ key={i}
158
+ className="topic-tag"
159
+ style={{
160
+ fontSize: `${0.72 + ratio * 0.4}rem`,
161
+ background: ratio > 0.6 ? '#667eea' : ratio > 0.3 ? '#e8ecf1' : '#f7f8fc',
162
+ color: ratio > 0.6 ? 'white' : '#4a5568',
163
+ }}
164
+ >
165
+ {t.topic}
166
+ <span className="topic-count" style={{
167
+ background: ratio > 0.6 ? 'rgba(255,255,255,0.3)' : '#667eea',
168
+ color: 'white',
169
+ }}>{t.count}</span>
170
+ </span>
171
+ );
172
+ })}
173
+ </div>
174
+ </div>
175
+ );
176
+ }
177
+
178
+ function EvidenceTypeChart({ contentTypes, rhetoricalRoles }) {
179
+ if ((!contentTypes || contentTypes.length === 0) &&
180
+ (!rhetoricalRoles || rhetoricalRoles.length === 0)) return null;
181
+
182
+ // Build stacked bar for "What kind of evidence did we find?"
183
+ const typeData = (contentTypes || []).map(c => ({
184
+ label: CONTENT_TYPE_LABELS[c.type] || c.type,
185
+ count: c.count,
186
+ color: CONTENT_COLORS[CONTENT_TYPE_LABELS[c.type]] || '#cbd5e0',
187
+ }));
188
+
189
+ const roleData = (rhetoricalRoles || []).map(r => ({
190
+ label: ROLE_LABELS[r.role] || r.role,
191
+ count: r.count,
192
+ color: ROLE_COLORS[ROLE_LABELS[r.role]] || '#cbd5e0',
193
+ }));
194
+
195
+ const totalChunks = typeData.reduce((sum, d) => sum + d.count, 0) || 1;
196
+
197
+ return (
198
+ <div className="analytics-card">
199
+ <h3 className="analytics-card-title">
200
+ What Kind of Evidence?
201
+ <Tooltip text="Each paper passage is automatically classified by what it describes. 'How It Works' covers algorithms and math. 'How To Build It' covers training details and implementation. 'How It Performs' covers experimental results and benchmarks. This shows what type of content the system found for your query." wide>
202
+ <span className="chart-help">?</span>
203
+ </Tooltip>
204
+ </h3>
205
+ <p className="analytics-card-subtitle">Breakdown of retrieved content by type and purpose</p>
206
+
207
+ {typeData.length > 0 && (
208
+ <div className="evidence-type-section">
209
+ <span className="evidence-type-label">Content Focus</span>
210
+ <div className="stacked-bar">
211
+ {typeData.map((d, i) => (
212
+ <div
213
+ key={i}
214
+ className="stacked-bar-segment"
215
+ style={{
216
+ width: `${(d.count / totalChunks) * 100}%`,
217
+ background: d.color,
218
+ }}
219
+ title={`${d.label}: ${d.count} passages`}
220
+ >
221
+ {d.count / totalChunks > 0.15 && (
222
+ <span className="segment-label">{d.label}</span>
223
+ )}
224
+ </div>
225
+ ))}
226
+ </div>
227
+ <div className="evidence-type-legend">
228
+ {typeData.map((d, i) => (
229
+ <span key={i} className="legend-item">
230
+ <span className="legend-dot" style={{ background: d.color }}></span>
231
+ {d.label} ({d.count})
232
+ </span>
233
+ ))}
234
+ </div>
235
+ </div>
236
+ )}
237
+
238
+ {roleData.length > 0 && (
239
+ <div className="evidence-type-section">
240
+ <span className="evidence-type-label">Paper Section Purpose</span>
241
+ <div className="stacked-bar">
242
+ {roleData.map((d, i) => (
243
+ <div
244
+ key={i}
245
+ className="stacked-bar-segment"
246
+ style={{
247
+ width: `${(d.count / totalChunks) * 100}%`,
248
+ background: d.color,
249
+ }}
250
+ title={`${d.label}: ${d.count} passages`}
251
+ >
252
+ {d.count / totalChunks > 0.15 && (
253
+ <span className="segment-label">{d.label}</span>
254
+ )}
255
+ </div>
256
+ ))}
257
+ </div>
258
+ <div className="evidence-type-legend">
259
+ {roleData.map((d, i) => (
260
+ <span key={i} className="legend-item">
261
+ <span className="legend-dot" style={{ background: d.color }}></span>
262
+ {d.label} ({d.count})
263
+ </span>
264
+ ))}
265
+ </div>
266
+ </div>
267
+ )}
268
+ </div>
269
+ );
270
+ }
271
+
272
+ function CitedReferencesChart({ citedReferences }) {
273
+ if (!citedReferences || citedReferences.length === 0) return null;
274
+
275
+ // Filter to only author-year citations (skip numbered [1], [2] which are noisy)
276
+ const authorCites = citedReferences.filter(r => !r.name.startsWith('['));
277
+ const numbered = citedReferences.filter(r => r.name.startsWith('['));
278
+
279
+ const toShow = authorCites.length > 0 ? authorCites.slice(0, 10) : numbered.slice(0, 10);
280
+ if (toShow.length === 0) return null;
281
+
282
+ const names = toShow.map(r => r.name);
283
+ const counts = toShow.map(r => r.count);
284
+
285
+ return (
286
+ <div className="analytics-card">
287
+ <h3 className="analytics-card-title">
288
+ Cited References in Evidence
289
+ <Tooltip text="These are academic papers that were cited WITHIN the retrieved passages. For example, if a retrieved chunk says 'as shown by (Smith et al., 2022)', that reference is counted here. This reveals which foundational works are most relevant to your query, even papers outside our 56-method dataset." wide>
290
+ <span className="chart-help">?</span>
291
+ </Tooltip>
292
+ </h3>
293
+ <p className="analytics-card-subtitle">Papers referenced inside the retrieved evidence passages</p>
294
+ <Plot
295
+ data={[{
296
+ type: 'bar',
297
+ x: counts,
298
+ y: names,
299
+ orientation: 'h',
300
+ marker: { color: '#9f7aea', line: { width: 0 } },
301
+ text: counts.map(c => `${c}x`),
302
+ textposition: 'outside',
303
+ textfont: { size: 11, color: '#555' },
304
+ hovertemplate: '%{y}: cited %{x} times<extra></extra>',
305
+ }]}
306
+ layout={{
307
+ margin: { l: 180, r: 40, t: 5, b: 25 },
308
+ height: Math.max(140, toShow.length * 28),
309
+ xaxis: {
310
+ title: { text: 'Times Cited', font: { size: 10, color: '#888' } },
311
+ dtick: 1, showgrid: true, gridcolor: '#f0f0f0',
312
+ },
313
+ yaxis: { autorange: 'reversed', tickfont: { size: 11 } },
314
+ paper_bgcolor: 'transparent',
315
+ plot_bgcolor: 'transparent',
316
+ }}
317
+ config={{ displayModeBar: false, responsive: true }}
318
+ style={{ width: '100%' }}
319
+ />
320
+ {toShow.length > 0 && toShow[0].source_papers && (
321
+ <div className="cited-refs-note">
322
+ Found across: {[...new Set(toShow.flatMap(r => r.source_papers))].slice(0, 3).join(', ')}
323
+ {[...new Set(toShow.flatMap(r => r.source_papers))].length > 3 && ' and more'}
324
+ </div>
325
+ )}
326
+ </div>
327
+ );
328
+ }
329
+
330
+ export default function AnalyticsDashboard({ suggestion }) {
331
+ if (!suggestion) return null;
332
+
333
+ const analytics = suggestion.ragAnalytics || {};
334
+ const methodRelevance = suggestion.methodRelevance || [];
335
+ const hasData = methodRelevance.length > 0 ||
336
+ (analytics.paperSources && analytics.paperSources.length > 0);
337
+
338
+ if (!hasData) return null;
339
+
340
+ return (
341
+ <div className="analytics-dashboard">
342
+ <div className="analytics-grid">
343
+ <MethodRelevanceChart methodRelevance={methodRelevance} />
344
+ <CitedReferencesChart citedReferences={analytics.citedReferences} />
345
+ <PaperSourcesChart paperSources={analytics.paperSources} />
346
+ <DomainTopicsChart domainTopics={analytics.domainTopics} />
347
+ <EvidenceTypeChart
348
+ contentTypes={analytics.contentTypes}
349
+ rhetoricalRoles={analytics.rhetoricalRoles}
350
+ />
351
+ </div>
352
+ </div>
353
+ );
354
+ }
frontend/src/components/InsightBullets.js CHANGED
@@ -1,4 +1,101 @@
1
  import React from 'react';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  export default function InsightBullets({ text }) {
4
  if (!text) return null;
@@ -7,10 +104,10 @@ export default function InsightBullets({ text }) {
7
  return (
8
  <ul className="insight-bullets">
9
  {bullets.map((line, i) => (
10
- <li key={i}>{line.replace(/^-\s*/, '')}</li>
11
  ))}
12
  </ul>
13
  );
14
  }
15
- return <p>{text}</p>;
16
  }
 
1
  import React from 'react';
2
+ import Tooltip from './Tooltip';
3
+
4
+ const ARCHITECTURE_TERMS = {
5
+ 'PointNet++': 'Hierarchical extension of PointNet that captures local geometric structures at multiple scales',
6
+ 'PointNet': 'Neural network that directly processes 3D point clouds for classification and segmentation',
7
+ 'ResNet': 'Deep residual network using skip connections for image recognition',
8
+ 'VGG': 'Deep convolutional network known for small 3x3 filters',
9
+ 'CNN': 'Convolutional Neural Network, processes grid-like data using learned spatial filters',
10
+ 'transformer': 'Attention-based architecture that processes sequences in parallel',
11
+ 'VAE': 'Variational Autoencoder, generates diverse outputs via a probabilistic latent space',
12
+ 'GAN': 'Generative Adversarial Network, generates outputs through adversarial training',
13
+ 'U-Net': 'Encoder-decoder with skip connections for segmentation',
14
+ 'MLP': 'Multi-Layer Perceptron, a basic feedforward neural network',
15
+ 'diffusion model': 'Generative model that reverses a noise process to create diverse samples',
16
+ };
17
+
18
+ const TECHNIQUE_TERMS = {
19
+ 'UMAP': 'Reduces high-dimensional data to 2D for visualization while preserving structure',
20
+ 'HDBSCAN': 'Density-based clustering that finds natural groups without specifying count',
21
+ 'TF-IDF': 'Text representation weighting terms by importance across documents',
22
+ 'cosine similarity': 'Measures angle between vectors to compare text or feature embeddings',
23
+ 'sentence-transformer': 'Neural model converting sentences into vectors capturing meaning',
24
+ '6-DoF': 'Six Degrees of Freedom: 3D position (x,y,z) + orientation (roll,pitch,yaw)',
25
+ '7-DoF': 'Seven Degrees of Freedom: 6-DoF plus gripper width or approach angle',
26
+ 'sim-to-real': 'Transferring simulation-trained models to real robots',
27
+ 'point cloud': 'Set of 3D points representing object surfaces from depth sensors',
28
+ 'TSDF': 'Truncated Signed Distance Function, a volumetric 3D scene representation',
29
+ 'RGBD': 'Color image (RGB) + depth channel (D), giving appearance and geometry',
30
+ 'antipodal grasp': 'Grasp with two fingers pressing opposite sides along the same force line',
31
+ 'grasp quality': 'Metric evaluating how stable and reliable a planned grasp is',
32
+ 'ablation': 'Experiment removing components one-by-one to measure contribution',
33
+ 'domain randomization': 'Randomizing simulation properties so models generalize to real world',
34
+ 'cross-entropy': 'Loss function measuring difference between predicted and true probability distributions',
35
+ 'binary cross-entropy': 'Cross-entropy loss for two-class classification problems',
36
+ };
37
+
38
+ const GRIPPER_TERMS = {
39
+ 'parallel-jaw': 'Simple two-finger gripper opening and closing on a single axis',
40
+ 'two-finger': 'Gripper with two opposing fingers, most common in industrial robotics',
41
+ 'multi-finger': 'Gripper with 3+ articulated fingers for complex manipulation',
42
+ 'dexterous': 'Robot hand with many joints for fine manipulation like a human hand',
43
+ 'suction': 'Gripper that picks objects by vacuum seal on flat surfaces',
44
+ };
45
+
46
+ // Build lookup: term -> {type, tooltip}
47
+ const ENTITY_LOOKUP = {};
48
+ Object.entries(ARCHITECTURE_TERMS).forEach(([t, d]) => { ENTITY_LOOKUP[t.toLowerCase()] = { type: 'architecture', tooltip: d }; });
49
+ Object.entries(TECHNIQUE_TERMS).forEach(([t, d]) => { ENTITY_LOOKUP[t.toLowerCase()] = { type: 'technique', tooltip: d }; });
50
+ Object.entries(GRIPPER_TERMS).forEach(([t, d]) => { ENTITY_LOOKUP[t.toLowerCase()] = { type: 'gripper', tooltip: d }; });
51
+
52
+ // All entity terms sorted by length (longest first for matching)
53
+ const ALL_TERMS = [
54
+ ...Object.keys(ARCHITECTURE_TERMS),
55
+ ...Object.keys(TECHNIQUE_TERMS),
56
+ ...Object.keys(GRIPPER_TERMS),
57
+ ].sort((a, b) => b.length - a.length);
58
+
59
+ const TERM_REGEX = new RegExp(
60
+ `(${ALL_TERMS.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})`,
61
+ 'gi'
62
+ );
63
+
64
+ function renderWithEntities(text) {
65
+ const parts = text.split(TERM_REGEX);
66
+ return parts.map((part, i) => {
67
+ const lookup = ENTITY_LOOKUP[part.toLowerCase()];
68
+ if (lookup) {
69
+ const className = `entity-tag entity-${lookup.type}`;
70
+ if (lookup.tooltip) {
71
+ return (
72
+ <Tooltip key={i} text={lookup.tooltip}>
73
+ <span className={className}>{part}</span>
74
+ </Tooltip>
75
+ );
76
+ }
77
+ return <span key={i} className={className}>{part}</span>;
78
+ }
79
+ return <span key={i}>{part}</span>;
80
+ });
81
+ }
82
+
83
+ function formatBullet(text) {
84
+ // Step 1: Extract and bold quoted paper names (remove the quotes)
85
+ // Match "Paper Name" patterns
86
+ const quoteRegex = /("[^"]{3,}")/g;
87
+ const segments = text.split(quoteRegex);
88
+
89
+ return segments.map((seg, i) => {
90
+ if (seg.startsWith('"') && seg.endsWith('"')) {
91
+ // This is a quoted paper name - bold it, strip quotes
92
+ const paperName = seg.slice(1, -1);
93
+ return <strong key={i} className="entity-tag entity-paper">{paperName}</strong>;
94
+ }
95
+ // For non-quoted text, scan for technical entities
96
+ return <span key={i}>{renderWithEntities(seg)}</span>;
97
+ });
98
+ }
99
 
100
  export default function InsightBullets({ text }) {
101
  if (!text) return null;
 
104
  return (
105
  <ul className="insight-bullets">
106
  {bullets.map((line, i) => (
107
+ <li key={i}>{formatBullet(line.replace(/^-\s*/, ''))}</li>
108
  ))}
109
  </ul>
110
  );
111
  }
112
+ return <p>{formatBullet(text)}</p>;
113
  }
frontend/src/components/InsightCard.js CHANGED
@@ -1,8 +1,196 @@
1
- import React from 'react';
2
  import InsightBullets from './InsightBullets';
 
3
  import { SHORT_NAMES } from '../constants';
 
 
4
 
5
- export default function InsightCard({ suggestion, weights, onClose }) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  const weightDiffs = Object.entries(suggestion.weights)
7
  .filter(([col, val]) => val !== (weights[col] ?? 0))
8
  .map(([col, val]) => ({
@@ -22,6 +210,9 @@ export default function InsightCard({ suggestion, weights, onClose }) {
22
  <div className="insight-body">
23
  <InsightBullets text={suggestion.insight} />
24
  </div>
 
 
 
25
  <div className="insight-actions-summary">
26
  {suggestion.filterMethods && (
27
  <span className="action-chip filter-chip">
@@ -39,15 +230,6 @@ export default function InsightCard({ suggestion, weights, onClose }) {
39
  </span>
40
  )}
41
  </div>
42
- {weightDiffs.length > 0 && (
43
- <div className="insight-weight-details">
44
- {weightDiffs.map(({ col, short, from, to }) => (
45
- <span key={col} className="weight-chip">
46
- {short}: {from} &rarr; <strong>{to}</strong>
47
- </span>
48
- ))}
49
- </div>
50
- )}
51
  {(suggestion.highlightMethods || []).length > 0 && (
52
  <div className="insight-matches">
53
  <span className="matches-label">Best matches:</span>
 
1
+ import React, { useState, useMemo, useEffect, useRef } from 'react';
2
  import InsightBullets from './InsightBullets';
3
+ import PdfViewer from './PdfViewer';
4
  import { SHORT_NAMES } from '../constants';
5
+ import katex from 'katex';
6
+ import 'katex/dist/katex.min.css';
7
 
8
+ const STOP_WORDS = new Set([
9
+ 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
10
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
11
+ 'should', 'may', 'might', 'can', 'shall', 'to', 'of', 'in', 'for',
12
+ 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during',
13
+ 'before', 'after', 'above', 'below', 'between', 'and', 'but', 'or',
14
+ 'not', 'no', 'nor', 'so', 'yet', 'both', 'each', 'all', 'any', 'few',
15
+ 'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'just',
16
+ 'about', 'up', 'out', 'if', 'then', 'also', 'how', 'what', 'which',
17
+ 'who', 'when', 'where', 'why', 'this', 'that', 'these', 'those',
18
+ 'it', 'its', 'they', 'them', 'their', 'we', 'us', 'our', 'i', 'me',
19
+ 'my', 'you', 'your', 'he', 'she', 'him', 'her', 'his', 'used', 'using',
20
+ 'across', 'methods', 'method', 'approach', 'based', 'use',
21
+ ]);
22
+
23
+ function extractKeywords(query) {
24
+ if (!query) return [];
25
+ const words = query.toLowerCase()
26
+ .replace(/[?!.,;:'"()]/g, '')
27
+ .split(/\s+/)
28
+ .filter(w => w.length > 2 && !STOP_WORDS.has(w));
29
+ const parts = query.toLowerCase().replace(/[?!.,;:'"()]/g, '').split(/\s+/);
30
+ const phrases = [];
31
+ for (let i = 0; i < parts.length - 1; i++) {
32
+ const bigram = parts[i] + ' ' + parts[i + 1];
33
+ if (!STOP_WORDS.has(parts[i]) && !STOP_WORDS.has(parts[i + 1])) {
34
+ phrases.push(bigram);
35
+ }
36
+ }
37
+ return [...phrases, ...words];
38
+ }
39
+
40
+ function cleanPdfText(text) {
41
+ if (!text) return '';
42
+ let cleaned = text
43
+ // Add space before uppercase letter following lowercase (e.g., "graspNetwork" -> "grasp Network")
44
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
45
+ // Add space before uppercase following a digit (e.g., "3DPoint" -> "3D Point")
46
+ .replace(/(\d)([A-Z][a-z])/g, '$1 $2')
47
+ // Add space before opening parens/brackets that follow word chars
48
+ .replace(/([a-zA-Z0-9])\(/g, '$1 (')
49
+ .replace(/([a-zA-Z0-9])\[/g, '$1 [')
50
+ // Add space after closing parens/brackets before word chars
51
+ .replace(/\)([a-zA-Z])/g, ') $1')
52
+ .replace(/\]([a-zA-Z])/g, '] $1')
53
+ // Add space after period followed by uppercase (sentence boundary)
54
+ .replace(/\.([A-Z])/g, '. $1')
55
+ // Add space after comma followed by letter
56
+ .replace(/,([a-zA-Z])/g, ', $1')
57
+ // Fix concatenated common English words (lowercase to lowercase)
58
+ .replace(/([a-z])(the|and|for|with|from|that|this|which|our|we|are|is|in|of|to|on|at|by|as|an|or|it|be|do|no|so|if|up|can|has|had|was|not|but|its|may|all|any|use|how|one|two|new|set|see|per|via|get|let|put|run|own|out|off|top|low|few|key|big|old|raw|due|end|aim|way|pre|sub|non)(?=[a-z])/gi, '$1 $2')
59
+ // Fix lowercase-to-lowercase concatenation with common word patterns
60
+ .replace(/([a-z]{3,})(using|based|given|shown|each|over|than|into|also|then|only|such|much|well|very|most|some|both|like|many|more|other|after|about|under|along|above|below|since|while|until|where|there|these|those|their|being|could|would|should|which|every|first|second|third)/gi, '$1 $2')
61
+ // Fix "wordword" where second word starts with common prefixes
62
+ .replace(/([a-z])(approach|method|network|model|object|grasp|robot|point|cloud|image|depth|scene|train|learn|predict|generate|sample|evaluate|compute|estimate|detect|process)/gi, (match, p1, p2) => {
63
+ // Only add space if the first part is 3+ chars
64
+ if (p1.length >= 3) return p1 + ' ' + p2;
65
+ return match;
66
+ })
67
+ // Normalize whitespace
68
+ .replace(/\s+/g, ' ')
69
+ // Fix double spaces around punctuation
70
+ .replace(/\s+([.,;:!?])/g, '$1')
71
+ .trim();
72
+ return cleaned;
73
+ }
74
+
75
+ function renderLatex(text) {
76
+ // Find LaTeX patterns: $...$ or \(...\) or common equation patterns
77
+ const latexPattern = /(\$[^$]+\$|\\[\(\[][^\\]+\\[\)\]])/g;
78
+ const parts = text.split(latexPattern);
79
+
80
+ return parts.map((part, i) => {
81
+ if (part.match(/^\$[^$]+\$$/)) {
82
+ const latex = part.slice(1, -1);
83
+ try {
84
+ const html = katex.renderToString(latex, { throwOnError: false, displayMode: false });
85
+ return <span key={i} className="latex-inline" dangerouslySetInnerHTML={{ __html: html }} />;
86
+ } catch { return <span key={i} className="latex-fallback">{part}</span>; }
87
+ }
88
+ if (part.match(/^\\[\(\[][^\\]+\\[\)\]]$/)) {
89
+ const latex = part.slice(2, -2);
90
+ try {
91
+ const html = katex.renderToString(latex, { throwOnError: false, displayMode: true });
92
+ return <span key={i} className="latex-block" dangerouslySetInnerHTML={{ __html: html }} />;
93
+ } catch { return <span key={i} className="latex-fallback">{part}</span>; }
94
+ }
95
+ return part;
96
+ });
97
+ }
98
+
99
+ function HighlightedText({ text, keywords }) {
100
+ const cleanedText = cleanPdfText(text);
101
+
102
+ // First pass: split by keywords for highlighting
103
+ if (!keywords || keywords.length === 0) {
104
+ return <span className="evidence-text-body">{renderLatex(cleanedText)}</span>;
105
+ }
106
+ const sorted = [...keywords].sort((a, b) => b.length - a.length);
107
+ const escaped = sorted.map(k => k.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
108
+ const regex = new RegExp(`(${escaped.join('|')})`, 'gi');
109
+ const parts = cleanedText.split(regex);
110
+ return (
111
+ <span className="evidence-text-body">
112
+ {parts.map((part, i) => {
113
+ const isMatch = keywords.some(k => part.toLowerCase() === k.toLowerCase());
114
+ if (isMatch) {
115
+ return <mark key={i} className="rag-highlight">{part}</mark>;
116
+ }
117
+ return <span key={i}>{renderLatex(part)}</span>;
118
+ })}
119
+ </span>
120
+ );
121
+ }
122
+
123
+ function formatPaperId(id) {
124
+ if (!id) return '';
125
+ return id.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
126
+ }
127
+
128
+ function PaperEvidencePanel({ citations, query }) {
129
+ const [showAll, setShowAll] = useState(false);
130
+ const [pdfOpen, setPdfOpen] = useState(null);
131
+ const keywords = useMemo(() => extractKeywords(query), [query]);
132
+
133
+ if (!citations || citations.length === 0) return null;
134
+
135
+ // Deduplicate by paper_id, keep best score and earliest page
136
+ const paperMap = {};
137
+ citations.forEach(cit => {
138
+ const key = cit.paper_id;
139
+ if (!paperMap[key] || cit.score > paperMap[key].score) {
140
+ paperMap[key] = { ...cit };
141
+ }
142
+ });
143
+ const papers = Object.values(paperMap).sort((a, b) => b.score - a.score);
144
+ const shown = showAll ? papers : papers.slice(0, 5);
145
+
146
+ return (
147
+ <div className="paper-evidence-panel">
148
+ <div className="evidence-header">
149
+ <span className="evidence-title">Paper Evidence</span>
150
+ <span className="evidence-count">
151
+ {citations.length} passages from {papers.length} papers
152
+ </span>
153
+ </div>
154
+
155
+ <div className="evidence-paper-list">
156
+ {shown.map((paper, i) => (
157
+ <div key={i} className="evidence-paper-row">
158
+ <div className="evidence-paper-info">
159
+ <span className="evidence-paper-name">{formatPaperId(paper.paper_id)}</span>
160
+ <span className="evidence-paper-score">{(paper.score * 100).toFixed(0)}% match</span>
161
+ </div>
162
+ <button
163
+ className="view-pdf-btn"
164
+ onClick={() => setPdfOpen({
165
+ paperId: paper.paper_id,
166
+ page: Math.max(1, paper.page || 1),
167
+ keywords,
168
+ })}
169
+ >
170
+ View PDF
171
+ </button>
172
+ </div>
173
+ ))}
174
+ {papers.length > 5 && !showAll && (
175
+ <button className="evidence-show-more" onClick={() => setShowAll(true)}>
176
+ +{papers.length - 5} more papers
177
+ </button>
178
+ )}
179
+ </div>
180
+
181
+ {pdfOpen && (
182
+ <PdfViewer
183
+ paperId={pdfOpen.paperId}
184
+ page={pdfOpen.page}
185
+ keywords={pdfOpen.keywords}
186
+ onClose={() => setPdfOpen(null)}
187
+ />
188
+ )}
189
+ </div>
190
+ );
191
+ }
192
+
193
+ export default function InsightCard({ suggestion, weights, query, onClose }) {
194
  const weightDiffs = Object.entries(suggestion.weights)
195
  .filter(([col, val]) => val !== (weights[col] ?? 0))
196
  .map(([col, val]) => ({
 
210
  <div className="insight-body">
211
  <InsightBullets text={suggestion.insight} />
212
  </div>
213
+
214
+ <PaperEvidencePanel citations={suggestion.ragCitations} query={query} />
215
+
216
  <div className="insight-actions-summary">
217
  {suggestion.filterMethods && (
218
  <span className="action-chip filter-chip">
 
230
  </span>
231
  )}
232
  </div>
 
 
 
 
 
 
 
 
 
233
  {(suggestion.highlightMethods || []).length > 0 && (
234
  <div className="insight-matches">
235
  <span className="matches-label">Best matches:</span>
frontend/src/components/PdfViewer.js ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React, { useState, useEffect, useCallback, useRef, useMemo } from 'react';
2
+ import { Document, Page, pdfjs } from 'react-pdf';
3
+ import 'react-pdf/dist/Page/AnnotationLayer.css';
4
+ import 'react-pdf/dist/Page/TextLayer.css';
5
+
6
+ // Set up the PDF.js worker
7
+ pdfjs.GlobalWorkerOptions.workerSrc = `//unpkg.com/pdfjs-dist@${pdfjs.version}/build/pdf.worker.min.mjs`;
8
+
9
+ export default function PdfViewer({ paperId, page, keywords, onClose }) {
10
+ const [numPages, setNumPages] = useState(null);
11
+ const [currentPage, setCurrentPage] = useState(page || 1);
12
+ const [loading, setLoading] = useState(true);
13
+ const [error, setError] = useState(null);
14
+ const containerRef = useRef(null);
15
+
16
+ const pdfUrl = `/api/papers/${paperId}`;
17
+
18
+ // Ensure page is at least 1
19
+ useEffect(() => {
20
+ if (page && page >= 1) setCurrentPage(page);
21
+ }, [page]);
22
+
23
+ const onDocumentLoadSuccess = useCallback(({ numPages }) => {
24
+ setNumPages(numPages);
25
+ setLoading(false);
26
+ }, []);
27
+
28
+ const onDocumentLoadError = useCallback((err) => {
29
+ setError(err.message);
30
+ setLoading(false);
31
+ }, []);
32
+
33
+ // After page renders, highlight matching keywords in the text layer
34
+ const highlightKeywords = useCallback(() => {
35
+ if (!keywords || keywords.length === 0) return;
36
+ if (!containerRef.current) return;
37
+
38
+ // Wait for text layer to render
39
+ setTimeout(() => {
40
+ const textLayer = containerRef.current?.querySelector('.react-pdf__Page__textContent');
41
+ if (!textLayer) return;
42
+
43
+ const spans = textLayer.querySelectorAll('span');
44
+ spans.forEach(span => {
45
+ const text = span.textContent.toLowerCase();
46
+ const hasMatch = keywords.some(kw => text.includes(kw.toLowerCase()));
47
+ if (hasMatch) {
48
+ span.classList.add('pdf-keyword-highlight');
49
+ }
50
+ });
51
+ }, 500);
52
+ }, [keywords]);
53
+
54
+ const pageWidth = useMemo(() => {
55
+ if (!containerRef.current) return 700;
56
+ return Math.min(containerRef.current.offsetWidth - 40, 800);
57
+ }, [containerRef.current]);
58
+
59
+ if (!paperId) return null;
60
+
61
+ return (
62
+ <div className="pdf-viewer-overlay" onClick={onClose}>
63
+ <div className="pdf-viewer-modal" onClick={e => e.stopPropagation()} ref={containerRef}>
64
+ <div className="pdf-viewer-header">
65
+ <div className="pdf-viewer-title">{paperId.replace(/-/g, ' ')}</div>
66
+ <div className="pdf-viewer-controls">
67
+ <button
68
+ disabled={currentPage <= 1}
69
+ onClick={() => setCurrentPage(p => Math.max(1, p - 1))}
70
+ className="pdf-nav-btn"
71
+ >
72
+ &larr; Prev
73
+ </button>
74
+ <span className="pdf-page-info">
75
+ Page {currentPage}{numPages ? ` of ${numPages}` : ''}
76
+ </span>
77
+ <button
78
+ disabled={currentPage >= (numPages || 1)}
79
+ onClick={() => setCurrentPage(p => Math.min(numPages || p, p + 1))}
80
+ className="pdf-nav-btn"
81
+ >
82
+ Next &rarr;
83
+ </button>
84
+ <button onClick={onClose} className="pdf-close-btn">&times;</button>
85
+ </div>
86
+ </div>
87
+
88
+ {keywords && keywords.length > 0 && (
89
+ <div className="pdf-keywords-bar">
90
+ Highlighting: {keywords.slice(0, 5).map((kw, i) => (
91
+ <span key={i} className="pdf-kw-tag">{kw}</span>
92
+ ))}
93
+ </div>
94
+ )}
95
+
96
+ <div className="pdf-viewer-content">
97
+ {error && <div className="pdf-error">Failed to load PDF: {error}</div>}
98
+ {loading && !error && <div className="pdf-loading">Loading PDF...</div>}
99
+
100
+ <Document
101
+ file={pdfUrl}
102
+ onLoadSuccess={onDocumentLoadSuccess}
103
+ onLoadError={onDocumentLoadError}
104
+ loading=""
105
+ >
106
+ <Page
107
+ pageNumber={currentPage}
108
+ width={pageWidth}
109
+ onRenderTextLayerSuccess={highlightKeywords}
110
+ renderAnnotationLayer={true}
111
+ renderTextLayer={true}
112
+ />
113
+ </Document>
114
+ </div>
115
+ </div>
116
+ </div>
117
+ );
118
+ }
frontend/src/components/QueryExplanation.js ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import Tooltip from './Tooltip';
3
+
4
+ export default function QueryExplanation({ suggestion, query, data, clusterStats }) {
5
+ if (!suggestion) return null;
6
+
7
+ const nMethods = data ? data.length : 0;
8
+ const nClusters = clusterStats ? clusterStats.length : 0;
9
+ const nHighlights = (suggestion.highlightMethods || []).length;
10
+ const isFiltered = !!suggestion.filterMethods;
11
+ const colorBy = suggestion.colorBy || 'cluster';
12
+ const nCitations = (suggestion.ragCitations || []).length;
13
+ const nPapers = new Set((suggestion.ragCitations || []).map(c => c.paper_title)).size;
14
+
15
+ return (
16
+ <div className="query-explanation">
17
+ <div className="explanation-header">
18
+ <span className="explanation-title">How This View Was Built</span>
19
+ <Tooltip text="This section explains what the system did to answer your query. Every step is deterministic (no AI guessing) except the final insight text." wide>
20
+ <span className="explanation-help">?</span>
21
+ </Tooltip>
22
+ </div>
23
+
24
+ <div className="explanation-steps">
25
+ <div className="explanation-step">
26
+ <div className="step-number">1</div>
27
+ <div className="step-content">
28
+ <span className="step-label">Query Understanding</span>
29
+ <span className="step-detail">
30
+ Your question was converted into a numerical vector using a
31
+ <Tooltip text="A neural network (all-MiniLM-L6-v2) that converts text into 384-dimensional vectors. Similar questions produce similar vectors, enabling mathematical comparison.">
32
+ <span className="inline-term">sentence-transformer</span>
33
+ </Tooltip>
34
+ {' '}model, then compared against all 56 method descriptions to find the most relevant ones.
35
+ </span>
36
+ </div>
37
+ </div>
38
+
39
+ <div className="explanation-step">
40
+ <div className="step-number">2</div>
41
+ <div className="step-content">
42
+ <span className="step-label">
43
+ {nHighlights} Methods Highlighted
44
+ </span>
45
+ <span className="step-detail">
46
+ The
47
+ <Tooltip text="Cosine similarity measures the angle between two vectors. A score of 1.0 means identical direction (perfect match), 0.0 means unrelated. The highlighted methods scored highest against your query.">
48
+ <span className="inline-term">cosine similarity</span>
49
+ </Tooltip>
50
+ {' '}between your query and each method's description determined the {nHighlights} best matches.
51
+ These are shown as larger, brighter points on the scatter plot.
52
+ </span>
53
+ </div>
54
+ </div>
55
+
56
+ {isFiltered && (
57
+ <div className="explanation-step">
58
+ <div className="step-number">3</div>
59
+ <div className="step-content">
60
+ <span className="step-label">Filtered to {nMethods} Methods</span>
61
+ <span className="step-detail">
62
+ Your query implied a specific subset, so only methods matching the criteria are shown.
63
+ The scatter plot and clustering were recomputed for just these methods.
64
+ </span>
65
+ </div>
66
+ </div>
67
+ )}
68
+
69
+ <div className="explanation-step">
70
+ <div className="step-number">{isFiltered ? 4 : 3}</div>
71
+ <div className="step-content">
72
+ <span className="step-label">{nClusters} Groups via HDBSCAN Clustering</span>
73
+ <span className="step-detail">
74
+ <Tooltip text="HDBSCAN (Hierarchical Density-Based Spatial Clustering) automatically finds groups of similar methods without needing to pre-specify how many groups there are. Unlike K-Means, it discovers natural groupings based on data density.">
75
+ <span className="inline-term">HDBSCAN</span>
76
+ </Tooltip>
77
+ {' '}automatically found {nClusters} natural groups among the {nMethods} methods.
78
+ Column weights were adjusted based on your query keywords to emphasize relevant attributes in the
79
+ <Tooltip text="UMAP (Uniform Manifold Approximation and Projection) takes the high-dimensional feature vectors and projects them to 2D coordinates so you can see which methods are similar (close together) or different (far apart).">
80
+ <span className="inline-term">UMAP projection</span>
81
+ </Tooltip>.
82
+ The scatter plot is colored by <strong>{colorBy}</strong>.
83
+ </span>
84
+ </div>
85
+ </div>
86
+
87
+ {nCitations > 0 && (
88
+ <div className="explanation-step">
89
+ <div className="step-number">{isFiltered ? 5 : 4}</div>
90
+ <div className="step-content">
91
+ <span className="step-label">{nCitations} Passages from {nPapers} Papers</span>
92
+ <span className="step-detail">
93
+ The
94
+ <Tooltip text="ChromaDB stores 1,074 text chunks from 34 research papers. Each chunk was embedded using the same sentence-transformer model, so your query can be matched against actual paper content by vector similarity.">
95
+ <span className="inline-term">vector database</span>
96
+ </Tooltip>
97
+ {' '}was searched for passages relevant to your query. These paper excerpts were
98
+ fed to the LLM to generate the grounded insight above.
99
+ </span>
100
+ </div>
101
+ </div>
102
+ )}
103
+ </div>
104
+ </div>
105
+ );
106
+ }
frontend/src/components/Tooltip.js ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React, { useState, useRef, useCallback, useEffect } from 'react';
2
+ import ReactDOM from 'react-dom';
3
+
4
+ export default function Tooltip({ children, text, wide = false }) {
5
+ const [visible, setVisible] = useState(false);
6
+ const [coords, setCoords] = useState({ top: 0, left: 0 });
7
+ const triggerRef = useRef(null);
8
+ const timeoutRef = useRef(null);
9
+
10
+ const updatePosition = useCallback(() => {
11
+ if (!triggerRef.current) return;
12
+ const rect = triggerRef.current.getBoundingClientRect();
13
+ const tooltipWidth = wide ? 320 : 240;
14
+ let left = rect.left + rect.width / 2 - tooltipWidth / 2;
15
+ // Keep tooltip within viewport
16
+ left = Math.max(8, Math.min(left, window.innerWidth - tooltipWidth - 8));
17
+ // Show below if too close to top, otherwise above
18
+ const showBelow = rect.top < 120;
19
+ const top = showBelow ? rect.bottom + 8 : rect.top - 8;
20
+ setCoords({ top, left, showBelow, tooltipWidth });
21
+ }, [wide]);
22
+
23
+ const show = () => {
24
+ clearTimeout(timeoutRef.current);
25
+ timeoutRef.current = setTimeout(() => {
26
+ updatePosition();
27
+ setVisible(true);
28
+ }, 250);
29
+ };
30
+
31
+ const hide = () => {
32
+ clearTimeout(timeoutRef.current);
33
+ setVisible(false);
34
+ };
35
+
36
+ useEffect(() => () => clearTimeout(timeoutRef.current), []);
37
+
38
+ const tooltip = visible ? ReactDOM.createPortal(
39
+ <div
40
+ className={`tooltip-bubble-fixed ${wide ? 'tooltip-wide' : ''}`}
41
+ style={{
42
+ top: coords.showBelow ? coords.top : undefined,
43
+ bottom: coords.showBelow ? undefined : `${window.innerHeight - coords.top}px`,
44
+ left: coords.left,
45
+ width: coords.tooltipWidth,
46
+ }}
47
+ >
48
+ {text}
49
+ </div>,
50
+ document.body
51
+ ) : null;
52
+
53
+ return (
54
+ <span
55
+ ref={triggerRef}
56
+ className="tooltip-wrapper"
57
+ onMouseEnter={show}
58
+ onMouseLeave={hide}
59
+ >
60
+ {children}
61
+ {tooltip}
62
+ </span>
63
+ );
64
+ }
rag_config.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project_name: grasp-explorer
2
+ domain_context: >
3
+ Robotic grasp planning: computing how a robot should position its gripper
4
+ to pick up objects. Methods vary by planning approach (sampling, regression,
5
+ RL, analytical), gripper type, sensor input, scene complexity, and training
6
+ paradigm (sim, real, or both).
7
+
8
+ csv_path: datasets/csv-gp-combined.csv
9
+ name_column: Name
10
+ description_column: Description
11
+ link_column: "Link(s)"
12
+
13
+ embedding_model: all-MiniLM-L6-v2
14
+ embedding_dimensions: 384
15
+ chroma_persist_dir: ./chroma_db
16
+ collection_name: grasp_papers
17
+
18
+ chunking:
19
+ coarse_max_tokens: 800
20
+ mid_min_tokens: 200
21
+ mid_max_tokens: 800
22
+ mid_overlap_ratio: 0.15
23
+ fine_min_tokens: 50
24
+ fine_max_tokens: 300
25
+ semantic_similarity_threshold: 0.35
26
+ strategies:
27
+ - semantic
28
+ domain_topics:
29
+ # Grasp planning approaches
30
+ - grasp planning
31
+ - grasp detection
32
+ - grasp synthesis
33
+ - grasp pose
34
+ - 6-DoF
35
+ - 7-DoF
36
+ - antipodal grasp
37
+ - power grasp
38
+ - precision grasp
39
+ - grasp quality
40
+ - grasp success rate
41
+ # Planning methods
42
+ - sampling
43
+ - direct regression
44
+ - reinforcement learning
45
+ - analytical
46
+ - optimization
47
+ - generative model
48
+ - diffusion model
49
+ - VAE
50
+ - GAN
51
+ # Gripper types
52
+ - parallel-jaw
53
+ - two-finger
54
+ - multi-finger
55
+ - dexterous
56
+ - suction
57
+ - gripper
58
+ - end-effector
59
+ # Sensors and input
60
+ - point cloud
61
+ - depth image
62
+ - RGB-D
63
+ - TSDF
64
+ - voxel
65
+ - mesh
66
+ - tactile
67
+ - force-torque
68
+ # Scene types
69
+ - cluttered
70
+ - piled
71
+ - singulated
72
+ - packed
73
+ - bin picking
74
+ # Neural network architectures
75
+ - PointNet
76
+ - PointNet++
77
+ - ResNet
78
+ - VGG
79
+ - transformer
80
+ - CNN
81
+ - encoder-decoder
82
+ - U-Net
83
+ # Training and simulation
84
+ - sim-to-real
85
+ - domain randomization
86
+ - transfer learning
87
+ - self-supervised
88
+ - real-world
89
+ - simulation
90
+ - Isaac
91
+ - MuJoCo
92
+ - PyBullet
93
+ # Robotics concepts
94
+ - contact model
95
+ - collision detection
96
+ - motion planning
97
+ - inverse kinematics
98
+ - workspace
99
+ - reachability
100
+ - robot arm
101
+ - manipulator
102
+ # Evaluation
103
+ - success rate
104
+ - grasp metric
105
+ - clearance
106
+ - coverage
107
+ - ablation
108
+
109
+ retrieval:
110
+ coarse_top_k: 2
111
+ mid_top_k: 4
112
+ fine_top_k: 4
113
+ token_budget: 3000
114
+ rerank: false
115
+
116
+ tools_enabled: true
117
+ dataset_columns:
118
+ - Planning Method
119
+ - Training Data
120
+ - End-effector Hardware
121
+ - Object Configuration
122
+ - Input Data
123
+ - Output Pose
124
+ - Backbone
125
+ - Language