Spaces:

htohfa
/

foto

Running

App Files Files Community

htohfa commited on 13 days ago

Commit

2502a3c

1 Parent(s): 6cc68be

adding pathfinder capabilities

Browse files

Files changed (5) hide show

.DS_Store +0 -0
app.py +53 -16
foto/pathfinder_search.py +118 -0
foto/search.py +20 -4
requirements.txt +4 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py CHANGED Viewed

@@ -68,11 +68,15 @@ html, body, [class*="css"] { font-family: 'Inter', sans-serif; font-weight: 300;
 div[data-testid="stSelectbox"] label,
 div[data-testid="stTextInput"] label,
 div[data-testid="stCheckbox"] label { font-family: 'DM Mono', monospace; font-size: 0.75rem; letter-spacing: 0.08em; text-transform: uppercase; color: #888; }
 </style>
 """, unsafe_allow_html=True)
-# Session state
 for key, default in {
     "pdf_cache": {},
     "results": None,
@@ -107,7 +111,7 @@ with col_left:
     api_key = st.text_input("Anthropic API Key", type="password", label_visibility="collapsed", placeholder="sk-ant-...")
     st.markdown('<p class="section-label" style="margin-top:0.8rem;">Semantic Scholar Key (optional)</p>', unsafe_allow_html=True)
-    s2_key = st.text_input("S2 Key", type="password", label_visibility="collapsed", placeholder="(Recommended)")
     st.markdown('<p class="section-label" style="margin-top:1.5rem;">Describe the figure</p>', unsafe_allow_html=True)
     user_text = st.text_area(
@@ -118,6 +122,34 @@ with col_left:
     st.markdown('<p class="section-label" style="margin-top:0.8rem;">Upload a sketch (optional)</p>', unsafe_allow_html=True)
     sketch_file = st.file_uploader("Sketch", type=["png", "jpg", "jpeg", "webp"], label_visibility="collapsed")
     run_verify = st.checkbox("Secondary verification — recommended, adds ~$0.05", value=True)
     st.markdown('<p style="font-size:0.78rem;color:#888;margin-top:-0.8rem;margin-left:1.8rem;">Uses a smarter model to double-check top matches. Best results, small extra cost.</p>', unsafe_allow_html=True)
     num_papers = st.slider("Papers to search", min_value=5, max_value=50, value=20, step=5)
@@ -137,10 +169,12 @@ with col_right:
 """, unsafe_allow_html=True)
-# Pipeline
 if run_btn:
     if not api_key:
         st.error("Please enter your Anthropic API key.")
     elif not user_text and not sketch_file:
         st.error("Please enter a description or upload a sketch (or both).")
     else:
@@ -166,7 +200,7 @@ if run_btn:
             st.session_state.log = []
             try:
-                # Parse input
                 log("⟳ Parsing your description...")
                 parser = InputParser(client, model_cfg.smart, tracker)
                 spec = parser.parse(text=user_text or None, sketch_bytes=sketch_bytes)
@@ -175,13 +209,16 @@ if run_btn:
                 if spec.get("plot_type"):
                     log(f"  Plot type: {spec['plot_type']}")
-                # Search
                 searcher = PaperSearcher(s2_key=s2_key or None)
-                all_papers = searcher.expanded_search(
-                    query, client, model_cfg.smart, tracker, log=log)
                 log(f"✓ {len(all_papers)} unique papers found")
-                # Triage
                 log("⟳ Triaging with Claude...")
                 triager = PaperTriager(client, model_cfg.cheap, tracker)
                 triaged = triager.triage(all_papers, spec)
@@ -189,7 +226,7 @@ if run_btn:
                 log(f"✓ {len(top)} papers passed triage")
                 paper_lookup = {p["paperId"]: p for p in top}
-                # Fetch PDFs
                 log("⟳ Fetching PDFs...")
                 downloaded = []
                 for i, paper in enumerate(top):
@@ -205,7 +242,7 @@ if run_btn:
                 progress_placeholder.empty()
                 log(f"✓ {len(downloaded)} PDFs ready")
-                # Extract figures
                 log("⟳ Extracting figures...")
                 extractor = FigureExtractor()
                 all_figures = []
@@ -219,7 +256,7 @@ if run_btn:
                 filtered = extractor.caption_filter(all_figures, query)
                 log(f"  {len(filtered)} figures after caption filter (from {len(all_figures)} total)")
-                # Primary scoring
                 log(f"⟳ Scoring {len(filtered)} figures...")
                 scorer = FigureScorer(client, model_cfg.cheap, tracker)
                 primary_matches = []
@@ -231,7 +268,7 @@ if run_btn:
                 progress_placeholder.empty()
                 log(f"✓ {len(primary_matches)} primary matches")
-                # Verification
                 verified = primary_matches
                 if run_verify and primary_matches:
                     log(f"⟳ Verifying {len(primary_matches)} matches...")
@@ -270,7 +307,7 @@ if run_btn:
         st.rerun()
-# Results
 if st.session_state.results:
     res = st.session_state.results
     matches = res["matches"]
@@ -331,7 +368,7 @@ if st.session_state.results:
                 st.markdown("---")
-# Feedback
 if st.session_state.results and st.session_state.results.get("matches"):
     st.markdown("""
 <div class="feedback-box">
@@ -351,7 +388,7 @@ if st.session_state.results and st.session_state.results.get("matches"):
         st.success("Thanks!")
-# Persistent tally
 stats = st.session_state.global_stats
 n_ratings = len(stats["ratings"])
 avg = sum(stats["ratings"]) / n_ratings if n_ratings else 0
@@ -365,4 +402,4 @@ st.markdown(f"""
     <div class="stat-item"><div class="tally-num">{"—" if not n_ratings else f"{avg:.1f}"}</div><div class="tally-label">Avg score</div></div>
   </div>
 </div>
-""", unsafe_allow_html=True)

 div[data-testid="stSelectbox"] label,
 div[data-testid="stTextInput"] label,
 div[data-testid="stCheckbox"] label { font-family: 'DM Mono', monospace; font-size: 0.75rem; letter-spacing: 0.08em; text-transform: uppercase; color: #888; }
+.pathfinder-row { display: flex; align-items: center; gap: 0.5rem; }
+.pathfinder-cite { font-family: 'DM Mono', monospace; font-size: 0.7rem; color: #888; }
+.pathfinder-cite a { color: #888; text-decoration: underline; }
 </style>
 """, unsafe_allow_html=True)
+# Per-session state for caching, run status, log buffer, and feedback tally
 for key, default in {
     "pdf_cache": {},
     "results": None,
     api_key = st.text_input("Anthropic API Key", type="password", label_visibility="collapsed", placeholder="sk-ant-...")
     st.markdown('<p class="section-label" style="margin-top:0.8rem;">Semantic Scholar Key (optional)</p>', unsafe_allow_html=True)
+    s2_key = st.text_input("S2 Key", type="password", label_visibility="collapsed", placeholder="(Recommended for keyword fallback)")
     st.markdown('<p class="section-label" style="margin-top:1.5rem;">Describe the figure</p>', unsafe_allow_html=True)
     user_text = st.text_area(
     st.markdown('<p class="section-label" style="margin-top:0.8rem;">Upload a sketch (optional)</p>', unsafe_allow_html=True)
     sketch_file = st.file_uploader("Sketch", type=["png", "jpg", "jpeg", "webp"], label_visibility="collapsed")
+    # Pathfinder toggle + inline citation link
+    pf_col1, pf_col2 = st.columns([2, 3])
+    with pf_col1:
+        use_pathfinder = st.checkbox("Use Pathfinder", value=True)
+    with pf_col2:
+        st.markdown(
+            '<div class="pathfinder-cite" style="padding-top:0.55rem;">'
+            'based on <a href="https://arxiv.org/abs/2408.01556" target="_blank">arXiv:2408.01556</a>'
+            '</div>',
+            unsafe_allow_html=True,
+        )
+    # OpenAI key only needed when Pathfinder is active (used to embed queries
+    # with text-embedding-3-small against the Pathfinder corpus)
+    openai_key = None
+    if use_pathfinder:
+        st.markdown('<p class="section-label" style="margin-top:0.6rem;">OpenAI API Key</p>', unsafe_allow_html=True)
+        openai_key = st.text_input(
+            "OpenAI Key", type="password", label_visibility="collapsed",
+            placeholder="sk-...",
+        )
+        st.markdown(
+            '<p style="font-size:0.78rem;color:#888;margin-top:-0.4rem;">'
+            'Used to embed queries with text-embedding-3-small (~$0.40 per million queries).'
+            '</p>',
+            unsafe_allow_html=True,
+        )
     run_verify = st.checkbox("Secondary verification — recommended, adds ~$0.05", value=True)
     st.markdown('<p style="font-size:0.78rem;color:#888;margin-top:-0.8rem;margin-left:1.8rem;">Uses a smarter model to double-check top matches. Best results, small extra cost.</p>', unsafe_allow_html=True)
     num_papers = st.slider("Papers to search", min_value=5, max_value=50, value=20, step=5)
 """, unsafe_allow_html=True)
+# Full pipeline runs on button press
 if run_btn:
     if not api_key:
         st.error("Please enter your Anthropic API key.")
+    elif use_pathfinder and not openai_key:
+        st.error("Pathfinder is checked — please enter your OpenAI API key, or uncheck Pathfinder to use keyword search.")
     elif not user_text and not sketch_file:
         st.error("Please enter a description or upload a sketch (or both).")
     else:
             st.session_state.log = []
             try:
+                # Parse text + optional sketch into a structured search spec
                 log("⟳ Parsing your description...")
                 parser = InputParser(client, model_cfg.smart, tracker)
                 spec = parser.parse(text=user_text or None, sketch_bytes=sketch_bytes)
                 if spec.get("plot_type"):
                     log(f"  Plot type: {spec['plot_type']}")
+                # Pathfinder semantic retrieval, or legacy keyword expansion as fallback
                 searcher = PaperSearcher(s2_key=s2_key or None)
+                if use_pathfinder:
+                    all_papers = searcher.expanded_search_pathfinder(query, openai_key, log=log)
+                else:
+                    all_papers = searcher.expanded_search(
+                        query, client, model_cfg.smart, tracker, log=log)
                 log(f"✓ {len(all_papers)} unique papers found")
+                # Abstract-level relevance filter to cut downstream cost
                 log("⟳ Triaging with Claude...")
                 triager = PaperTriager(client, model_cfg.cheap, tracker)
                 triaged = triager.triage(all_papers, spec)
                 log(f"✓ {len(top)} papers passed triage")
                 paper_lookup = {p["paperId"]: p for p in top}
+                # PDF fetch with arxiv-first URL preference, polite spacing
                 log("⟳ Fetching PDFs...")
                 downloaded = []
                 for i, paper in enumerate(top):
                 progress_placeholder.empty()
                 log(f"✓ {len(downloaded)} PDFs ready")
+                # Pull raster figures + captions from each PDF, then caption pre-filter
                 log("⟳ Extracting figures...")
                 extractor = FigureExtractor()
                 all_figures = []
                 filtered = extractor.caption_filter(all_figures, query)
                 log(f"  {len(filtered)} figures after caption filter (from {len(all_figures)} total)")
+                # Cheap vision pass: score every surviving figure against the spec
                 log(f"⟳ Scoring {len(filtered)} figures...")
                 scorer = FigureScorer(client, model_cfg.cheap, tracker)
                 primary_matches = []
                 progress_placeholder.empty()
                 log(f"✓ {len(primary_matches)} primary matches")
+                # Optional smart-model verification on figures that passed primary scoring
                 verified = primary_matches
                 if run_verify and primary_matches:
                     log(f"⟳ Verifying {len(primary_matches)} matches...")
         st.rerun()
+# Render results: stats row, downloadable zip, then per-figure cards with metadata
 if st.session_state.results:
     res = st.session_state.results
     matches = res["matches"]
                 st.markdown("---")
+# Post-search feedback slider; submission logs to persistence layer
 if st.session_state.results and st.session_state.results.get("matches"):
     st.markdown("""
 <div class="feedback-box">
         st.success("Thanks!")
+# Aggregate stats across all sessions, loaded from persistence
 stats = st.session_state.global_stats
 n_ratings = len(stats["ratings"])
 avg = sum(stats["ratings"]) / n_ratings if n_ratings else 0
     <div class="stat-item"><div class="tally-num">{"—" if not n_ratings else f"{avg:.1f}"}</div><div class="tally-label">Avg score</div></div>
   </div>
 </div>
+""", unsafe_allow_html=True)

foto/pathfinder_search.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""Semantic search over the Pathfinder astronomy corpus (Iyer et al. 2024,
+arXiv:2408.01556).
+The corpus ships with pre-computed text-embedding-3-small vectors for each
+paper. Queries are embedded with the same OpenAI model and matched via FAISS
+on the embedding column. Output format matches the rest of foto's search
+layer so it slots in interchangeably with keyword search.
+"""
+import os
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import streamlit as st
+from datasets import load_from_disk, load_dataset
+DATASET_NAME = "kiyer/pathfinder_arxiv_data"
+EMBEDDING_MODEL = "text-embedding-3-small"
+# First-run download lands here; subsequent runs load_from_disk straight from cache
+DATA_DIR = Path.home() / ".cache" / "foto" / "pathfinder_data"
+@st.cache_resource(show_spinner="Loading Pathfinder corpus (~5 GB on first run)...")
+def load_pathfinder_corpus():
+    """Returns the dataset with a FAISS index attached to the embed column.
+    Downloads from HF on first call, reuses local cache afterward."""
+    if not DATA_DIR.exists():
+        DATA_DIR.mkdir(parents=True, exist_ok=True)
+        ds = load_dataset(DATASET_NAME, split="train")
+        ds.save_to_disk(str(DATA_DIR))
+    else:
+        ds = load_from_disk(str(DATA_DIR))
+    if not ds.is_index_initialized("embed"):
+        ds.add_faiss_index(column="embed")
+    return ds
+def make_embedder(openai_key: str):
+    """Returns a function that embeds text into a 1536-dim vector with
+    text-embedding-3-small. Key is user-supplied so this is not cached."""
+    from openai import OpenAI
+    if not openai_key:
+        raise RuntimeError(
+            "Pathfinder uses text-embedding-3-small from OpenAI. Set an "
+            "OpenAI API key in the sidebar (get one at platform.openai.com)."
+        )
+    client = OpenAI(api_key=openai_key)
+    def embed(text: str) -> np.ndarray:
+        resp = client.embeddings.create(model=EMBEDDING_MODEL, input=[text])
+        return np.array(resp.data[0].embedding, dtype=np.float32)
+    return embed
+def _row_to_paper(row: dict, similarity: float) -> dict:
+    """Map a Pathfinder dataset row into foto's paper dict shape."""
+    arxiv_id = row.get("arxiv_id") or ""
+    year = None
+    d = row.get("date")
+    if d is not None:
+        try:
+            year = d.year
+        except AttributeError:
+            s = str(d)
+            year = int(s[:4]) if s[:4].isdigit() else None
+    # Pathfinder stores authors as a list of strings; foto wants [{"name": ...}, ...]
+    raw_authors = row.get("authors") or []
+    if raw_authors and isinstance(raw_authors[0], str):
+        authors = [{"name": a} for a in raw_authors]
+    else:
+        authors = raw_authors
+    return {
+        "paperId": f"arxiv_{arxiv_id}" if arxiv_id else f"ads_{row.get('ads_id', '')}",
+        "title": row.get("title", ""),
+        "abstract": row.get("abstract", ""),
+        "year": year,
+        "authors": authors,
+        "externalIds": {"ArXiv": arxiv_id} if arxiv_id else {},
+        "openAccessPdf": {"url": f"https://arxiv.org/pdf/{arxiv_id}"} if arxiv_id else {},
+        "citationCount": row.get("cites", 0) or 0,
+        "_source": "pathfinder",
+        "_pathfinder_score": similarity,
+    }
+class PathfinderSearcher:
+    """Semantic retrieval over the Pathfinder corpus.
+    Output format matches PaperSearcher.search_s2 so downstream code is unchanged."""
+    def __init__(self, openai_key: str):
+        self.dataset = load_pathfinder_corpus()
+        self.embed = make_embedder(openai_key)
+    def search(self, query: str, limit: int = 50) -> list[dict]:
+        query_vec = self.embed(query)
+        tmp = self.dataset.search("embed", query_vec, k=limit)
+        results = []
+        for idx, dist in zip(tmp.indices, tmp.scores):
+            row = self.dataset[int(idx)]
+            # Skip papers with no arxiv_id — the download step needs it
+            if not row.get("arxiv_id"):
+                continue
+            # Convert FAISS distance to similarity, matching Pathfinder's convention
+            similarity = 1.0 / (1.0 + float(dist))
+            results.append(_row_to_paper(row, similarity))
+        return results

foto/search.py CHANGED Viewed

@@ -146,6 +146,8 @@ class PaperSearcher:
         return scored[:top_n]
     def expanded_search(self, query: str, client, model: str, tracker, log=None) -> list[dict]:
         def _log(msg):
             if log:
                 log(msg)
@@ -153,7 +155,7 @@ class PaperSearcher:
         seen_ids, seen_titles = set(), set()
         all_results = []
-        # Round 1: expanded queries from Claude
         try:
             response = client.messages.create(
                 model=model, max_tokens=200,
@@ -175,7 +177,7 @@ class PaperSearcher:
         _log(f"  {len(all_results)} papers after round 1")
-        # Round 2: author search
         top_authors = self._top_authors(all_results)
         if top_authors:
             for author in top_authors:
@@ -185,7 +187,7 @@ class PaperSearcher:
                 time.sleep(1)
             _log(f"  {len(all_results)} papers after author search")
-        # Round 3: adjacent topics from landmarks
         landmarks = self._landmarks(all_results)
         if landmarks:
             titles = "\n".join(f"- {p['title']}" for p in landmarks)
@@ -210,6 +212,20 @@ class PaperSearcher:
         all_results.sort(key=lambda p: -(p.get("citationCount", 0) or 0))
         return all_results
 class PaperTriager:
     def __init__(self, client, model: str, tracker):
@@ -243,4 +259,4 @@ class PaperTriager:
         scored.sort(key=lambda p: -(p.get("citationCount") or 0) * 0.1
                                   - p["_triage"].get("confidence", 0))
-        return scored

         return scored[:top_n]
     def expanded_search(self, query: str, client, model: str, tracker, log=None) -> list[dict]:
+        """Keyword-based retrieval with 3 rounds of LLM-driven query expansion.
+        Used as fallback when Pathfinder semantic search is disabled."""
         def _log(msg):
             if log:
                 log(msg)
         seen_ids, seen_titles = set(), set()
         all_results = []
+        # Round 1: LLM expands the original query into subfield-specific variants
         try:
             response = client.messages.create(
                 model=model, max_tokens=200,
         _log(f"  {len(all_results)} papers after round 1")
+        # Round 2: pull recent papers from top citation-weighted authors
         top_authors = self._top_authors(all_results)
         if top_authors:
             for author in top_authors:
                 time.sleep(1)
             _log(f"  {len(all_results)} papers after author search")
+        # Round 3: LLM proposes adjacent subfields based on landmark titles
         landmarks = self._landmarks(all_results)
         if landmarks:
             titles = "\n".join(f"- {p['title']}" for p in landmarks)
         all_results.sort(key=lambda p: -(p.get("citationCount", 0) or 0))
         return all_results
+    def expanded_search_pathfinder(self, query: str, openai_key: str, log=None) -> list[dict]:
+        """Semantic retrieval against the Pathfinder corpus (Iyer et al. 2024,
+        arXiv:2408.01556). Embeds query with text-embedding-3-small, returns
+        top-K papers by FAISS similarity over precomputed embeddings."""
+        from .pathfinder_search import PathfinderSearcher
+        if log:
+            log("⟳ Semantic search via Pathfinder corpus...")
+        searcher = PathfinderSearcher(openai_key=openai_key)
+        results = searcher.search(query, limit=50)
+        if log:
+            log(f"  ✓ {len(results)} papers retrieved (semantic similarity)")
+        return results
 class PaperTriager:
     def __init__(self, client, model: str, tracker):
         scored.sort(key=lambda p: -(p.get("citationCount") or 0) * 0.1
                                   - p["_triage"].get("confidence", 0))
+        return scored

requirements.txt CHANGED Viewed

@@ -1,6 +1,10 @@
 streamlit>=1.35.0
 anthropic>=0.25.0
 requests>=2.31.0
 PyMuPDF>=1.24.0
 Pillow>=10.0.0
 gspread>=6.0.0

 streamlit>=1.35.0
 anthropic>=0.25.0
+openai>=1.0.0
 requests>=2.31.0
 PyMuPDF>=1.24.0
 Pillow>=10.0.0
 gspread>=6.0.0
+google-auth>=2.0.0
+datasets>=2.18.0
+faiss-cpu>=1.7.4