Spaces:

uw-math-ai
/

theorem-search

Running

App Files Files Community

Sophie commited on Nov 9

Commit

947c57d

1 Parent(s): 878f0ee

integrated pgvector; updated SQL calls to reference new papers table; minor refactoring

Browse files

Files changed (2) hide show

requirements.txt +2 -0
src/streamlit_app.py +215 -195

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 streamlit==1.39.0
 sentence-transformers>=3.0.0
 numpy

+requests
+pgvector
 streamlit==1.39.0
 sentence-transformers>=3.0.0
 numpy

src/streamlit_app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import streamlit as st
 import json
 import numpy as np
-from sentence_transformers import SentenceTransformer, util
 import os
 import boto3
 import psycopg2
 from psycopg2.extensions import connection
-import torch
 import re
 import requests
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -16,6 +16,7 @@ from latex_clean import clean_latex_for_display
 # Config
 load_dotenv()
 def get_rds_connection() -> connection:
     region = os.getenv("AWS_REGION")
     secret_arn = os.getenv("RDS_SECRET_ARN")
@@ -34,8 +35,10 @@ def get_rds_connection() -> connection:
         password=secret_dict["password"],
         sslmode="require",
     )
     return conn
 AVAILABLE_TAGS = {
     "arXiv": [
         "math.AC", "math.AG", "math.AP", "math.AT", "math.CA", "math.CO",
@@ -51,7 +54,7 @@ AVAILABLE_TAGS = {
 }
 ALLOWED_TYPES = [
-    "theorem", "lemma", "proposition", "corollary", "definition", "remark", "assumption"
 ]
 ARXIV_ID_RE = re.compile(
@@ -59,52 +62,63 @@ ARXIV_ID_RE = re.compile(
     re.IGNORECASE
 )
 # Load the Embedding Model
 @st.cache_resource
 def load_model():
-    """
-    Loads the specialized math embedding model from Hugging Face.
-    """
     try:
-        model = SentenceTransformer('math-similarity/Bert-MLM_arXiv-MP-class_zbMath')
         return model
     except Exception as e:
         st.error(f"Error loading the embedding model: {e}")
         return None
 # Load Data from RDS
 @st.cache_data
 def load_papers_from_rds():
     """
-    Loads theorem data from the RDS database and prepares it for embedding.
     Returns a list of theorem dictionaries with all necessary fields.
     """
     try:
         conn = get_rds_connection()
         cur = conn.cursor()
-        # Fetch all papers with their theorems and embeddings
         cur.execute("""
-            SELECT
-                tm.paper_id,
-                tm.title,
-                tm.authors,
-                tm.link,
-                tm.last_updated,
-                tm.summary,
-                tm.journal_ref,
-                tm.primary_category,
-                tm.categories,
-                tm.global_notations,
-                tm.global_definitions,
-                tm.global_assumptions,
-                te.theorem_name,
-                te.theorem_slogan,
-                te.theorem_body,
-                te.embedding
-            FROM theorem_metadata tm
-            JOIN theorem_embedding te ON tm.paper_id = te.paper_id
-            ORDER BY tm.paper_id, te.theorem_name;
         """)
         rows = cur.fetchall()
@@ -115,27 +129,7 @@ def load_papers_from_rds():
         for row in rows:
             (paper_id, title, authors, link, last_updated, summary,
              journal_ref, primary_category, categories,
-             global_notations, global_definitions, global_assumptions,
-             theorem_name, theorem_slogan, theorem_body, embedding) = row
-            # Build global context
-            global_context_parts = []
-            if global_notations:
-                global_context_parts.append(f"**Global Notations:**\n{global_notations}")
-            if global_definitions:
-                global_context_parts.append(f"**Global Definitions:**\n{global_definitions}")
-            if global_assumptions:
-                global_context_parts.append(f"**Global Assumptions:**\n{global_assumptions}")
-            global_context = "\n\n".join(global_context_parts)
-            # Convert embedding to a numpy float array
-            if isinstance(embedding, str):
-                embedding = json.loads(embedding)
-            if isinstance(embedding, list):
-                embedding = np.array(embedding, dtype=np.float32)
-            elif isinstance(embedding, np.ndarray):
-                embedding = embedding.astype(np.float32)
             # Determine source from url
             link_str = link or ""
@@ -145,15 +139,6 @@ def load_papers_from_rds():
                 source = "Stacks Project"
             # Determine type from name
-            def infer_type(name: str) -> str:
-                if not name:
-                    return "theorem"
-                lower = name.lower()
-                for t in ["theorem", "lemma", "proposition", "corollary", "definition", "remark", "assumption"]:
-                    if t in lower:
-                        return t
-                return "theorem"
             inferred_type = infer_type(theorem_name or "")
             all_theorems_data.append({
@@ -170,8 +155,6 @@ def load_papers_from_rds():
                 "theorem_name": theorem_name,
                 "theorem_slogan": theorem_slogan,
                 "theorem_body": theorem_body,
-                "global_context": global_context,
-                "stored_embedding": embedding,
             })
         return all_theorems_data
@@ -272,7 +255,7 @@ def extract_arxiv_id(s: str) -> str | None:
 def normalize_title(s: str) -> str:
     return (s or "").casefold().strip()
-def parse_paper_filter_input(raw: str) -> dict:
     """
     Parse user input into two sets: arxiv_ids and title substrings.
     Multiple entries may be comma-separated.
@@ -289,165 +272,197 @@ def parse_paper_filter_input(raw: str) -> dict:
             titles.add(normalize_title(token))
     return {"ids": ids, "titles": titles}
-def item_matches_paper_filter(item: dict, paper_filter: dict) -> bool:
-    """
-    True if the item matches at least one requested arXiv ID or one title substring.
-    If paper_filter is empty (both sets empty), always True.
-    """
-    ids = paper_filter.get("ids", set())
-    titles = paper_filter.get("titles", set())
-    if not ids and not titles:
-        return True
-    # Compare IDs (extract once from url)
-    url = item.get("paper_url") or ""
-    item_id = extract_arxiv_id(url)
-    if item_id and item_id.lower() in ids:
-        return True
-    # Compare titles (substring, case-insensitive)
-    t = normalize_title(item.get("paper_title"))
-    if t and any(sub in t for sub in titles):
-        return True
-    return False
 # --- Search and Display ---
-def search_and_display_with_filters(query, model, theorems_data, embeddings_db, filters):
     if not filters['sources']:
         st.warning("Please select at least one source.")
         return
-    if query:
-        query_embedding = model.encode(query, convert_to_tensor=True)
-        cosine_scores = util.cos_sim(query_embedding, embeddings_db)[0]
-    else:
-        cosine_scores = torch.zeros(len(theorems_data))
-    low, high = filters['citation_range']
-    # Get a larger pool to filter from
-    top_k_pool = min(200, len(theorems_data))
-    top_indices = torch.topk(cosine_scores, k=top_k_pool, sorted=True).indices
-    top_indices = top_indices.tolist()
-    paper_filter = filters.get("paper_filter", {"ids": set(), "titles": set()})
-    matched_indices = []
-    if paper_filter and (paper_filter.get("ids") or paper_filter.get("titles")):
-        for i, it in enumerate(theorems_data):
-            if item_matches_paper_filter(it, paper_filter):
-                matched_indices.append(i)
-    pool_indices = list(dict.fromkeys(top_indices + matched_indices))
-    pool = [(i, theorems_data[i]) for i in pool_indices]
-    # Fetch citations in parallel
-    if ('arXiv' in filters['sources']):
-        add_citations([it for _, it in pool])
-    results = []
-    # Filter results
-    for idx, item in pool:
-        type_match = (not filters['types']) or (item.get('type','').lower() in filters['types'])
-        tag_match = (not filters['tags'])  or (item.get('primary_category') in filters['tags'])
-        author_match = (not filters['authors']) or any(a in (item.get('authors') or []) for a in filters['authors'])
-        source_match = item.get('source') in filters['sources']
-        paper_match = item_matches_paper_filter(item, filters['paper_filter'])
-        # Citations & year & journal only for arXiv
-        citations = item.get('citations')
-        log_cit = np.log1p(int(citations)) if citations is not None else 0.0
-        if citations is None:
-            if not filters['include_unknown_citations']:
-                continue
-            citation_match = True
-        else:
-            citation_match = (low <= int(citations) <= high)
-        year_match = True
-        if filters['year_range'] and item.get('source') == 'arXiv':
-            y = item.get('year') or 0
-            yr0, yr1 = filters['year_range']
-            year_match = (yr0 <= y <= yr1)
-        journal_match = True
-        if item.get('source') == 'arXiv':
-            status = filters['journal_status']
-            jp = bool(item.get('journal_published'))
-            if status == "Journal Article":
-                journal_match = jp
-            elif status == "Preprint Only":
-                journal_match = not jp
-        if all([type_match, tag_match, author_match, source_match, paper_match, citation_match, year_match, journal_match]):
-            # Similarity = cosine_similary + citation_weight * log(citation_count)
-            similarity = float(cosine_scores[idx].item()) + filters['citation_weight'] * log_cit
-            results.append({"idx": idx, "info": item, "similarity": similarity})
-            if len(results) >= filters['top_k']:
-                break
-    results.sort(key=lambda r: r["similarity"], reverse=True)
-    results = results[:filters['top_k']]
-    st.subheader(f"Found {len(results)} Matching Results")
-    if not results:
         st.warning("No results found for the current filters.")
         return
-    for i, r in enumerate(results):
-        info = r["info"]
-        expander_title = f"**Result {i+1} | Similarity: {r['similarity']:.4f} | Type: {info.get('type','').title()}**"
         with st.expander(expander_title, expanded=True):
-            st.markdown(f"**Paper:** *{info.get('paper_title','Unknown')}*")
             st.markdown(f"**Authors:** {', '.join(info.get('authors') or []) or 'N/A'}")
             st.markdown(f"**Source:** {info.get('source')} ({info.get('paper_url')})")
             citations = info.get("citations")
             cit_str = "Unknown" if citations is None else str(citations)
             st.markdown(
-                f"**Math Tag:** `{info.get('primary_category')}` | "
                 f"**Citations:** {cit_str} | "
                 f"**Year:** {info.get('year', 'N/A')}"
             )
-            # Testing only
-            if filters['citation_weight'] > 0:
-                base = float(cosine_scores[r["idx"]].item())
-                log_cit = np.log1p(int(citations)) if citations is not None else 0.0
-                st.caption(
-                    f"base_cosine={base:.4f}  |  log(citations)={log_cit:.4f}  |  weight={filters['citation_weight']:.2f}")
             st.markdown("---")
             if info.get("theorem_slogan"):
                 st.markdown(f"**Slogan:** {info['theorem_slogan']}\n")
-            if info.get("global_context"):
-                cleaned_ctx = clean_latex_for_display(info["global_context"])
-                st.markdown("> " + cleaned_ctx.replace("\n", "\n> ") )
             cleaned_content = clean_latex_for_display(info['theorem_body'])
             st.markdown(f"**{info['theorem_name'] or 'Theorem Body.'}**")
             st.markdown(cleaned_content)
-            st.markdown("**Paper ID:**")
-            st.markdown(info['paper_id'])
-            # Testing only
-            st.markdown('**Paper ID (testing only)**')
-            st.markdown(info['paper_id'])
 # --- Main App Interface ---
 st.set_page_config(page_title="Theorem Search Demo", layout="wide")
-st.title("📚 Semantic Theorem Search")
-st.write("This demo uses a specialized mathematical language model to find theorems semantically similar to your query.")
 model = load_model()
 theorems_data = load_papers_from_rds()
 if model and theorems_data:
-    with st.spinner("Preparing embeddings from database..."):
-        corpus_embeddings = np.array([item['stored_embedding'] for item in theorems_data])
     st.success(f"Successfully loaded {len(theorems_data)} theorems from arXiv and the Stacks Project. Ready to search!")
     # --- Sidebar filters ---
     with st.sidebar:
         st.header("Search Filters")
@@ -461,6 +476,7 @@ if model and theorems_data:
         )
         selected_authors, selected_types, selected_tags = [], [], []
         year_range, journal_status = None, "All"
         citation_range = (0, 1000)
         citation_weight = 0.0
@@ -479,16 +495,20 @@ if model and theorems_data:
             for it in theorems_data:
                 tags_per_source[it['source']].add(it.get('primary_category'))
             union_tags = sorted({t for s in selected_sources for t in tags_per_source.get(s, set()) if t})
-            selected_tags = st.multiselect("Filter by Math Tag/Category:", union_tags)
-            paper_filter_raw = st.text_input("Filter by Paper",
                                              value="",
                                              placeholder="e.g., 2401.12345, Finite Hilbert stability",
                                              help="Filter by title substring or arXiv ID/URL. Use commas for multiple.")
             if 'arXiv' in selected_sources:
                 year_range = st.slider("Filter by Year:", 1991, 2025, (1991, 2025))
-                journal_status = st.radio("Publication Status:", ["All", "Journal Article", "Preprint Only"], horizontal=True)
-                citation_range = st.slider("Filter by Citations:", 0, 1000, (0, 1000))
-                citation_weight = st.slider("Citation Weight:", 0.0, 1.0, 0.0, step=0.01)
                 include_unknown_citations = st.checkbox(
                     "Include entries with unknown citation counts",
                     value=True,
@@ -501,7 +521,7 @@ if model and theorems_data:
         "types": [t.lower() for t in selected_types],
         "tags": selected_tags,
         "sources": selected_sources,
-        "paper_filter": parse_paper_filter_input(paper_filter_raw),
         "year_range": year_range,
         "journal_status": journal_status,
         "citation_range": citation_range,
@@ -512,6 +532,6 @@ if model and theorems_data:
     user_query = st.text_input("Enter your query:", "")
     if st.button("Search") or user_query:
-        search_and_display_with_filters(user_query, model, theorems_data, corpus_embeddings, filters)
 else:
-    st.error("Could not load the model or data from RDS. Please check your RDS database connection and credentials.")

 import streamlit as st
 import json
 import numpy as np
+from sentence_transformers import SentenceTransformer
 import os
 import boto3
 import psycopg2
 from psycopg2.extensions import connection
+from pgvector.psycopg2 import register_vector
 import re
 import requests
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # Config
 load_dotenv()
 def get_rds_connection() -> connection:
     region = os.getenv("AWS_REGION")
     secret_arn = os.getenv("RDS_SECRET_ARN")
         password=secret_dict["password"],
         sslmode="require",
     )
+    register_vector(conn)
     return conn
 AVAILABLE_TAGS = {
     "arXiv": [
         "math.AC", "math.AG", "math.AP", "math.AT", "math.CA", "math.CO",
 }
 ALLOWED_TYPES = [
+    "theorem", "lemma", "proposition"
 ]
 ARXIV_ID_RE = re.compile(
     re.IGNORECASE
 )
+EMBED_TABLE = "theorem_embedding_qwen"
 # Load the Embedding Model
 @st.cache_resource
 def load_model():
     try:
+        model = SentenceTransformer('Qwen/Qwen3-Embedding-0.6B')
         return model
     except Exception as e:
         st.error(f"Error loading the embedding model: {e}")
         return None
+def infer_type(name: str) -> str:
+    if not name:
+        return "theorem"
+    lower = name.lower()
+    for t in ["theorem", "lemma", "proposition"]:
+        if t in lower:
+            return t
+    return "theorem"
 # Load Data from RDS
 @st.cache_data
 def load_papers_from_rds():
     """
+    Loads the theorem data from the RDS database.
     Returns a list of theorem dictionaries with all necessary fields.
     """
     try:
         conn = get_rds_connection()
         cur = conn.cursor()
+        # Fetch all papers with their theorems
         cur.execute("""
+            WITH latest_slogan AS (SELECT DISTINCT
+            ON (ts.theorem_id)
+                ts.theorem_id, ts.slogan_id, ts.slogan
+            FROM theorem_slogan ts
+            ORDER BY ts.theorem_id, ts.slogan_id DESC
+                )
+            SELECT p.paper_id,
+                   p.title,
+                   p.authors,
+                   p.link,
+                   p.last_updated,
+                   p.summary,
+                   p.journal_ref,
+                   p.primary_category,
+                   p.categories,
+                   t.name    AS theorem_name,
+                   ls.slogan AS theorem_slogan,
+                   t.body    AS theorem_body
+            FROM paper p
+                     JOIN theorem t ON t.paper_id = p.paper_id
+                     LEFT JOIN latest_slogan ls ON ls.theorem_id = t.theorem_id
+            ORDER BY p.paper_id, t.name;
         """)
         rows = cur.fetchall()
         for row in rows:
             (paper_id, title, authors, link, last_updated, summary,
              journal_ref, primary_category, categories,
+             theorem_name, theorem_slogan, theorem_body) = row
             # Determine source from url
             link_str = link or ""
                 source = "Stacks Project"
             # Determine type from name
             inferred_type = infer_type(theorem_name or "")
             all_theorems_data.append({
                 "theorem_name": theorem_name,
                 "theorem_slogan": theorem_slogan,
                 "theorem_body": theorem_body,
             })
         return all_theorems_data
 def normalize_title(s: str) -> str:
     return (s or "").casefold().strip()
+def parse_paper_filter(raw: str) -> dict:
     """
     Parse user input into two sets: arxiv_ids and title substrings.
     Multiple entries may be comma-separated.
             titles.add(normalize_title(token))
     return {"ids": ids, "titles": titles}
+def compute_score(similarity: float, citations: int, weight: float) -> float:
+    c = int(citations) if citations is not None else 0
+    if c == 0:
+        return float(similarity)
+    return float(similarity) + float(weight) * np.log(c)
 # --- Search and Display ---
+def search_and_display(query: str, model, filters: dict):
     if not filters['sources']:
         st.warning("Please select at least one source.")
         return
+    # Encode query to numpy array
+    query_vec = model.encode(query or "", normalize_embeddings=True, convert_to_numpy=True)
+    where = []
+    params = []
+    # Source
+    if filters['sources']:
+        src_cases = []
+        if 'arXiv' in filters['sources']:
+            src_cases.append(" (p.link ILIKE '%%arxiv.org%%') ")
+        if 'Stacks Project' in filters['sources']:
+            src_cases.append(" (p.link NOT ILIKE '%%arxiv.org%%') ")
+        if src_cases:
+            where.append("(" + " OR ".join(src_cases) + ")")
+    # Authors
+    if filters['authors']:
+        where.append(" p.authors && %s ")
+        params.append(filters['authors'])
+    # Tag/category
+    if filters['tags']:
+        where.append(" p.primary_category = ANY(%s) ")
+        params.append(filters['tags'])
+    # Year (arXiv only)
+    if filters['year_range']:
+        yr0, yr1 = filters['year_range']
+        where.append("""
+                ( (p.link ILIKE '%%arxiv.org%%' AND EXTRACT(YEAR FROM p.last_updated) BETWEEN %s AND %s)
+                  OR (p.link NOT ILIKE '%%arxiv.org%%') )
+            """)
+        params.extend([yr0, yr1])
+    # Journal status (arXiv only)
+    if filters['journal_status'] != "All":
+        if filters['journal_status'] == "Journal Article":
+            where.append(" (p.link ILIKE '%%arxiv.org%%' AND p.journal_ref IS NOT NULL) ")
+        elif filters['journal_status'] == "Preprint Only":
+            where.append(" (p.link ILIKE '%%arxiv.org%%' AND p.journal_ref IS NULL) ")
+    # Paper filter: arXiv id in link or title substring(s)
+    pf = filters.get("paper_filter", {"ids": set(), "titles": set()})
+    id_patterns = [f"%{i}%" for i in pf.get("ids", set())]
+    title_patterns = [f"%{t}%" for t in pf.get("titles", set())]
+    pf_clauses = []
+    if id_patterns:
+        pf_clauses.append(" p.link ILIKE ANY(%s) ")
+        params.append(id_patterns)
+    if title_patterns:
+        pf_clauses.append(" p.title ILIKE ANY(%s) ")
+        params.append(title_patterns)
+    if pf_clauses:
+        where.append("(" + " OR ".join(pf_clauses) + ")")
+    # Filter in SQL
+    if filters['types']:
+        like_any = [f"%{t}%" for t in filters['types']]
+        where.append(" lower(t.name) ILIKE ANY(%s) ")
+        params.append(like_any)
+    sql = f"""
+            WITH latest_slogan AS (
+                SELECT DISTINCT ON (ts.theorem_id)
+                       ts.theorem_id, ts.slogan_id, ts.slogan, ts.model
+                FROM theorem_slogan ts
+                ORDER BY ts.theorem_id, ts.slogan_id DESC
+            )
+            SELECT
+                p.paper_id, p.title, p.authors, p.link, p.last_updated, p.summary,
+                p.journal_ref, p.primary_category, p.categories,
+                t.theorem_id, t.name AS theorem_name, t.body AS theorem_body,
+                ls.slogan AS theorem_slogan,
+                (1.0 - (e.embedding <#> %s::vector)) AS similarity
+            FROM paper p
+            JOIN theorem t        ON t.paper_id = p.paper_id
+            JOIN latest_slogan ls ON ls.theorem_id = t.theorem_id
+            JOIN {EMBED_TABLE} e  ON e.slogan_id   = ls.slogan_id
+            {'WHERE ' + ' AND '.join(where) if where else ''}
+            ORDER BY e.embedding <#> %s::vector ASC
+            LIMIT %s
+        """
+    exec_params = [query_vec, *params, query_vec, int(filters['top_k'])]
+    conn = get_rds_connection()
+    cur = conn.cursor()
+    cur.execute(sql, exec_params)
+    rows = cur.fetchall()
+    cur.close()
+    conn.close()
+    # Populate result fields
+    items = []
+    for (paper_id, title, authors, link, last_updated, summary, journal_ref,
+         primary_category, categories, theorem_id, theorem_name, theorem_body,
+         theorem_slogan, similarity) in rows:
+        # Determine source from url
+        link_str = link or ""
+        source = "arXiv" if link_str.startswith(
+            ("http://arxiv.org", "https://arxiv.org")) or "arxiv.org" in link_str else "Stacks Project"
+        inferred_type = infer_type(theorem_name or "")
+        items.append({
+            "paper_id": paper_id,
+            "authors": authors,
+            "paper_title": title,
+            "paper_url": link,
+            "year": last_updated.year,
+            "primary_category": primary_category,
+            "source": source,
+            "type": inferred_type,
+            "journal_published": bool(journal_ref),
+            "citations": None,
+            "theorem_name": theorem_name,
+            "theorem_slogan": theorem_slogan,
+            "theorem_body": theorem_body,
+            "similarity": float(similarity),
+        })
+    # Citations
+    if 'arXiv' in filters['sources']:
+        with st.spinner("Fetching citations..."):
+            add_citations(items)
+    for it in items:
+        # Compute weighted score if applicable
+        it["score"] = compute_score(it["similarity"], it.get("citations"), citation_weight)
+    # Sort results by weighted score, then cosine similarity, then paper id
+    items.sort(key=lambda x: (x["score"], x["similarity"], str(x.get("paper_id"))), reverse=True)
+    # Display results
+    st.subheader(f"Found {len(items)} Matching Results")
+    if not items:
         st.warning("No results found for the current filters.")
         return
+    for i, info in enumerate(items):
+        expander_title = f"**Result {i + 1} | Similarity: {info['score']:.4f} | {info.get('type', '').title()}**"
         with st.expander(expander_title, expanded=True):
+            st.markdown(f"**Paper:** *{info.get('paper_title', 'Unknown')}*")
             st.markdown(f"**Authors:** {', '.join(info.get('authors') or []) or 'N/A'}")
             st.markdown(f"**Source:** {info.get('source')} ({info.get('paper_url')})")
             citations = info.get("citations")
             cit_str = "Unknown" if citations is None else str(citations)
             st.markdown(
+                f"**Tag:** `{info.get('primary_category')}` | "
                 f"**Citations:** {cit_str} | "
                 f"**Year:** {info.get('year', 'N/A')}"
             )
             st.markdown("---")
             if info.get("theorem_slogan"):
                 st.markdown(f"**Slogan:** {info['theorem_slogan']}\n")
             cleaned_content = clean_latex_for_display(info['theorem_body'])
             st.markdown(f"**{info['theorem_name'] or 'Theorem Body.'}**")
             st.markdown(cleaned_content)
+            st.markdown("---")
+            # FOR TESTING ONLY
+            st.caption(f"Paper ID: {info['paper_id']}")
+            if info['citations'] is None or info['citations'] == 0:
+                log = 0
+            else:
+                log = np.log(info['citations'])
+            st.caption(
+                f"base_cosine={info['similarity']:.4f} | log(cit)={log:.4f} | weight={filters['citation_weight']:.2f}")
 # --- Main App Interface ---
 st.set_page_config(page_title="Theorem Search Demo", layout="wide")
+st.title("Math Theorem Search")
+st.write("This demo finds mathematical theorems that are semantically similar to your query.")
 model = load_model()
 theorems_data = load_papers_from_rds()
 if model and theorems_data:
     st.success(f"Successfully loaded {len(theorems_data)} theorems from arXiv and the Stacks Project. Ready to search!")
     # --- Sidebar filters ---
     with st.sidebar:
         st.header("Search Filters")
         )
         selected_authors, selected_types, selected_tags = [], [], []
+        paper_filter = ""
         year_range, journal_status = None, "All"
         citation_range = (0, 1000)
         citation_weight = 0.0
             for it in theorems_data:
                 tags_per_source[it['source']].add(it.get('primary_category'))
             union_tags = sorted({t for s in selected_sources for t in tags_per_source.get(s, set()) if t})
+            selected_tags = st.multiselect("Filter by Tag/Category:", union_tags)
+            paper_filter = st.text_input("Filter by Paper",
                                              value="",
                                              placeholder="e.g., 2401.12345, Finite Hilbert stability",
                                              help="Filter by title substring or arXiv ID/URL. Use commas for multiple.")
             if 'arXiv' in selected_sources:
                 year_range = st.slider("Filter by Year:", 1991, 2025, (1991, 2025))
+                journal_status = st.radio("Publication Status:",
+                                          ["All", "Journal Article", "Preprint Only"],
+                                          horizontal=True)
+                citation_range = st.slider("Filter by Citations:", 0, 1000, 1000, step=10)
+                citation_weight = st.slider("Citation Weight:", 0.0, 1.0, 0.0, step=0.01,
+                                            help="If nonzero, results are ranked by base_score $+$ weight $\\times$ "
+                                                 "$\\log($citations$)$.")
                 include_unknown_citations = st.checkbox(
                     "Include entries with unknown citation counts",
                     value=True,
         "types": [t.lower() for t in selected_types],
         "tags": selected_tags,
         "sources": selected_sources,
+        "paper_filter": parse_paper_filter(paper_filter),
         "year_range": year_range,
         "journal_status": journal_status,
         "citation_range": citation_range,
     user_query = st.text_input("Enter your query:", "")
     if st.button("Search") or user_query:
+        search_and_display(user_query, model, filters)
 else:
+    st.error("Could not load the model or data from RDS. Please check your RDS database connection and credentials.")