Spaces:

uw-math-ai
/

theorem-search

Running

App Files Files Community

Sophie commited on Oct 27

Commit

ab99a3d

1 Parent(s): cb6c277

better latex parsing when displaying theorems (not perfect)

Browse files

Files changed (2) hide show

src/latex_clean.py +186 -0
src/streamlit_app.py +39 -36

src/latex_clean.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import re
+_MATH_ENVS = [
+    # display / alignment
+    "align", "equation", "gather", "multline", "flalign", "dmath",
+    "aligned", "alignedat", "split",
+    # arrays & matrices
+    "array", "matrix", "pmatrix", "bmatrix", "Bmatrix", "vmatrix", "Vmatrix", "smallmatrix", "cases",
+]
+def _fix_truncated_end_braces(s: str) -> str:
+    return re.sub(r'(\\end\{[A-Za-z]+(?:\*)?)(?=\s|$)', r'\1}', s)
+def _close_unclosed_envs(s: str) -> str:
+    token = re.compile(
+        r'\\begin\{(?P<b_env>[A-Za-z]+)(?P<b_star>\*)?\}'
+        r'|\\end\{(?P<e_env>[A-Za-z]+)(?P<e_star>\*)?}?',
+        re.DOTALL
+    )
+    stack = []
+    for m in token.finditer(s):
+        if m.group('b_env'):
+            env = m.group('b_env')
+            star = m.group('b_star') or ''
+            if env in _MATH_ENVS:
+                stack.append((env, star))
+        else:
+            env = m.group('e_env')
+            star = m.group('e_star') or ''
+            if stack and stack[-1] == (env, star):
+                stack.pop()
+    if not stack:
+        return s
+    # Append missing delimiters in reverse order
+    closers = ''.join(f'\n\\end{{{env}{star}}}' for env, star in reversed(stack))
+    return s + closers
+def _balance_math_fences(s: str) -> str:
+    # $$ blocks
+    if s.count('$$') % 2 == 1:
+        s = s.rstrip() + '\n$$'
+    # \[ \]
+    if len(re.findall(r'\\\[', s)) > len(re.findall(r'\\\]', s)):
+        s = s.rstrip() + '\n\\]'
+    # \( \)
+    if len(re.findall(r'\\\(', s)) > len(re.findall(r'\\\)', s)):
+        s = s.rstrip() + '\\)'
+    return s
+def _repair_unbalanced_math(text: str) -> str:
+    # normalize newlines
+    text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # fix truncated \end{env
+    text = _fix_truncated_end_braces(text)
+    text = text + "]"
+    # append closing \end{...} for any unclosed math envs we care about
+    text = _close_unclosed_envs(text)
+    # make sure $$ / \[ / \( are closed
+    text = _balance_math_fences(text)
+    return text
+def clean_latex_for_display(text: str) -> str:
+    """Cleans raw LaTeX for display in Streamlit."""
+    if not text:
+        return text
+    # Fix potential truncation errors
+    text = _repair_unbalanced_math(text)
+    # Remove common macros and non-important display commands
+    text = re.sub(
+        r"""
+        \\(?:DeclareMathOperator|newcommand|renewcommand)\*?   # command
+        \s*\{[^{}]+\}                                          # {name}
+        (?:\s*\[\d+\])?                                        # [n] optional
+        (?:\s*\[[^\]]*\])?                                     # [default] optional
+        \s*\{[^{}]*\}                                          # {body} (no nesting)
+        """,
+        "",
+        text,
+        flags=re.VERBOSE | re.DOTALL,
+    )
+    text = re.sub(r'\\(label|ref|eqref|cite|footnote|footnotetext|alert)\{[^}]*\}', '', text)
+    # Align/align* normalization
+    def _normalize_align_blocks(s: str) -> str:
+        out, i, n = [], 0, len(s)
+        begin_pat = re.compile(r'\\begin\{align(\*)?\}', re.DOTALL)
+        while i < n:
+            m = begin_pat.search(s, i)
+            if not m:
+                out.append(s[i:])
+                break
+            # Copy everything before this block
+            out.append(s[i:m.start()])
+            star = m.group(1) or ""  # "" or "*"
+            body_start = m.end()
+            rest = s[body_start:]
+            # Try exact end: \end{align*} or \end{align}
+            exact_end = re.search(rf'\\end\{{align{re.escape(star)}\}}', rest)
+            if exact_end:
+                end_start_in_rest = exact_end.start()
+                end_consumed = exact_end.end()
+            else:
+                # Fallback: accept truncated end like "\end{align*"
+                trunc = re.search(rf'\\end\{{align{re.escape(star)}', rest)
+                if not trunc:
+                    out.append(s[m.start():])
+                    break
+                end_start_in_rest = trunc.start()
+                end_consumed = trunc.end() + (1 if rest[trunc.end():].startswith('}') else 0)
+            body = rest[:end_start_in_rest]
+            # Clean the body
+            body = re.sub(r'\\tag\{[^}]*\}', '', body)
+            body = re.sub(r'\\(?:nonumber|notag)\b', '', body)
+            body = re.sub(r'\\label\{[^}]*\}', '', body)
+            # Trim trailing "\\" on the final line
+            lines = [ln.rstrip() for ln in body.strip().split('\n')]
+            if lines and lines[-1].endswith(r'\\'):
+                lines[-1] = lines[-1][:-2].rstrip()
+            cleaned = '\n'.join(lines).strip()
+            # Emit a single aligned block
+            out.append(f"$$\n\\begin{{aligned}}\n{cleaned}\n\\end{{aligned}}\n$$")
+            # Advance past the end tag (exact or truncated)
+            i = body_start + end_consumed
+        return ''.join(out)
+    text = _normalize_align_blocks(text)
+    text = re.sub(r'\\\[\s*(.*?)\s*\\\]', r'$$\n\1\n$$', text, flags=re.DOTALL)
+    text = re.sub(r'\\\(\s*(.*?)\s*\\\)', r'$\1$',       text, flags=re.DOTALL)
+    # Turn \item into Markdown bullets
+    text = re.sub(r'\\begin\{(?:enumerate|itemize)\}', '', text)
+    text = re.sub(r'\\end\{(?:enumerate|itemize)\}',   '', text)
+    text = re.sub(r'^[ \t]*\\item[ \t]*', r'- ', text, flags=re.MULTILINE)
+    # Wrap "&"-aligned single lines outside existing $$...$$ blocks
+    parts = re.split(r'(\$\$[\s\S]*?\$\$)', text)  # keep math blocks intact
+    for i in range(0, len(parts), 2):
+        segment = parts[i]
+        lines = segment.split('\n')
+        for j, ln in enumerate(lines):
+            if '&' in ln and not ln.strip().startswith(('-', '$')):
+                lines[j] = f"$$\n\\begin{{aligned}}\n{ln}\n\\end{{aligned}}\n$$"
+        parts[i] = '\n'.join(lines)
+    text = ''.join(parts)
+    def _isolate_display_math(s: str) -> str:
+        """Ensure each $$...$$ block is on its own lines with padding blank lines."""
+        parts = re.split(r'(\$\$[\s\S]*?\$\$)', s)  # keep the $$...$$ blocks
+        for i in range(1, len(parts), 2):  # only the $$ blocks (odd indices)
+            block = parts[i]  # starts with $$, ends with $$
+            # normalize interior newlines: $$\n... \n$$
+            if not block.startswith('$$\n'):
+                block = '$$\n' + block[2:].lstrip()
+            if not block.endswith('\n$$'):
+                block = block[:-2].rstrip() + '\n$$'
+            parts[i] = block
+            # ensure a blank line before and after the block
+            if i - 1 >= 0:
+                parts[i - 1] = parts[i - 1].rstrip() + '\n\n'
+            if i + 1 < len(parts):
+                parts[i + 1] = '\n\n' + parts[i + 1].lstrip()
+        return ''.join(parts)
+    text = _isolate_display_math(text)
+    # Remove whitespace
+    text = re.sub(r'\n{3,}', '\n\n', text).strip()
+    return text

src/streamlit_app.py CHANGED Viewed

@@ -3,13 +3,13 @@ import json
 import numpy as np
 from sentence_transformers import SentenceTransformer, util
 import os
-import re
 import boto3
 import psycopg2
 from psycopg2.extensions import connection
 from dotenv import load_dotenv
-# --- 0. Config ---
 load_dotenv()
 def get_rds_connection() -> connection:
@@ -32,7 +32,25 @@ def get_rds_connection() -> connection:
     )
     return conn
-# --- 1. Load the Embedding Model ---
 @st.cache_resource
 def load_model():
     """
@@ -46,7 +64,7 @@ def load_model():
         return None
-# --- 2. Load Data from RDS ---
 @st.cache_data
 def load_papers_from_rds():
     """
@@ -113,13 +131,15 @@ def load_papers_from_rds():
             all_theorems_data.append({
                 "paper_id": paper_id,
                 "paper_title": title,
                 "paper_url": link,
                 "theorem_name": theorem_name,
                 "theorem_slogan": theorem_slogan,
                 "theorem_body": theorem_body,
                 "global_context": global_context,
-                "text_to_embed": f"{global_context}\n\n**Theorem ({theorem_name}):**\n{theorem_body}",
                 "stored_embedding": embedding
             })
@@ -133,7 +153,7 @@ def load_papers_from_rds():
 # --- 3. The Search Function ---
 def search_theorems(query, model, theorems_data, embeddings_db):
     """
-    Takes a user query and finds the top 5 most similar theorems.
     """
     if not query:
         st.info("Please enter a search query.")
@@ -141,7 +161,7 @@ def search_theorems(query, model, theorems_data, embeddings_db):
     query_embedding = model.encode(query, convert_to_tensor=True)
     cosine_scores = util.cos_sim(query_embedding, embeddings_db)[0]
-    top_results_indices = np.argsort(-cosine_scores.cpu())[:5]
     st.subheader("Top 5 Most Similar Theorems")
@@ -154,67 +174,50 @@ def search_theorems(query, model, theorems_data, embeddings_db):
         similarity = cosine_scores[idx].item()
         theorem_info = theorems_data[idx]
-        # Use an expander for each result to keep the main view clean
         expander_title = f"**Result {i+1} | Similarity: {similarity:.4f}**"
         if theorem_info.get("theorem_name"):
             expander_title += f" | {theorem_info['theorem_name']}"
         with st.expander(expander_title):
             st.markdown(f"**Paper:** {theorem_info.get('paper_title', 'Unknown')}")
             st.markdown(f"**Source:** [{theorem_info['paper_url']}]({theorem_info['paper_url']})")
-            # Display theorem slogan if available
             if theorem_info.get("theorem_slogan"):
                 st.markdown(f"**Slogan:** {theorem_info['theorem_slogan']}")
                 st.write("")
-            # Display global context in a more readable blockquote
             if theorem_info["global_context"]:
-                blockquote_context = "> " + theorem_info["global_context"].replace("\n", "\n> ")
-                st.markdown(blockquote_context)
                 st.write("")
-            # Clean and display theorem body
-            content = theorem_info['theorem_body']
-            # Remove labels, citations, and other disruptive commands
-            cleaned_content = re.sub(r'\\(label|cite|eqref)\{.*?\}', '', content)
-            # Convert math delimiters to $$
-            cleaned_content = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', cleaned_content)
-            cleaned_content = re.sub(r'\\\((.*?)\\\)', r'$\1$', cleaned_content)
-            # Remove common environment wrappers like \begin\{...\} and \end\{...\}
-            cleaned_content = re.sub(r'\\label\{.*?\}', r'', cleaned_content)
-            cleaned_content = re.sub(r'\\begin\{.*?\}', r'', cleaned_content)
-            cleaned_content = re.sub(r'\\end\{.*?\}', r'', cleaned_content)
-            # Remove extra formatting like newlines and tabs
-            cleaned_content = cleaned_content.replace('\n', ' ').replace('\t', ' ').strip()
-            # Use st.markdown() to render the cleaned, mixed text and LaTeX
             st.markdown(f"**Theorem Body:**")
             st.markdown(cleaned_content)
 # --- Main App Interface ---
 st.set_page_config(page_title="Theorem Search Demo", layout="wide")
 st.title("📚 Semantic Theorem Search")
 st.write("This demo uses a specialized mathematical language model to find theorems semantically similar to your query.")
 model = load_model()
 theorems_data = load_papers_from_rds()
 if model and theorems_data:
     with st.spinner("Preparing embeddings from database..."):
-        # Use stored embeddings from database - already numpy arrays
         corpus_embeddings = np.array([item['stored_embedding'] for item in theorems_data])
-    st.success(f"Successfully loaded {len(theorems_data)} theorems from RDS. Ready to search!")
-    user_query = st.text_input("Enter your query:", "The Jones polynomial is a link invariant")
     if st.button("Search") or user_query:
         search_theorems(user_query, model, theorems_data, corpus_embeddings)
 else:
-    st.error("Could not load the model or data from RDS. Please check your database connection and credentials.")

 import numpy as np
 from sentence_transformers import SentenceTransformer, util
 import os
 import boto3
 import psycopg2
 from psycopg2.extensions import connection
 from dotenv import load_dotenv
+from latex_clean import clean_latex_for_display
+# Config
 load_dotenv()
 def get_rds_connection() -> connection:
     )
     return conn
+AVAILABLE_TAGS = {
+    "arXiv": [
+        "math.AC", "math.AG", "math.AP", "math.AT", "math.CA", "math.CO",
+        "math.CT", "math.CV", "math.DG", "math.DS", "math.FA", "math.GM",
+        "math.GN", "math.GR", "math.GT", "math.HO", "math.IT", "math.KT",
+        "math.LO", "math.MG", "math.MP", "math.NA", "math.NT", "math.OA",
+        "math.OC", "math.PR", "math.QA", "math.RA", "math.RT", "math.SG",
+        "math.SP", "math.ST", "Statistics Theory"
+    ],
+    "Stacks Project": [
+        "Sets", "Schemes", "Algebraic Stacks", "Étale Cohomology"
+    ]
+}
+ALLOWED_TYPES = [
+    "theorem", "lemma", "proposition", "corollary", "definition", "remark", "assumption"
+]
+# Load the Embedding Model
 @st.cache_resource
 def load_model():
     """
         return None
+# Load Data from RDS
 @st.cache_data
 def load_papers_from_rds():
     """
             all_theorems_data.append({
                 "paper_id": paper_id,
+                "authors": authors,
                 "paper_title": title,
                 "paper_url": link,
+                "year": last_updated.year,
+                "primary_category": primary_category,
                 "theorem_name": theorem_name,
                 "theorem_slogan": theorem_slogan,
                 "theorem_body": theorem_body,
                 "global_context": global_context,
                 "stored_embedding": embedding
             })
 # --- 3. The Search Function ---
 def search_theorems(query, model, theorems_data, embeddings_db):
     """
+    Takes a user query and finds the top 10 most similar theorems.
     """
     if not query:
         st.info("Please enter a search query.")
     query_embedding = model.encode(query, convert_to_tensor=True)
     cosine_scores = util.cos_sim(query_embedding, embeddings_db)[0]
+    top_results_indices = np.argsort(-cosine_scores.cpu())[:10]
     st.subheader("Top 5 Most Similar Theorems")
         similarity = cosine_scores[idx].item()
         theorem_info = theorems_data[idx]
         expander_title = f"**Result {i+1} | Similarity: {similarity:.4f}**"
         if theorem_info.get("theorem_name"):
             expander_title += f" | {theorem_info['theorem_name']}"
         with st.expander(expander_title):
             st.markdown(f"**Paper:** {theorem_info.get('paper_title', 'Unknown')}")
+            st.markdown(f"**Authors:** {', '.join(theorem_info['authors']) if theorem_info['authors'] else 'N/A'}")
             st.markdown(f"**Source:** [{theorem_info['paper_url']}]({theorem_info['paper_url']})")
+            st.markdown(
+                f"**Math Tag:** `{theorem_info['primary_category']}` | **Year:** {theorem_info.get('year', 'N/A')}")
+            st.markdown("---")
             if theorem_info.get("theorem_slogan"):
                 st.markdown(f"**Slogan:** {theorem_info['theorem_slogan']}")
                 st.write("")
             if theorem_info["global_context"]:
+                cleaned_ctx = clean_latex_for_display(theorem_info["global_context"])
+                st.markdown(f"> {cleaned_ctx.replace('\n', '\n> ')}")
                 st.write("")
+            cleaned_content = clean_latex_for_display(theorem_info['theorem_body'])
             st.markdown(f"**Theorem Body:**")
             st.markdown(cleaned_content)
 # --- Main App Interface ---
 st.set_page_config(page_title="Theorem Search Demo", layout="wide")
 st.title("📚 Semantic Theorem Search")
 st.write("This demo uses a specialized mathematical language model to find theorems semantically similar to your query.")
+st.markdown("*Note: Linking to a specific page within an arXiv PDF is not directly possible.*",
+            help="arXiv links redirect to the paper's abstract, not a specific page in the PDF.")
 model = load_model()
 theorems_data = load_papers_from_rds()
 if model and theorems_data:
     with st.spinner("Preparing embeddings from database..."):
         corpus_embeddings = np.array([item['stored_embedding'] for item in theorems_data])
+    st.success(f"Successfully loaded {len(theorems_data)} theorems from arXiv. Ready to search!")
+    user_query = st.text_input("Enter your query:", "")
     if st.button("Search") or user_query:
         search_theorems(user_query, model, theorems_data, corpus_embeddings)
 else:
+    st.error("Could not load the model or data from RDS. Please check your RDS database connection and credentials.")