math-embed / paper /math_embeddings.tex
RobBobin's picture
Upload paper/math_embeddings.tex with huggingface_hub
371a39d verified
\documentclass[11pt,a4paper]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage[margin=1in]{geometry}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{algorithm}
\usepackage{algpseudocode}
\hypersetup{
colorlinks=true,
linkcolor=blue!70!black,
citecolor=green!50!black,
urlcolor=blue!70!black,
}
\title{Knowledge-Graph-Guided Fine-Tuning of Embedding Models\\
for Mathematical Document Retrieval}
\author{Robin Langer\thanks{The author thanks Claude (Anthropic) for assistance with code development and manuscript preparation.}}
\date{}
\begin{document}
\maketitle
\begin{abstract}
We present a method for improving semantic search over mathematical research
papers by fine-tuning embedding models using contrastive learning, guided by
a knowledge graph extracted from the corpus. General-purpose embedding models
(e.g., OpenAI's \texttt{text-embedding-3-small}) and even scientific embedding
models (SPECTER2, SciNCL) perform poorly on mathematical retrieval tasks because
they lack understanding of the semantic relationships between mathematical
concepts. Our approach exploits an existing knowledge graph --- whose nodes are
mathematical concepts and whose edges encode relationships such as
\emph{generalizes}, \emph{proves}, and \emph{is\_instance\_of} --- to
automatically generate training data for contrastive fine-tuning. We benchmark
baseline models against our fine-tuned model on a retrieval task over 4,794
paper chunks spanning 75 papers in algebraic combinatorics, and demonstrate
that domain-specific fine-tuning significantly outperforms all baselines.
The method is general: given any corpus of mathematical papers and a
knowledge graph over their concepts, the same pipeline produces a
domain-adapted embedding model.
\end{abstract}
\section{Introduction}
The increasing volume of mathematical literature makes automated retrieval
tools indispensable for researchers. A common approach is
\emph{retrieval-augmented generation} (RAG): chunk papers into passages, embed
them in a vector space, and retrieve relevant passages via nearest-neighbor
search over embeddings. The quality of retrieval depends critically on the
embedding model's ability to capture \emph{mathematical semantic similarity}
--- the idea that a query like ``Rogers--Ramanujan identities'' should retrieve
not only passages containing that exact phrase but also passages discussing
Bailey's lemma, $q$-series transformations, and partition identities.
General-purpose embedding models are trained on broad web text and lack this
kind of domain knowledge. Scientific embedding models such as SPECTER2
\cite{specter2} and SciNCL \cite{scincl} are trained on citation graphs from
Semantic Scholar, but mathematics is underrepresented in their training data,
and they are optimized for \emph{paper-to-paper} similarity rather than
\emph{concept-to-passage} retrieval.
We address this gap by fine-tuning an embedding model specifically for
mathematical concept retrieval. Our key insight is that a \textbf{knowledge
graph} (KG) extracted from the corpus provides exactly the supervision signal
needed for contrastive learning:
\begin{itemize}[nosep]
\item Each KG concept (e.g., ``Macdonald polynomials'') maps to specific
papers, and hence to specific text chunks. These form
\emph{positive pairs} for contrastive training.
\item KG edges (e.g., ``Bailey's lemma \emph{generalizes}
Rogers--Ramanujan identities'') provide \emph{cross-concept
positives} that teach the model about mathematical relationships.
\item In-batch negatives from unrelated concepts provide the contrastive
signal automatically.
\end{itemize}
This paper makes the following contributions:
\begin{enumerate}[nosep]
\item A benchmark comparing general-purpose and scientific embedding
models on mathematical concept retrieval (Section~\ref{sec:benchmark}).
\item A method for automatically generating contrastive training data from
a knowledge graph (Section~\ref{sec:training-data}).
\item A fine-tuned embedding model that outperforms all baselines on our
benchmark (Section~\ref{sec:finetuning}).
\item An open-source pipeline\footnote{Code available at
\url{https://github.com/RaggedR/embeddings}. Model available at
\url{https://huggingface.co/RobBobin/math-embed}.} that can be applied to any
mathematical corpus with an associated knowledge graph.
\end{enumerate}
\section{Related Work}
\paragraph{Scientific document embeddings.}
SPECTER \cite{specter} introduced citation-based contrastive learning for
scientific document embeddings, training on (paper, cited paper, non-cited
paper) triplets. SPECTER2 \cite{specter2} extended this to 6 million citation
triplets across 23 fields of study and introduced task-specific adapters
(proximity, classification, regression). SciNCL \cite{scincl} improved on
SPECTER by using citation graph \emph{neighborhood} sampling for harder
negatives. All three models use SciBERT \cite{scibert} as their backbone and
produce 768-dimensional embeddings.
\paragraph{Mathematics-specific models.}
MathBERT \cite{mathbert} pre-trained BERT on mathematical curricula and arXiv
abstracts, but only with masked language modeling --- it was not contrastively
trained for retrieval. No widely adopted embedding model exists that is
specifically trained for mathematical semantic similarity.
\paragraph{Contrastive fine-tuning.}
The sentence-transformers framework \cite{sbert} provides
\texttt{MultipleNegativesRankingLoss} (MNRL), which treats all other examples
in a batch as negatives. Matryoshka Representation Learning \cite{matryoshka}
trains embeddings so that any prefix of the full vector is itself a useful
embedding, enabling flexible dimensionality--quality tradeoffs at inference.
\section{Data}
\label{sec:data}
\subsection{Corpus}
Our corpus consists of 75 research papers in algebraic combinatorics,
$q$-series, and related areas, sourced from arXiv. Papers are chunked into
passages of up to 1,500 characters with 200-character overlap, yielding
\textbf{4,794 chunks}. The chunks are stored in a ChromaDB vector database
with embeddings from OpenAI's \texttt{text-embedding-3-small} (1536-dim).
\subsection{Knowledge graph}
A knowledge graph was constructed by having GPT-4o-mini extract concepts and
relationships from representative chunks (first two and last two) of each
paper \cite{kg-extraction}. After normalization and deduplication, the KG
contains:
\begin{itemize}[nosep]
\item \textbf{559 concepts} (218 objects, 92 theorems, 77 definitions,
56 techniques, 28 persons, 26 formulas, 25 identities, 11
conjectures, and others)
\item \textbf{486 edges} with typed relationships (\emph{related\_to}:
110, \emph{uses}: 78, \emph{generalizes}: 54,
\emph{is\_instance\_of}: 45, \emph{implies}: 40, \emph{defines}: 39,
and others)
\item Coverage of all 75 papers
\end{itemize}
\section{Benchmark}
\label{sec:benchmark}
\subsection{Ground truth construction}
We construct a retrieval benchmark from the KG. For each concept $c$ with at
least $\text{min\_degree} = 2$ matched papers in the corpus:
\begin{itemize}[nosep]
\item \textbf{Query}: the concept's display name (e.g., ``Rogers--Ramanujan
identities'')
\item \textbf{Relevant documents}: all chunks from the concept's source
papers
\end{itemize}
This yields \textbf{108 queries}. The ground truth is approximate --- not
every chunk in a relevant paper directly discusses the concept --- but this
bias is consistent across models, making relative comparisons valid.
\subsection{Metrics}
We report:
\begin{itemize}[nosep]
\item \textbf{MRR} (Mean Reciprocal Rank): the average inverse rank of the
first relevant result.
\item \textbf{NDCG@$k$} (Normalized Discounted Cumulative Gain): measures
ranking quality with position-dependent discounting.
\item \textbf{Recall@$k$}: fraction of relevant documents retrieved in the
top $k$. Note that Recall@$k$ appears low because relevant sets are
large (often 100+ chunks per concept); MRR and NDCG are the
meaningful comparison metrics.
\end{itemize}
All metrics are computed using a Rust implementation with rayon parallelism
for batch kNN and metric aggregation \cite{rust-metrics}.
\subsection{Baseline results}
\begin{table}[h]
\centering
\caption{Baseline embedding model comparison on mathematical concept retrieval.
All models evaluated on 108 queries over 4,794 chunks.}
\label{tab:baselines}
\begin{tabular}{lcccccc}
\toprule
Model & Dim & R@5 & R@10 & R@20 & MRR & NDCG@10 \\
\midrule
\texttt{openai-small} & 1536 & 0.010 & 0.019 & 0.037 & \textbf{0.461} & \textbf{0.324} \\
SPECTER2 (proximity) & 768 & 0.007 & 0.013 & 0.024 & 0.360 & 0.225 \\
SciNCL & 768 & 0.006 & 0.012 & 0.024 & 0.306 & 0.205 \\
\midrule
Math-Embed (ours) & 768 & \textbf{0.030} & \textbf{0.058} & \textbf{0.111} & \textbf{0.816} & \textbf{0.736} \\
\bottomrule
\end{tabular}
\end{table}
The general-purpose OpenAI model outperforms both scientific models by a wide
margin (28\% higher MRR than SPECTER2, 51\% higher than SciNCL). This is
notable because SPECTER2 was trained on 6 million scientific citation triplets
--- yet it underperforms a model with no scientific specialization. We
attribute this to two factors:
\begin{enumerate}[nosep]
\item \textbf{Dimensionality}: OpenAI's 1536-dim space has more capacity
than the 768-dim BERT-based models.
\item \textbf{Task mismatch}: SPECTER2 and SciNCL were trained for
paper-to-paper similarity (title + abstract), not concept-to-chunk
retrieval. A query like ``Rogers--Ramanujan identities'' is not a
paper title --- it is a mathematical concept name, and retrieving
relevant passages requires understanding what that concept means.
\end{enumerate}
\section{Training Data from Knowledge Graphs}
\label{sec:training-data}
We generate contrastive training data automatically from the KG and corpus.
\subsection{Direct pairs}
For each concept $c$ with papers $P_1, \ldots, P_m$ in the KG, and each
paper $P_j$ with chunks $\{d_{j,1}, \ldots, d_{j,n_j}\}$ in the corpus:
\begin{align}
\text{Pairs}_{\text{name}}(c) &= \{(\texttt{name}(c),\; d_{j,k}) :
j \in [m],\; k \in [n_j]\} \\
\text{Pairs}_{\text{desc}}(c) &= \{(\texttt{desc}(c),\; d_{j,k}) :
j \in [m],\; k \in [n_j]\}
\end{align}
Using both the concept name and its description as anchors provides anchor
diversity: short anchors (e.g., ``Macdonald polynomials'') train exact-match
retrieval, while longer descriptions (e.g., ``A family of orthogonal
symmetric polynomials generalizing Schur functions'') train paraphrase
retrieval.
We cap at 20 chunks per concept to prevent over-representation of
high-degree concepts.
\subsection{Edge pairs}
For each edge $(c_1, c_2, r)$ in the KG with relation $r$ (e.g.,
\emph{generalizes}, \emph{uses}):
\begin{equation}
\text{Pairs}_{\text{edge}}(c_1, c_2) = \{(\texttt{name}(c_1),\; d) :
d \in \text{chunks}(c_2)\} \cup \{(\texttt{name}(c_2),\; d) :
d \in \text{chunks}(c_1)\}
\end{equation}
These cross-concept pairs teach the model that mathematically related concepts
should embed nearby. For example, if ``Bailey's lemma'' \emph{generalizes}
``Rogers--Ramanujan identities,'' then chunks about Rogers--Ramanujan should
be somewhat relevant to queries about Bailey's lemma.
We cap at 5 chunks per edge direction to prevent edge pairs from dominating
the dataset.
\subsection{Dataset statistics}
\begin{table}[h]
\centering
\caption{Training dataset statistics.}
\label{tab:dataset}
\begin{tabular}{lr}
\toprule
Direct pairs (concept $\to$ chunk) & 21,544 \\
Edge pairs (cross-concept) & 4,855 \\
Total unique pairs & 25,121 \\
Training set (90\%) & 22,609 \\
Validation set (10\%) & 2,512 \\
Unique anchors & 1,114 \\
\bottomrule
\end{tabular}
\end{table}
\section{Fine-Tuning}
\label{sec:finetuning}
\subsection{Method}
We fine-tune the SPECTER2 base model (\texttt{allenai/specter2\_base},
768-dim, SciBERT backbone) using the sentence-transformers framework
\cite{sbert}. Despite SPECTER2's poor off-the-shelf performance on our
benchmark, its pre-training on 6 million scientific citation triplets provides
a strong initialization for mathematical text --- the model already understands
scientific language structure, and we teach it mathematical concept semantics
on top.
\paragraph{Loss function.}
We use \texttt{MultipleNegativesRankingLoss} (MNRL) wrapped in
\texttt{MatryoshkaLoss}. MNRL treats all other examples in a batch as
negatives, providing $B(B-1)$ negative comparisons per batch of size $B$
without explicit negative mining. MatryoshkaLoss computes the same contrastive
loss at multiple embedding truncation points (768, 512, 256, 128 dimensions),
training the model to frontload important information into the first
dimensions.
\paragraph{Training details.}
\begin{itemize}[nosep]
\item Micro-batch size: 8, with gradient accumulation over 4 steps
(effective batch size 32, yielding 56 in-batch negative comparisons
per micro-batch)
\item Max sequence length: 256 tokens (truncating longer chunks)
\item Learning rate: $2 \times 10^{-5}$ with 10\% linear warmup
\item Epochs: 3 (2,118 optimization steps)
\item Duplicate-free batch sampling to maximize negative diversity
\item Final model selected after epoch 3 (training loss converged
from $\sim$11 to $\sim$5)
\item Hardware: Apple M-series GPU (MPS backend), $\sim$4 hours wall time
\end{itemize}
\subsection{Results}
\begin{table}[h]
\centering
\caption{Final comparison including fine-tuned model. All models evaluated
on 108 queries over 4,794 chunks. Best results in bold.}
\label{tab:final}
\begin{tabular}{lcccccc}
\toprule
Model & Dim & R@5 & R@10 & R@20 & MRR & NDCG@10 \\
\midrule
\texttt{openai-small} & 1536 & 0.010 & 0.019 & 0.037 & 0.461 & 0.324 \\
SPECTER2 (proximity) & 768 & 0.007 & 0.013 & 0.024 & 0.360 & 0.225 \\
SciNCL & 768 & 0.006 & 0.012 & 0.024 & 0.306 & 0.205 \\
\midrule
Math-Embed (ours) & 768 & \textbf{0.030} & \textbf{0.058} & \textbf{0.111} & \textbf{0.816} & \textbf{0.736} \\
\bottomrule
\end{tabular}
\end{table}
Our fine-tuned model outperforms all baselines by a wide margin.
MRR improves from 0.461 (OpenAI) to \textbf{0.816} --- a 77\% relative
improvement, meaning the first relevant result now appears on average at
rank $\sim$1.2 rather than rank $\sim$2.2. NDCG@10 more than doubles from
0.324 to 0.736, and Recall@20 triples from 0.037 to 0.111.
Remarkably, the fine-tuned model uses half the embedding dimensions (768
vs.\ 1536) of the OpenAI model yet dramatically outperforms it. The same
base model (SPECTER2) that scored worst among baselines (MRR 0.360) becomes
the best performer after fine-tuning --- a 127\% improvement from the same
architecture with no additional parameters, demonstrating that the
knowledge-graph-derived training signal is highly effective.
\section{Discussion}
\subsection{Why general-purpose models fail at math}
The poor performance of SPECTER2 and SciNCL --- models explicitly trained on
scientific literature --- highlights that \emph{scientific} training is not
the same as \emph{mathematical} training. These models learn paper-level
similarity from citation patterns: ``paper A cites paper B, so they should
embed nearby.'' But mathematical retrieval requires a different kind of
similarity: understanding that the text ``$\sum_{n=0}^{\infty}
\frac{q^{n^2}}{(q;q)_n}$'' is about the Rogers--Ramanujan identities, even
though it contains no occurrence of that phrase.
Standard tokenizers (BERT WordPiece) fragment mathematical notation into
meaningless subwords. Fine-tuning cannot fix the tokenizer, but it can teach
the model that certain patterns of subword tokens, when they appear together,
carry specific mathematical meaning.
\subsection{Knowledge graphs as supervision}
Our approach requires a knowledge graph, which itself requires an LLM
extraction step (GPT-4o-mini in our case). This may seem circular --- we use
an LLM to generate training data for a different model. But the key insight is
that these are \emph{complementary capabilities}:
\begin{itemize}[nosep]
\item The LLM excels at \emph{reading individual passages} and extracting
structured information (concepts, relationships), but is too slow
and expensive for real-time retrieval over thousands of chunks.
\item The embedding model excels at \emph{fast similarity search} over
large corpora, but needs training data to learn domain-specific
semantics.
\end{itemize}
The KG is a one-time cost that distills the LLM's understanding into a
reusable supervision signal.
\subsection{Generalizability}
The pipeline is not specific to algebraic combinatorics. Given:
\begin{enumerate}[nosep]
\item A corpus of mathematical papers (any subfield)
\item A knowledge graph over their concepts (extractable by LLM)
\end{enumerate}
the same code produces a domain-adapted embedding model. The fine-tuned model
should generalize to new papers in the same mathematical area, since it learns
\emph{concept semantics} rather than memorizing specific passages.
\section{Conclusion}
We demonstrated that general-purpose and scientific embedding models perform
poorly on mathematical concept retrieval, and presented a pipeline that
automatically generates contrastive training data from a knowledge graph to
fine-tune a domain-specific embedding model. Our approach requires no manual
annotation --- the knowledge graph provides the supervision signal --- and
produces a portable model that can be deployed in any RAG system for
mathematical literature.
Future work includes: (1) scaling to larger mathematical corpora spanning
multiple subfields, (2) incorporating mathematical notation awareness into
the tokenizer, and (3) exploring whether the fine-tuned model's understanding
of mathematical relationships transfers across subfields.
\begin{thebibliography}{10}
\bibitem{specter}
A.~Cohan, S.~Feldman, I.~Beltagy, D.~Downey, and D.~S.~Weld,
``SPECTER: Document-level representation learning using citation-informed
transformers,'' in \emph{Proc.\ ACL}, 2020.
\bibitem{specter2}
A.~Singh, M.~D'Arcy, A.~Cohan, D.~Downey, and S.~Feldman,
``SciRepEval: A multi-format benchmark for scientific document
representations,'' in \emph{Proc.\ EMNLP}, 2023.
\bibitem{scincl}
M.~Ostendorff, N.~Rethmeier, I.~Augenstein, B.~Gipp, and G.~Rehm,
``Neighborhood contrastive learning for scientific document
representations with citation embeddings,'' in \emph{Proc.\ EMNLP}, 2022.
\bibitem{scibert}
I.~Beltagy, K.~Lo, and A.~Cohan,
``SciBERT: A pretrained language model for scientific text,'' in
\emph{Proc.\ EMNLP}, 2019.
\bibitem{mathbert}
S.~Peng, K.~Yuan, L.~Gao, and Z.~Tang,
``MathBERT: A pre-trained model for mathematical formula understanding,''
\emph{arXiv:2105.00377}, 2021.
\bibitem{sbert}
N.~Reimers and I.~Gurevych,
``Sentence-BERT: Sentence embeddings using Siamese BERT-networks,'' in
\emph{Proc.\ EMNLP}, 2019.
\bibitem{matryoshka}
A.~Kusupati, G.~Bhatt, A.~Rege, M.~Wallingford, A.~Sinha, V.~Ramanujan,
W.~Howard-Snyder, K.~Chen, S.~Kakade, P.~Jain, and A.~Farhadi,
``Matryoshka representation learning,'' in \emph{Proc.\ NeurIPS}, 2022.
\bibitem{kg-extraction}
Knowledge graph extraction via LLM-based concept and relationship
identification from scientific text, internal methodology.
\bibitem{rust-metrics}
Custom Rust implementation of batch kNN and IR metrics (Recall@$k$, MRR,
NDCG@$k$) with rayon parallelism and PyO3 Python bindings.
\end{thebibliography}
\end{document}