Spaces:
Sleeping
Sleeping
Commit
·
3ecd1bf
1
Parent(s):
23bdcc2
Add welcome screen, scroll to highlight, lennart version
Browse files
frontend/src/components/DocumentProcessor.jsx
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/src/components/DocumentViewer.jsx
CHANGED
|
@@ -46,18 +46,42 @@ const MyHighlightContainer = () => {
|
|
| 46 |
return component;
|
| 47 |
};
|
| 48 |
|
| 49 |
-
const DocumentViewer = ({ selectedFile, documentData, onPageChange, preloadedHighlights = null,
|
| 50 |
const [highlights, setHighlights] = useState([]);
|
| 51 |
const [pdfUrl, setPdfUrl] = useState(null);
|
| 52 |
|
| 53 |
/** Refs for PdfHighlighter utilities */
|
| 54 |
const highlighterUtilsRef = useRef();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
// Utility function to normalize highlight data
|
| 57 |
const normalizeHighlight = (highlightData) => {
|
| 58 |
// Ensure the highlight has the required structure
|
| 59 |
if (!highlightData.id || !highlightData.position || !highlightData.content) {
|
| 60 |
-
console.warn('Invalid highlight data:', highlightData);
|
| 61 |
return null;
|
| 62 |
}
|
| 63 |
|
|
@@ -83,19 +107,19 @@ const DocumentViewer = ({ selectedFile, documentData, onPageChange, preloadedHig
|
|
| 83 |
}
|
| 84 |
}, [selectedFile]);
|
| 85 |
|
| 86 |
-
// Load preloaded highlights when component mounts or when
|
| 87 |
useEffect(() => {
|
| 88 |
if (preloadedHighlights) {
|
| 89 |
let highlightsToLoad = [];
|
| 90 |
|
| 91 |
-
if (
|
| 92 |
-
// Load highlights for specific
|
| 93 |
-
highlightsToLoad = preloadedHighlights[
|
| 94 |
} else if (Array.isArray(preloadedHighlights)) {
|
| 95 |
// Load all highlights if it's an array
|
| 96 |
highlightsToLoad = preloadedHighlights;
|
| 97 |
} else if (typeof preloadedHighlights === 'object') {
|
| 98 |
-
// If it's an object without
|
| 99 |
highlightsToLoad = Object.values(preloadedHighlights).flat();
|
| 100 |
}
|
| 101 |
|
|
@@ -104,13 +128,24 @@ const DocumentViewer = ({ selectedFile, documentData, onPageChange, preloadedHig
|
|
| 104 |
.map(normalizeHighlight)
|
| 105 |
.filter(Boolean);
|
| 106 |
|
| 107 |
-
console.log(`🎨 Loading ${validHighlights.length} preloaded highlights${
|
| 108 |
setHighlights(validHighlights);
|
| 109 |
} else {
|
| 110 |
// Clear highlights if no preloaded data
|
| 111 |
setHighlights([]);
|
| 112 |
}
|
| 113 |
-
}, [preloadedHighlights,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
// Handle selection - log coordinates and add debugging
|
| 116 |
const handleSelection = (selection) => {
|
|
@@ -159,6 +194,7 @@ const DocumentViewer = ({ selectedFile, documentData, onPageChange, preloadedHig
|
|
| 159 |
pdfDocument={pdfDocument}
|
| 160 |
utilsRef={(_pdfHighlighterUtils) => {
|
| 161 |
highlighterUtilsRef.current = _pdfHighlighterUtils;
|
|
|
|
| 162 |
}}
|
| 163 |
highlights={highlights}
|
| 164 |
onSelection={handleSelection}
|
|
|
|
| 46 |
return component;
|
| 47 |
};
|
| 48 |
|
| 49 |
+
const DocumentViewer = ({ selectedFile, documentData, onPageChange, preloadedHighlights = null, currentChunkIndex = null, onDocumentReady = null }) => {
|
| 50 |
const [highlights, setHighlights] = useState([]);
|
| 51 |
const [pdfUrl, setPdfUrl] = useState(null);
|
| 52 |
|
| 53 |
/** Refs for PdfHighlighter utilities */
|
| 54 |
const highlighterUtilsRef = useRef();
|
| 55 |
+
const documentReadyCalledRef = useRef(false);
|
| 56 |
+
|
| 57 |
+
// Function to scroll to a specific chunk's highlight
|
| 58 |
+
const scrollToChunk = (chunkIndex) => {
|
| 59 |
+
if (highlighterUtilsRef.current && preloadedHighlights) {
|
| 60 |
+
const chunkHighlights = preloadedHighlights[chunkIndex];
|
| 61 |
+
if (chunkHighlights && chunkHighlights.length > 0) {
|
| 62 |
+
const firstHighlightInChunk = chunkHighlights[0];
|
| 63 |
+
highlighterUtilsRef.current.scrollToHighlight(firstHighlightInChunk);
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
};
|
| 67 |
+
|
| 68 |
+
// Function to scroll to the first highlight (for backwards compatibility)
|
| 69 |
+
const scrollToFirstChunk = () => {
|
| 70 |
+
scrollToChunk(0);
|
| 71 |
+
};
|
| 72 |
+
|
| 73 |
+
// Call onDocumentReady only once when utils become available
|
| 74 |
+
const callOnDocumentReady = () => {
|
| 75 |
+
if (onDocumentReady && !documentReadyCalledRef.current && highlighterUtilsRef.current) {
|
| 76 |
+
documentReadyCalledRef.current = true;
|
| 77 |
+
onDocumentReady({ scrollToFirstChunk });
|
| 78 |
+
}
|
| 79 |
+
};
|
| 80 |
|
| 81 |
// Utility function to normalize highlight data
|
| 82 |
const normalizeHighlight = (highlightData) => {
|
| 83 |
// Ensure the highlight has the required structure
|
| 84 |
if (!highlightData.id || !highlightData.position || !highlightData.content) {
|
|
|
|
| 85 |
return null;
|
| 86 |
}
|
| 87 |
|
|
|
|
| 107 |
}
|
| 108 |
}, [selectedFile]);
|
| 109 |
|
| 110 |
+
// Load preloaded highlights when component mounts or when currentChunkIndex changes
|
| 111 |
useEffect(() => {
|
| 112 |
if (preloadedHighlights) {
|
| 113 |
let highlightsToLoad = [];
|
| 114 |
|
| 115 |
+
if (currentChunkIndex !== null && currentChunkIndex !== undefined && preloadedHighlights[currentChunkIndex]) {
|
| 116 |
+
// Load highlights for specific chunk
|
| 117 |
+
highlightsToLoad = preloadedHighlights[currentChunkIndex];
|
| 118 |
} else if (Array.isArray(preloadedHighlights)) {
|
| 119 |
// Load all highlights if it's an array
|
| 120 |
highlightsToLoad = preloadedHighlights;
|
| 121 |
} else if (typeof preloadedHighlights === 'object') {
|
| 122 |
+
// If it's an object without chunkIndex, take all values
|
| 123 |
highlightsToLoad = Object.values(preloadedHighlights).flat();
|
| 124 |
}
|
| 125 |
|
|
|
|
| 128 |
.map(normalizeHighlight)
|
| 129 |
.filter(Boolean);
|
| 130 |
|
| 131 |
+
console.log(`🎨 Loading ${validHighlights.length} preloaded highlights${currentChunkIndex !== null ? ` for chunk ${currentChunkIndex}` : ''}`);
|
| 132 |
setHighlights(validHighlights);
|
| 133 |
} else {
|
| 134 |
// Clear highlights if no preloaded data
|
| 135 |
setHighlights([]);
|
| 136 |
}
|
| 137 |
+
}, [preloadedHighlights, currentChunkIndex]);
|
| 138 |
+
|
| 139 |
+
// Auto-scroll to current chunk when currentChunkIndex changes (only on navigation, not during streaming)
|
| 140 |
+
useEffect(() => {
|
| 141 |
+
// Only auto-scroll if we have highlighter utils and this is a valid chunk navigation
|
| 142 |
+
if (highlighterUtilsRef.current && currentChunkIndex !== null && currentChunkIndex !== undefined && currentChunkIndex >= 0) {
|
| 143 |
+
// Small delay to ensure highlights are loaded
|
| 144 |
+
setTimeout(() => {
|
| 145 |
+
scrollToChunk(currentChunkIndex);
|
| 146 |
+
}, 200);
|
| 147 |
+
}
|
| 148 |
+
}, [currentChunkIndex]); // Only depend on currentChunkIndex, not preloadedHighlights
|
| 149 |
|
| 150 |
// Handle selection - log coordinates and add debugging
|
| 151 |
const handleSelection = (selection) => {
|
|
|
|
| 194 |
pdfDocument={pdfDocument}
|
| 195 |
utilsRef={(_pdfHighlighterUtils) => {
|
| 196 |
highlighterUtilsRef.current = _pdfHighlighterUtils;
|
| 197 |
+
callOnDocumentReady();
|
| 198 |
}}
|
| 199 |
highlights={highlights}
|
| 200 |
onSelection={handleSelection}
|
frontend/src/components/WelcomeScreen.jsx
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from 'react';
|
| 2 |
+
|
| 3 |
+
const WelcomeScreen = ({ onGetStarted }) => {
|
| 4 |
+
return (
|
| 5 |
+
<div className="h-full flex flex-col items-center justify-center p-8 bg-gradient-to-br from-blue-50 to-indigo-100">
|
| 6 |
+
<div className="max-w-lg text-center space-y-6">
|
| 7 |
+
<div className="space-y-4">
|
| 8 |
+
<h1 className="text-4xl font-bold text-gray-900">
|
| 9 |
+
Welcome to SokratesAI
|
| 10 |
+
</h1>
|
| 11 |
+
<p className="text-lg text-gray-600 leading-relaxed">
|
| 12 |
+
Master complex documents without the overwhelm.
|
| 13 |
+
Your document becomes your tutor, questioning you to deepen understanding.
|
| 14 |
+
</p>
|
| 15 |
+
</div>
|
| 16 |
+
|
| 17 |
+
<div className="space-y-6">
|
| 18 |
+
<div className="text-sm text-gray-700">
|
| 19 |
+
<h3 className="font-semibold text-gray-900 mb-3">How it works:</h3>
|
| 20 |
+
<div className="space-y-2">
|
| 21 |
+
<div className="flex items-start space-x-3">
|
| 22 |
+
<div className="w-2 h-2 bg-blue-500 rounded-full mt-1.5"></div>
|
| 23 |
+
<span>Document appears highlighted in digestible sections</span>
|
| 24 |
+
</div>
|
| 25 |
+
<div className="flex items-start space-x-3">
|
| 26 |
+
<div className="w-2 h-2 bg-green-500 rounded-full mt-1.5"></div>
|
| 27 |
+
<span>AI tutor questions <em>you</em> about each chunk</span>
|
| 28 |
+
</div>
|
| 29 |
+
<div className="flex items-start space-x-3">
|
| 30 |
+
<div className="w-2 h-2 bg-purple-500 rounded-full mt-1.5"></div>
|
| 31 |
+
<span>Progress only when you truly understand</span>
|
| 32 |
+
</div>
|
| 33 |
+
</div>
|
| 34 |
+
</div>
|
| 35 |
+
|
| 36 |
+
<div className="text-sm text-gray-600 bg-gray-50 p-4 rounded-lg">
|
| 37 |
+
<div className="grid grid-cols-2 gap-4">
|
| 38 |
+
<div className="flex items-center space-x-2">
|
| 39 |
+
<div className="p-1.5 rounded-full bg-green-100">
|
| 40 |
+
<svg className="w-4 h-4 text-green-600" fill="currentColor" viewBox="0 0 20 20">
|
| 41 |
+
<path fillRule="evenodd" d="M16.707 5.293a1 1 0 010 1.414l-8 8a1 1 0 01-1.414 0l-4-4a1 1 0 011.414-1.414L8 12.586l7.293-7.293a1 1 0 011.414 0z" clipRule="evenodd" />
|
| 42 |
+
</svg>
|
| 43 |
+
</div>
|
| 44 |
+
<span>Master current topic</span>
|
| 45 |
+
</div>
|
| 46 |
+
<div className="flex items-center space-x-2">
|
| 47 |
+
<div className="p-1.5 rounded-full bg-gray-100">
|
| 48 |
+
<svg className="w-4 h-4 text-gray-600" fill="currentColor" viewBox="0 0 20 20">
|
| 49 |
+
<path fillRule="evenodd" d="M7.293 14.707a1 1 0 010-1.414L10.586 10 7.293 6.707a1 1 0 011.414-1.414l4 4a1 1 0 010 1.414l-4 4a1 1 0 01-1.414 0z" clipRule="evenodd" />
|
| 50 |
+
<path fillRule="evenodd" d="M12.293 14.707a1 1 0 010-1.414L15.586 10l-3.293-3.293a1 1 0 011.414-1.414l4 4a1 1 0 010 1.414l-4 4a1 1 0 01-1.414 0z" clipRule="evenodd" />
|
| 51 |
+
</svg>
|
| 52 |
+
</div>
|
| 53 |
+
<span>Focus elsewhere</span>
|
| 54 |
+
</div>
|
| 55 |
+
<div className="flex items-center space-x-2">
|
| 56 |
+
<div className="p-1.5 rounded-full bg-gray-100">
|
| 57 |
+
<svg className="w-4 h-4 text-gray-600" fill="currentColor" viewBox="0 0 20 20">
|
| 58 |
+
<path fillRule="evenodd" d="M12.707 5.293a1 1 0 010 1.414L9.414 10l3.293 3.293a1 1 0 01-1.414 1.414l-4-4a1 1 0 010-1.414l4-4a1 1 0 011.414 0z" clipRule="evenodd" />
|
| 59 |
+
</svg>
|
| 60 |
+
</div>
|
| 61 |
+
<span>Review previous sections</span>
|
| 62 |
+
</div>
|
| 63 |
+
<div className="flex items-center space-x-2">
|
| 64 |
+
<div className="w-6 h-2 bg-blue-200 rounded-full overflow-hidden">
|
| 65 |
+
<div className="w-2/3 h-full bg-blue-500 rounded-full"></div>
|
| 66 |
+
</div>
|
| 67 |
+
<span>Track your journey</span>
|
| 68 |
+
</div>
|
| 69 |
+
</div>
|
| 70 |
+
</div>
|
| 71 |
+
</div>
|
| 72 |
+
|
| 73 |
+
<button
|
| 74 |
+
onClick={onGetStarted}
|
| 75 |
+
className="bg-blue-600 hover:bg-blue-700 text-white font-semibold py-4 px-8 rounded-lg transition-all duration-200 transform hover:scale-105 shadow-lg hover:shadow-xl"
|
| 76 |
+
>
|
| 77 |
+
Let's Start
|
| 78 |
+
</button>
|
| 79 |
+
</div>
|
| 80 |
+
</div>
|
| 81 |
+
);
|
| 82 |
+
};
|
| 83 |
+
|
| 84 |
+
export default WelcomeScreen;
|
frontend/src/hooks/useDocumentProcessor.js
CHANGED
|
@@ -47,119 +47,44 @@ export const useDocumentProcessor = () => {
|
|
| 47 |
// Use hardcoded chunks for the document
|
| 48 |
const hardcodedChunks = [
|
| 49 |
{
|
| 50 |
-
"topic": "The
|
| 51 |
-
"text": "
|
| 52 |
-
"page": 2
|
| 53 |
},
|
| 54 |
{
|
| 55 |
-
"topic": "The
|
| 56 |
-
"text": "
|
| 57 |
-
"page": 2
|
| 58 |
},
|
| 59 |
{
|
| 60 |
-
"topic": "The
|
| 61 |
-
"text": "
|
| 62 |
-
"page": 2
|
| 63 |
},
|
| 64 |
{
|
| 65 |
-
"topic": "
|
| 66 |
-
"text": "
|
| 67 |
-
"page": 2
|
| 68 |
},
|
| 69 |
{
|
| 70 |
-
"topic": "
|
| 71 |
-
"text": "
|
| 72 |
-
"page": 2
|
| 73 |
},
|
| 74 |
{
|
| 75 |
-
"topic": "
|
| 76 |
-
"text": "
|
| 77 |
-
"page": 2
|
| 78 |
},
|
| 79 |
{
|
| 80 |
-
"topic": "
|
| 81 |
-
"text": "
|
| 82 |
-
"page": 3
|
| 83 |
},
|
| 84 |
{
|
| 85 |
-
"topic": "
|
| 86 |
-
"text": "### 3.
|
| 87 |
-
"page": 3
|
| 88 |
},
|
| 89 |
{
|
| 90 |
-
"topic": "
|
| 91 |
-
"text": "###
|
| 92 |
-
"page": 4
|
| 93 |
},
|
| 94 |
{
|
| 95 |
-
"topic": "Why
|
| 96 |
-
"text": "The
|
| 97 |
-
"page": 4
|
| 98 |
-
},
|
| 99 |
-
{
|
| 100 |
-
"topic": "Innovation: Multi-Head Attention",
|
| 101 |
-
"text": "### 3.2.2 Multi-Head Attention\n\nInstead of performing a single attention function with $d_{\\text{model}}$ -dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values $h$ times with different, learned linear projections to $d_k$ , $d_k$ and $d_v$ dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding $d_v$ -dimensional output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure [2.](#page-3-0)",
|
| 102 |
-
"page": 4
|
| 103 |
-
},
|
| 104 |
-
{
|
| 105 |
-
"topic": "The Power of Multi-Head Attention",
|
| 106 |
-
"text": "Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.\n\n$$\\begin{aligned} \\text{MultiHead}(Q, K, V) &= \\text{Concat}(\\text{head}_1, ..., \\text{head}_h)W^O \\\\ \\text{where } \\text{head}_i &= \\text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \\end{aligned}$$\n\nWhere the projections are parameter matrices W Q <sup>i</sup> <sup>∈</sup> <sup>R</sup> <sup>d</sup>model×d<sup>k</sup> , W <sup>K</sup> <sup>i</sup> ∈ R <sup>d</sup>model×d<sup>k</sup> , W<sup>V</sup> <sup>i</sup> ∈ R dmodel×d<sup>v</sup> and W<sup>O</sup> ∈ R hdv×dmodel .\n\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use d<sup>k</sup> = d<sup>v</sup> = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.",
|
| 107 |
-
"page": 5
|
| 108 |
-
},
|
| 109 |
-
{
|
| 110 |
-
"topic": "Three Uses of Attention in the Model",
|
| 111 |
-
"text": "### 3.2.3 Applications of Attention in our Model\n\nThe Transformer uses multi-head attention in three different ways:\n\n- In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [\\\[38,\\\](#page-11-1) [2,\\\](#page-9-0) [9\\\]](#page-10-9).\n- The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder.\n- Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to −∞) all values in the input of the softmax which correspond to illegal connections. See Figure [2.](#page-3-0)",
|
| 112 |
-
"page": 5
|
| 113 |
-
},
|
| 114 |
-
{
|
| 115 |
-
"topic": "The Role of Position-wise Feed-Forward Networks",
|
| 116 |
-
"text": "### 3.3 Position-wise Feed-Forward Networks\n\nIn addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between.\n\n$$FFN(x) = \\max(0, xW_1 + b_1)W_2 + b_2 \\tag{2}$$\n\nWhile the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality df f = 2048.",
|
| 117 |
-
"page": 5
|
| 118 |
-
},
|
| 119 |
-
{
|
| 120 |
-
"topic": "Input/Output: Embeddings and Softmax",
|
| 121 |
-
"text": "### 3.4 Embeddings and Softmax\n\nSimilarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [\\\[30\\\]](#page-11-6). In the embedding layers, we multiply those weights by <sup>√</sup> dmodel.",
|
| 122 |
-
"page": 5
|
| 123 |
-
},
|
| 124 |
-
{
|
| 125 |
-
"topic": "Solving Sequence Order: Positional Encodings",
|
| 126 |
-
"text": "### 3.5 Positional Encoding\n\nSince our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension $d_{\\text{model}}$ as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [9].",
|
| 127 |
-
"page": 6
|
| 128 |
-
},
|
| 129 |
-
{
|
| 130 |
-
"topic": "The Sinusoidal Positional Encoding Function",
|
| 131 |
-
"text": "In this work, we use sine and cosine functions of different frequencies:\n\n$$PE_{(pos,2i)} = sin(pos/10000^{2i/d_{\\text{model}}})$$\n$$PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{\\text{model}}})$$\n\nwhere $pos$ is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from $2\\pi$ to $10000 \\cdot 2\\pi$ . We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset $k$ , $PE_{pos+k}$ can be represented as a linear function of $PE_{pos}$ .\n\nWe also experimented with using learned positional embeddings [9] instead, and found that the two versions produced nearly identical results (see Table 3 row $(\\bar{E})$ ). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.",
|
| 132 |
-
"page": 6
|
| 133 |
-
},
|
| 134 |
-
{
|
| 135 |
-
"topic": "Why Self-Attention? The Three Desiderata",
|
| 136 |
-
"text": "#### Why Self-Attention 4\n\nIn this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations $(x_1,...,x_n)$ to another sequence of equal length $(z_1,...,z_n)$ , with $x_i,z_i\\in\\mathbb{R}^d$ , such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata.\n\nOne is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required.\n\nThe third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [12]. Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types.",
|
| 137 |
-
"page": 6
|
| 138 |
-
},
|
| 139 |
-
{
|
| 140 |
-
"topic": "Comparing Layer Types by Key Metrics",
|
| 141 |
-
"text": "As noted in Table 1, a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires $O(n)$ sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence\n\n<span id=\"page-5-0\"></span><span id=\"page-5-0\"></span>Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations for different layer types. $n$ is the sequence length, $d$ is the representation dimension, $k$ is the kernel size of convolutions and $r$ the size of the neighborhood in restricted self-attention.\n\n| Layer Type | Complexity per Layer | Sequential<br>Operations | Maximum Path Length |\n|-----------------------------|--------------------------|--------------------------|---------------------|\n| Self-Attention | $O(n^2 \\cdot d)$ | O(1) | O(1) |\n| Recurrent | $O(n \\cdot d^2)$ | O(n) | O(n) |\n| Convolutional | $O(k \\cdot n \\cdot d^2)$ | O(1) | $O(log_k(n))$ |\n| Self-Attention (restricted) | $O(r \\cdot n \\cdot d)$ | $\\mathcal{O}(1)$ | O(n/r) |\n\nlength n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [\\\[38\\\]](#page-11-1) and byte-pair [\\\[31\\\]](#page-11-7) representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in the input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work.",
|
| 142 |
-
"page": 6
|
| 143 |
-
},
|
| 144 |
-
{
|
| 145 |
-
"topic": "A Side Benefit: Interpretability",
|
| 146 |
-
"text": "As side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences.",
|
| 147 |
-
"page": 7
|
| 148 |
-
},
|
| 149 |
-
{
|
| 150 |
-
"topic": "Training Data, Batching, and Hardware",
|
| 151 |
-
"text": "### 5.1 Training Data and Batching\n\nWe trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [\\\[3\\\]](#page-9-3), which has a shared sourcetarget vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [\\\[38\\\]](#page-11-1). Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens.\n\n### 5.2 Hardware and Schedule\n\nWe trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We trained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the bottom line of table [3\\)](#page-8-0), step time was 1.0 seconds. The big models were trained for 300,000 steps (3.5 days).",
|
| 152 |
-
"page": 7
|
| 153 |
-
},
|
| 154 |
-
{
|
| 155 |
-
"topic": "The Adam Optimizer and Learning Rate Schedule",
|
| 156 |
-
"text": "### 5.3 Optimizer\n\nWe used the Adam optimizer [\\\[20\\\]](#page-10-16) with β<sup>1</sup> = 0.9, β<sup>2</sup> = 0.98 and ϵ = 10<sup>−</sup><sup>9</sup> . We varied the learning rate over the course of training, according to the formula:\n\n$$lrate = d_{\\text{model}}^{-0.5} \\cdot \\min(\\text{step\\_num}^{-0.5}, \\text{step\\_num} \\cdot \\text{warmup\\_steps}^{-1.5})$$\n (3)\n\nThis corresponds to increasing the learning rate linearly for the first warmup\\_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup\\_steps = 4000.",
|
| 157 |
-
"page": 7
|
| 158 |
-
},
|
| 159 |
-
{
|
| 160 |
-
"topic": "Regularization Techniques",
|
| 161 |
-
"text": "### 5.4 Regularization\n\nWe employ three types of regularization during training:\n\n**Residual Dropout** We apply dropout [33] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of $P_{drop} = 0.1.$\n\n**Label Smoothing** During training, we employed label smoothing of value $\\epsilon_{ls} = 0.1$ [36]. This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.",
|
| 162 |
-
"page": 7
|
| 163 |
}
|
| 164 |
];
|
| 165 |
|
|
|
|
| 47 |
// Use hardcoded chunks for the document
|
| 48 |
const hardcodedChunks = [
|
| 49 |
{
|
| 50 |
+
"topic": "The Foundation: Proximal Policy Optimization (PPO)",
|
| 51 |
+
"text": "### 4.1.1. From PPO to GRPO\n\nProximal Policy Optimization (PPO) (Schulman et al., 2017) is an actor-critic RL algorithm that is widely used in the RL fine-tuning stage of LLMs (Ouyang et al., 2022). In particular, it optimizes LLMs by maximizing the following surrogate objective:\n\n$$\\mathcal{J}_{PPO}(\\theta) = \\mathbb{E}\\left[q \\sim P(Q), o \\sim \\pi_{\\theta_{old}}(O|q)\\right] \\frac{1}{|o|} \\sum_{t=1}^{|o|} \\min\\left[\\frac{\\pi_{\\theta}(o_t|q, o_{\\leq t})}{\\pi_{\\theta_{old}}(o_t|q, o_{\\leq t})} A_t, \\text{clip}\\left(\\frac{\\pi_{\\theta}(o_t|q, o_{\\leq t})}{\\pi_{\\theta_{old}}(o_t|q, o_{\\leq t})}, 1 - \\varepsilon, 1 + \\varepsilon\\right) A_t\\right], \\tag{1}$$\n\nwhere $\\pi_{\\theta}$ and $\\pi_{\\theta_{old}}$ are the current and old policy models, and *q*, *o* are questions and outputs sampled from the question dataset and the old policy $\\pi_{\\theta_{old}}$ , respectively. $\\varepsilon$ is a clipping-related hyper-parameter introduced in PPO for stabilizing training. $A_t$ is the advantage, which is computed by applying Generalized Advantage Estimation (GAE) (Schulman et al., 2015), based on the rewards $\\{r_{\\geq t}\\}$ and a learned value function $V_{\\psi}$ . Thus, in PPO, a value function needs to be trained alongside the policy model and to mitigate over-optimization of the reward model, the standard approach is to add a per-token KL penalty from a reference model in the reward at each token (Ouyang et al., 2022), i.e.,\n\n$$r_t = r_{\\varphi}(q, o_{\\leq t}) - \\beta \\log \\frac{\\pi_{\\theta}(o_t|q, o_{\\leq t})}{\\pi_{ref}(o_t|q, o_{\\leq t})},\\tag{2}$$\n\nwhere $r_{\\varphi}$ is the reward model, $\\pi_{ref}$ is the reference model, which is usually the initial SFT model, and $\\beta$ is the coefficient of the KL penalty."
|
|
|
|
| 52 |
},
|
| 53 |
{
|
| 54 |
+
"topic": "The Problem with PPO: Why a New Approach is Needed",
|
| 55 |
+
"text": "As the value function employed in PPO is typically another model of comparable size as the policy model, it brings a substantial memory and computational burden. Additionally, during RL training, the value function is treated as a baseline in the calculation of the advantage for variance reduction. While in the LLM context, usually only the last token is assigned a reward score by the reward model, which may complicate the training of a value function that is accurate at each token."
|
|
|
|
| 56 |
},
|
| 57 |
{
|
| 58 |
+
"topic": "The Solution: Introducing Group Relative Policy Optimization (GRPO)",
|
| 59 |
+
"text": "To address this, as shown in Figure 4, we propose Group Relative Policy Optimization (GRPO), which obviates the need for additional value function approximation as in PPO, and instead uses the average reward of multiple sampled outputs, produced in response to the same question, as the baseline.\n\n\n\nFigure 4 | Demonstration of PPO and our GRPO. GRPO foregoes the value model, instead estimating the baseline from group scores, significantly reducing training resources."
|
|
|
|
| 60 |
},
|
| 61 |
{
|
| 62 |
+
"topic": "The GRPO Objective Function (Equation 3)",
|
| 63 |
+
"text": "More specifically, for each question $q$ , GRPO samples a group of outputs $\\{o_1, o_2, \\cdots, o_G\\}$ from the old policy $\\pi_{\\theta_{old}}$ and then optimizes the policy model by maximizing the following objective:\n\n$$\\mathcal{J}_{GRPO}(\\theta) = \\mathbb{E}[q \\sim P(Q), \\{o_{i}\\}_{i=1}^{G} \\sim \\pi_{\\theta_{old}}(O|q)]\\n$$\n\n$$\\n\\frac{1}{G} \\sum_{i=1}^{G} \\frac{1}{|o_{i}|} \\sum_{t=1}^{|o_{i}|} \\left\\{ \\min \\left[ \\frac{\\pi_{\\theta}(o_{i,t}|q, o_{i,< t})}{\\pi_{\\theta_{old}}(o_{i,t}|q, o_{i,< t})} \\hat{A}_{i,t}, \\operatorname{clip} \\left( \\frac{\\pi_{\\theta}(o_{i,t}|q, o_{i,< t})}{\\pi_{\\theta_{old}}(o_{i,t}|q, o_{i,< t})}, 1 - \\varepsilon, 1 + \\varepsilon \\right) \\hat{A}_{i,t} \\right] - \\beta \\mathbb{D}_{KL} \\left[ \\pi_{\\theta} || \\pi_{ref} \\right] \\right\\}, \\n$$\n(3)\n\nwhere $\\varepsilon$ and $\\beta$ are hyper-parameters, and $\\hat{A}_{i,t}$ is the advantage calculated based on relative rewards of the outputs inside each group only, which will be detailed in the following subsections."
|
|
|
|
| 64 |
},
|
| 65 |
{
|
| 66 |
+
"topic": "Key Feature 1: Group Relative Advantage Calculation",
|
| 67 |
+
"text": "The group relative way that GRPO leverages to calculate the advantages, aligns well with the comparative nature of rewards models, as reward models are typically trained on datasets of comparisons between outputs on the same question. Also note that, instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss, avoiding complicating the calculation of $\\hat{A}_{i,t}$ ."
|
|
|
|
| 68 |
},
|
| 69 |
{
|
| 70 |
+
"topic": "Key Feature 2: KL Divergence as a Direct Penalty (Equation 4)",
|
| 71 |
+
"text": "And different from the KL penalty term used in (2), we estimate the KL divergence with the following unbiased estimator (Schulman, 2020):\n\n$$\\mathbb{D}_{KL}\\left[\\pi_{\\theta}||\\pi_{ref}\\right] = \\frac{\\pi_{ref}(o_{i,t}|q, o_{i,< t})}{\\pi_{\\theta}(o_{i,t}|q, o_{i,< t})} - \\log\\frac{\\pi_{ref}(o_{i,t}|q, o_{i,< t})}{\\pi_{\\theta}(o_{i,t}|q, o_{i,< t})} - 1,\\tag{4}$$\n\nwhich is guaranteed to be positive."
|
|
|
|
| 72 |
},
|
| 73 |
{
|
| 74 |
+
"topic": "Application 1: Outcome Supervision RL with GRPO",
|
| 75 |
+
"text": "#### 4.1.2. Outcome Supervision RL with GRPO\n\nFormally, for each question q, a group of outputs $\\{o_1, o_2, \\cdots, o_G\\}$ are sampled from the old policy model $\\pi_{\\theta_{old}}$ . A reward model is then used to score the outputs, yielding *G* rewards $\\mathbf{r} = \\{r_1, r_2, \\cdots, r_G\\}$ correspondingly. Subsequently, these rewards are normalized by subtracting the group average and dividing by the group standard deviation. Outcome supervision provides the normalized reward at the end of each output $o_i$ and sets the advantages $\\hat{A}_{i,t}$ of all tokens in the output as the normalized reward, i.e., $\\hat{A}_{i,t} = \\widetilde{r}_i = \\frac{r_i - \\text{mean}(\\mathbf{r})}{\\text{std}(\\mathbf{r})}$ , and then optimizes the policy by maximizing the objective defined in equation $(3)$ ."
|
|
|
|
| 76 |
},
|
| 77 |
{
|
| 78 |
+
"topic": "Application 2: Process Supervision RL with GRPO",
|
| 79 |
+
"text": "### 4.1.3. Process Supervision RL with GRPO\n\nOutcome supervision only provides a reward at the end of each output, which may not be sufficient and efficient to supervise the policy in complex mathematical tasks. Following Wang et al. (2023b), we also explore process supervision, which provides a reward at the end of each reasoning step. Formally, given the question q and G sampled outputs $\\{o_1, o_2, \\cdots, o_G\\}$ , a process reward model is used to score each step of the outputs, yielding corresponding rewards: $\\mathbf{R} = \\{\\{r_1^{index(1)}, \\cdots, r_1^{index(K_1)}\\}, \\cdots, \\{r_G^{index(1)}, \\cdots, r_G^{index(K_G)}\\}\\}, \\text{ where } index(j) \\text{ is the end token index}$ of the $j$ -th step, and $K_i$ is the total number of steps in the $i$ -th output. We also normalize these rewards with the average and the standard deviation, i.e., $\\widetilde{r}_{i}^{\\text{index}(j)} = \\frac{r_{i}^{\\text{index}(j)} - \\text{mean}(\\mathbf{R})}{\\text{std}(\\mathbf{R})}$ . Subsequently, the process supervision calculates the advantage of each token as the sum of the normalized rewards from the following steps, i.e., $\\hat{A}_{i,t} = \\sum_{index(j) \\ge t} \\tilde{r}_i^{index(j)}$ , and then optimizes the policy by maximizing the objective defined in equation $(3)$ ."
|
|
|
|
| 80 |
},
|
| 81 |
{
|
| 82 |
+
"topic": "The Full Training Loop: Iterative RL with GRPO",
|
| 83 |
+
"text": "### 4.1.4. Iterative RL with GRPO\n\nAs the reinforcement learning training process progresses, the old reward model may not be sufficient to supervise the current policy model. Therefore, we also explore the iterative RL with GRPO. As shown in Algorithm 1, in iterative GRPO, we generate new training sets for the reward model based on the sampling results from the policy model and continually train the old reward model using a replay mechanism that incorporates 10% of historical data. Then, we set the reference model as the policy model, and continually train the policy model with the new reward model.\n\n### **Algorithm 1** Iterative Group Relative Policy Optimization\n\n**Input** initial policy model $\\pi_{\\theta_{\\text{init}}}$ ; reward models $r_{\\varphi}$ ; task prompts $\\mathcal{D}$ ; hyperparameters $\\varepsilon$ , $\\beta$ , $\\mu$ \n\n- 1: policy model $\\pi_{\\theta} \\leftarrow \\pi_{\\theta_{\\text{init}}}$ 2: **for** iteration = $1, \\ldots, I$ **do** 3: reference model $\\pi_{ref} \\leftarrow \\pi_{\\theta}$\n- 4: for step = $1, \\ldots, M$ do\n- Sample a batch $\\mathcal{D}_b$ from $\\mathcal{D}$ 5:\n- Update the old policy model $\\pi_{\\theta_{old}} \\leftarrow \\pi_{\\theta}$ 6:\n- 7:\n- Sample *G* outputs $\\{o_i\\}_{i=1}^G \\sim \\pi_{\\theta_{old}}(\\cdot \\mid q)$ for each question $q \\in \\mathcal{D}_b$ <br>Compute rewards $\\{r_i\\}_{i=1}^G$ for each sampled output $o_i$ by running $r_{\\varphi}$ 8:\n- Compute $\\hat{A}_{i,t}$ for the *t*-th token of $o_i$ through group relative advantage estimation. 9:\n- **for** GRPO iteration = $1, \\ldots, \\mu$ **do** 10:\n- Update the policy model $\\pi_{\\theta}$ by maximizing the GRPO objective (Equation 21) 11:\n- 12: Update $r_{\\varphi}$ through continuous training using a replay mechanism.\n\nOutput $\\pi_{\\theta}$ "
|
|
|
|
| 84 |
},
|
| 85 |
{
|
| 86 |
+
"topic": "Why GRPO Makes Sense: The Benefit of a Graded Reward Signal",
|
| 87 |
+
"text": "The algorithm processes the reward signal to the gradient coefficient to update the model parameter. We divide the reward function as 'Rule' and 'Model' in our experiments. Rule refers to judging the quality of a response based on the correctness of the answer, and Model denotes that we train a reward model to score each response. The training data of the reward model is based on the rule judgment. Equations 10 and 21 highlight a key difference between GRPO and Online RFT: GRPO uniquely adjusts its gradient coefficient based on the reward value provided by the reward model. This allows for differential reinforcement and penalization of responses according to their varying magnitudes. In contrast, Online RFT lacks this feature; it does not penalize incorrect responses and uniformly reinforces all responses with correct answers at the same level of intensity.\n\nAs demonstrated in Figure 5, GRPO surpasses online RFT, thereby highlighting the efficiency of altering positive and negative gradient coefficients. In addition, GRPO+PS shows superior performance compared to GRPO+OS, indicating the benefits of using fine-grained, step-aware gradient coefficients. Furthermore, we explore the iterative RL, in our experiments, we conduct two rounds of iteration. As shown in Figure 6, we notice that the iterative RL significantly improves the performance, especially at the first iteration.\n\n\n\nFigure 5 | Performance of the DeepSeekMath-Instruct 1.3B model, which was further trained using various methods, on two benchmarks."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
}
|
| 89 |
];
|
| 90 |
|