Spaces:
Sleeping
Sleeping
Add automatic Docling parsing display in indexing flow
Browse files- Add /api/parse-docling endpoint for bulk file parsing
- Create DoclingOutput component showing complete parsing results:
- Breakdown numbers (elements, chars, words, pages)
- Element type counts
- Complete element list with full text
- Download JSON button
- Continue to Indexing button
- Add parseWithDocling API function
- Add "parse" step to ProcessingStatus
- Update Sidebar to:
- Parse files with Docling after Dropbox fetch
- Display DoclingOutput for user review
- Continue indexing on user confirmation
- Remove manual "Test Parsing" button (now automatic)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- frontend/src/api/client.js +13 -0
- frontend/src/components/DoclingOutput.jsx +227 -0
- frontend/src/components/ProcessingStatus.jsx +1 -0
- frontend/src/components/Sidebar.jsx +69 -41
- src/api/routes.py +107 -0
frontend/src/api/client.js
CHANGED
|
@@ -125,3 +125,16 @@ export async function getSupportedFormats() {
|
|
| 125 |
const res = await fetch(`${API_BASE}/eval/formats`);
|
| 126 |
return res.json();
|
| 127 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
const res = await fetch(`${API_BASE}/eval/formats`);
|
| 126 |
return res.json();
|
| 127 |
}
|
| 128 |
+
|
| 129 |
+
/**
|
| 130 |
+
* Parse files with Docling and return COMPLETE output
|
| 131 |
+
* Returns all elements (not just samples) for full document preview
|
| 132 |
+
*/
|
| 133 |
+
export async function parseWithDocling(files, accessToken) {
|
| 134 |
+
const res = await fetch(`${API_BASE}/parse-docling`, {
|
| 135 |
+
method: 'POST',
|
| 136 |
+
headers: { 'Content-Type': 'application/json' },
|
| 137 |
+
body: JSON.stringify({ files, access_token: accessToken })
|
| 138 |
+
});
|
| 139 |
+
return res.json();
|
| 140 |
+
}
|
frontend/src/components/DoclingOutput.jsx
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from 'react';
|
| 2 |
+
|
| 3 |
+
/**
|
| 4 |
+
* Display complete Docling parsing output with breakdown numbers
|
| 5 |
+
* Shows all elements, element type counts, and stats
|
| 6 |
+
*/
|
| 7 |
+
export default function DoclingOutput({ results, onContinue, onDownload }) {
|
| 8 |
+
const [expandedFiles, setExpandedFiles] = useState(
|
| 9 |
+
// Expand first file by default
|
| 10 |
+
results?.length > 0 ? { [results[0].filename]: true } : {}
|
| 11 |
+
);
|
| 12 |
+
|
| 13 |
+
const toggleFile = (filename) => {
|
| 14 |
+
setExpandedFiles(prev => ({
|
| 15 |
+
...prev,
|
| 16 |
+
[filename]: !prev[filename]
|
| 17 |
+
}));
|
| 18 |
+
};
|
| 19 |
+
|
| 20 |
+
const formatNumber = (num) => {
|
| 21 |
+
return num?.toLocaleString() || '0';
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
const handleDownload = () => {
|
| 25 |
+
const dataStr = JSON.stringify(results, null, 2);
|
| 26 |
+
const dataBlob = new Blob([dataStr], { type: 'application/json' });
|
| 27 |
+
const url = URL.createObjectURL(dataBlob);
|
| 28 |
+
const link = document.createElement('a');
|
| 29 |
+
link.href = url;
|
| 30 |
+
link.download = `docling-output-${new Date().toISOString().slice(0, 10)}.json`;
|
| 31 |
+
document.body.appendChild(link);
|
| 32 |
+
link.click();
|
| 33 |
+
document.body.removeChild(link);
|
| 34 |
+
URL.revokeObjectURL(url);
|
| 35 |
+
onDownload?.();
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
// Get element type badge color
|
| 39 |
+
const getTypeColor = (type) => {
|
| 40 |
+
const colors = {
|
| 41 |
+
heading: 'bg-blue-900/50 text-blue-300 border-blue-700',
|
| 42 |
+
paragraph: 'bg-slate-700 text-slate-300 border-slate-600',
|
| 43 |
+
table: 'bg-purple-900/50 text-purple-300 border-purple-700',
|
| 44 |
+
list_item: 'bg-orange-900/50 text-orange-300 border-orange-700',
|
| 45 |
+
list: 'bg-orange-900/50 text-orange-300 border-orange-700',
|
| 46 |
+
code: 'bg-green-900/50 text-green-300 border-green-700',
|
| 47 |
+
image: 'bg-pink-900/50 text-pink-300 border-pink-700',
|
| 48 |
+
caption: 'bg-yellow-900/50 text-yellow-300 border-yellow-700',
|
| 49 |
+
formula: 'bg-cyan-900/50 text-cyan-300 border-cyan-700',
|
| 50 |
+
footer: 'bg-gray-700 text-gray-300 border-gray-600',
|
| 51 |
+
header: 'bg-gray-700 text-gray-300 border-gray-600',
|
| 52 |
+
};
|
| 53 |
+
return colors[type] || 'bg-slate-700 text-slate-300 border-slate-600';
|
| 54 |
+
};
|
| 55 |
+
|
| 56 |
+
if (!results || results.length === 0) {
|
| 57 |
+
return (
|
| 58 |
+
<div className="text-center py-8 text-slate-400">
|
| 59 |
+
No parsing results available
|
| 60 |
+
</div>
|
| 61 |
+
);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
return (
|
| 65 |
+
<div className="fixed inset-0 bg-black/80 flex items-center justify-center z-50 p-4">
|
| 66 |
+
<div className="bg-slate-800 border border-slate-700 rounded-xl w-full max-w-4xl max-h-[90vh] flex flex-col shadow-2xl">
|
| 67 |
+
{/* Header */}
|
| 68 |
+
<div className="p-4 border-b border-slate-700 flex items-center justify-between flex-shrink-0">
|
| 69 |
+
<div>
|
| 70 |
+
<h2 className="text-lg font-semibold text-slate-100">Docling Parsing Results</h2>
|
| 71 |
+
<p className="text-sm text-slate-400 mt-0.5">
|
| 72 |
+
{results.length} file{results.length !== 1 ? 's' : ''} parsed
|
| 73 |
+
</p>
|
| 74 |
+
</div>
|
| 75 |
+
<div className="flex items-center gap-2">
|
| 76 |
+
<button
|
| 77 |
+
type="button"
|
| 78 |
+
onClick={handleDownload}
|
| 79 |
+
className="flex items-center gap-2 px-3 py-2 text-sm font-medium text-slate-300 bg-slate-700 border border-slate-600 rounded-lg hover:bg-slate-600 transition-colors"
|
| 80 |
+
>
|
| 81 |
+
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 82 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-4l-4 4m0 0l-4-4m4 4V4" />
|
| 83 |
+
</svg>
|
| 84 |
+
Download JSON
|
| 85 |
+
</button>
|
| 86 |
+
<button
|
| 87 |
+
type="button"
|
| 88 |
+
onClick={onContinue}
|
| 89 |
+
className="flex items-center gap-2 px-4 py-2 text-sm font-medium text-white bg-blue-600 rounded-lg hover:bg-blue-700 transition-colors"
|
| 90 |
+
>
|
| 91 |
+
Continue to Indexing
|
| 92 |
+
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 93 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7l5 5m0 0l-5 5m5-5H6" />
|
| 94 |
+
</svg>
|
| 95 |
+
</button>
|
| 96 |
+
</div>
|
| 97 |
+
</div>
|
| 98 |
+
|
| 99 |
+
{/* Content - Scrollable */}
|
| 100 |
+
<div className="flex-1 overflow-auto p-4 space-y-4">
|
| 101 |
+
{results.map((doc, idx) => (
|
| 102 |
+
<div key={idx} className="bg-slate-900 border border-slate-700 rounded-lg overflow-hidden">
|
| 103 |
+
{/* File Header - Clickable */}
|
| 104 |
+
<button
|
| 105 |
+
type="button"
|
| 106 |
+
onClick={() => toggleFile(doc.filename)}
|
| 107 |
+
className="w-full p-4 flex items-center justify-between hover:bg-slate-800/50 transition-colors text-left"
|
| 108 |
+
>
|
| 109 |
+
<div className="flex items-center gap-3">
|
| 110 |
+
<svg className={`w-5 h-5 text-slate-400 transition-transform ${expandedFiles[doc.filename] ? 'rotate-90' : ''}`} fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 111 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
|
| 112 |
+
</svg>
|
| 113 |
+
<svg className="w-5 h-5 text-slate-500" fill="currentColor" viewBox="0 0 24 24">
|
| 114 |
+
<path d="M14 2H6c-1.1 0-2 .9-2 2v16c0 1.1.9 2 2 2h12c1.1 0 2-.9 2-2V8l-6-6zm-1 2l5 5h-5V4z"/>
|
| 115 |
+
</svg>
|
| 116 |
+
<span className="font-medium text-slate-200">{doc.filename}</span>
|
| 117 |
+
</div>
|
| 118 |
+
<div className="flex items-center gap-2">
|
| 119 |
+
{doc.status === 'OK' ? (
|
| 120 |
+
<span className="flex items-center gap-1 text-xs font-medium px-2 py-1 bg-green-900/40 border border-green-700 text-green-400 rounded">
|
| 121 |
+
<svg className="w-3 h-3" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 122 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M5 13l4 4L19 7" />
|
| 123 |
+
</svg>
|
| 124 |
+
OK
|
| 125 |
+
</span>
|
| 126 |
+
) : (
|
| 127 |
+
<span className="flex items-center gap-1 text-xs font-medium px-2 py-1 bg-red-900/40 border border-red-700 text-red-400 rounded">
|
| 128 |
+
<svg className="w-3 h-3" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 129 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M6 18L18 6M6 6l12 12" />
|
| 130 |
+
</svg>
|
| 131 |
+
{doc.status || 'ERROR'}
|
| 132 |
+
</span>
|
| 133 |
+
)}
|
| 134 |
+
<span className="text-xs text-slate-500 uppercase">{doc.format}</span>
|
| 135 |
+
</div>
|
| 136 |
+
</button>
|
| 137 |
+
|
| 138 |
+
{/* Expanded Content */}
|
| 139 |
+
{expandedFiles[doc.filename] && (
|
| 140 |
+
<div className="border-t border-slate-700">
|
| 141 |
+
{doc.error ? (
|
| 142 |
+
<div className="p-4 bg-red-900/20 text-red-400 text-sm">
|
| 143 |
+
Error: {doc.error}
|
| 144 |
+
</div>
|
| 145 |
+
) : (
|
| 146 |
+
<>
|
| 147 |
+
{/* Stats Grid */}
|
| 148 |
+
<div className="p-4 grid grid-cols-2 sm:grid-cols-4 gap-3">
|
| 149 |
+
<div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
|
| 150 |
+
<p className="text-2xl font-bold text-blue-400">{formatNumber(doc.total_elements)}</p>
|
| 151 |
+
<p className="text-xs text-slate-500 mt-1">Elements</p>
|
| 152 |
+
</div>
|
| 153 |
+
<div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
|
| 154 |
+
<p className="text-2xl font-bold text-green-400">{formatNumber(doc.total_chars)}</p>
|
| 155 |
+
<p className="text-xs text-slate-500 mt-1">Characters</p>
|
| 156 |
+
</div>
|
| 157 |
+
<div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
|
| 158 |
+
<p className="text-2xl font-bold text-purple-400">{formatNumber(doc.total_words)}</p>
|
| 159 |
+
<p className="text-xs text-slate-500 mt-1">Words</p>
|
| 160 |
+
</div>
|
| 161 |
+
<div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
|
| 162 |
+
<p className="text-2xl font-bold text-orange-400">{doc.page_count || '-'}</p>
|
| 163 |
+
<p className="text-xs text-slate-500 mt-1">Pages</p>
|
| 164 |
+
</div>
|
| 165 |
+
</div>
|
| 166 |
+
|
| 167 |
+
{/* Element Types */}
|
| 168 |
+
{doc.element_types && Object.keys(doc.element_types).length > 0 && (
|
| 169 |
+
<div className="px-4 pb-4">
|
| 170 |
+
<h4 className="text-sm font-medium text-slate-400 mb-2">Element Types</h4>
|
| 171 |
+
<div className="flex flex-wrap gap-2">
|
| 172 |
+
{Object.entries(doc.element_types)
|
| 173 |
+
.sort((a, b) => b[1] - a[1])
|
| 174 |
+
.map(([type, count]) => (
|
| 175 |
+
<span
|
| 176 |
+
key={type}
|
| 177 |
+
className={`text-xs font-medium px-2 py-1 rounded border ${getTypeColor(type)}`}
|
| 178 |
+
>
|
| 179 |
+
{type}: {count}
|
| 180 |
+
</span>
|
| 181 |
+
))}
|
| 182 |
+
</div>
|
| 183 |
+
</div>
|
| 184 |
+
)}
|
| 185 |
+
|
| 186 |
+
{/* Complete Output */}
|
| 187 |
+
{doc.elements && doc.elements.length > 0 && (
|
| 188 |
+
<div className="px-4 pb-4">
|
| 189 |
+
<h4 className="text-sm font-medium text-slate-400 mb-2">
|
| 190 |
+
Complete Output ({doc.elements.length} elements)
|
| 191 |
+
</h4>
|
| 192 |
+
<div className="bg-slate-950 border border-slate-700 rounded-lg max-h-96 overflow-auto">
|
| 193 |
+
{doc.elements.map((el, elIdx) => (
|
| 194 |
+
<div
|
| 195 |
+
key={elIdx}
|
| 196 |
+
className="p-3 border-b border-slate-800 last:border-b-0 hover:bg-slate-900/50"
|
| 197 |
+
>
|
| 198 |
+
<div className="flex items-center gap-2 mb-1">
|
| 199 |
+
<span className={`text-xs font-medium px-2 py-0.5 rounded border ${getTypeColor(el.type)}`}>
|
| 200 |
+
{el.type}
|
| 201 |
+
</span>
|
| 202 |
+
{el.level && (
|
| 203 |
+
<span className="text-xs text-slate-500">L{el.level}</span>
|
| 204 |
+
)}
|
| 205 |
+
{el.page && (
|
| 206 |
+
<span className="text-xs text-slate-600">p.{el.page}</span>
|
| 207 |
+
)}
|
| 208 |
+
</div>
|
| 209 |
+
<p className="text-sm text-slate-300 whitespace-pre-wrap break-words">
|
| 210 |
+
{el.text || '(empty)'}
|
| 211 |
+
</p>
|
| 212 |
+
</div>
|
| 213 |
+
))}
|
| 214 |
+
</div>
|
| 215 |
+
</div>
|
| 216 |
+
)}
|
| 217 |
+
</>
|
| 218 |
+
)}
|
| 219 |
+
</div>
|
| 220 |
+
)}
|
| 221 |
+
</div>
|
| 222 |
+
))}
|
| 223 |
+
</div>
|
| 224 |
+
</div>
|
| 225 |
+
</div>
|
| 226 |
+
);
|
| 227 |
+
}
|
frontend/src/components/ProcessingStatus.jsx
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
const steps = [
|
| 2 |
{ id: 'read', label: 'Reading from cloud storage' },
|
|
|
|
| 3 |
{ id: 'chunk', label: 'Chunking in your browser' },
|
| 4 |
{ id: 'clear', label: 'Clearing old index data' },
|
| 5 |
{ id: 'embed', label: 'Generating embeddings' },
|
|
|
|
| 1 |
const steps = [
|
| 2 |
{ id: 'read', label: 'Reading from cloud storage' },
|
| 3 |
+
{ id: 'parse', label: 'Parsing with Docling' },
|
| 4 |
{ id: 'chunk', label: 'Chunking in your browser' },
|
| 5 |
{ id: 'clear', label: 'Clearing old index data' },
|
| 6 |
{ id: 'embed', label: 'Generating embeddings' },
|
frontend/src/components/Sidebar.jsx
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
import { useState } from 'react';
|
| 2 |
-
import { embedChunks, clearIndex } from '../api/client';
|
| 3 |
import { processSelectedFiles } from '../api/dropbox';
|
| 4 |
import { chunkFiles } from '../api/chunker';
|
| 5 |
import ProcessingStatus from './ProcessingStatus';
|
| 6 |
import IndexSummary from './IndexSummary';
|
| 7 |
import CloudConnect from './CloudConnect';
|
| 8 |
-
import
|
| 9 |
|
| 10 |
export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
| 11 |
const [loading, setLoading] = useState(false);
|
|
@@ -17,8 +17,9 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
|
| 17 |
const [stagedFiles, setStagedFiles] = useState([]);
|
| 18 |
const [accessToken, setAccessToken] = useState(null);
|
| 19 |
|
| 20 |
-
// State for parsing
|
| 21 |
-
const [
|
|
|
|
| 22 |
|
| 23 |
// Handle files staged from CloudConnect (not processed yet)
|
| 24 |
const handleFilesStaged = (files) => {
|
|
@@ -46,7 +47,7 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
|
| 46 |
setStagedFiles([]);
|
| 47 |
};
|
| 48 |
|
| 49 |
-
// Start indexing the staged files
|
| 50 |
const handleIndexFiles = async () => {
|
| 51 |
if (stagedFiles.length === 0 || !accessToken) return;
|
| 52 |
|
|
@@ -62,7 +63,7 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
|
| 62 |
setProcessingState({
|
| 63 |
step: 'read',
|
| 64 |
fileName: progress.fileName,
|
| 65 |
-
progress: 10 + (progress.current / progress.total) *
|
| 66 |
});
|
| 67 |
});
|
| 68 |
|
|
@@ -73,26 +74,66 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
|
| 73 |
return;
|
| 74 |
}
|
| 75 |
|
| 76 |
-
// Step 2:
|
| 77 |
-
setProcessingState({ step: '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
await new Promise(r => setTimeout(r, 100));
|
| 79 |
|
| 80 |
const chunks = chunkFiles(fileContents);
|
| 81 |
|
| 82 |
-
// Step
|
| 83 |
setProcessingState({ step: 'clear', fileName: 'Clearing old data', progress: 50 });
|
| 84 |
await clearIndex();
|
| 85 |
|
| 86 |
-
// Step
|
| 87 |
setProcessingState({ step: 'embed', fileName: `${chunks.length} chunks`, progress: 65 });
|
| 88 |
|
| 89 |
const result = await embedChunks(chunks);
|
| 90 |
|
| 91 |
-
// Step
|
| 92 |
setProcessingState({ step: 'discard', fileName: '', progress: 85 });
|
| 93 |
await new Promise(r => setTimeout(r, 300));
|
| 94 |
|
| 95 |
-
// Step
|
| 96 |
setProcessingState({ step: 'save', fileName: '', progress: 100 });
|
| 97 |
await new Promise(r => setTimeout(r, 200));
|
| 98 |
|
|
@@ -185,29 +226,16 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
|
| 185 |
<p className="text-xs text-slate-500">{formatSize(file.size)}</p>
|
| 186 |
</div>
|
| 187 |
</div>
|
| 188 |
-
<
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
</svg>
|
| 199 |
-
</button>
|
| 200 |
-
<button
|
| 201 |
-
type="button"
|
| 202 |
-
onClick={() => removeFile(file.id)}
|
| 203 |
-
className="p-1 text-slate-500 hover:text-red-400"
|
| 204 |
-
aria-label={`Remove ${file.name}`}
|
| 205 |
-
>
|
| 206 |
-
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 207 |
-
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M6 18L18 6M6 6l12 12" />
|
| 208 |
-
</svg>
|
| 209 |
-
</button>
|
| 210 |
-
</div>
|
| 211 |
</div>
|
| 212 |
))}
|
| 213 |
</div>
|
|
@@ -250,12 +278,12 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
|
| 250 |
</div>
|
| 251 |
)}
|
| 252 |
|
| 253 |
-
{/*
|
| 254 |
-
{
|
| 255 |
-
<
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
/>
|
| 260 |
)}
|
| 261 |
</div>
|
|
|
|
| 1 |
import { useState } from 'react';
|
| 2 |
+
import { embedChunks, clearIndex, parseWithDocling } from '../api/client';
|
| 3 |
import { processSelectedFiles } from '../api/dropbox';
|
| 4 |
import { chunkFiles } from '../api/chunker';
|
| 5 |
import ProcessingStatus from './ProcessingStatus';
|
| 6 |
import IndexSummary from './IndexSummary';
|
| 7 |
import CloudConnect from './CloudConnect';
|
| 8 |
+
import DoclingOutput from './DoclingOutput';
|
| 9 |
|
| 10 |
export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
|
| 11 |
const [loading, setLoading] = useState(false);
|
|
|
|
| 17 |
const [stagedFiles, setStagedFiles] = useState([]);
|
| 18 |
const [accessToken, setAccessToken] = useState(null);
|
| 19 |
|
| 20 |
+
// State for Docling parsing output
|
| 21 |
+
const [parsedDocuments, setParsedDocuments] = useState(null);
|
| 22 |
+
const [pendingFileContents, setPendingFileContents] = useState(null);
|
| 23 |
|
| 24 |
// Handle files staged from CloudConnect (not processed yet)
|
| 25 |
const handleFilesStaged = (files) => {
|
|
|
|
| 47 |
setStagedFiles([]);
|
| 48 |
};
|
| 49 |
|
| 50 |
+
// Start indexing the staged files - Phase 1: Read and Parse
|
| 51 |
const handleIndexFiles = async () => {
|
| 52 |
if (stagedFiles.length === 0 || !accessToken) return;
|
| 53 |
|
|
|
|
| 63 |
setProcessingState({
|
| 64 |
step: 'read',
|
| 65 |
fileName: progress.fileName,
|
| 66 |
+
progress: 10 + (progress.current / progress.total) * 15,
|
| 67 |
});
|
| 68 |
});
|
| 69 |
|
|
|
|
| 74 |
return;
|
| 75 |
}
|
| 76 |
|
| 77 |
+
// Step 2: Parse with Docling
|
| 78 |
+
setProcessingState({ step: 'parse', fileName: `${fileContents.length} files`, progress: 28 });
|
| 79 |
+
|
| 80 |
+
const parseResult = await parseWithDocling(
|
| 81 |
+
stagedFiles.map(f => ({ path: f.path_lower, name: f.name })),
|
| 82 |
+
accessToken
|
| 83 |
+
);
|
| 84 |
+
|
| 85 |
+
if (parseResult.error) {
|
| 86 |
+
setMessage({ type: 'error', text: parseResult.error });
|
| 87 |
+
setLoading(false);
|
| 88 |
+
setProcessingState(null);
|
| 89 |
+
return;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
// Store results and pause for user review
|
| 93 |
+
setParsedDocuments(parseResult.results);
|
| 94 |
+
setPendingFileContents(fileContents);
|
| 95 |
+
setProcessingState(null);
|
| 96 |
+
setLoading(false);
|
| 97 |
+
// User will click "Continue to Indexing" in DoclingOutput
|
| 98 |
+
|
| 99 |
+
} catch (err) {
|
| 100 |
+
setMessage({ type: 'error', text: err.message });
|
| 101 |
+
setProcessingState(null);
|
| 102 |
+
setLoading(false);
|
| 103 |
+
}
|
| 104 |
+
};
|
| 105 |
+
|
| 106 |
+
// Continue indexing after user reviews Docling output
|
| 107 |
+
const handleContinueIndexing = async () => {
|
| 108 |
+
if (!pendingFileContents) return;
|
| 109 |
+
|
| 110 |
+
setLoading(true);
|
| 111 |
+
setParsedDocuments(null);
|
| 112 |
+
|
| 113 |
+
try {
|
| 114 |
+
const fileContents = pendingFileContents;
|
| 115 |
+
setPendingFileContents(null);
|
| 116 |
+
|
| 117 |
+
// Step 3: Chunk files (client-side)
|
| 118 |
+
setProcessingState({ step: 'chunk', fileName: `${fileContents.length} files`, progress: 40 });
|
| 119 |
await new Promise(r => setTimeout(r, 100));
|
| 120 |
|
| 121 |
const chunks = chunkFiles(fileContents);
|
| 122 |
|
| 123 |
+
// Step 4: Clear existing index
|
| 124 |
setProcessingState({ step: 'clear', fileName: 'Clearing old data', progress: 50 });
|
| 125 |
await clearIndex();
|
| 126 |
|
| 127 |
+
// Step 5: Send chunks to server for embedding
|
| 128 |
setProcessingState({ step: 'embed', fileName: `${chunks.length} chunks`, progress: 65 });
|
| 129 |
|
| 130 |
const result = await embedChunks(chunks);
|
| 131 |
|
| 132 |
+
// Step 6: Show discard step
|
| 133 |
setProcessingState({ step: 'discard', fileName: '', progress: 85 });
|
| 134 |
await new Promise(r => setTimeout(r, 300));
|
| 135 |
|
| 136 |
+
// Step 7: Complete
|
| 137 |
setProcessingState({ step: 'save', fileName: '', progress: 100 });
|
| 138 |
await new Promise(r => setTimeout(r, 200));
|
| 139 |
|
|
|
|
| 226 |
<p className="text-xs text-slate-500">{formatSize(file.size)}</p>
|
| 227 |
</div>
|
| 228 |
</div>
|
| 229 |
+
<button
|
| 230 |
+
type="button"
|
| 231 |
+
onClick={() => removeFile(file.id)}
|
| 232 |
+
className="p-1 text-slate-500 hover:text-red-400 opacity-0 group-hover:opacity-100 transition-all"
|
| 233 |
+
aria-label={`Remove ${file.name}`}
|
| 234 |
+
>
|
| 235 |
+
<svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 236 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M6 18L18 6M6 6l12 12" />
|
| 237 |
+
</svg>
|
| 238 |
+
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
</div>
|
| 240 |
))}
|
| 241 |
</div>
|
|
|
|
| 278 |
</div>
|
| 279 |
)}
|
| 280 |
|
| 281 |
+
{/* Docling Output Modal */}
|
| 282 |
+
{parsedDocuments && (
|
| 283 |
+
<DoclingOutput
|
| 284 |
+
results={parsedDocuments}
|
| 285 |
+
onContinue={handleContinueIndexing}
|
| 286 |
+
onDownload={() => {}}
|
| 287 |
/>
|
| 288 |
)}
|
| 289 |
</div>
|
src/api/routes.py
CHANGED
|
@@ -537,6 +537,113 @@ async def eval_formats():
|
|
| 537 |
return get_supported_formats()
|
| 538 |
|
| 539 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
@router.post("/dropbox/file")
|
| 541 |
async def dropbox_file(request: dict):
|
| 542 |
"""
|
|
|
|
| 537 |
return get_supported_formats()
|
| 538 |
|
| 539 |
|
| 540 |
+
@router.post("/parse-docling")
|
| 541 |
+
async def parse_docling(request: dict):
|
| 542 |
+
"""
|
| 543 |
+
Parse files with Docling and return COMPLETE output.
|
| 544 |
+
|
| 545 |
+
Request:
|
| 546 |
+
- files: Array of {path, name} objects
|
| 547 |
+
- access_token: Dropbox access token
|
| 548 |
+
|
| 549 |
+
Returns array of parsed documents with ALL elements (not samples).
|
| 550 |
+
"""
|
| 551 |
+
import tempfile
|
| 552 |
+
import os
|
| 553 |
+
from pathlib import Path
|
| 554 |
+
from collections import Counter
|
| 555 |
+
|
| 556 |
+
files = request.get("files", [])
|
| 557 |
+
access_token = request.get("access_token")
|
| 558 |
+
|
| 559 |
+
if not access_token or not files:
|
| 560 |
+
return {"error": "Missing files or access_token"}
|
| 561 |
+
|
| 562 |
+
results = []
|
| 563 |
+
|
| 564 |
+
for file_info in files:
|
| 565 |
+
file_path = file_info.get("path")
|
| 566 |
+
file_name = file_info.get("name", Path(file_path).name if file_path else "unknown")
|
| 567 |
+
|
| 568 |
+
if not file_path:
|
| 569 |
+
results.append({
|
| 570 |
+
"filename": file_name,
|
| 571 |
+
"status": "ERROR",
|
| 572 |
+
"error": "Missing file path"
|
| 573 |
+
})
|
| 574 |
+
continue
|
| 575 |
+
|
| 576 |
+
try:
|
| 577 |
+
# Download file from Dropbox
|
| 578 |
+
async with httpx.AsyncClient(timeout=180.0) as client:
|
| 579 |
+
response = await client.post(
|
| 580 |
+
"https://content.dropboxapi.com/2/files/download",
|
| 581 |
+
headers={
|
| 582 |
+
"Authorization": f"Bearer {access_token}",
|
| 583 |
+
"Dropbox-API-Arg": f'{{"path": "{file_path}"}}'
|
| 584 |
+
}
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
if response.status_code != 200:
|
| 588 |
+
results.append({
|
| 589 |
+
"filename": file_name,
|
| 590 |
+
"status": "ERROR",
|
| 591 |
+
"error": f"Dropbox download failed: {response.text}"
|
| 592 |
+
})
|
| 593 |
+
continue
|
| 594 |
+
|
| 595 |
+
# Save to temp file
|
| 596 |
+
suffix = Path(file_name).suffix or Path(file_path).suffix
|
| 597 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
| 598 |
+
tmp.write(response.content)
|
| 599 |
+
tmp_path = tmp.name
|
| 600 |
+
|
| 601 |
+
try:
|
| 602 |
+
from src.ingestion.docling_loader import load_document_with_docling
|
| 603 |
+
|
| 604 |
+
doc = load_document_with_docling(tmp_path)
|
| 605 |
+
|
| 606 |
+
# Count element types
|
| 607 |
+
type_counts = Counter(el.element_type for el in doc.elements)
|
| 608 |
+
|
| 609 |
+
# Return ALL elements (not just samples)
|
| 610 |
+
all_elements = []
|
| 611 |
+
for el in doc.elements:
|
| 612 |
+
all_elements.append({
|
| 613 |
+
"type": el.element_type,
|
| 614 |
+
"text": el.text,
|
| 615 |
+
"level": el.level,
|
| 616 |
+
"page": getattr(el, 'page', None),
|
| 617 |
+
"metadata": getattr(el, 'metadata', {})
|
| 618 |
+
})
|
| 619 |
+
|
| 620 |
+
results.append({
|
| 621 |
+
"filename": file_name,
|
| 622 |
+
"path": file_path,
|
| 623 |
+
"status": doc.status,
|
| 624 |
+
"format": doc.format,
|
| 625 |
+
"total_elements": len(doc.elements),
|
| 626 |
+
"total_chars": doc.chars,
|
| 627 |
+
"total_words": doc.words,
|
| 628 |
+
"page_count": doc.page_count,
|
| 629 |
+
"element_types": dict(type_counts),
|
| 630 |
+
"elements": all_elements,
|
| 631 |
+
"error": doc.error
|
| 632 |
+
})
|
| 633 |
+
|
| 634 |
+
finally:
|
| 635 |
+
os.unlink(tmp_path)
|
| 636 |
+
|
| 637 |
+
except Exception as e:
|
| 638 |
+
results.append({
|
| 639 |
+
"filename": file_name,
|
| 640 |
+
"status": "ERROR",
|
| 641 |
+
"error": str(e)
|
| 642 |
+
})
|
| 643 |
+
|
| 644 |
+
return {"results": results}
|
| 645 |
+
|
| 646 |
+
|
| 647 |
@router.post("/dropbox/file")
|
| 648 |
async def dropbox_file(request: dict):
|
| 649 |
"""
|