vn6295337 Claude Opus 4.5 commited on
Commit
aa663e1
·
1 Parent(s): 2eda359

Add automatic Docling parsing display in indexing flow

Browse files

- Add /api/parse-docling endpoint for bulk file parsing
- Create DoclingOutput component showing complete parsing results:
- Breakdown numbers (elements, chars, words, pages)
- Element type counts
- Complete element list with full text
- Download JSON button
- Continue to Indexing button
- Add parseWithDocling API function
- Add "parse" step to ProcessingStatus
- Update Sidebar to:
- Parse files with Docling after Dropbox fetch
- Display DoclingOutput for user review
- Continue indexing on user confirmation
- Remove manual "Test Parsing" button (now automatic)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

frontend/src/api/client.js CHANGED
@@ -125,3 +125,16 @@ export async function getSupportedFormats() {
125
  const res = await fetch(`${API_BASE}/eval/formats`);
126
  return res.json();
127
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  const res = await fetch(`${API_BASE}/eval/formats`);
126
  return res.json();
127
  }
128
+
129
+ /**
130
+ * Parse files with Docling and return COMPLETE output
131
+ * Returns all elements (not just samples) for full document preview
132
+ */
133
+ export async function parseWithDocling(files, accessToken) {
134
+ const res = await fetch(`${API_BASE}/parse-docling`, {
135
+ method: 'POST',
136
+ headers: { 'Content-Type': 'application/json' },
137
+ body: JSON.stringify({ files, access_token: accessToken })
138
+ });
139
+ return res.json();
140
+ }
frontend/src/components/DoclingOutput.jsx ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from 'react';
2
+
3
+ /**
4
+ * Display complete Docling parsing output with breakdown numbers
5
+ * Shows all elements, element type counts, and stats
6
+ */
7
+ export default function DoclingOutput({ results, onContinue, onDownload }) {
8
+ const [expandedFiles, setExpandedFiles] = useState(
9
+ // Expand first file by default
10
+ results?.length > 0 ? { [results[0].filename]: true } : {}
11
+ );
12
+
13
+ const toggleFile = (filename) => {
14
+ setExpandedFiles(prev => ({
15
+ ...prev,
16
+ [filename]: !prev[filename]
17
+ }));
18
+ };
19
+
20
+ const formatNumber = (num) => {
21
+ return num?.toLocaleString() || '0';
22
+ };
23
+
24
+ const handleDownload = () => {
25
+ const dataStr = JSON.stringify(results, null, 2);
26
+ const dataBlob = new Blob([dataStr], { type: 'application/json' });
27
+ const url = URL.createObjectURL(dataBlob);
28
+ const link = document.createElement('a');
29
+ link.href = url;
30
+ link.download = `docling-output-${new Date().toISOString().slice(0, 10)}.json`;
31
+ document.body.appendChild(link);
32
+ link.click();
33
+ document.body.removeChild(link);
34
+ URL.revokeObjectURL(url);
35
+ onDownload?.();
36
+ };
37
+
38
+ // Get element type badge color
39
+ const getTypeColor = (type) => {
40
+ const colors = {
41
+ heading: 'bg-blue-900/50 text-blue-300 border-blue-700',
42
+ paragraph: 'bg-slate-700 text-slate-300 border-slate-600',
43
+ table: 'bg-purple-900/50 text-purple-300 border-purple-700',
44
+ list_item: 'bg-orange-900/50 text-orange-300 border-orange-700',
45
+ list: 'bg-orange-900/50 text-orange-300 border-orange-700',
46
+ code: 'bg-green-900/50 text-green-300 border-green-700',
47
+ image: 'bg-pink-900/50 text-pink-300 border-pink-700',
48
+ caption: 'bg-yellow-900/50 text-yellow-300 border-yellow-700',
49
+ formula: 'bg-cyan-900/50 text-cyan-300 border-cyan-700',
50
+ footer: 'bg-gray-700 text-gray-300 border-gray-600',
51
+ header: 'bg-gray-700 text-gray-300 border-gray-600',
52
+ };
53
+ return colors[type] || 'bg-slate-700 text-slate-300 border-slate-600';
54
+ };
55
+
56
+ if (!results || results.length === 0) {
57
+ return (
58
+ <div className="text-center py-8 text-slate-400">
59
+ No parsing results available
60
+ </div>
61
+ );
62
+ }
63
+
64
+ return (
65
+ <div className="fixed inset-0 bg-black/80 flex items-center justify-center z-50 p-4">
66
+ <div className="bg-slate-800 border border-slate-700 rounded-xl w-full max-w-4xl max-h-[90vh] flex flex-col shadow-2xl">
67
+ {/* Header */}
68
+ <div className="p-4 border-b border-slate-700 flex items-center justify-between flex-shrink-0">
69
+ <div>
70
+ <h2 className="text-lg font-semibold text-slate-100">Docling Parsing Results</h2>
71
+ <p className="text-sm text-slate-400 mt-0.5">
72
+ {results.length} file{results.length !== 1 ? 's' : ''} parsed
73
+ </p>
74
+ </div>
75
+ <div className="flex items-center gap-2">
76
+ <button
77
+ type="button"
78
+ onClick={handleDownload}
79
+ className="flex items-center gap-2 px-3 py-2 text-sm font-medium text-slate-300 bg-slate-700 border border-slate-600 rounded-lg hover:bg-slate-600 transition-colors"
80
+ >
81
+ <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
82
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-4l-4 4m0 0l-4-4m4 4V4" />
83
+ </svg>
84
+ Download JSON
85
+ </button>
86
+ <button
87
+ type="button"
88
+ onClick={onContinue}
89
+ className="flex items-center gap-2 px-4 py-2 text-sm font-medium text-white bg-blue-600 rounded-lg hover:bg-blue-700 transition-colors"
90
+ >
91
+ Continue to Indexing
92
+ <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
93
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M13 7l5 5m0 0l-5 5m5-5H6" />
94
+ </svg>
95
+ </button>
96
+ </div>
97
+ </div>
98
+
99
+ {/* Content - Scrollable */}
100
+ <div className="flex-1 overflow-auto p-4 space-y-4">
101
+ {results.map((doc, idx) => (
102
+ <div key={idx} className="bg-slate-900 border border-slate-700 rounded-lg overflow-hidden">
103
+ {/* File Header - Clickable */}
104
+ <button
105
+ type="button"
106
+ onClick={() => toggleFile(doc.filename)}
107
+ className="w-full p-4 flex items-center justify-between hover:bg-slate-800/50 transition-colors text-left"
108
+ >
109
+ <div className="flex items-center gap-3">
110
+ <svg className={`w-5 h-5 text-slate-400 transition-transform ${expandedFiles[doc.filename] ? 'rotate-90' : ''}`} fill="none" stroke="currentColor" viewBox="0 0 24 24">
111
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
112
+ </svg>
113
+ <svg className="w-5 h-5 text-slate-500" fill="currentColor" viewBox="0 0 24 24">
114
+ <path d="M14 2H6c-1.1 0-2 .9-2 2v16c0 1.1.9 2 2 2h12c1.1 0 2-.9 2-2V8l-6-6zm-1 2l5 5h-5V4z"/>
115
+ </svg>
116
+ <span className="font-medium text-slate-200">{doc.filename}</span>
117
+ </div>
118
+ <div className="flex items-center gap-2">
119
+ {doc.status === 'OK' ? (
120
+ <span className="flex items-center gap-1 text-xs font-medium px-2 py-1 bg-green-900/40 border border-green-700 text-green-400 rounded">
121
+ <svg className="w-3 h-3" fill="none" stroke="currentColor" viewBox="0 0 24 24">
122
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M5 13l4 4L19 7" />
123
+ </svg>
124
+ OK
125
+ </span>
126
+ ) : (
127
+ <span className="flex items-center gap-1 text-xs font-medium px-2 py-1 bg-red-900/40 border border-red-700 text-red-400 rounded">
128
+ <svg className="w-3 h-3" fill="none" stroke="currentColor" viewBox="0 0 24 24">
129
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M6 18L18 6M6 6l12 12" />
130
+ </svg>
131
+ {doc.status || 'ERROR'}
132
+ </span>
133
+ )}
134
+ <span className="text-xs text-slate-500 uppercase">{doc.format}</span>
135
+ </div>
136
+ </button>
137
+
138
+ {/* Expanded Content */}
139
+ {expandedFiles[doc.filename] && (
140
+ <div className="border-t border-slate-700">
141
+ {doc.error ? (
142
+ <div className="p-4 bg-red-900/20 text-red-400 text-sm">
143
+ Error: {doc.error}
144
+ </div>
145
+ ) : (
146
+ <>
147
+ {/* Stats Grid */}
148
+ <div className="p-4 grid grid-cols-2 sm:grid-cols-4 gap-3">
149
+ <div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
150
+ <p className="text-2xl font-bold text-blue-400">{formatNumber(doc.total_elements)}</p>
151
+ <p className="text-xs text-slate-500 mt-1">Elements</p>
152
+ </div>
153
+ <div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
154
+ <p className="text-2xl font-bold text-green-400">{formatNumber(doc.total_chars)}</p>
155
+ <p className="text-xs text-slate-500 mt-1">Characters</p>
156
+ </div>
157
+ <div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
158
+ <p className="text-2xl font-bold text-purple-400">{formatNumber(doc.total_words)}</p>
159
+ <p className="text-xs text-slate-500 mt-1">Words</p>
160
+ </div>
161
+ <div className="bg-slate-800 border border-slate-700 rounded-lg p-3 text-center">
162
+ <p className="text-2xl font-bold text-orange-400">{doc.page_count || '-'}</p>
163
+ <p className="text-xs text-slate-500 mt-1">Pages</p>
164
+ </div>
165
+ </div>
166
+
167
+ {/* Element Types */}
168
+ {doc.element_types && Object.keys(doc.element_types).length > 0 && (
169
+ <div className="px-4 pb-4">
170
+ <h4 className="text-sm font-medium text-slate-400 mb-2">Element Types</h4>
171
+ <div className="flex flex-wrap gap-2">
172
+ {Object.entries(doc.element_types)
173
+ .sort((a, b) => b[1] - a[1])
174
+ .map(([type, count]) => (
175
+ <span
176
+ key={type}
177
+ className={`text-xs font-medium px-2 py-1 rounded border ${getTypeColor(type)}`}
178
+ >
179
+ {type}: {count}
180
+ </span>
181
+ ))}
182
+ </div>
183
+ </div>
184
+ )}
185
+
186
+ {/* Complete Output */}
187
+ {doc.elements && doc.elements.length > 0 && (
188
+ <div className="px-4 pb-4">
189
+ <h4 className="text-sm font-medium text-slate-400 mb-2">
190
+ Complete Output ({doc.elements.length} elements)
191
+ </h4>
192
+ <div className="bg-slate-950 border border-slate-700 rounded-lg max-h-96 overflow-auto">
193
+ {doc.elements.map((el, elIdx) => (
194
+ <div
195
+ key={elIdx}
196
+ className="p-3 border-b border-slate-800 last:border-b-0 hover:bg-slate-900/50"
197
+ >
198
+ <div className="flex items-center gap-2 mb-1">
199
+ <span className={`text-xs font-medium px-2 py-0.5 rounded border ${getTypeColor(el.type)}`}>
200
+ {el.type}
201
+ </span>
202
+ {el.level && (
203
+ <span className="text-xs text-slate-500">L{el.level}</span>
204
+ )}
205
+ {el.page && (
206
+ <span className="text-xs text-slate-600">p.{el.page}</span>
207
+ )}
208
+ </div>
209
+ <p className="text-sm text-slate-300 whitespace-pre-wrap break-words">
210
+ {el.text || '(empty)'}
211
+ </p>
212
+ </div>
213
+ ))}
214
+ </div>
215
+ </div>
216
+ )}
217
+ </>
218
+ )}
219
+ </div>
220
+ )}
221
+ </div>
222
+ ))}
223
+ </div>
224
+ </div>
225
+ </div>
226
+ );
227
+ }
frontend/src/components/ProcessingStatus.jsx CHANGED
@@ -1,5 +1,6 @@
1
  const steps = [
2
  { id: 'read', label: 'Reading from cloud storage' },
 
3
  { id: 'chunk', label: 'Chunking in your browser' },
4
  { id: 'clear', label: 'Clearing old index data' },
5
  { id: 'embed', label: 'Generating embeddings' },
 
1
  const steps = [
2
  { id: 'read', label: 'Reading from cloud storage' },
3
+ { id: 'parse', label: 'Parsing with Docling' },
4
  { id: 'chunk', label: 'Chunking in your browser' },
5
  { id: 'clear', label: 'Clearing old index data' },
6
  { id: 'embed', label: 'Generating embeddings' },
frontend/src/components/Sidebar.jsx CHANGED
@@ -1,11 +1,11 @@
1
  import { useState } from 'react';
2
- import { embedChunks, clearIndex } from '../api/client';
3
  import { processSelectedFiles } from '../api/dropbox';
4
  import { chunkFiles } from '../api/chunker';
5
  import ProcessingStatus from './ProcessingStatus';
6
  import IndexSummary from './IndexSummary';
7
  import CloudConnect from './CloudConnect';
8
- import ParsingEval from './ParsingEval';
9
 
10
  export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
11
  const [loading, setLoading] = useState(false);
@@ -17,8 +17,9 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
17
  const [stagedFiles, setStagedFiles] = useState([]);
18
  const [accessToken, setAccessToken] = useState(null);
19
 
20
- // State for parsing evaluation
21
- const [evalFile, setEvalFile] = useState(null);
 
22
 
23
  // Handle files staged from CloudConnect (not processed yet)
24
  const handleFilesStaged = (files) => {
@@ -46,7 +47,7 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
46
  setStagedFiles([]);
47
  };
48
 
49
- // Start indexing the staged files
50
  const handleIndexFiles = async () => {
51
  if (stagedFiles.length === 0 || !accessToken) return;
52
 
@@ -62,7 +63,7 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
62
  setProcessingState({
63
  step: 'read',
64
  fileName: progress.fileName,
65
- progress: 10 + (progress.current / progress.total) * 20,
66
  });
67
  });
68
 
@@ -73,26 +74,66 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
73
  return;
74
  }
75
 
76
- // Step 2: Chunk files (client-side)
77
- setProcessingState({ step: 'chunk', fileName: `${fileContents.length} files`, progress: 35 });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  await new Promise(r => setTimeout(r, 100));
79
 
80
  const chunks = chunkFiles(fileContents);
81
 
82
- // Step 3: Clear existing index
83
  setProcessingState({ step: 'clear', fileName: 'Clearing old data', progress: 50 });
84
  await clearIndex();
85
 
86
- // Step 4: Send chunks to server for embedding
87
  setProcessingState({ step: 'embed', fileName: `${chunks.length} chunks`, progress: 65 });
88
 
89
  const result = await embedChunks(chunks);
90
 
91
- // Step 5: Show discard step
92
  setProcessingState({ step: 'discard', fileName: '', progress: 85 });
93
  await new Promise(r => setTimeout(r, 300));
94
 
95
- // Step 6: Complete
96
  setProcessingState({ step: 'save', fileName: '', progress: 100 });
97
  await new Promise(r => setTimeout(r, 200));
98
 
@@ -185,29 +226,16 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
185
  <p className="text-xs text-slate-500">{formatSize(file.size)}</p>
186
  </div>
187
  </div>
188
- <div className="flex items-center gap-1 opacity-0 group-hover:opacity-100 transition-all">
189
- <button
190
- type="button"
191
- onClick={() => setEvalFile(file)}
192
- className="p-1 text-slate-500 hover:text-blue-400"
193
- aria-label={`Test parsing ${file.name}`}
194
- title="Test Docling Parsing"
195
- >
196
- <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
197
- <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2m-6 9l2 2 4-4" />
198
- </svg>
199
- </button>
200
- <button
201
- type="button"
202
- onClick={() => removeFile(file.id)}
203
- className="p-1 text-slate-500 hover:text-red-400"
204
- aria-label={`Remove ${file.name}`}
205
- >
206
- <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
207
- <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M6 18L18 6M6 6l12 12" />
208
- </svg>
209
- </button>
210
- </div>
211
  </div>
212
  ))}
213
  </div>
@@ -250,12 +278,12 @@ export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
250
  </div>
251
  )}
252
 
253
- {/* Parsing Evaluation Modal */}
254
- {evalFile && (
255
- <ParsingEval
256
- file={evalFile}
257
- accessToken={accessToken}
258
- onClose={() => setEvalFile(null)}
259
  />
260
  )}
261
  </div>
 
1
  import { useState } from 'react';
2
+ import { embedChunks, clearIndex, parseWithDocling } from '../api/client';
3
  import { processSelectedFiles } from '../api/dropbox';
4
  import { chunkFiles } from '../api/chunker';
5
  import ProcessingStatus from './ProcessingStatus';
6
  import IndexSummary from './IndexSummary';
7
  import CloudConnect from './CloudConnect';
8
+ import DoclingOutput from './DoclingOutput';
9
 
10
  export default function Sidebar({ onStatusChange, onAccessTokenChange }) {
11
  const [loading, setLoading] = useState(false);
 
17
  const [stagedFiles, setStagedFiles] = useState([]);
18
  const [accessToken, setAccessToken] = useState(null);
19
 
20
+ // State for Docling parsing output
21
+ const [parsedDocuments, setParsedDocuments] = useState(null);
22
+ const [pendingFileContents, setPendingFileContents] = useState(null);
23
 
24
  // Handle files staged from CloudConnect (not processed yet)
25
  const handleFilesStaged = (files) => {
 
47
  setStagedFiles([]);
48
  };
49
 
50
+ // Start indexing the staged files - Phase 1: Read and Parse
51
  const handleIndexFiles = async () => {
52
  if (stagedFiles.length === 0 || !accessToken) return;
53
 
 
63
  setProcessingState({
64
  step: 'read',
65
  fileName: progress.fileName,
66
+ progress: 10 + (progress.current / progress.total) * 15,
67
  });
68
  });
69
 
 
74
  return;
75
  }
76
 
77
+ // Step 2: Parse with Docling
78
+ setProcessingState({ step: 'parse', fileName: `${fileContents.length} files`, progress: 28 });
79
+
80
+ const parseResult = await parseWithDocling(
81
+ stagedFiles.map(f => ({ path: f.path_lower, name: f.name })),
82
+ accessToken
83
+ );
84
+
85
+ if (parseResult.error) {
86
+ setMessage({ type: 'error', text: parseResult.error });
87
+ setLoading(false);
88
+ setProcessingState(null);
89
+ return;
90
+ }
91
+
92
+ // Store results and pause for user review
93
+ setParsedDocuments(parseResult.results);
94
+ setPendingFileContents(fileContents);
95
+ setProcessingState(null);
96
+ setLoading(false);
97
+ // User will click "Continue to Indexing" in DoclingOutput
98
+
99
+ } catch (err) {
100
+ setMessage({ type: 'error', text: err.message });
101
+ setProcessingState(null);
102
+ setLoading(false);
103
+ }
104
+ };
105
+
106
+ // Continue indexing after user reviews Docling output
107
+ const handleContinueIndexing = async () => {
108
+ if (!pendingFileContents) return;
109
+
110
+ setLoading(true);
111
+ setParsedDocuments(null);
112
+
113
+ try {
114
+ const fileContents = pendingFileContents;
115
+ setPendingFileContents(null);
116
+
117
+ // Step 3: Chunk files (client-side)
118
+ setProcessingState({ step: 'chunk', fileName: `${fileContents.length} files`, progress: 40 });
119
  await new Promise(r => setTimeout(r, 100));
120
 
121
  const chunks = chunkFiles(fileContents);
122
 
123
+ // Step 4: Clear existing index
124
  setProcessingState({ step: 'clear', fileName: 'Clearing old data', progress: 50 });
125
  await clearIndex();
126
 
127
+ // Step 5: Send chunks to server for embedding
128
  setProcessingState({ step: 'embed', fileName: `${chunks.length} chunks`, progress: 65 });
129
 
130
  const result = await embedChunks(chunks);
131
 
132
+ // Step 6: Show discard step
133
  setProcessingState({ step: 'discard', fileName: '', progress: 85 });
134
  await new Promise(r => setTimeout(r, 300));
135
 
136
+ // Step 7: Complete
137
  setProcessingState({ step: 'save', fileName: '', progress: 100 });
138
  await new Promise(r => setTimeout(r, 200));
139
 
 
226
  <p className="text-xs text-slate-500">{formatSize(file.size)}</p>
227
  </div>
228
  </div>
229
+ <button
230
+ type="button"
231
+ onClick={() => removeFile(file.id)}
232
+ className="p-1 text-slate-500 hover:text-red-400 opacity-0 group-hover:opacity-100 transition-all"
233
+ aria-label={`Remove ${file.name}`}
234
+ >
235
+ <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
236
+ <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M6 18L18 6M6 6l12 12" />
237
+ </svg>
238
+ </button>
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  </div>
240
  ))}
241
  </div>
 
278
  </div>
279
  )}
280
 
281
+ {/* Docling Output Modal */}
282
+ {parsedDocuments && (
283
+ <DoclingOutput
284
+ results={parsedDocuments}
285
+ onContinue={handleContinueIndexing}
286
+ onDownload={() => {}}
287
  />
288
  )}
289
  </div>
src/api/routes.py CHANGED
@@ -537,6 +537,113 @@ async def eval_formats():
537
  return get_supported_formats()
538
 
539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  @router.post("/dropbox/file")
541
  async def dropbox_file(request: dict):
542
  """
 
537
  return get_supported_formats()
538
 
539
 
540
+ @router.post("/parse-docling")
541
+ async def parse_docling(request: dict):
542
+ """
543
+ Parse files with Docling and return COMPLETE output.
544
+
545
+ Request:
546
+ - files: Array of {path, name} objects
547
+ - access_token: Dropbox access token
548
+
549
+ Returns array of parsed documents with ALL elements (not samples).
550
+ """
551
+ import tempfile
552
+ import os
553
+ from pathlib import Path
554
+ from collections import Counter
555
+
556
+ files = request.get("files", [])
557
+ access_token = request.get("access_token")
558
+
559
+ if not access_token or not files:
560
+ return {"error": "Missing files or access_token"}
561
+
562
+ results = []
563
+
564
+ for file_info in files:
565
+ file_path = file_info.get("path")
566
+ file_name = file_info.get("name", Path(file_path).name if file_path else "unknown")
567
+
568
+ if not file_path:
569
+ results.append({
570
+ "filename": file_name,
571
+ "status": "ERROR",
572
+ "error": "Missing file path"
573
+ })
574
+ continue
575
+
576
+ try:
577
+ # Download file from Dropbox
578
+ async with httpx.AsyncClient(timeout=180.0) as client:
579
+ response = await client.post(
580
+ "https://content.dropboxapi.com/2/files/download",
581
+ headers={
582
+ "Authorization": f"Bearer {access_token}",
583
+ "Dropbox-API-Arg": f'{{"path": "{file_path}"}}'
584
+ }
585
+ )
586
+
587
+ if response.status_code != 200:
588
+ results.append({
589
+ "filename": file_name,
590
+ "status": "ERROR",
591
+ "error": f"Dropbox download failed: {response.text}"
592
+ })
593
+ continue
594
+
595
+ # Save to temp file
596
+ suffix = Path(file_name).suffix or Path(file_path).suffix
597
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
598
+ tmp.write(response.content)
599
+ tmp_path = tmp.name
600
+
601
+ try:
602
+ from src.ingestion.docling_loader import load_document_with_docling
603
+
604
+ doc = load_document_with_docling(tmp_path)
605
+
606
+ # Count element types
607
+ type_counts = Counter(el.element_type for el in doc.elements)
608
+
609
+ # Return ALL elements (not just samples)
610
+ all_elements = []
611
+ for el in doc.elements:
612
+ all_elements.append({
613
+ "type": el.element_type,
614
+ "text": el.text,
615
+ "level": el.level,
616
+ "page": getattr(el, 'page', None),
617
+ "metadata": getattr(el, 'metadata', {})
618
+ })
619
+
620
+ results.append({
621
+ "filename": file_name,
622
+ "path": file_path,
623
+ "status": doc.status,
624
+ "format": doc.format,
625
+ "total_elements": len(doc.elements),
626
+ "total_chars": doc.chars,
627
+ "total_words": doc.words,
628
+ "page_count": doc.page_count,
629
+ "element_types": dict(type_counts),
630
+ "elements": all_elements,
631
+ "error": doc.error
632
+ })
633
+
634
+ finally:
635
+ os.unlink(tmp_path)
636
+
637
+ except Exception as e:
638
+ results.append({
639
+ "filename": file_name,
640
+ "status": "ERROR",
641
+ "error": str(e)
642
+ })
643
+
644
+ return {"results": results}
645
+
646
+
647
  @router.post("/dropbox/file")
648
  async def dropbox_file(request: dict):
649
  """