Alleinzellgaenger commited on
Commit
02158b8
·
1 Parent(s): c075953

Implement chunking and loading animation stuff

Browse files
backend/app.py CHANGED
@@ -3,7 +3,9 @@ from fastapi.middleware.cors import CORSMiddleware
3
  from mistralai import Mistral
4
  import os
5
  import tempfile
 
6
  from dotenv import load_dotenv
 
7
 
8
  # Load environment variables
9
  load_dotenv()
@@ -144,6 +146,18 @@ async def process_ocr_content(file_id: str):
144
  }
145
  page_data["images"].append(image_data)
146
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  processed_pages.append(page_data)
148
 
149
  print(f"📝 Total processed pages: {len(processed_pages)}")
@@ -204,4 +218,197 @@ async def get_image_base64(file_id: str, image_id: str):
204
 
205
  except Exception as e:
206
  print(f"❌ Error getting image: {e}")
207
- raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from mistralai import Mistral
4
  import os
5
  import tempfile
6
+ import json
7
  from dotenv import load_dotenv
8
+ from difflib import SequenceMatcher
9
 
10
  # Load environment variables
11
  load_dotenv()
 
146
  }
147
  page_data["images"].append(image_data)
148
 
149
+ # Auto-chunk this page
150
+ try:
151
+ print(f"🧠 Auto-chunking page {page_idx + 1}...")
152
+ chunks = await auto_chunk_page(page.markdown, client)
153
+ page_data["chunks"] = chunks
154
+ print(f"📊 Page {page_idx + 1} chunks found: {len(chunks)}")
155
+ for i, chunk in enumerate(chunks):
156
+ print(f" {i+1}. {chunk.get('topic', 'Unknown')}: {chunk.get('start_phrase', '')[:50]}...")
157
+ except Exception as chunk_error:
158
+ print(f"⚠️ Chunking failed for page {page_idx + 1}: {chunk_error}")
159
+ page_data["chunks"] = []
160
+
161
  processed_pages.append(page_data)
162
 
163
  print(f"📝 Total processed pages: {len(processed_pages)}")
 
218
 
219
  except Exception as e:
220
  print(f"❌ Error getting image: {e}")
221
+ raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
222
+
223
+ def fuzzy_find(text, pattern, start_pos=0):
224
+ """Find the best fuzzy match for pattern in text starting from start_pos"""
225
+ best_match = None
226
+ best_ratio = 0
227
+ best_pos = -1
228
+
229
+ # Search in sliding windows
230
+ pattern_len = len(pattern)
231
+ for i in range(start_pos, len(text) - pattern_len + 1):
232
+ window = text[i:i + pattern_len]
233
+ ratio = SequenceMatcher(None, pattern.lower(), window.lower()).ratio()
234
+
235
+ if ratio > best_ratio and ratio > 0.8: # Minimum 60% similarity
236
+ best_ratio = ratio
237
+ best_pos = i
238
+ best_match = window
239
+
240
+ return best_pos if best_pos != -1 else None
241
+
242
+ async def auto_chunk_page(page_markdown, client):
243
+ """Auto-chunk a page during OCR processing"""
244
+ if not page_markdown or len(page_markdown.strip()) < 100:
245
+ return [] # Skip very short pages
246
+
247
+ # Create chunking prompt
248
+ prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
249
+
250
+ DOCUMENT PAGE:
251
+ {page_markdown}
252
+
253
+ For each chunk you identify, output ONLY a JSON array with this exact format:
254
+ [
255
+ {{
256
+ "topic": "Brief topic name",
257
+ "start_phrase": "First few words of the chunk",
258
+ "end_phrase": "Last few words of the chunk"
259
+ }}
260
+ ]
261
+
262
+ Rules:
263
+ 1. Each chunk should contain 2-3 valuable lessons
264
+ 2. start_phrase and end_phrase should be 5-15 words long
265
+ 3. Focus on educational content (concepts, examples, key points)
266
+ 4. Output ONLY the JSON array, no other text
267
+ 5. More dense content should have more chunks, less dense content fewer chunks
268
+ 6. Ensure the JSON is valid and well-formed
269
+ 7. Do not include any explanations or additional text, just the JSON array
270
+
271
+ JSON:"""
272
+
273
+ try:
274
+ # Call Mistral for chunking
275
+ response = client.chat.complete(
276
+ model="mistral-small-latest",
277
+ messages=[{"role": "user", "content": prompt}],
278
+ temperature=0.3
279
+ )
280
+
281
+ chunk_text = response.choices[0].message.content.strip()
282
+
283
+ # Parse JSON response
284
+ try:
285
+ chunks = json.loads(chunk_text)
286
+ except json.JSONDecodeError:
287
+ # Try to extract JSON from response if wrapped in text
288
+ import re
289
+ json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
290
+ if json_match:
291
+ chunks = json.loads(json_match.group())
292
+ else:
293
+ return []
294
+
295
+ # Find positions using fuzzy matching
296
+ positioned_chunks = []
297
+ for chunk in chunks:
298
+ start_pos = fuzzy_find(page_markdown, chunk.get("start_phrase", ""))
299
+ end_pos = fuzzy_find(page_markdown, chunk.get("end_phrase", ""), start_pos or 0)
300
+
301
+ if start_pos is not None:
302
+ positioned_chunks.append({
303
+ **chunk,
304
+ "start_position": start_pos,
305
+ "end_position": end_pos,
306
+ "found_start": True,
307
+ "found_end": end_pos is not None
308
+ })
309
+
310
+ return positioned_chunks
311
+
312
+ except Exception as e:
313
+ print(f"❌ Auto-chunking error: {e}")
314
+ return []
315
+
316
+ @app.post("/chunk_page")
317
+ async def chunk_page(request: dict):
318
+ """Analyze a page and suggest chunks for lessons"""
319
+ print(f"🧠 Chunking page...")
320
+
321
+ page_markdown = request.get("markdown", "")
322
+ if not page_markdown:
323
+ raise HTTPException(status_code=400, detail="No markdown provided")
324
+
325
+ # Get Mistral API key
326
+ api_key = os.environ.get("MISTRAL_API_KEY")
327
+ if not api_key:
328
+ raise HTTPException(status_code=500, detail="MISTRAL_API_KEY not set")
329
+
330
+ try:
331
+ # Initialize Mistral client
332
+ client = Mistral(api_key=api_key)
333
+
334
+ # Create chunking prompt
335
+ prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
336
+
337
+ DOCUMENT PAGE:
338
+ {page_markdown}
339
+
340
+ For each chunk you identify, output ONLY a JSON array with this exact format:
341
+ [
342
+ {{
343
+ "topic": "Brief topic name",
344
+ "start_phrase": "First few words of the chunk",
345
+ "end_phrase": "Last few words of the chunk",
346
+ }}
347
+ ]
348
+
349
+ Rules:
350
+ 1. Each chunk should contain 2-3 valuable lessons.
351
+ 2. start_phrase and end_phrase should be 5-15 words long
352
+ 3. Focus on educational content (concepts, examples, key points)
353
+ 4. Output ONLY the JSON array, no other text
354
+ 5. More dense content should have more chunks, less dense content fewer chunks
355
+ 6. Ensure the JSON is valid and well-formed
356
+ 7. Do not include any explanations or additional text, just the JSON array
357
+
358
+ JSON:"""
359
+
360
+ # Call Mistral for chunking
361
+ print("🚀 Calling Mistral for chunking...")
362
+ response = client.chat.complete(
363
+ model="mistral-small-latest", # Faster and cheaper for this task
364
+ messages=[{
365
+ "role": "user",
366
+ "content": prompt
367
+ }],
368
+ temperature=0.3 # Lower temperature for more consistent output
369
+ )
370
+
371
+ chunk_text = response.choices[0].message.content.strip()
372
+ print(f"📝 LLM Response: {chunk_text[:200]}...")
373
+
374
+ # Parse JSON response
375
+ try:
376
+ chunks = json.loads(chunk_text)
377
+ except json.JSONDecodeError:
378
+ # Try to extract JSON from response if wrapped in text
379
+ import re
380
+ json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
381
+ if json_match:
382
+ chunks = json.loads(json_match.group())
383
+ else:
384
+ raise ValueError("Could not parse JSON from LLM response")
385
+
386
+ # Find positions using fuzzy matching
387
+ positioned_chunks = []
388
+ for chunk in chunks:
389
+ start_pos = fuzzy_find(page_markdown, chunk.get("start_phrase", ""))
390
+ end_pos = fuzzy_find(page_markdown, chunk.get("end_phrase", ""), start_pos or 0)
391
+
392
+ if start_pos is not None:
393
+ positioned_chunks.append({
394
+ **chunk,
395
+ "start_position": start_pos,
396
+ "end_position": end_pos,
397
+ "found_start": True,
398
+ "found_end": end_pos is not None
399
+ })
400
+ print(f"✅ Found chunk: {chunk.get('topic')} at position {start_pos}")
401
+ else:
402
+ print(f"❌ Could not find chunk: {chunk.get('topic')}")
403
+
404
+ print(f"📊 Successfully positioned {len(positioned_chunks)}/{len(chunks)} chunks")
405
+
406
+ return {
407
+ "chunks": positioned_chunks,
408
+ "total_found": len(positioned_chunks),
409
+ "total_suggested": len(chunks)
410
+ }
411
+
412
+ except Exception as e:
413
+ print(f"❌ Error chunking page: {e}")
414
+ raise HTTPException(status_code=500, detail=f"Error chunking page: {str(e)}")
frontend/src/components/DocumentProcessor.jsx CHANGED
@@ -1,13 +1,8 @@
1
- import { useState, useRef } from 'react';
2
  import ReactMarkdown from 'react-markdown';
3
- import { Document, Page, pdfjs } from 'react-pdf';
4
  import remarkMath from 'remark-math';
5
  import rehypeKatex from 'rehype-katex';
6
  import 'katex/dist/katex.min.css';
7
- import 'react-pdf/dist/Page/AnnotationLayer.css';
8
- import 'react-pdf/dist/Page/TextLayer.css';
9
-
10
- pdfjs.GlobalWorkerOptions.workerSrc = '/pdf.worker.min.js';
11
 
12
  function DocumentProcessor() {
13
  const fileInputRef = useRef(null);
@@ -16,14 +11,104 @@ function DocumentProcessor() {
16
  const [uploadProgress, setUploadProgress] = useState(0);
17
  const [ocrProgress, setOcrProgress] = useState(0);
18
  const [documentData, setDocumentData] = useState(null);
19
- const [showPdfViewer, setShowPdfViewer] = useState(false);
20
- const [numPages, setNumPages] = useState(null);
21
 
22
  const handleFileChange = (e) => {
23
  setSelectedFile(e.target.files[0]);
24
  setDocumentData(null);
25
  setUploadProgress(0);
26
  setOcrProgress(0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  };
28
 
29
  const processDocument = async () => {
@@ -124,53 +209,12 @@ function DocumentProcessor() {
124
  </div>
125
 
126
  <p className="text-sm text-gray-500">
127
- Using Mistral AI to extract text and understand your document structure...
128
  </p>
129
  </div>
130
  </div>
131
  );
132
 
133
- const PdfViewer = () => (
134
- <div className={`fixed bottom-4 left-4 bg-white rounded-lg shadow-xl border transition-all duration-300 ${
135
- showPdfViewer ? 'w-80 h-96' : 'w-48 h-12'
136
- }`}>
137
- <div className="p-3 border-b flex justify-between items-center">
138
- <span className="text-sm font-medium text-gray-700">Original PDF</span>
139
- <button
140
- onClick={() => setShowPdfViewer(!showPdfViewer)}
141
- className="text-gray-500 hover:text-gray-700"
142
- >
143
- {showPdfViewer ? (
144
- <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
145
- <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
146
- </svg>
147
- ) : (
148
- <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
149
- <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M5 15l7-7 7 7" />
150
- </svg>
151
- )}
152
- </button>
153
- </div>
154
-
155
- {showPdfViewer && (
156
- <div className="h-80 overflow-auto">
157
- <Document
158
- file={selectedFile}
159
- onLoadSuccess={({ numPages }) => setNumPages(numPages)}
160
- >
161
- {numPages && Array.from(new Array(numPages), (_, index) => (
162
- <div key={index + 1} className="mb-2">
163
- <Page
164
- pageNumber={index + 1}
165
- width={280}
166
- />
167
- </div>
168
- ))}
169
- </Document>
170
- </div>
171
- )}
172
- </div>
173
- );
174
 
175
  if (!selectedFile) {
176
  return (
@@ -233,58 +277,41 @@ function DocumentProcessor() {
233
 
234
  return (
235
  <div className="min-h-screen bg-gray-50">
236
- {/* Header */}
237
- <div className="bg-white shadow-sm border-b">
238
- <div className="max-w-6xl mx-auto px-4 py-4 flex justify-between items-center">
239
- <div>
240
- <h1 className="text-xl font-bold text-gray-900">{documentData.filename}</h1>
241
- <p className="text-sm text-gray-600">{documentData.totalPages} pages processed</p>
242
- </div>
243
- <button
244
- onClick={() => setSelectedFile(null)}
245
- className="bg-blue-600 hover:bg-blue-700 text-white px-4 py-2 rounded-lg transition-colors"
246
- >
247
- Upload New Document
248
- </button>
249
- </div>
250
- </div>
251
-
252
  {/* Document Content */}
253
  <div className="max-w-4xl mx-auto px-4 py-8">
254
- <div className="bg-white rounded-lg shadow-sm border p-8">
255
- <ReactMarkdown
256
- remarkPlugins={[remarkMath]}
257
- rehypePlugins={[rehypeKatex]}
258
- className="prose prose-lg max-w-none"
259
- components={{
260
- h1: ({ children }) => <h1 className="text-3xl font-bold mb-6 text-gray-900">{children}</h1>,
261
- h2: ({ children }) => <h2 className="text-2xl font-bold mb-4 text-gray-900 mt-8">{children}</h2>,
262
- h3: ({ children }) => <h3 className="text-xl font-bold mb-3 text-gray-900 mt-6">{children}</h3>,
263
- p: ({ children }) => <p className="mb-4 text-gray-700 leading-relaxed">{children}</p>,
264
- hr: () => <hr className="my-8 border-gray-300" />,
265
- ul: ({ children }) => <ul className="mb-4 ml-6 list-disc">{children}</ul>,
266
- ol: ({ children }) => <ol className="mb-4 ml-6 list-decimal">{children}</ol>,
267
- li: ({ children }) => <li className="mb-1 text-gray-700">{children}</li>,
268
- blockquote: ({ children }) => (
269
- <blockquote className="border-l-4 border-blue-500 pl-4 italic my-4 text-gray-600">
270
- {children}
271
- </blockquote>
272
- ),
273
- code: ({ inline, children }) =>
274
- inline ?
275
- <code className="bg-gray-100 px-1 py-0.5 rounded text-sm font-mono">{children}</code> :
276
- <pre className="bg-gray-100 p-4 rounded-lg overflow-x-auto my-4">
277
- <code className="text-sm font-mono">{children}</code>
278
- </pre>
279
- }}
280
- >
281
- {documentData.markdown}
282
- </ReactMarkdown>
 
 
283
  </div>
284
  </div>
285
-
286
- {/* PDF Viewer */}
287
- <PdfViewer />
288
  </div>
289
  );
290
  }
 
1
+ import { useState, useRef, useEffect } from 'react';
2
  import ReactMarkdown from 'react-markdown';
 
3
  import remarkMath from 'remark-math';
4
  import rehypeKatex from 'rehype-katex';
5
  import 'katex/dist/katex.min.css';
 
 
 
 
6
 
7
  function DocumentProcessor() {
8
  const fileInputRef = useRef(null);
 
11
  const [uploadProgress, setUploadProgress] = useState(0);
12
  const [ocrProgress, setOcrProgress] = useState(0);
13
  const [documentData, setDocumentData] = useState(null);
14
+ const [imageCache, setImageCache] = useState({});
 
15
 
16
  const handleFileChange = (e) => {
17
  setSelectedFile(e.target.files[0]);
18
  setDocumentData(null);
19
  setUploadProgress(0);
20
  setOcrProgress(0);
21
+ setImageCache({});
22
+ };
23
+
24
+ const fetchImage = async (imageId, fileId) => {
25
+ if (imageCache[imageId]) {
26
+ return imageCache[imageId];
27
+ }
28
+
29
+ try {
30
+ const response = await fetch(`http://localhost:8000/get_image/${fileId}/${imageId}`);
31
+ if (response.ok) {
32
+ const data = await response.json();
33
+ const imageData = data.image_base64;
34
+
35
+ // Cache the image
36
+ setImageCache(prev => ({
37
+ ...prev,
38
+ [imageId]: imageData
39
+ }));
40
+
41
+ return imageData;
42
+ }
43
+ } catch (error) {
44
+ console.error('Error fetching image:', error);
45
+ }
46
+ return null;
47
+ };
48
+
49
+ const ImageComponent = ({ src, alt }) => {
50
+ const [imageSrc, setImageSrc] = useState(null);
51
+ const [loading, setLoading] = useState(true);
52
+
53
+ useEffect(() => {
54
+ if (documentData && src) {
55
+ // Extract image ID from src (assuming format like ![imageId](imageId))
56
+ fetchImage(src, documentData.fileId).then(imageData => {
57
+ if (imageData) {
58
+ setImageSrc(imageData);
59
+ }
60
+ setLoading(false);
61
+ });
62
+ }
63
+ }, [src, documentData]);
64
+
65
+ if (loading) {
66
+ return (
67
+ <div style={{
68
+ width: '100%',
69
+ height: '200px',
70
+ backgroundColor: '#f3f4f6',
71
+ display: 'flex',
72
+ alignItems: 'center',
73
+ justifyContent: 'center',
74
+ margin: '1rem 0',
75
+ borderRadius: '0.5rem'
76
+ }}>
77
+ <span style={{ color: '#6b7280' }}>Loading image...</span>
78
+ </div>
79
+ );
80
+ }
81
+
82
+ if (!imageSrc) {
83
+ return (
84
+ <div style={{
85
+ width: '100%',
86
+ height: '200px',
87
+ backgroundColor: '#fef2f2',
88
+ display: 'flex',
89
+ alignItems: 'center',
90
+ justifyContent: 'center',
91
+ margin: '1rem 0',
92
+ borderRadius: '0.5rem',
93
+ border: '1px solid #fecaca'
94
+ }}>
95
+ <span style={{ color: '#dc2626' }}>Image not found: {alt || src}</span>
96
+ </div>
97
+ );
98
+ }
99
+
100
+ return (
101
+ <div style={{ margin: '1.5rem 0', textAlign: 'center' }}>
102
+ <img
103
+ src={imageSrc}
104
+ alt={alt || 'Document image'}
105
+ style={{
106
+ maxWidth: '100%',
107
+ height: 'auto',
108
+ }}
109
+ />
110
+ </div>
111
+ );
112
  };
113
 
114
  const processDocument = async () => {
 
209
  </div>
210
 
211
  <p className="text-sm text-gray-500">
212
+ Using AI to extract text and understand your document structure...
213
  </p>
214
  </div>
215
  </div>
216
  );
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  if (!selectedFile) {
220
  return (
 
277
 
278
  return (
279
  <div className="min-h-screen bg-gray-50">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  {/* Document Content */}
281
  <div className="max-w-4xl mx-auto px-4 py-8">
282
+ <div className="bg-white rounded-lg shadow-sm p-8">
283
+ <div className="prose prose-lg max-w-none">
284
+ <ReactMarkdown
285
+ remarkPlugins={[remarkMath]}
286
+ rehypePlugins={[rehypeKatex]}
287
+ components={{
288
+ h1: ({ children }) => <h1 style={{ fontSize: '2rem', fontWeight: 'bold', marginBottom: '1.5rem', color: '#1a202c' }}>{children}</h1>,
289
+ h2: ({ children }) => <h2 style={{ fontSize: '1.5rem', fontWeight: 'bold', marginBottom: '1rem', marginTop: '2rem', color: '#1a202c' }}>{children}</h2>,
290
+ h3: ({ children }) => <h3 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', marginTop: '1.5rem', color: '#1a202c' }}>{children}</h3>,
291
+ p: ({ children }) => <p style={{ marginBottom: '1rem', color: '#374151', lineHeight: '1.6' }}>{children}</p>,
292
+ hr: () => <hr style={{ margin: '2rem 0', borderColor: '#d1d5db' }} />,
293
+ ul: ({ children }) => <ul style={{ marginBottom: '1rem', marginLeft: '1.5rem', listStyleType: 'disc' }}>{children}</ul>,
294
+ ol: ({ children }) => <ol style={{ marginBottom: '1rem', marginLeft: '1.5rem', listStyleType: 'decimal' }}>{children}</ol>,
295
+ li: ({ children }) => <li style={{ marginBottom: '0.25rem', color: '#374151' }}>{children}</li>,
296
+ blockquote: ({ children }) => (
297
+ <blockquote style={{ borderLeft: '4px solid #3b82f6', paddingLeft: '1rem', fontStyle: 'italic', margin: '1rem 0', color: '#6b7280' }}>
298
+ {children}
299
+ </blockquote>
300
+ ),
301
+ code: ({ inline, children }) =>
302
+ inline ?
303
+ <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.875rem', fontFamily: 'monospace' }}>{children}</code> :
304
+ <pre style={{ backgroundColor: '#f3f4f6', padding: '1rem', borderRadius: '0.5rem', overflowX: 'auto', margin: '1rem 0' }}>
305
+ <code style={{ fontSize: '0.875rem', fontFamily: 'monospace' }}>{children}</code>
306
+ </pre>,
307
+ img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
308
+ }}
309
+ >
310
+ {documentData.markdown}
311
+ </ReactMarkdown>
312
+ </div>
313
  </div>
314
  </div>
 
 
 
315
  </div>
316
  );
317
  }