Alleinzellgaenger commited on
Commit
b1e57e4
·
1 Parent(s): 02158b8

Add chunking and markdown rendering

Browse files
Dockerfile CHANGED
@@ -8,7 +8,7 @@ COPY frontend/ ./
8
  RUN npm run build
9
 
10
  # Use Python runtime for backend
11
- FROM python:3.9
12
 
13
  WORKDIR /code
14
 
 
8
  RUN npm run build
9
 
10
  # Use Python runtime for backend
11
+ FROM python:3.10
12
 
13
  WORKDIR /code
14
 
backend/app.py CHANGED
@@ -1,11 +1,16 @@
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
 
 
3
  from mistralai import Mistral
4
  import os
5
  import tempfile
6
  import json
7
  from dotenv import load_dotenv
8
  from difflib import SequenceMatcher
 
 
 
9
 
10
  # Load environment variables
11
  load_dotenv()
@@ -20,9 +25,6 @@ app.add_middleware(
20
  allow_headers=["*"],
21
  )
22
 
23
- @app.get("/")
24
- def hello():
25
- return {"message": "Backend is running!"}
26
 
27
  @app.get("/api/test")
28
  def test():
@@ -220,6 +222,16 @@ async def get_image_base64(file_id: str, image_id: str):
220
  print(f"❌ Error getting image: {e}")
221
  raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
222
 
 
 
 
 
 
 
 
 
 
 
223
  def fuzzy_find(text, pattern, start_pos=0):
224
  """Find the best fuzzy match for pattern in text starting from start_pos"""
225
  best_match = None
@@ -239,68 +251,57 @@ def fuzzy_find(text, pattern, start_pos=0):
239
 
240
  return best_pos if best_pos != -1 else None
241
 
242
- async def auto_chunk_page(page_markdown, client):
243
- """Auto-chunk a page during OCR processing"""
244
  if not page_markdown or len(page_markdown.strip()) < 100:
245
  return [] # Skip very short pages
246
 
247
- # Create chunking prompt
248
- prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
249
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  DOCUMENT PAGE:
251
  {page_markdown}
252
 
253
- For each chunk you identify, output ONLY a JSON array with this exact format:
254
- [
255
- {{
256
- "topic": "Brief topic name",
257
- "start_phrase": "First few words of the chunk",
258
- "end_phrase": "Last few words of the chunk"
259
- }}
260
- ]
261
-
262
  Rules:
263
  1. Each chunk should contain 2-3 valuable lessons
264
  2. start_phrase and end_phrase should be 5-15 words long
265
  3. Focus on educational content (concepts, examples, key points)
266
- 4. Output ONLY the JSON array, no other text
267
- 5. More dense content should have more chunks, less dense content fewer chunks
268
- 6. Ensure the JSON is valid and well-formed
269
- 7. Do not include any explanations or additional text, just the JSON array
270
 
271
- JSON:"""
272
 
273
- try:
274
- # Call Mistral for chunking
275
- response = client.chat.complete(
276
- model="mistral-small-latest",
277
- messages=[{"role": "user", "content": prompt}],
278
- temperature=0.3
279
- )
280
-
281
- chunk_text = response.choices[0].message.content.strip()
282
-
283
- # Parse JSON response
284
- try:
285
- chunks = json.loads(chunk_text)
286
- except json.JSONDecodeError:
287
- # Try to extract JSON from response if wrapped in text
288
- import re
289
- json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
290
- if json_match:
291
- chunks = json.loads(json_match.group())
292
- else:
293
- return []
294
 
295
  # Find positions using fuzzy matching
296
  positioned_chunks = []
297
  for chunk in chunks:
298
- start_pos = fuzzy_find(page_markdown, chunk.get("start_phrase", ""))
299
- end_pos = fuzzy_find(page_markdown, chunk.get("end_phrase", ""), start_pos or 0)
300
 
301
  if start_pos is not None:
302
  positioned_chunks.append({
303
- **chunk,
 
 
304
  "start_position": start_pos,
305
  "end_position": end_pos,
306
  "found_start": True,
@@ -310,26 +311,35 @@ JSON:"""
310
  return positioned_chunks
311
 
312
  except Exception as e:
 
313
  print(f"❌ Auto-chunking error: {e}")
 
314
  return []
315
 
316
  @app.post("/chunk_page")
317
  async def chunk_page(request: dict):
318
- """Analyze a page and suggest chunks for lessons"""
319
  print(f"🧠 Chunking page...")
320
 
321
  page_markdown = request.get("markdown", "")
322
  if not page_markdown:
323
  raise HTTPException(status_code=400, detail="No markdown provided")
324
 
325
- # Get Mistral API key
326
- api_key = os.environ.get("MISTRAL_API_KEY")
327
- if not api_key:
328
- raise HTTPException(status_code=500, detail="MISTRAL_API_KEY not set")
329
 
330
  try:
331
- # Initialize Mistral client
332
- client = Mistral(api_key=api_key)
 
 
 
 
 
 
 
333
 
334
  # Create chunking prompt
335
  prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
@@ -337,69 +347,40 @@ async def chunk_page(request: dict):
337
  DOCUMENT PAGE:
338
  {page_markdown}
339
 
340
- For each chunk you identify, output ONLY a JSON array with this exact format:
341
- [
342
- {{
343
- "topic": "Brief topic name",
344
- "start_phrase": "First few words of the chunk",
345
- "end_phrase": "Last few words of the chunk",
346
- }}
347
- ]
348
-
349
  Rules:
350
- 1. Each chunk should contain 2-3 valuable lessons.
351
  2. start_phrase and end_phrase should be 5-15 words long
352
  3. Focus on educational content (concepts, examples, key points)
353
- 4. Output ONLY the JSON array, no other text
354
- 5. More dense content should have more chunks, less dense content fewer chunks
355
- 6. Ensure the JSON is valid and well-formed
356
- 7. Do not include any explanations or additional text, just the JSON array
357
 
358
- JSON:"""
359
 
360
- # Call Mistral for chunking
361
- print("🚀 Calling Mistral for chunking...")
362
- response = client.chat.complete(
363
- model="mistral-small-latest", # Faster and cheaper for this task
364
- messages=[{
365
- "role": "user",
366
- "content": prompt
367
- }],
368
- temperature=0.3 # Lower temperature for more consistent output
369
- )
370
-
371
- chunk_text = response.choices[0].message.content.strip()
372
- print(f"📝 LLM Response: {chunk_text[:200]}...")
373
-
374
- # Parse JSON response
375
- try:
376
- chunks = json.loads(chunk_text)
377
- except json.JSONDecodeError:
378
- # Try to extract JSON from response if wrapped in text
379
- import re
380
- json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
381
- if json_match:
382
- chunks = json.loads(json_match.group())
383
- else:
384
- raise ValueError("Could not parse JSON from LLM response")
385
 
386
  # Find positions using fuzzy matching
387
  positioned_chunks = []
388
  for chunk in chunks:
389
- start_pos = fuzzy_find(page_markdown, chunk.get("start_phrase", ""))
390
- end_pos = fuzzy_find(page_markdown, chunk.get("end_phrase", ""), start_pos or 0)
391
 
392
  if start_pos is not None:
393
  positioned_chunks.append({
394
- **chunk,
 
 
395
  "start_position": start_pos,
396
  "end_position": end_pos,
397
  "found_start": True,
398
  "found_end": end_pos is not None
399
  })
400
- print(f"✅ Found chunk: {chunk.get('topic')} at position {start_pos}")
401
  else:
402
- print(f"❌ Could not find chunk: {chunk.get('topic')}")
403
 
404
  print(f"📊 Successfully positioned {len(positioned_chunks)}/{len(chunks)} chunks")
405
 
@@ -410,5 +391,27 @@ JSON:"""
410
  }
411
 
412
  except Exception as e:
 
413
  print(f"❌ Error chunking page: {e}")
414
- raise HTTPException(status_code=500, detail=f"Error chunking page: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.responses import FileResponse
5
  from mistralai import Mistral
6
  import os
7
  import tempfile
8
  import json
9
  from dotenv import load_dotenv
10
  from difflib import SequenceMatcher
11
+ from pydantic import BaseModel, Field
12
+ from typing import Optional, List
13
+ from langchain.chat_models import init_chat_model
14
 
15
  # Load environment variables
16
  load_dotenv()
 
25
  allow_headers=["*"],
26
  )
27
 
 
 
 
28
 
29
  @app.get("/api/test")
30
  def test():
 
222
  print(f"❌ Error getting image: {e}")
223
  raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
224
 
225
+ class ChunkSchema(BaseModel):
226
+ """Schema for document chunks suitable for creating interactive lessons."""
227
+ topic: str = Field(description="Brief topic name for the chunk")
228
+ start_phrase: str = Field(description="First few words of the chunk (5-15 words)")
229
+ end_phrase: str = Field(description="Last few words of the chunk (5-15 words)")
230
+
231
+ class ChunkList(BaseModel):
232
+ """Container for a list of document chunks."""
233
+ chunks: List[ChunkSchema] = Field(description="List of identified chunks for interactive lessons")
234
+
235
  def fuzzy_find(text, pattern, start_pos=0):
236
  """Find the best fuzzy match for pattern in text starting from start_pos"""
237
  best_match = None
 
251
 
252
  return best_pos if best_pos != -1 else None
253
 
254
+ async def auto_chunk_page(page_markdown, client=None):
255
+ """Auto-chunk a page during OCR processing using Fireworks AI with structured output"""
256
  if not page_markdown or len(page_markdown.strip()) < 100:
257
  return [] # Skip very short pages
258
 
259
+ # Get Fireworks API key
260
+ fireworks_api_key = os.environ.get("FIREWORKS_API_KEY")
261
+ if not fireworks_api_key:
262
+ print("⚠️ No Fireworks API key found, falling back to regular chunking")
263
+ return []
264
+
265
+ try:
266
+ # Initialize Fireworks LLM with structured output
267
+ llm = init_chat_model(
268
+ "accounts/fireworks/models/llama4-maverick-instruct-basic",
269
+ model_provider="fireworks",
270
+ api_key=fireworks_api_key
271
+ )
272
+
273
+ # Create structured LLM that returns ChunkList object
274
+ structured_llm = llm.with_structured_output(ChunkList)
275
+
276
+ # Create chunking prompt
277
+ prompt = f"""Imagine you are a teacher. You are given an individual page, and you have to decide how to dissect this page. Your task is to identify chunks of content by providing start and end phrases that can be used to create interactive lessons. Here's the page:
278
  DOCUMENT PAGE:
279
  {page_markdown}
280
 
 
 
 
 
 
 
 
 
 
281
  Rules:
282
  1. Each chunk should contain 2-3 valuable lessons
283
  2. start_phrase and end_phrase should be 5-15 words long
284
  3. Focus on educational content (concepts, examples, key points)
285
+ 4. More dense content should have more chunks, less dense content fewer chunks
286
+ 5. Identify chunks that would make good interactive lessons
 
 
287
 
288
+ Return a list of chunks with topic, start_phrase, and end_phrase for each."""
289
 
290
+ # Call Fireworks with structured output
291
+ chunk_response = structured_llm.invoke(prompt)
292
+ chunks = chunk_response.chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  # Find positions using fuzzy matching
295
  positioned_chunks = []
296
  for chunk in chunks:
297
+ start_pos = fuzzy_find(page_markdown, chunk.start_phrase)
298
+ end_pos = fuzzy_find(page_markdown, chunk.end_phrase, start_pos or 0)
299
 
300
  if start_pos is not None:
301
  positioned_chunks.append({
302
+ "topic": chunk.topic,
303
+ "start_phrase": chunk.start_phrase,
304
+ "end_phrase": chunk.end_phrase,
305
  "start_position": start_pos,
306
  "end_position": end_pos,
307
  "found_start": True,
 
311
  return positioned_chunks
312
 
313
  except Exception as e:
314
+ import traceback
315
  print(f"❌ Auto-chunking error: {e}")
316
+ print(f"❌ Full traceback: {traceback.format_exc()}")
317
  return []
318
 
319
  @app.post("/chunk_page")
320
  async def chunk_page(request: dict):
321
+ """Analyze a page and suggest chunks for lessons using Fireworks AI with structured output"""
322
  print(f"🧠 Chunking page...")
323
 
324
  page_markdown = request.get("markdown", "")
325
  if not page_markdown:
326
  raise HTTPException(status_code=400, detail="No markdown provided")
327
 
328
+ # Get Fireworks API key
329
+ fireworks_api_key = os.environ.get("FIREWORKS_API_KEY")
330
+ if not fireworks_api_key:
331
+ raise HTTPException(status_code=500, detail="FIREWORKS_API_KEY not set")
332
 
333
  try:
334
+ # Initialize Fireworks LLM with structured output
335
+ llm = init_chat_model(
336
+ "accounts/fireworks/models/llama4-maverick-instruct-basic",
337
+ model_provider="fireworks",
338
+ api_key=fireworks_api_key
339
+ )
340
+
341
+ # Create structured LLM that returns ChunkList object
342
+ structured_llm = llm.with_structured_output(ChunkList)
343
 
344
  # Create chunking prompt
345
  prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
 
347
  DOCUMENT PAGE:
348
  {page_markdown}
349
 
 
 
 
 
 
 
 
 
 
350
  Rules:
351
+ 1. Each chunk should contain 2-3 valuable lessons
352
  2. start_phrase and end_phrase should be 5-15 words long
353
  3. Focus on educational content (concepts, examples, key points)
354
+ 4. More dense content should have more chunks, less dense content fewer chunks
355
+ 5. Identify chunks that would make good interactive lessons
 
 
356
 
357
+ Return a list of chunks with topic, start_phrase, and end_phrase for each."""
358
 
359
+ # Call Fireworks with structured output
360
+ print("🚀 Calling Fireworks for chunking...")
361
+ chunk_response = structured_llm.invoke(prompt)
362
+ chunks = chunk_response.chunks
363
+ print(f"📝 Received {len(chunks)} chunks from Fireworks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
  # Find positions using fuzzy matching
366
  positioned_chunks = []
367
  for chunk in chunks:
368
+ start_pos = fuzzy_find(page_markdown, chunk.start_phrase)
369
+ end_pos = fuzzy_find(page_markdown, chunk.end_phrase, start_pos or 0)
370
 
371
  if start_pos is not None:
372
  positioned_chunks.append({
373
+ "topic": chunk.topic,
374
+ "start_phrase": chunk.start_phrase,
375
+ "end_phrase": chunk.end_phrase,
376
  "start_position": start_pos,
377
  "end_position": end_pos,
378
  "found_start": True,
379
  "found_end": end_pos is not None
380
  })
381
+ print(f"✅ Found chunk: {chunk.topic} at position {start_pos}")
382
  else:
383
+ print(f"❌ Could not find chunk: {chunk.topic}")
384
 
385
  print(f"📊 Successfully positioned {len(positioned_chunks)}/{len(chunks)} chunks")
386
 
 
391
  }
392
 
393
  except Exception as e:
394
+ import traceback
395
  print(f"❌ Error chunking page: {e}")
396
+ print(f" Full traceback: {traceback.format_exc()}")
397
+ raise HTTPException(status_code=500, detail=f"Error chunking page: {str(e)}")
398
+
399
+ # Mount static files for production deployment
400
+ frontend_path = os.path.join(os.path.dirname(__file__), "..", "frontend")
401
+ assets_path = os.path.join(frontend_path, "assets")
402
+
403
+ if os.path.exists(frontend_path):
404
+ # Only mount assets if the directory exists (production build)
405
+ if os.path.exists(assets_path):
406
+ app.mount("/assets", StaticFiles(directory=assets_path), name="assets")
407
+
408
+ @app.get("/")
409
+ async def serve_frontend():
410
+ index_path = os.path.join(frontend_path, "index.html")
411
+ if os.path.exists(index_path):
412
+ return FileResponse(index_path)
413
+ return {"message": "Backend is running - frontend not found"}
414
+ else:
415
+ @app.get("/")
416
+ def hello():
417
+ return {"message": "Backend is running!"}
backend/requirements.txt CHANGED
@@ -3,3 +3,7 @@ fastapi==0.115.7
3
  python-multipart>=0.0.5
4
  mistralai
5
  python-dotenv
 
 
 
 
 
3
  python-multipart>=0.0.5
4
  mistralai
5
  python-dotenv
6
+ fireworks-ai
7
+ langchain-core
8
+ langchain-fireworks
9
+ pydantic
frontend/src/components/DocumentProcessor.jsx CHANGED
@@ -12,6 +12,7 @@ function DocumentProcessor() {
12
  const [ocrProgress, setOcrProgress] = useState(0);
13
  const [documentData, setDocumentData] = useState(null);
14
  const [imageCache, setImageCache] = useState({});
 
15
 
16
  const handleFileChange = (e) => {
17
  setSelectedFile(e.target.files[0]);
@@ -27,7 +28,7 @@ function DocumentProcessor() {
27
  }
28
 
29
  try {
30
- const response = await fetch(`http://localhost:8000/get_image/${fileId}/${imageId}`);
31
  if (response.ok) {
32
  const data = await response.json();
33
  const imageData = data.image_base64;
@@ -46,6 +47,56 @@ function DocumentProcessor() {
46
  return null;
47
  };
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  const ImageComponent = ({ src, alt }) => {
50
  const [imageSrc, setImageSrc] = useState(null);
51
  const [loading, setLoading] = useState(true);
@@ -64,50 +115,52 @@ function DocumentProcessor() {
64
 
65
  if (loading) {
66
  return (
67
- <div style={{
 
68
  width: '100%',
69
  height: '200px',
70
  backgroundColor: '#f3f4f6',
71
- display: 'flex',
72
- alignItems: 'center',
73
- justifyContent: 'center',
74
  margin: '1rem 0',
75
- borderRadius: '0.5rem'
 
76
  }}>
77
- <span style={{ color: '#6b7280' }}>Loading image...</span>
78
- </div>
79
  );
80
  }
81
 
82
  if (!imageSrc) {
83
  return (
84
- <div style={{
 
85
  width: '100%',
86
  height: '200px',
87
  backgroundColor: '#fef2f2',
88
- display: 'flex',
89
- alignItems: 'center',
90
- justifyContent: 'center',
91
  margin: '1rem 0',
92
  borderRadius: '0.5rem',
93
- border: '1px solid #fecaca'
 
94
  }}>
95
- <span style={{ color: '#dc2626' }}>Image not found: {alt || src}</span>
96
- </div>
97
  );
98
  }
99
 
100
  return (
101
- <div style={{ margin: '1.5rem 0', textAlign: 'center' }}>
102
- <img
103
- src={imageSrc}
104
- alt={alt || 'Document image'}
105
- style={{
106
- maxWidth: '100%',
107
- height: 'auto',
108
- }}
109
- />
110
- </div>
111
  );
112
  };
113
 
@@ -124,7 +177,7 @@ function DocumentProcessor() {
124
  formData.append('file', selectedFile);
125
 
126
  setUploadProgress(30);
127
- const uploadResponse = await fetch('http://localhost:8000/upload_pdf', {
128
  method: 'POST',
129
  body: formData,
130
  });
@@ -141,7 +194,7 @@ function DocumentProcessor() {
141
  await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for UX
142
 
143
  setOcrProgress(60);
144
- const ocrResponse = await fetch(`http://localhost:8000/process_ocr/${uploadData.file_id}`);
145
 
146
  if (!ocrResponse.ok) {
147
  throw new Error('Failed to process OCR');
@@ -155,12 +208,31 @@ function DocumentProcessor() {
155
  .map(page => page.markdown)
156
  .join('\n\n---\n\n');
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  setDocumentData({
159
  fileId: uploadData.file_id,
160
  filename: uploadData.filename,
161
  markdown: combinedMarkdown,
162
  pages: ocrData.pages,
163
- totalPages: ocrData.total_pages
 
164
  });
165
 
166
  } catch (error) {
@@ -276,33 +348,33 @@ function DocumentProcessor() {
276
  }
277
 
278
  return (
279
- <div className="min-h-screen bg-gray-50">
280
- {/* Document Content */}
281
- <div className="max-w-4xl mx-auto px-4 py-8">
282
- <div className="bg-white rounded-lg shadow-sm p-8">
283
- <div className="prose prose-lg max-w-none">
284
  <ReactMarkdown
285
  remarkPlugins={[remarkMath]}
286
  rehypePlugins={[rehypeKatex]}
287
  components={{
288
- h1: ({ children }) => <h1 style={{ fontSize: '2rem', fontWeight: 'bold', marginBottom: '1.5rem', color: '#1a202c' }}>{children}</h1>,
289
- h2: ({ children }) => <h2 style={{ fontSize: '1.5rem', fontWeight: 'bold', marginBottom: '1rem', marginTop: '2rem', color: '#1a202c' }}>{children}</h2>,
290
- h3: ({ children }) => <h3 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', marginTop: '1.5rem', color: '#1a202c' }}>{children}</h3>,
291
- p: ({ children }) => <p style={{ marginBottom: '1rem', color: '#374151', lineHeight: '1.6' }}>{children}</p>,
292
- hr: () => <hr style={{ margin: '2rem 0', borderColor: '#d1d5db' }} />,
293
- ul: ({ children }) => <ul style={{ marginBottom: '1rem', marginLeft: '1.5rem', listStyleType: 'disc' }}>{children}</ul>,
294
- ol: ({ children }) => <ol style={{ marginBottom: '1rem', marginLeft: '1.5rem', listStyleType: 'decimal' }}>{children}</ol>,
295
- li: ({ children }) => <li style={{ marginBottom: '0.25rem', color: '#374151' }}>{children}</li>,
296
  blockquote: ({ children }) => (
297
- <blockquote style={{ borderLeft: '4px solid #3b82f6', paddingLeft: '1rem', fontStyle: 'italic', margin: '1rem 0', color: '#6b7280' }}>
298
  {children}
299
  </blockquote>
300
  ),
301
  code: ({ inline, children }) =>
302
  inline ?
303
- <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.875rem', fontFamily: 'monospace' }}>{children}</code> :
304
- <pre style={{ backgroundColor: '#f3f4f6', padding: '1rem', borderRadius: '0.5rem', overflowX: 'auto', margin: '1rem 0' }}>
305
- <code style={{ fontSize: '0.875rem', fontFamily: 'monospace' }}>{children}</code>
306
  </pre>,
307
  img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
308
  }}
@@ -312,6 +384,11 @@ function DocumentProcessor() {
312
  </div>
313
  </div>
314
  </div>
 
 
 
 
 
315
  </div>
316
  );
317
  }
 
12
  const [ocrProgress, setOcrProgress] = useState(0);
13
  const [documentData, setDocumentData] = useState(null);
14
  const [imageCache, setImageCache] = useState({});
15
+ const [selectedChunk, setSelectedChunk] = useState(0);
16
 
17
  const handleFileChange = (e) => {
18
  setSelectedFile(e.target.files[0]);
 
28
  }
29
 
30
  try {
31
+ const response = await fetch(`/get_image/${fileId}/${imageId}`);
32
  if (response.ok) {
33
  const data = await response.json();
34
  const imageData = data.image_base64;
 
47
  return null;
48
  };
49
 
50
+ // Component for the chunk panel
51
+ const ChunkPanel = ({ chunks }) => {
52
+ if (!chunks || chunks.length === 0) {
53
+ return (
54
+ <div className="p-6 text-center text-gray-500">
55
+ No interactive chunks found
56
+ </div>
57
+ );
58
+ }
59
+
60
+ return (
61
+ <div className="p-4 space-y-4 overflow-y-auto max-h-screen">
62
+ <h3 className="text-lg font-semibold text-gray-800 mb-4 sticky top-0 bg-white pb-2">
63
+ Interactive Chunks ({chunks.length})
64
+ </h3>
65
+
66
+ {chunks.map((chunk, index) => (
67
+ <div
68
+ key={index}
69
+ onClick={() => setSelectedChunk(index)}
70
+ className={`p-4 rounded-lg border-2 cursor-pointer transition-all duration-200 ${
71
+ selectedChunk === index
72
+ ? 'border-blue-500 bg-blue-50 shadow-md'
73
+ : 'border-gray-200 bg-white hover:border-gray-300 hover:shadow-sm'
74
+ }`}
75
+ >
76
+ <div className="flex items-start justify-between mb-2">
77
+ <h4 className="font-medium text-gray-900">
78
+ 📚 {chunk.topic}
79
+ </h4>
80
+ <span className="text-xs text-gray-500 bg-gray-100 px-2 py-1 rounded">
81
+ {index + 1}/{chunks.length}
82
+ </span>
83
+ </div>
84
+
85
+
86
+ {chunk.start_position !== null && chunk.start_position !== undefined &&
87
+ chunk.end_position !== null && chunk.end_position !== undefined && (
88
+ <div className="mt-3 p-3 bg-gray-50 rounded text-sm text-gray-700 italic">
89
+ {documentData.markdown.slice(chunk.start_position, chunk.end_position).substring(0, 150)}
90
+ {documentData.markdown.slice(chunk.start_position, chunk.end_position).length > 150 ? '...' : ''}
91
+ </div>
92
+ )}
93
+ </div>
94
+ ))}
95
+ </div>
96
+ );
97
+ };
98
+
99
+
100
  const ImageComponent = ({ src, alt }) => {
101
  const [imageSrc, setImageSrc] = useState(null);
102
  const [loading, setLoading] = useState(true);
 
115
 
116
  if (loading) {
117
  return (
118
+ <span style={{
119
+ display: 'inline-block',
120
  width: '100%',
121
  height: '200px',
122
  backgroundColor: '#f3f4f6',
123
+ textAlign: 'center',
124
+ lineHeight: '200px',
 
125
  margin: '1rem 0',
126
+ borderRadius: '0.5rem',
127
+ color: '#6b7280'
128
  }}>
129
+ Loading image...
130
+ </span>
131
  );
132
  }
133
 
134
  if (!imageSrc) {
135
  return (
136
+ <span style={{
137
+ display: 'inline-block',
138
  width: '100%',
139
  height: '200px',
140
  backgroundColor: '#fef2f2',
141
+ textAlign: 'center',
142
+ lineHeight: '200px',
 
143
  margin: '1rem 0',
144
  borderRadius: '0.5rem',
145
+ border: '1px solid #fecaca',
146
+ color: '#dc2626'
147
  }}>
148
+ Image not found: {alt || src}
149
+ </span>
150
  );
151
  }
152
 
153
  return (
154
+ <img
155
+ src={imageSrc}
156
+ alt={alt || 'Document image'}
157
+ style={{
158
+ display: 'block',
159
+ maxWidth: '100%',
160
+ height: 'auto',
161
+ margin: '1.5rem auto'
162
+ }}
163
+ />
164
  );
165
  };
166
 
 
177
  formData.append('file', selectedFile);
178
 
179
  setUploadProgress(30);
180
+ const uploadResponse = await fetch('/upload_pdf', {
181
  method: 'POST',
182
  body: formData,
183
  });
 
194
  await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for UX
195
 
196
  setOcrProgress(60);
197
+ const ocrResponse = await fetch(`/process_ocr/${uploadData.file_id}`);
198
 
199
  if (!ocrResponse.ok) {
200
  throw new Error('Failed to process OCR');
 
208
  .map(page => page.markdown)
209
  .join('\n\n---\n\n');
210
 
211
+ // Collect all chunks from all pages
212
+ const allChunks = [];
213
+ let markdownOffset = 0;
214
+
215
+ ocrData.pages.forEach((page, pageIndex) => {
216
+ if (page.chunks && page.chunks.length > 0) {
217
+ page.chunks.forEach(chunk => {
218
+ allChunks.push({
219
+ ...chunk,
220
+ start_position: chunk.start_position + markdownOffset,
221
+ end_position: chunk.end_position + markdownOffset,
222
+ pageIndex: pageIndex
223
+ });
224
+ });
225
+ }
226
+ markdownOffset += page.markdown.length + 6; // +6 for the separator "\n\n---\n\n"
227
+ });
228
+
229
  setDocumentData({
230
  fileId: uploadData.file_id,
231
  filename: uploadData.filename,
232
  markdown: combinedMarkdown,
233
  pages: ocrData.pages,
234
+ totalPages: ocrData.total_pages,
235
+ chunks: allChunks
236
  });
237
 
238
  } catch (error) {
 
348
  }
349
 
350
  return (
351
+ <div className="min-h-screen bg-gray-50 flex">
352
+ {/* Left Panel - Document (66%) */}
353
+ <div className="w-2/3 p-6">
354
+ <div className="bg-white rounded-lg shadow-sm p-6 h-full">
355
+ <div className="prose prose-sm max-w-none overflow-y-auto" style={{ maxHeight: 'calc(100vh - 60px)' }}>
356
  <ReactMarkdown
357
  remarkPlugins={[remarkMath]}
358
  rehypePlugins={[rehypeKatex]}
359
  components={{
360
+ h1: ({ children }) => <h1 style={{ fontSize: '1.5rem', fontWeight: 'bold', marginBottom: '1rem', color: '#1a202c' }}>{children}</h1>,
361
+ h2: ({ children }) => <h2 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', marginTop: '1.5rem', color: '#1a202c' }}>{children}</h2>,
362
+ h3: ({ children }) => <h3 style={{ fontSize: '1.125rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '1rem', color: '#1a202c' }}>{children}</h3>,
363
+ p: ({ children }) => <p style={{ marginBottom: '0.75rem', color: '#374151', lineHeight: '1.5', fontSize: '0.875rem' }}>{children}</p>,
364
+ hr: () => <hr style={{ margin: '1.5rem 0', borderColor: '#d1d5db' }} />,
365
+ ul: ({ children }) => <ul style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'disc', fontSize: '0.875rem' }}>{children}</ul>,
366
+ ol: ({ children }) => <ol style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'decimal', fontSize: '0.875rem' }}>{children}</ol>,
367
+ li: ({ children }) => <li style={{ marginBottom: '0.125rem', color: '#374151' }}>{children}</li>,
368
  blockquote: ({ children }) => (
369
+ <blockquote style={{ borderLeft: '3px solid #3b82f6', paddingLeft: '0.75rem', fontStyle: 'italic', margin: '0.75rem 0', color: '#6b7280', fontSize: '0.875rem' }}>
370
  {children}
371
  </blockquote>
372
  ),
373
  code: ({ inline, children }) =>
374
  inline ?
375
+ <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code> :
376
+ <pre style={{ backgroundColor: '#f3f4f6', padding: '0.75rem', borderRadius: '0.375rem', overflowX: 'auto', margin: '0.75rem 0' }}>
377
+ <code style={{ fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code>
378
  </pre>,
379
  img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
380
  }}
 
384
  </div>
385
  </div>
386
  </div>
387
+
388
+ {/* Right Panel - Chunks (33%) */}
389
+ <div className="w-1/3 border-l border-gray-200 bg-white">
390
+ <ChunkPanel chunks={documentData.chunks} />
391
+ </div>
392
  </div>
393
  );
394
  }
frontend/vite.config.js CHANGED
@@ -4,4 +4,13 @@ import react from '@vitejs/plugin-react'
4
  // https://vite.dev/config/
5
  export default defineConfig({
6
  plugins: [react()],
 
 
 
 
 
 
 
 
 
7
  })
 
4
  // https://vite.dev/config/
5
  export default defineConfig({
6
  plugins: [react()],
7
+ server: {
8
+ proxy: {
9
+ '/upload_pdf': 'http://localhost:8000',
10
+ '/process_ocr': 'http://localhost:8000',
11
+ '/get_image': 'http://localhost:8000',
12
+ '/chunk_page': 'http://localhost:8000',
13
+ '/api': 'http://localhost:8000'
14
+ }
15
+ }
16
  })