Alleinzellgaenger commited on
Commit
98d2bd7
·
1 Parent(s): 1e6902f

Added custom chat component'

Browse files
backend/app.py CHANGED
@@ -2,20 +2,13 @@ from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.responses import FileResponse
5
- from mistralai import Mistral
6
  import os
7
  import tempfile
8
- import json
9
- import re
10
- import string
11
  from dotenv import load_dotenv
12
- from difflib import SequenceMatcher
13
- from pydantic import BaseModel, Field
14
  from typing import Optional, List
15
- from langchain.chat_models import init_chat_model
16
  import anthropic
17
- import google
18
- from google import genai
19
  # Load environment variables
20
  load_dotenv()
21
 
@@ -107,1120 +100,32 @@ Keep responses concise and educational. When relevant, use LaTeX math notation l
107
 
108
  @app.post("/upload_pdf")
109
  async def upload_pdf(file: UploadFile = File(...)):
110
- """Upload PDF to Mistral Document AI"""
111
- print(f"📄 Processing file: {file.filename}")
112
-
113
- # Get Mistral API key
114
- api_key = os.environ.get("MISTRAL_API_KEY")
115
- if not api_key:
116
- print("❌ No Mistral API key found")
117
- raise HTTPException(status_code=500, detail="MISTRAL_API_KEY not set in environment")
118
 
119
  try:
120
- # Initialize Mistral client
121
- client = Mistral(api_key=api_key)
122
- print("🔑 Mistral client initialized")
123
-
124
  # Read PDF bytes
125
  file_bytes = await file.read()
126
  print(f"📊 File size: {len(file_bytes)} bytes")
127
 
128
- # Create temporary file for Mistral upload
129
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
130
  temp_file.write(file_bytes)
131
  temp_file_path = temp_file.name
132
 
133
- try:
134
- print("🚀 Uploading to Mistral...")
135
-
136
- # Upload PDF to Mistral for OCR processing
137
- uploaded_pdf = client.files.upload(
138
- file={
139
- "file_name": file.filename or "document.pdf",
140
- "content": open(temp_file_path, "rb"),
141
- },
142
- purpose="ocr"
143
- )
144
-
145
- print(f"✅ Upload successful! File ID: {uploaded_pdf.id}")
146
-
147
- return {
148
- "message": "PDF uploaded to Mistral successfully!",
149
- "file_id": uploaded_pdf.id,
150
- "filename": file.filename,
151
- "status": "uploaded",
152
- "mistral_response": str(uploaded_pdf)
153
- }
154
-
155
- finally:
156
- # Clean up temporary file
157
- os.unlink(temp_file_path)
158
- print("🗑️ Temporary file cleaned up")
159
-
160
- except Exception as e:
161
- print(f"❌ Error with Mistral API: {e}")
162
- raise HTTPException(status_code=500, detail=f"Mistral API error: {str(e)}")
163
-
164
- @app.get("/process_ocr/{file_id}")
165
- async def process_ocr_content(file_id: str):
166
- """Process OCR content using proper Mistral OCR API"""
167
- print(f"🔍 Processing OCR for file ID: {file_id}")
168
-
169
- # Get Mistral API key
170
- api_key = os.environ.get("MISTRAL_API_KEY")
171
- if not api_key:
172
- raise HTTPException(status_code=500, detail="MISTRAL_API_KEY not set")
173
-
174
- try:
175
- # Initialize Mistral client
176
- client = Mistral(api_key=api_key)
177
-
178
- # Get signed URL for the file
179
- print("🔗 Getting signed URL...")
180
- signed_url = client.files.get_signed_url(file_id=file_id, expiry=1)
181
- print(f"✅ Signed URL obtained")
182
-
183
- # Process OCR using the proper API
184
- print("🚀 Processing OCR...")
185
- ocr_response = client.ocr.process(
186
- model="mistral-ocr-latest",
187
- document={
188
- "type": "document_url",
189
- "document_url": signed_url.url,
190
- },
191
- include_image_base64=True # Include images for full processing
192
- )
193
-
194
- print(f"✅ OCR processing complete! Found {len(ocr_response.pages)} pages")
195
-
196
- # Debug: Print raw OCR response structure
197
- print("\n" + "="*80)
198
- print("🔍 RAW MISTRAL OCR RESPONSE DEBUG:")
199
- print("="*80)
200
-
201
- for page_idx, page in enumerate(ocr_response.pages):
202
- print(f"\n📄 PAGE {page_idx + 1} RAW MARKDOWN:")
203
- print("-" * 50)
204
- print(repr(page.markdown)) # Using repr() to show escape characters
205
- print("-" * 50)
206
- print("RENDERED:")
207
- print(page.markdown[:500] + "..." if len(page.markdown) > 500 else page.markdown)
208
- print(f"TOTAL LENGTH: {len(page.markdown)} characters")
209
-
210
- print("="*80)
211
- print("END RAW OCR DEBUG")
212
- print("="*80 + "\n")
213
-
214
- # Process each page and extract structured data (without per-page chunking)
215
- processed_pages = []
216
- all_page_markdown = []
217
-
218
- for page_idx, page in enumerate(ocr_response.pages):
219
- print(f"📄 Page {page_idx + 1}: {len(page.markdown)} chars, {len(page.images)} images")
220
-
221
- page_data = {
222
- "index": page.index,
223
- "markdown": page.markdown,
224
- "images": [],
225
- "dimensions": {
226
- "dpi": page.dimensions.dpi,
227
- "height": page.dimensions.height,
228
- "width": page.dimensions.width
229
- }
230
- }
231
-
232
- # Process images with coordinates
233
- for img in page.images:
234
- image_data = {
235
- "id": img.id,
236
- "coordinates": {
237
- "top_left_x": img.top_left_x,
238
- "top_left_y": img.top_left_y,
239
- "bottom_right_x": img.bottom_right_x,
240
- "bottom_right_y": img.bottom_right_y
241
- },
242
- "has_base64": bool(img.image_base64) # Don't include actual base64 in response
243
- }
244
- page_data["images"].append(image_data)
245
-
246
- processed_pages.append(page_data)
247
- all_page_markdown.append(page.markdown)
248
-
249
- # Combine all markdown into single document
250
- combined_markdown = '\n\n---\n\n'.join(all_page_markdown)
251
- print(f"📋 Combined document: {len(combined_markdown)} chars total")
252
-
253
- # Auto-chunk the entire document once - try Gemini first, then fallback
254
- document_chunks = []
255
- original_markdown = combined_markdown
256
- try:
257
- print(f"🧠 Auto-chunking entire document with Gemini...")
258
- document_chunks, original_markdown = await gemini_chunk_document(combined_markdown)
259
-
260
- # If Gemini failed, try the old Fireworks method
261
- if not document_chunks:
262
- print(f"🔄 Gemini failed, falling back to Fireworks...")
263
- document_chunks, original_markdown = await auto_chunk_document(combined_markdown, client)
264
-
265
- print(f"📊 Document chunks found: {len(document_chunks)}")
266
- for i, chunk in enumerate(document_chunks):
267
- topic = chunk.get('topic', 'Unknown')
268
- preview = chunk.get('text', chunk.get('start_phrase', ''))[:50] + "..." if chunk.get('text', chunk.get('start_phrase', '')) else 'No content'
269
- print(f" {i+1}. {topic}: {preview}")
270
-
271
- except Exception as chunk_error:
272
- print(f"⚠️ Document chunking failed: {chunk_error}")
273
- document_chunks = []
274
- original_markdown = combined_markdown
275
-
276
- print(f"📝 Total processed pages: {len(processed_pages)}")
277
-
278
- return {
279
- "file_id": file_id,
280
- "pages": processed_pages,
281
- "total_pages": len(processed_pages),
282
- "combined_markdown": original_markdown, # Send original version for highlighting
283
- "chunks": document_chunks,
284
- "status": "processed"
285
- }
286
-
287
- except Exception as e:
288
- print(f"❌ Error processing OCR: {e}")
289
- raise HTTPException(status_code=500, detail=f"Error processing OCR: {str(e)}")
290
-
291
- @app.get("/get_image/{file_id}/{image_id}")
292
- async def get_image_base64(file_id: str, image_id: str):
293
- """Get base64 image data for a specific image"""
294
- print(f"🖼️ Getting image {image_id} from file {file_id}")
295
-
296
- # Get Mistral API key
297
- api_key = os.environ.get("MISTRAL_API_KEY")
298
- if not api_key:
299
- raise HTTPException(status_code=500, detail="MISTRAL_API_KEY not set")
300
-
301
- try:
302
- # Initialize Mistral client
303
- client = Mistral(api_key=api_key)
304
-
305
- # Get signed URL and process OCR again (we could cache this)
306
- signed_url = client.files.get_signed_url(file_id=file_id, expiry=1)
307
-
308
- ocr_response = client.ocr.process(
309
- model="mistral-ocr-latest",
310
- document={
311
- "type": "document_url",
312
- "document_url": signed_url.url,
313
- },
314
- include_image_base64=True
315
- )
316
-
317
- # Find the requested image
318
- for page in ocr_response.pages:
319
- for img in page.images:
320
- if img.id == image_id:
321
- return {
322
- "image_id": image_id,
323
- "image_base64": img.image_base64,
324
- "coordinates": {
325
- "top_left_x": img.top_left_x,
326
- "top_left_y": img.top_left_y,
327
- "bottom_right_x": img.bottom_right_x,
328
- "bottom_right_y": img.bottom_right_y
329
- }
330
- }
331
-
332
- raise HTTPException(status_code=404, detail=f"Image {image_id} not found")
333
-
334
- except Exception as e:
335
- print(f"❌ Error getting image: {e}")
336
- raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
337
-
338
- class ChunkSchema(BaseModel):
339
- """Schema for document chunks suitable for creating interactive lessons."""
340
- topic: str = Field(description="Brief descriptive name (2-6 words) for the educational content")
341
- text: str = Field(description="Complete chunk text with exact markdown/LaTeX formatting preserved, containing 2-3 related educational concepts")
342
-
343
- class ChunkList(BaseModel):
344
- """Container for a list of document chunks."""
345
- chunks: List[ChunkSchema] = Field(description="List of identified chunks for interactive lessons")
346
-
347
- def find_paragraph_end(text, start_pos):
348
- """Find the end of a paragraph starting from start_pos"""
349
- end_pos = start_pos
350
- while end_pos < len(text) and text[end_pos] not in ['\n', '\r']:
351
- end_pos += 1
352
-
353
- return end_pos
354
-
355
- def find_paragraph_end(text, start_pos):
356
- """Find the end of current paragraph (looks for \\n\\n or document end)"""
357
- pos = start_pos
358
- while pos < len(text):
359
- if pos < len(text) - 1 and text[pos:pos+2] == '\n\n':
360
- return pos # End at paragraph break
361
- elif text[pos] in '.!?':
362
- # Found sentence end, check if paragraph continues
363
- next_pos = pos + 1
364
- while next_pos < len(text) and text[next_pos] in ' \t':
365
- next_pos += 1
366
- if next_pos < len(text) - 1 and text[next_pos:next_pos+2] == '\n\n':
367
- return next_pos # Paragraph ends after this sentence
368
- pos = next_pos
369
- else:
370
- pos += 1
371
- return min(pos, len(text))
372
-
373
- def fuzzy_find(text, pattern, start_pos=0):
374
- """Find the best fuzzy match for pattern in text starting from start_pos"""
375
- best_ratio = 0
376
- best_pos = -1
377
-
378
- # Search in sliding windows
379
- pattern_len = len(pattern)
380
- for i in range(start_pos, len(text) - pattern_len + 1):
381
- window = text[i:i + pattern_len]
382
- ratio = SequenceMatcher(None, pattern.lower(), window.lower()).ratio()
383
-
384
- if ratio > best_ratio and ratio > 0.8: # Much stricter: 80% similarity
385
- best_ratio = ratio
386
- best_pos = i
387
-
388
- return best_pos if best_pos != -1 else None
389
-
390
- def clean_academic_content(text):
391
- """Remove common academic paper noise that breaks natural chunking"""
392
-
393
- # Patterns to remove/clean
394
- patterns_to_remove = [
395
- # Author contribution footnotes
396
- r'\[\^\d+\]:\s*[∗\*]+\s*Equal contribution[^.]*\.',
397
- r'\[\^\d+\]:\s*[†\*]+\s*Correspondence to[^.]*\.',
398
- r'\[\^\d+\]:\s*[†\*]+\s*Corresponding author[^.]*\.',
399
-
400
- # Copyright notices
401
- r'Copyright \(c\) \d{4}[^.]*\.',
402
- r'All rights reserved\.',
403
-
404
- # Common academic noise
405
- r'\[\^\d+\]:\s*Code available at[^.]*\.',
406
- r'\[\^\d+\]:\s*Data available at[^.]*\.',
407
- r'\[\^\d+\]:\s*This work was[^.]*\.',
408
-
409
- # Funding acknowledgments (often break paragraphs)
410
- r'This research was supported by[^.]*\.',
411
- r'Funded by[^.]*\.',
412
-
413
- # Page numbers and headers that shouldn't end paragraphs
414
- r'^\d+$', # Standalone page numbers
415
- r'^Page \d+',
416
-
417
- # DOI and URL patterns that break paragraphs
418
- r'DOI:\s*\S+',
419
- r'arXiv:\d{4}\.\d{4,5}',
420
- ]
421
-
422
- cleaned_text = text
423
- for pattern in patterns_to_remove:
424
- cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE | re.IGNORECASE)
425
-
426
- # Clean up multiple newlines created by removals
427
- cleaned_text = re.sub(r'\n\n\n+', '\n\n', cleaned_text)
428
-
429
- return cleaned_text.strip()
430
-
431
- def validate_paragraph_chunk(chunk_text):
432
- """Check if a chunk looks like valid content (not metadata/noise)"""
433
- # Skip very short chunks
434
- if len(chunk_text.strip()) < 50:
435
- return False
436
-
437
- # Skip chunks that are mostly footnote references
438
- footnote_refs = len(re.findall(r'\[\^\d+\]', chunk_text))
439
- if footnote_refs > len(chunk_text.split()) / 10: # More than 10% footnote refs
440
- return False
441
-
442
- # Skip chunks that are mostly citations
443
- citations = len(re.findall(r'\[\d+\]', chunk_text))
444
- if citations > len(chunk_text.split()) / 8: # More than 12.5% citations
445
- return False
446
-
447
- # Skip chunks that are mostly symbols/special chars
448
- normal_chars = sum(1 for c in chunk_text if c.isalnum() or c in string.whitespace)
449
- if normal_chars / len(chunk_text) < 0.7: # Less than 70% normal content
450
- return False
451
-
452
- return True
453
-
454
- def programmatic_chunk_document(document_markdown):
455
- """Chunk document by natural paragraph boundaries - much more reliable than LLM"""
456
- if not document_markdown or len(document_markdown.strip()) < 100:
457
- return []
458
-
459
- # Use original document without any cleaning to preserve integrity
460
- original_markdown = document_markdown
461
- print(f"📄 Using original document: {len(document_markdown)} chars")
462
-
463
- chunks = []
464
- start_pos = 0
465
- chunk_count = 0
466
-
467
- print(f"🧠 Using programmatic paragraph-based chunking...")
468
-
469
- # Find all proper paragraph endings: [.!?] followed by \n\n
470
- paragraph_ends = []
471
-
472
- # Pattern: sentence punctuation followed by \n\n
473
- pattern = r'([.!?])\n\n'
474
- matches = re.finditer(pattern, original_markdown)
475
-
476
- for match in matches:
477
- end_pos = match.end() - 3 # Position right after punctuation, before \n\n
478
- paragraph_ends.append(end_pos)
479
-
480
- print(f"📊 Found {len(paragraph_ends)} natural paragraph endings")
481
-
482
- # Create chunks from paragraph boundaries using original document
483
- for i, end_pos in enumerate(paragraph_ends):
484
- # Extract from original markdown
485
- chunk_text_clean = original_markdown[start_pos:end_pos + 1]
486
-
487
- # Validate chunk quality
488
- if not validate_paragraph_chunk(chunk_text_clean):
489
- print(f" ❌ Skipping low-quality chunk: {chunk_text_clean[:50]}...")
490
- start_pos = end_pos + 3 # Skip past .\n\n
491
- continue
492
-
493
- chunk_count += 1
494
-
495
- # Map positions back to original document for highlighting
496
- # For now, use cleaned positions (we could implement position mapping if needed)
497
- chunk_text = chunk_text_clean
498
-
499
- # Create a simple topic from first few words
500
- first_line = chunk_text.split('\n')[0].strip()
501
- topic = first_line[:50] + "..." if len(first_line) > 50 else first_line
502
-
503
- chunks.append({
504
- "topic": topic,
505
- "start_position": start_pos,
506
- "end_position": end_pos + 1,
507
- "start_phrase": chunk_text[:20] + "...", # First 20 chars
508
- "end_phrase": "..." + chunk_text[-20:], # Last 20 chars
509
- "found_start": True,
510
- "found_end": True
511
- })
512
-
513
- print(f" ✅ Chunk {chunk_count}: {start_pos}-{end_pos + 1} (length: {end_pos + 1 - start_pos})")
514
- print(f" Topic: {topic}")
515
- print(f" Preview: {chunk_text[:80]}...")
516
-
517
- # Next chunk starts after \n\n
518
- start_pos = end_pos + 3 # Skip past .\n\n
519
-
520
- # Handle any remaining text (document might not end with proper paragraph)
521
- if start_pos < len(original_markdown):
522
- remaining_text = original_markdown[start_pos:].strip()
523
- if remaining_text and validate_paragraph_chunk(remaining_text):
524
- chunk_count += 1
525
- first_line = remaining_text.split('\n')[0].strip()
526
- topic = first_line[:50] + "..." if len(first_line) > 50 else first_line
527
-
528
- chunks.append({
529
- "topic": topic,
530
- "start_position": start_pos,
531
- "end_position": len(original_markdown),
532
- "start_phrase": remaining_text[:20] + "...",
533
- "end_phrase": "..." + remaining_text[-20:],
534
- "found_start": True,
535
- "found_end": True
536
- })
537
-
538
- print(f" ✅ Final chunk {chunk_count}: {start_pos}-{len(original_markdown)} (remaining text)")
539
- else:
540
- print(f" ❌ Skipping low-quality remaining text")
541
-
542
- print(f"📊 Created {len(chunks)} high-quality paragraph-based chunks")
543
-
544
- # Note: We're returning chunks based on original document positions
545
- # The frontend will use the original document for highlighting
546
- return chunks, document_markdown
547
-
548
- def split_document_into_batches(document_markdown, max_chars=8000):
549
- """Split document into manageable batches for LLM processing"""
550
- if len(document_markdown) <= max_chars:
551
- return [document_markdown]
552
-
553
- batches = []
554
- current_pos = 0
555
-
556
- while current_pos < len(document_markdown):
557
- # Try to find a good breaking point (paragraph boundary)
558
- end_pos = min(current_pos + max_chars, len(document_markdown))
559
-
560
- # If we're not at the end, try to break at a paragraph boundary
561
- if end_pos < len(document_markdown):
562
- # Look for \n\n within the last 1000 characters of this batch
563
- search_start = max(end_pos - 1000, current_pos)
564
- last_paragraph = document_markdown.rfind('\n\n', search_start, end_pos)
565
-
566
- if last_paragraph != -1 and last_paragraph > current_pos:
567
- end_pos = last_paragraph + 2 # Include the \n\n
568
-
569
- batch = document_markdown[current_pos:end_pos]
570
- batches.append(batch)
571
- current_pos = end_pos
572
-
573
- print(f"📄 Created batch {len(batches)}: {len(batch)} chars (pos {current_pos-len(batch)}-{current_pos})")
574
-
575
- return batches
576
-
577
- async def gemini_chunk_document(document_markdown):
578
- """Auto-chunk a document using Google Gemini 2.5 Pro with reliable structured output"""
579
-
580
- # Get Gemini API key
581
- gemini_api_key = os.environ.get("GEMINI_API_KEY")
582
- if not gemini_api_key:
583
- print("⚠️ No Gemini API key found")
584
- return None, document_markdown
585
-
586
- print(f"📄 Document length: {len(document_markdown)} characters")
587
-
588
- try:
589
- # Initialize Gemini client
590
- client = genai.Client(api_key=gemini_api_key)
591
-
592
- # Split document into batches if it's too large (Gemini has token limits)
593
- batches = split_document_into_batches(document_markdown, max_chars=12000) # Gemini can handle larger batches
594
- print(f"📄 Split document into {len(batches)} batches for Gemini")
595
-
596
- all_chunks = []
597
-
598
- # Process each batch
599
- for batch_idx, batch in enumerate(batches):
600
- print(f"\n🔄 Processing batch {batch_idx + 1}/{len(batches)} ({len(batch)} chars) with Gemini")
601
-
602
- try:
603
- # Create the prompt for Gemini
604
- prompt = f"""You are an educational content analyzer. Analyze this document section and break it into logical learning chunks.
605
-
606
- Each chunk should:
607
- - Contain 2-3 related educational concepts that naturally belong together
608
- - Be 150-500 words (optimal for learning)
609
- - Have clear educational value
610
- - Preserve all markdown/LaTeX formatting exactly
611
- - Skip: abstracts, acknowledgments, references, author info, page numbers
612
-
613
- Return your response as a valid JSON object with this exact structure:
614
- {{
615
- "chunks": [
616
- {{
617
- "topic": "Brief descriptive name (2-6 words)",
618
- "text": "Complete chunk text with exact formatting preserved"
619
- }}
620
- ]
621
- }}
622
-
623
- Document section to analyze:
624
- {batch}
625
-
626
- Important: Return ONLY the JSON object, no other text."""
627
-
628
- # Call Gemini 2.5 Pro (disable thinking for faster/cheaper responses)
629
- response = client.models.generate_content(
630
- model="gemini-2.5-pro",
631
- contents=prompt,
632
- config=genai.types.GenerateContentConfig(
633
- thinking_config=genai.types.ThinkingConfig(thinking_budget=-1)
634
- )
635
- )
636
-
637
- # Extract and parse response
638
- response_text = response.text.strip()
639
- print(f"📋 Gemini response preview: {response_text}...")
640
-
641
- # Clean up the response (remove code blocks if present)
642
- clean_response = response_text
643
- if clean_response.startswith('```json'):
644
- clean_response = clean_response[7:]
645
- if clean_response.endswith('```'):
646
- clean_response = clean_response[:-3]
647
- clean_response = clean_response.strip()
648
-
649
- # Parse JSON
650
- try:
651
- json_data = json.loads(clean_response)
652
-
653
- # Validate structure
654
- if not isinstance(json_data, dict) or 'chunks' not in json_data:
655
- print(f"❌ Invalid response structure from Gemini batch {batch_idx + 1}")
656
- continue
657
-
658
- chunks = json_data['chunks']
659
- if not isinstance(chunks, list):
660
- print(f"❌ 'chunks' is not a list in batch {batch_idx + 1}")
661
- continue
662
-
663
- # Process chunks
664
- batch_chunks = []
665
- for i, chunk in enumerate(chunks):
666
- if not isinstance(chunk, dict) or 'topic' not in chunk or 'text' not in chunk:
667
- print(f"❌ Invalid chunk structure in batch {batch_idx + 1}, chunk {i}")
668
- continue
669
-
670
- # Clean up text formatting
671
- chunk_text = chunk['text']
672
- # Replace literal \n with actual newlines
673
- chunk_text = chunk_text.replace('\\n', '\n')
674
-
675
- batch_chunks.append({
676
- "topic": chunk['topic'],
677
- "text": chunk_text,
678
- "chunk_index": len(all_chunks) + len(batch_chunks)
679
- })
680
-
681
- print(f"✅ Processed chunk: {chunk['topic']}")
682
-
683
- all_chunks.extend(batch_chunks)
684
- print(f"📊 Batch {batch_idx + 1} added {len(batch_chunks)} chunks (total: {len(all_chunks)})")
685
-
686
- except json.JSONDecodeError as e:
687
- print(f"❌ JSON parsing failed for batch {batch_idx + 1}: {e}")
688
- print(f"❌ Response was: {response_text}")
689
- continue
690
-
691
- except Exception as e:
692
- print(f"❌ Error processing batch {batch_idx + 1} with Gemini: {e}")
693
- continue
694
-
695
- # Return results
696
- if all_chunks:
697
- print(f"✅ Gemini successfully processed document with {len(all_chunks)} total chunks")
698
- return all_chunks, document_markdown
699
- else:
700
- print("❌ Gemini processing failed for all batches")
701
- return None, document_markdown
702
-
703
- except Exception as e:
704
- print(f"❌ Gemini chunking error: {e}")
705
- return None, document_markdown
706
-
707
- async def auto_chunk_document(document_markdown, client=None):
708
- """Auto-chunk a document using LLM with batch processing for large documents"""
709
-
710
- # Debug: Print document info
711
- print(f"📄 Document length: {len(document_markdown)} characters")
712
-
713
- # Get Fireworks API key
714
- fireworks_api_key = os.environ.get("FIREWORKS_API_KEY")
715
- if not fireworks_api_key:
716
- print("⚠️ No Fireworks API key found, falling back to programmatic chunking")
717
- chunks, original_markdown = programmatic_chunk_document(document_markdown)
718
- return chunks, original_markdown
719
-
720
- # Split document into batches if it's too large
721
- batches = split_document_into_batches(document_markdown, max_chars=8000)
722
- print(f"📄 Split document into {len(batches)} batches")
723
-
724
- all_chunks = []
725
-
726
- # Process each batch
727
- for batch_idx, batch in enumerate(batches):
728
- print(f"\n🔄 Processing batch {batch_idx + 1}/{len(batches)} ({len(batch)} chars)")
729
-
730
- # Try structured output with retry logic for this batch
731
- max_retries = 3
732
- batch_chunks = None
733
-
734
- for attempt in range(max_retries):
735
- try:
736
- print(f"🚀 Batch {batch_idx + 1} Attempt {attempt + 1}/{max_retries}: Calling Fireworks...")
737
-
738
- # Initialize LLM
739
- llm = init_chat_model(
740
- "accounts/fireworks/models/llama4-maverick-instruct-basic",
741
- model_provider="fireworks",
742
- api_key=fireworks_api_key
743
- )
744
-
745
- # Use regular LLM and manual JSON parsing
746
- prompt = f"""You are an educational content analyzer. Break this document section into logical learning chunks.
747
-
748
- IMPORTANT: Return your response as a valid JSON object with this exact structure:
749
- {{
750
- "chunks": [
751
- {{
752
- "topic": "Brief topic name",
753
- "text": "Complete chunk text with exact formatting"
754
- }}
755
- ]
756
- }}
757
-
758
- Rules for chunking:
759
- - Each chunk should contain 2-3 related educational concepts
760
- - Keep chunks concise: 100-300 words (avoid very long text blocks)
761
- - Preserve all markdown/LaTeX formatting exactly as written
762
- - Skip: abstracts, acknowledgements, references, author information, page numbers
763
- - Create separate chunks for figures/tables with their captions
764
- - Never split mathematical expressions or LaTeX formulas
765
- - Process ALL content in this section - don't skip any educational material
766
- - Ensure all JSON strings are properly formatted (no unescaped quotes)
767
-
768
- Document section to analyze:
769
- {batch}
770
-
771
- Return only the JSON object, no other text."""
772
-
773
- # Call regular LLM
774
- result = llm.invoke(prompt)
775
- print(f"📋 Raw LLM response type: {type(result)}")
776
-
777
- # Extract text content
778
- if hasattr(result, 'content'):
779
- response_text = result.content
780
- elif hasattr(result, 'text'):
781
- response_text = result.text
782
- else:
783
- response_text = str(result)
784
-
785
- print(f"📋 Response text preview: {response_text}...")
786
-
787
- # Try to parse JSON manually
788
-
789
- try:
790
- # Clean up the response - remove any markdown code blocks and fix common issues
791
- clean_response = response_text.strip()
792
- if clean_response.startswith('```json'):
793
- clean_response = clean_response[7:]
794
- if clean_response.endswith('```'):
795
- clean_response = clean_response[:-3]
796
- clean_response = clean_response.strip()
797
-
798
- # Fix common JSON truncation issues
799
- # If the response doesn't end properly, try to close it
800
- if not clean_response.endswith('}'):
801
- # Try to find the last complete chunk entry and close properly
802
- last_brace = clean_response.rfind('}')
803
- if last_brace != -1:
804
- # Find if we're inside a chunks array
805
- chunks_start = clean_response.find('"chunks": [')
806
- if chunks_start != -1 and last_brace > chunks_start:
807
- # Close the chunks array and main object
808
- clean_response = clean_response[:last_brace+1] + '\n ]\n}'
809
- else:
810
- clean_response = clean_response[:last_brace+1]
811
-
812
- print(f"📋 Cleaned response preview: {clean_response[:300]}...")
813
- print(f"📋 Cleaned response ends with: '{clean_response[-50:]}'")
814
-
815
- # Additional safety: ensure we have a complete JSON structure
816
- if not (clean_response.startswith('{') and clean_response.endswith('}')):
817
- print(f"❌ Response doesn't look like valid JSON structure")
818
- continue
819
-
820
- # Fix common JSON escape issues with LaTeX
821
- # Replace single backslashes with double backslashes in JSON strings
822
- # But be careful not to affect already-escaped sequences
823
- def fix_latex_escapes(text):
824
- # Find all JSON string values (between quotes)
825
- def escape_in_string(match):
826
- string_content = match.group(1)
827
- # Escape single backslashes in LaTeX commands
828
- # Handle \mathrm, \left, \%, etc. but preserve JSON escapes like \n, \t, \", \\
829
- # Pattern: backslash followed by letters OR specific LaTeX symbols like %
830
- fixed = re.sub(r'(?<!\\)\\(?=[a-zA-Z%])', r'\\\\', string_content)
831
- return f'"{fixed}"'
832
-
833
- # Apply to all JSON string values
834
- return re.sub(r'"([^"\\]*(\\.[^"\\]*)*)"', escape_in_string, text)
835
-
836
- clean_response = fix_latex_escapes(clean_response)
837
- print(f"📋 After escape fixing: {clean_response[:200]}...")
838
-
839
- # Parse JSON
840
- json_data = json.loads(clean_response)
841
- print(f"📋 Successfully parsed JSON: {type(json_data)}")
842
-
843
- # Validate with Pydantic
844
- chunk_response = ChunkList.model_validate(json_data)
845
- print(f"📋 Pydantic validation successful: {type(chunk_response)}")
846
-
847
- # Fix literal \n strings in chunk text (convert to actual newlines)
848
- for chunk in chunk_response.chunks:
849
- if hasattr(chunk, 'text') and chunk.text:
850
- # Replace literal \n with actual newlines for paragraph breaks
851
- # Be careful not to affect LaTeX commands that might contain 'n'
852
- chunk.text = chunk.text.replace('\\n', '\n')
853
-
854
- except json.JSONDecodeError as e:
855
- print(f"❌ Attempt {attempt + 1}: JSON parsing failed: {e}")
856
- print(f"❌ Response was: {response_text}")
857
- continue
858
- except Exception as e:
859
- print(f"❌ Attempt {attempt + 1}: Pydantic validation failed: {e}")
860
- continue
861
-
862
- chunks = chunk_response.chunks
863
- if not chunks or len(chunks) == 0:
864
- print(f"⚠️ Attempt {attempt + 1}: No chunks returned")
865
- continue
866
-
867
- # Success! Process chunks
868
- processed_chunks = []
869
- for i, chunk in enumerate(chunks):
870
- print(f"\n📝 Processing chunk {i+1}: {chunk.topic}")
871
-
872
- if not hasattr(chunk, 'text') or not chunk.text.strip():
873
- print(f"❌ Chunk missing or empty text: {chunk}")
874
- continue
875
-
876
- print(f" Text preview: '{chunk.text[:100]}...'")
877
-
878
- processed_chunks.append({
879
- "topic": chunk.topic,
880
- "text": chunk.text,
881
- "chunk_index": i
882
- })
883
-
884
- if processed_chunks:
885
- print(f"✅ Successfully processed {len(processed_chunks)} chunks for batch {batch_idx + 1}")
886
- batch_chunks = processed_chunks
887
- break
888
- else:
889
- print(f"❌ Batch {batch_idx + 1} Attempt {attempt + 1}: No valid chunks processed")
890
- continue
891
-
892
- except Exception as e:
893
- print(f"❌ Batch {batch_idx + 1} Attempt {attempt + 1} failed: {e}")
894
- if attempt == max_retries - 1:
895
- print(f"❌ All {max_retries} attempts failed for batch {batch_idx + 1}")
896
-
897
- # Add successful batch chunks to all_chunks
898
- if batch_chunks:
899
- all_chunks.extend(batch_chunks)
900
- print(f"📊 Total chunks so far: {len(all_chunks)}")
901
- else:
902
- print(f"⚠️ Batch {batch_idx + 1} failed completely, skipping...")
903
-
904
- # Final results
905
- if all_chunks:
906
- print(f"✅ Successfully processed document with {len(all_chunks)} total chunks from {len(batches)} batches")
907
- # Re-index all chunks sequentially
908
- for i, chunk in enumerate(all_chunks):
909
- chunk["chunk_index"] = i
910
- return all_chunks, document_markdown
911
- else:
912
- print("🔄 All batches failed, falling back to programmatic chunking...")
913
- chunks, original_markdown = programmatic_chunk_document(document_markdown)
914
- return chunks, original_markdown
915
-
916
- try:
917
- # Initialize Fireworks LLM with structured output
918
- llm = init_chat_model(
919
- "accounts/fireworks/models/llama4-maverick-instruct-basic",
920
- model_provider="fireworks",
921
- api_key=fireworks_api_key
922
- )
923
-
924
- # Create structured LLM that returns ChunkList object
925
- structured_llm = llm.with_structured_output(ChunkList)
926
-
927
- # Create improved chunking prompt that returns complete chunk text
928
- prompt = f"""## Task
929
- Analyze this academic document and create logical educational chunks. Each chunk should contain 2-3 related educational concepts or lessons that a student would naturally learn together.
930
-
931
- ## Step-by-Step Process
932
- 1. **Scan the document** to identify main topics and educational concepts
933
- 2. **Group related paragraphs** that teach connected ideas (even if separated by figures)
934
- 3. **Create separate chunks** for figures/tables with their captions
935
- 4. **Ensure each chunk** contains 2-3 educational lessons that build on each other
936
- 5. **Preserve all formatting** exactly as written
937
-
938
- ## Chunking Rules
939
-
940
- ### Content Rules
941
- - **Combine related content**: If a concept is split by a figure placement, reunite the related paragraphs in one chunk
942
- - **2-3 educational lessons per chunk**: Each chunk should teach 2-3 connected concepts that logically belong together
943
- - **Preserve complete thoughts**: Never split sentences, mathematical expressions, or LaTeX formulas
944
- - **Skip metadata sections**: Exclude abstracts, acknowledgments, references, author info, page numbers
945
-
946
- ### Formatting Rules
947
- - **Preserve exactly**: All markdown, LaTeX, mathematical notation, and formatting
948
- - **Include paragraph breaks**: Maintain original \\n\\n paragraph separations
949
- - **Remove artifacts**: Strip page numbers, headers, footers, and formatting metadata
950
-
951
- ### Special Elements
952
- - **Figures/Tables/Images**: Create separate chunks containing the full caption and any accompanying text
953
- - **Mathematical expressions**: Keep complete formulas together, never split LaTeX
954
- - **Code blocks**: Preserve in their entirety with proper formatting
955
-
956
- ## Output Format
957
- Return a JSON object with this exact schema:
958
-
959
- ```json
960
- {{
961
- "chunks": [
962
- {{
963
- "topic": "Brief descriptive name (2-6 words) for the educational content",
964
- "text": "Complete chunk text with exact markdown/LaTeX formatting preserved"
965
- }}
966
- ]
967
- }}
968
- ```
969
-
970
- ## Quality Criteria
971
- **Good chunks:**
972
- - Contain 2-3 related educational concepts
973
- - Are 150-500 words (optimal learning unit size)
974
- - Have clear educational value and logical flow
975
- - Preserve all original formatting perfectly
976
-
977
- **Avoid:**
978
- - Single-sentence chunks
979
- - Chunks with >5 unrelated concepts
980
- - Split mathematical expressions
981
- - Metadata or reference content
982
-
983
- ## Examples
984
-
985
- **Good chunk example:**
986
- ```json
987
- {{
988
- "chunks": [
989
- {{
990
- "topic": "Gradient Descent Fundamentals",
991
- "text": "## Gradient Descent Algorithm\\n\\nGradient descent is an optimization algorithm used to minimize functions...\\n\\n### Mathematical Formulation\\n\\nThe update rule is given by:\\n\\n$\\theta_{{t+1}} = \\theta_t - \\alpha \\nabla f(\\theta_t)$\\n\\nwhere $\\alpha$ is the learning rate..."
992
- }}
993
- ]
994
- }}
995
- ```
996
-
997
- **Bad chunk example:**
998
- ```json
999
- {{
1000
- "chunks": [
1001
- {{
1002
- "topic": "Introduction",
1003
- "text": "This paper presents..."
1004
- }}
1005
- ]
1006
- }}
1007
- ```
1008
- (Too brief, not educational content)
1009
-
1010
- ---
1011
-
1012
- ## Document to Process:
1013
- {document_markdown}
1014
-
1015
- Please analyze the document and return the JSON object with chunks following the above guidelines.
1016
- """
1017
-
1018
- # Call Fireworks with structured output
1019
- print("🚀 Calling Fireworks for document chunking...")
1020
- try:
1021
- chunk_response = structured_llm.invoke(prompt)
1022
- print(f"📋 Raw response type: {type(chunk_response)}")
1023
- print(f"📋 Raw response: {chunk_response}")
1024
- except Exception as invoke_error:
1025
- print(f"❌ Error during Fireworks invoke: {invoke_error}")
1026
- return [], document_markdown
1027
-
1028
- if chunk_response is None:
1029
- print("❌ Received None response from Fireworks")
1030
- return [], document_markdown
1031
-
1032
- if not hasattr(chunk_response, 'chunks'):
1033
- print(f"❌ Response missing 'chunks' attribute: {type(chunk_response)}")
1034
- print(f"Response content: {chunk_response}")
1035
- return [], document_markdown
1036
-
1037
- chunks = chunk_response.chunks
1038
- if not chunks:
1039
- print("⚠️ No chunks returned from Fireworks")
1040
- return [], document_markdown
1041
-
1042
- # Process chunks with direct text (no fuzzy matching needed)
1043
- processed_chunks = []
1044
- for i, chunk in enumerate(chunks):
1045
- print(f"\n📝 Processing chunk {i+1}: {chunk.topic}")
1046
-
1047
- # Check if chunk has the expected 'text' attribute
1048
- if not hasattr(chunk, 'text'):
1049
- print(f"❌ Chunk missing 'text' attribute: {chunk}")
1050
- continue
1051
-
1052
- print(f" Text preview: '{chunk.text[:100]}...'")
1053
-
1054
- processed_chunks.append({
1055
- "topic": chunk.topic,
1056
- "text": chunk.text,
1057
- "chunk_index": i
1058
- })
1059
-
1060
- print(f"📊 Processed {len(processed_chunks)} chunks with direct text")
1061
-
1062
- return processed_chunks, document_markdown
1063
-
1064
- except Exception as e:
1065
- import traceback
1066
- print(f"❌ Auto-chunking error: {e}")
1067
- print(f"❌ Full traceback: {traceback.format_exc()}")
1068
- return [], document_markdown
1069
-
1070
- @app.post("/chunk_page")
1071
- async def chunk_page(request: dict):
1072
- """Analyze a page and suggest chunks for lessons using Fireworks AI with structured output"""
1073
- print(f"🧠 Chunking page...")
1074
-
1075
- page_markdown = request.get("markdown", "")
1076
- if not page_markdown:
1077
- raise HTTPException(status_code=400, detail="No markdown provided")
1078
-
1079
- # Get Fireworks API key
1080
- fireworks_api_key = os.environ.get("FIREWORKS_API_KEY")
1081
- if not fireworks_api_key:
1082
- raise HTTPException(status_code=500, detail="FIREWORKS_API_KEY not set")
1083
-
1084
- try:
1085
- # Initialize Fireworks LLM with structured output
1086
- llm = init_chat_model(
1087
- "accounts/fireworks/models/llama4-maverick-instruct-basic",
1088
- model_provider="fireworks",
1089
- api_key=fireworks_api_key
1090
- )
1091
-
1092
- # Create structured LLM that returns ChunkList object
1093
- structured_llm = llm.with_structured_output(ChunkList)
1094
-
1095
- # Create chunking prompt
1096
- prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
1097
-
1098
- DOCUMENT PAGE:
1099
- {page_markdown}
1100
-
1101
- Rules:
1102
- 1. Each chunk should contain 2-3 valuable lessons
1103
- 2. start_phrase and end_phrase should be 5-15 words long
1104
- 3. Focus on educational content (concepts, examples, key points)
1105
- 4. More dense content should have more chunks, less dense content fewer chunks
1106
- 5. Identify chunks that would make good interactive lessons
1107
- 6. SKIP chunks from abstract, references, author information, page numbers, etc.
1108
-
1109
- Return a list of chunks with topic, start_phrase, and end_phrase for each."""
1110
-
1111
- # Call Fireworks with structured output
1112
- print("🚀 Calling Fireworks for chunking...")
1113
- chunk_response = structured_llm.invoke(prompt)
1114
- chunks = chunk_response.chunks
1115
- print(f"📝 Received {len(chunks)} chunks from Fireworks")
1116
-
1117
- # Process chunks with direct text (no fuzzy matching needed)
1118
- processed_chunks = []
1119
- for i, chunk in enumerate(chunks):
1120
- processed_chunks.append({
1121
- "topic": chunk.topic,
1122
- "text": chunk.text,
1123
- "chunk_index": i
1124
- })
1125
- print(f"✅ Processed chunk: {chunk.topic}")
1126
-
1127
- print(f"📊 Successfully processed {len(processed_chunks)} chunks")
1128
-
1129
- return {
1130
- "chunks": processed_chunks,
1131
- "total_found": len(processed_chunks),
1132
- "total_suggested": len(chunks)
1133
- }
1134
-
1135
- except Exception as e:
1136
- import traceback
1137
- print(f"❌ Error chunking page: {e}")
1138
- print(f"❌ Full traceback: {traceback.format_exc()}")
1139
- raise HTTPException(status_code=500, detail=f"Error chunking page: {str(e)}")
1140
-
1141
- @app.post("/start_chunk_lesson/{file_id}/{chunk_index}")
1142
- async def start_chunk_lesson(file_id: str, chunk_index: int, request: dict):
1143
- """Start a Socratic teaching session for a specific chunk using Claude"""
1144
- print(f"🎓 Starting lesson for chunk {chunk_index} in file {file_id}")
1145
-
1146
- # Get Anthropic API key
1147
- anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
1148
- if not anthropic_api_key:
1149
- raise HTTPException(status_code=500, detail="ANTHROPIC_API_KEY not set")
1150
-
1151
- try:
1152
- # Extract data from request
1153
- chunk_data = request.get("chunk", {})
1154
- document_markdown = request.get("document_markdown", "")
1155
-
1156
- if not chunk_data or not document_markdown:
1157
- raise HTTPException(status_code=400, detail="Missing chunk data or document markdown")
1158
-
1159
- # Get the specific chunk text for focus
1160
- start_pos = chunk_data.get("start_position")
1161
- end_pos = chunk_data.get("end_position")
1162
-
1163
- if start_pos is not None and end_pos is not None:
1164
- chunk_text = document_markdown[start_pos:end_pos]
1165
- print(f"📍 Extracted chunk text: {chunk_text[:100]}...")
1166
- else:
1167
- chunk_text = f"Focus area: {chunk_data.get('topic', 'Selected content')}"
1168
- print("⚠️ No positions found, using topic as fallback")
1169
-
1170
- # Initialize Anthropic client
1171
- client = anthropic.Anthropic(api_key=anthropic_api_key)
1172
-
1173
- # Create the system prompt for Socratic teaching
1174
- system_prompt = """You are a Socratic teacher. Your goal is to help students deeply understand concepts through guided questioning, not by giving direct answers.
1175
-
1176
- Your teaching approach:
1177
- 1. Ask thoughtful questions that guide discovery
1178
- 2. Be encouraging and patient
1179
- 3. Help students think critically about the material
1180
- 4. Don't give direct answers - guide them to find answers themselves
1181
- 5. Start with 2-3 opening questions to check understanding and spark curiosity
1182
-
1183
- Focus on the specific chunk highlighted, but use the full document context to create meaningful questions."""
1184
-
1185
- # Create the user prompt with FULL document + focused chunk
1186
- user_prompt = f"""Here's a complete educational document, with a specific section that the student wants to focus on:
1187
-
1188
- FULL DOCUMENT:
1189
- {document_markdown}
1190
-
1191
- FOCUSED SECTION:
1192
- {chunk_text}
1193
-
1194
- The student has selected the "FOCUSED SECTION" to study. Please create 2-3 Socratic questions that will help them deeply understand this specific section, while drawing on the broader document context when helpful.
1195
-
1196
- Make the questions specific and thought-provoking to encourage critical thinking about the focused content."""
1197
-
1198
- # Call Claude
1199
- print("🤖 Calling Claude for Socratic questions...")
1200
- response = client.messages.create(
1201
- model="claude-sonnet-4-20250514",
1202
- max_tokens=1000,
1203
- system=system_prompt,
1204
- messages=[
1205
- {"role": "user", "content": user_prompt}
1206
- ]
1207
- )
1208
-
1209
- # Extract the response text
1210
- questions_text = response.content[0].text
1211
- print(f"✅ Received Socratic questions from Claude")
1212
 
1213
  return {
1214
- "chat_id": f"{file_id}_{chunk_index}_{hash(chunk_text)}",
1215
- "questions": questions_text,
1216
- "chunk_text": chunk_text
 
 
1217
  }
1218
 
1219
  except Exception as e:
1220
- import traceback
1221
- print(f" Error starting chunk lesson: {e}")
1222
- print(f"❌ Full traceback: {traceback.format_exc()}")
1223
- raise HTTPException(status_code=500, detail=f"Error starting lesson: {str(e)}")
1224
 
1225
  # Mount static files for production deployment
1226
  frontend_path = os.path.join(os.path.dirname(__file__), "..", "frontend")
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.responses import FileResponse
 
5
  import os
6
  import tempfile
 
 
 
7
  from dotenv import load_dotenv
8
+ from pydantic import BaseModel
 
9
  from typing import Optional, List
 
10
  import anthropic
11
+
 
12
  # Load environment variables
13
  load_dotenv()
14
 
 
100
 
101
  @app.post("/upload_pdf")
102
  async def upload_pdf(file: UploadFile = File(...)):
103
+ """Simple PDF upload endpoint that saves the file locally"""
104
+ print(f"📄 Uploading file: {file.filename}")
 
 
 
 
 
 
105
 
106
  try:
 
 
 
 
107
  # Read PDF bytes
108
  file_bytes = await file.read()
109
  print(f"📊 File size: {len(file_bytes)} bytes")
110
 
111
+ # Create temporary file to save PDF
112
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
113
  temp_file.write(file_bytes)
114
  temp_file_path = temp_file.name
115
 
116
+ print(f"✅ PDF saved to: {temp_file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  return {
119
+ "message": "PDF uploaded successfully!",
120
+ "file_path": temp_file_path,
121
+ "filename": file.filename,
122
+ "status": "uploaded",
123
+ "size": len(file_bytes)
124
  }
125
 
126
  except Exception as e:
127
+ print(f"❌ Error uploading PDF: {e}")
128
+ raise HTTPException(status_code=500, detail=f"PDF upload error: {str(e)}")
 
 
129
 
130
  # Mount static files for production deployment
131
  frontend_path = os.path.join(os.path.dirname(__file__), "..", "frontend")
backend/requirements.txt CHANGED
@@ -1,13 +1,6 @@
1
  uvicorn[standard]
2
  fastapi==0.115.7
3
  python-multipart>=0.0.5
4
- mistralai
5
  python-dotenv
6
- fireworks-ai
7
- langchain[fireworks]
8
- langchain
9
- langchain-core
10
- langchain-fireworks
11
  pydantic
12
- anthropic
13
- google-genai
 
1
  uvicorn[standard]
2
  fastapi==0.115.7
3
  python-multipart>=0.0.5
 
4
  python-dotenv
 
 
 
 
 
5
  pydantic
6
+ anthropic
 
frontend/components.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://ui.shadcn.com/schema.json",
3
+ "style": "new-york",
4
+ "rsc": false,
5
+ "tsx": false,
6
+ "tailwind": {
7
+ "config": "tailwind.config.js",
8
+ "css": "src/index.css",
9
+ "baseColor": "neutral",
10
+ "cssVariables": true,
11
+ "prefix": ""
12
+ },
13
+ "aliases": {
14
+ "components": "@/components",
15
+ "utils": "@/lib/utils",
16
+ "ui": "@/components/ui",
17
+ "lib": "@/lib",
18
+ "hooks": "@/hooks"
19
+ },
20
+ "iconLibrary": "lucide"
21
+ }
frontend/jsconfig.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "baseUrl": ".",
4
+ "paths": {
5
+ "@/*": ["./src/*"]
6
+ }
7
+ },
8
+ "include": ["src"]
9
+ }
frontend/package-lock.json CHANGED
@@ -8,11 +8,16 @@
8
  "name": "frontend",
9
  "version": "0.0.0",
10
  "dependencies": {
 
11
  "@llamaindex/chat-ui": "^0.5.17",
12
  "@swc/helpers": "^0.5.17",
13
  "@tailwindcss/postcss": "^4.1.11",
 
14
  "autoprefixer": "^10.4.21",
 
 
15
  "katex": "^0.16.22",
 
16
  "postcss": "^8.5.6",
17
  "react": "^18.3.1",
18
  "react-dom": "^18.3.1",
@@ -23,6 +28,7 @@
23
  "rehype-katex": "^7.0.1",
24
  "rehype-raw": "^7.0.0",
25
  "remark-math": "^6.0.0",
 
26
  "tailwindcss": "^4.1.11"
27
  },
28
  "devDependencies": {
@@ -34,9 +40,80 @@
34
  "eslint-plugin-react-hooks": "^5.2.0",
35
  "eslint-plugin-react-refresh": "^0.4.20",
36
  "globals": "^16.3.0",
 
37
  "vite": "^7.0.4"
38
  }
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "node_modules/@alloc/quick-lru": {
41
  "version": "5.2.0",
42
  "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz",
@@ -3186,6 +3263,15 @@
3186
  "integrity": "sha512-7NXolsK4CAS5+xvdj5OMMbI962hU/wvwoxk+LWR9Ek9bVtyuuYScDN6eS0rUm6TxApFpw7CX1o4uJzcd4AyD3Q==",
3187
  "license": "MIT"
3188
  },
 
 
 
 
 
 
 
 
 
3189
  "node_modules/@llamaindex/chat-ui/node_modules/make-cancellable-promise": {
3190
  "version": "1.3.2",
3191
  "resolved": "https://registry.npmjs.org/make-cancellable-promise/-/make-cancellable-promise-1.3.2.tgz",
@@ -4812,6 +4898,16 @@
4812
  "inline-style-parser": "0.1.1"
4813
  }
4814
  },
 
 
 
 
 
 
 
 
 
 
4815
  "node_modules/@llamaindex/chat-ui/node_modules/unified": {
4816
  "version": "10.1.2",
4817
  "resolved": "https://registry.npmjs.org/unified/-/unified-10.1.2.tgz",
@@ -5117,6 +5213,15 @@
5117
  "integrity": "sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA==",
5118
  "license": "MIT"
5119
  },
 
 
 
 
 
 
 
 
 
5120
  "node_modules/@radix-ui/colors": {
5121
  "version": "3.0.0",
5122
  "resolved": "https://registry.npmjs.org/@radix-ui/colors/-/colors-3.0.0.tgz",
@@ -5850,6 +5955,12 @@
5850
  "win32"
5851
  ]
5852
  },
 
 
 
 
 
 
5853
  "node_modules/@stitches/core": {
5854
  "version": "1.2.8",
5855
  "resolved": "https://registry.npmjs.org/@stitches/core/-/core-1.2.8.tgz",
@@ -6384,6 +6495,24 @@
6384
  "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
6385
  }
6386
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6387
  "node_modules/ajv": {
6388
  "version": "6.12.6",
6389
  "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
@@ -7405,6 +7534,15 @@
7405
  "es5-ext": "~0.10.14"
7406
  }
7407
  },
 
 
 
 
 
 
 
 
 
7408
  "node_modules/expand-template": {
7409
  "version": "2.0.3",
7410
  "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz",
@@ -8151,6 +8289,12 @@
8151
  "dev": true,
8152
  "license": "MIT"
8153
  },
 
 
 
 
 
 
8154
  "node_modules/json-schema-traverse": {
8155
  "version": "0.4.1",
8156
  "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
@@ -8551,12 +8695,12 @@
8551
  }
8552
  },
8553
  "node_modules/lucide-react": {
8554
- "version": "0.453.0",
8555
- "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.453.0.tgz",
8556
- "integrity": "sha512-kL+RGZCcJi9BvJtzg2kshO192Ddy9hv3ij+cPrVPWSRzgCWCVazoQJxOjAwgK53NomL07HB7GPHW120FimjNhQ==",
8557
  "license": "ISC",
8558
  "peerDependencies": {
8559
- "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0-rc"
8560
  }
8561
  },
8562
  "node_modules/lz-string": {
@@ -13899,6 +14043,19 @@
13899
  "node": ">=8"
13900
  }
13901
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
13902
  "node_modules/tabbable": {
13903
  "version": "6.2.0",
13904
  "resolved": "https://registry.npmjs.org/tabbable/-/tabbable-6.2.0.tgz",
@@ -13906,9 +14063,9 @@
13906
  "license": "MIT"
13907
  },
13908
  "node_modules/tailwind-merge": {
13909
- "version": "2.6.0",
13910
- "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-2.6.0.tgz",
13911
- "integrity": "sha512-P+Vu1qXfzediirmHOC3xKGAYeZtPcV9g76X+xg2FD4tYgR71ewMA35Y3sCz3zhiN/dwefRpJX0yBcgwi1fXNQA==",
13912
  "license": "MIT",
13913
  "funding": {
13914
  "type": "github",
@@ -13993,6 +14150,18 @@
13993
  "node": ">=18"
13994
  }
13995
  },
 
 
 
 
 
 
 
 
 
 
 
 
13996
  "node_modules/tiny-invariant": {
13997
  "version": "1.3.3",
13998
  "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz",
@@ -14127,6 +14296,16 @@
14127
  "node": "*"
14128
  }
14129
  },
 
 
 
 
 
 
 
 
 
 
14130
  "node_modules/type": {
14131
  "version": "2.7.3",
14132
  "resolved": "https://registry.npmjs.org/type/-/type-2.7.3.tgz",
@@ -14376,6 +14555,15 @@
14376
  }
14377
  }
14378
  },
 
 
 
 
 
 
 
 
 
14379
  "node_modules/util-deprecate": {
14380
  "version": "1.0.2",
14381
  "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
@@ -14627,6 +14815,25 @@
14627
  "url": "https://github.com/sponsors/sindresorhus"
14628
  }
14629
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14630
  "node_modules/zwitch": {
14631
  "version": "2.0.4",
14632
  "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz",
 
8
  "name": "frontend",
9
  "version": "0.0.0",
10
  "dependencies": {
11
+ "@ai-sdk/react": "^2.0.11",
12
  "@llamaindex/chat-ui": "^0.5.17",
13
  "@swc/helpers": "^0.5.17",
14
  "@tailwindcss/postcss": "^4.1.11",
15
+ "ai": "^5.0.11",
16
  "autoprefixer": "^10.4.21",
17
+ "class-variance-authority": "^0.7.1",
18
+ "clsx": "^2.1.1",
19
  "katex": "^0.16.22",
20
+ "lucide-react": "^0.539.0",
21
  "postcss": "^8.5.6",
22
  "react": "^18.3.1",
23
  "react-dom": "^18.3.1",
 
28
  "rehype-katex": "^7.0.1",
29
  "rehype-raw": "^7.0.0",
30
  "remark-math": "^6.0.0",
31
+ "tailwind-merge": "^3.3.1",
32
  "tailwindcss": "^4.1.11"
33
  },
34
  "devDependencies": {
 
40
  "eslint-plugin-react-hooks": "^5.2.0",
41
  "eslint-plugin-react-refresh": "^0.4.20",
42
  "globals": "^16.3.0",
43
+ "tw-animate-css": "^1.3.6",
44
  "vite": "^7.0.4"
45
  }
46
  },
47
+ "node_modules/@ai-sdk/gateway": {
48
+ "version": "1.0.5",
49
+ "resolved": "https://registry.npmjs.org/@ai-sdk/gateway/-/gateway-1.0.5.tgz",
50
+ "integrity": "sha512-GOhxiHm2nfuS618Ia13AWxEIhCsj5+tFaw6sjSO7pvMZT03QgFAJyX4xBYj+3i3mfIvw+yJOvyhVu1fI+pAHQA==",
51
+ "license": "Apache-2.0",
52
+ "dependencies": {
53
+ "@ai-sdk/provider": "2.0.0",
54
+ "@ai-sdk/provider-utils": "3.0.2"
55
+ },
56
+ "engines": {
57
+ "node": ">=18"
58
+ },
59
+ "peerDependencies": {
60
+ "zod": "^3.25.76 || ^4"
61
+ }
62
+ },
63
+ "node_modules/@ai-sdk/provider": {
64
+ "version": "2.0.0",
65
+ "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-2.0.0.tgz",
66
+ "integrity": "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==",
67
+ "license": "Apache-2.0",
68
+ "dependencies": {
69
+ "json-schema": "^0.4.0"
70
+ },
71
+ "engines": {
72
+ "node": ">=18"
73
+ }
74
+ },
75
+ "node_modules/@ai-sdk/provider-utils": {
76
+ "version": "3.0.2",
77
+ "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-3.0.2.tgz",
78
+ "integrity": "sha512-0a5a6VafkV6+0irdpqnub8WE6qzG2VMsDBpXb9NQIz8c4TG8fI+GSTFIL9sqrLEwXrHdiRj7fwJsrir4jClL0w==",
79
+ "license": "Apache-2.0",
80
+ "dependencies": {
81
+ "@ai-sdk/provider": "2.0.0",
82
+ "@standard-schema/spec": "^1.0.0",
83
+ "eventsource-parser": "^3.0.3",
84
+ "zod-to-json-schema": "^3.24.1"
85
+ },
86
+ "engines": {
87
+ "node": ">=18"
88
+ },
89
+ "peerDependencies": {
90
+ "zod": "^3.25.76 || ^4"
91
+ }
92
+ },
93
+ "node_modules/@ai-sdk/react": {
94
+ "version": "2.0.11",
95
+ "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-2.0.11.tgz",
96
+ "integrity": "sha512-XL73e7RSOQjYRCJQ96sDY6TxrMJK9YBgI518E6Jy306BjRwy5XyY94e/DN71TE6VpiwDzxixlymfDK90Ro95Jg==",
97
+ "license": "Apache-2.0",
98
+ "dependencies": {
99
+ "@ai-sdk/provider-utils": "3.0.2",
100
+ "ai": "5.0.11",
101
+ "swr": "^2.2.5",
102
+ "throttleit": "2.1.0"
103
+ },
104
+ "engines": {
105
+ "node": ">=18"
106
+ },
107
+ "peerDependencies": {
108
+ "react": "^18 || ^19 || ^19.0.0-rc",
109
+ "zod": "^3.25.76 || ^4"
110
+ },
111
+ "peerDependenciesMeta": {
112
+ "zod": {
113
+ "optional": true
114
+ }
115
+ }
116
+ },
117
  "node_modules/@alloc/quick-lru": {
118
  "version": "5.2.0",
119
  "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz",
 
3263
  "integrity": "sha512-7NXolsK4CAS5+xvdj5OMMbI962hU/wvwoxk+LWR9Ek9bVtyuuYScDN6eS0rUm6TxApFpw7CX1o4uJzcd4AyD3Q==",
3264
  "license": "MIT"
3265
  },
3266
+ "node_modules/@llamaindex/chat-ui/node_modules/lucide-react": {
3267
+ "version": "0.453.0",
3268
+ "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.453.0.tgz",
3269
+ "integrity": "sha512-kL+RGZCcJi9BvJtzg2kshO192Ddy9hv3ij+cPrVPWSRzgCWCVazoQJxOjAwgK53NomL07HB7GPHW120FimjNhQ==",
3270
+ "license": "ISC",
3271
+ "peerDependencies": {
3272
+ "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0-rc"
3273
+ }
3274
+ },
3275
  "node_modules/@llamaindex/chat-ui/node_modules/make-cancellable-promise": {
3276
  "version": "1.3.2",
3277
  "resolved": "https://registry.npmjs.org/make-cancellable-promise/-/make-cancellable-promise-1.3.2.tgz",
 
4898
  "inline-style-parser": "0.1.1"
4899
  }
4900
  },
4901
+ "node_modules/@llamaindex/chat-ui/node_modules/tailwind-merge": {
4902
+ "version": "2.6.0",
4903
+ "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-2.6.0.tgz",
4904
+ "integrity": "sha512-P+Vu1qXfzediirmHOC3xKGAYeZtPcV9g76X+xg2FD4tYgR71ewMA35Y3sCz3zhiN/dwefRpJX0yBcgwi1fXNQA==",
4905
+ "license": "MIT",
4906
+ "funding": {
4907
+ "type": "github",
4908
+ "url": "https://github.com/sponsors/dcastil"
4909
+ }
4910
+ },
4911
  "node_modules/@llamaindex/chat-ui/node_modules/unified": {
4912
  "version": "10.1.2",
4913
  "resolved": "https://registry.npmjs.org/unified/-/unified-10.1.2.tgz",
 
5213
  "integrity": "sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA==",
5214
  "license": "MIT"
5215
  },
5216
+ "node_modules/@opentelemetry/api": {
5217
+ "version": "1.9.0",
5218
+ "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz",
5219
+ "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==",
5220
+ "license": "Apache-2.0",
5221
+ "engines": {
5222
+ "node": ">=8.0.0"
5223
+ }
5224
+ },
5225
  "node_modules/@radix-ui/colors": {
5226
  "version": "3.0.0",
5227
  "resolved": "https://registry.npmjs.org/@radix-ui/colors/-/colors-3.0.0.tgz",
 
5955
  "win32"
5956
  ]
5957
  },
5958
+ "node_modules/@standard-schema/spec": {
5959
+ "version": "1.0.0",
5960
+ "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.0.0.tgz",
5961
+ "integrity": "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==",
5962
+ "license": "MIT"
5963
+ },
5964
  "node_modules/@stitches/core": {
5965
  "version": "1.2.8",
5966
  "resolved": "https://registry.npmjs.org/@stitches/core/-/core-1.2.8.tgz",
 
6495
  "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
6496
  }
6497
  },
6498
+ "node_modules/ai": {
6499
+ "version": "5.0.11",
6500
+ "resolved": "https://registry.npmjs.org/ai/-/ai-5.0.11.tgz",
6501
+ "integrity": "sha512-PtiQAnhlWuN3Y2z9PifM/9XIQ0HIoHjZqEu7zHffyGEXiqHLtrJpt4IiGVzUTAKxXM5JCtO9sD/hwGXDp7ZYsw==",
6502
+ "license": "Apache-2.0",
6503
+ "dependencies": {
6504
+ "@ai-sdk/gateway": "1.0.5",
6505
+ "@ai-sdk/provider": "2.0.0",
6506
+ "@ai-sdk/provider-utils": "3.0.2",
6507
+ "@opentelemetry/api": "1.9.0"
6508
+ },
6509
+ "engines": {
6510
+ "node": ">=18"
6511
+ },
6512
+ "peerDependencies": {
6513
+ "zod": "^3.25.76 || ^4"
6514
+ }
6515
+ },
6516
  "node_modules/ajv": {
6517
  "version": "6.12.6",
6518
  "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
 
7534
  "es5-ext": "~0.10.14"
7535
  }
7536
  },
7537
+ "node_modules/eventsource-parser": {
7538
+ "version": "3.0.3",
7539
+ "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.3.tgz",
7540
+ "integrity": "sha512-nVpZkTMM9rF6AQ9gPJpFsNAMt48wIzB5TQgiTLdHiuO8XEDhUgZEhqKlZWXbIzo9VmJ/HvysHqEaVeD5v9TPvA==",
7541
+ "license": "MIT",
7542
+ "engines": {
7543
+ "node": ">=20.0.0"
7544
+ }
7545
+ },
7546
  "node_modules/expand-template": {
7547
  "version": "2.0.3",
7548
  "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz",
 
8289
  "dev": true,
8290
  "license": "MIT"
8291
  },
8292
+ "node_modules/json-schema": {
8293
+ "version": "0.4.0",
8294
+ "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
8295
+ "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==",
8296
+ "license": "(AFL-2.1 OR BSD-3-Clause)"
8297
+ },
8298
  "node_modules/json-schema-traverse": {
8299
  "version": "0.4.1",
8300
  "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
 
8695
  }
8696
  },
8697
  "node_modules/lucide-react": {
8698
+ "version": "0.539.0",
8699
+ "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.539.0.tgz",
8700
+ "integrity": "sha512-VVISr+VF2krO91FeuCrm1rSOLACQUYVy7NQkzrOty52Y8TlTPcXcMdQFj9bYzBgXbWCiywlwSZ3Z8u6a+6bMlg==",
8701
  "license": "ISC",
8702
  "peerDependencies": {
8703
+ "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
8704
  }
8705
  },
8706
  "node_modules/lz-string": {
 
14043
  "node": ">=8"
14044
  }
14045
  },
14046
+ "node_modules/swr": {
14047
+ "version": "2.3.6",
14048
+ "resolved": "https://registry.npmjs.org/swr/-/swr-2.3.6.tgz",
14049
+ "integrity": "sha512-wfHRmHWk/isGNMwlLGlZX5Gzz/uTgo0o2IRuTMcf4CPuPFJZlq0rDaKUx+ozB5nBOReNV1kiOyzMfj+MBMikLw==",
14050
+ "license": "MIT",
14051
+ "dependencies": {
14052
+ "dequal": "^2.0.3",
14053
+ "use-sync-external-store": "^1.4.0"
14054
+ },
14055
+ "peerDependencies": {
14056
+ "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
14057
+ }
14058
+ },
14059
  "node_modules/tabbable": {
14060
  "version": "6.2.0",
14061
  "resolved": "https://registry.npmjs.org/tabbable/-/tabbable-6.2.0.tgz",
 
14063
  "license": "MIT"
14064
  },
14065
  "node_modules/tailwind-merge": {
14066
+ "version": "3.3.1",
14067
+ "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.3.1.tgz",
14068
+ "integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==",
14069
  "license": "MIT",
14070
  "funding": {
14071
  "type": "github",
 
14150
  "node": ">=18"
14151
  }
14152
  },
14153
+ "node_modules/throttleit": {
14154
+ "version": "2.1.0",
14155
+ "resolved": "https://registry.npmjs.org/throttleit/-/throttleit-2.1.0.tgz",
14156
+ "integrity": "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==",
14157
+ "license": "MIT",
14158
+ "engines": {
14159
+ "node": ">=18"
14160
+ },
14161
+ "funding": {
14162
+ "url": "https://github.com/sponsors/sindresorhus"
14163
+ }
14164
+ },
14165
  "node_modules/tiny-invariant": {
14166
  "version": "1.3.3",
14167
  "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz",
 
14296
  "node": "*"
14297
  }
14298
  },
14299
+ "node_modules/tw-animate-css": {
14300
+ "version": "1.3.6",
14301
+ "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.3.6.tgz",
14302
+ "integrity": "sha512-9dy0R9UsYEGmgf26L8UcHiLmSFTHa9+D7+dAt/G/sF5dCnPePZbfgDYinc7/UzAM7g/baVrmS6m9yEpU46d+LA==",
14303
+ "dev": true,
14304
+ "license": "MIT",
14305
+ "funding": {
14306
+ "url": "https://github.com/sponsors/Wombosvideo"
14307
+ }
14308
+ },
14309
  "node_modules/type": {
14310
  "version": "2.7.3",
14311
  "resolved": "https://registry.npmjs.org/type/-/type-2.7.3.tgz",
 
14555
  }
14556
  }
14557
  },
14558
+ "node_modules/use-sync-external-store": {
14559
+ "version": "1.5.0",
14560
+ "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.5.0.tgz",
14561
+ "integrity": "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==",
14562
+ "license": "MIT",
14563
+ "peerDependencies": {
14564
+ "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
14565
+ }
14566
+ },
14567
  "node_modules/util-deprecate": {
14568
  "version": "1.0.2",
14569
  "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
 
14815
  "url": "https://github.com/sponsors/sindresorhus"
14816
  }
14817
  },
14818
+ "node_modules/zod": {
14819
+ "version": "3.25.76",
14820
+ "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
14821
+ "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
14822
+ "license": "MIT",
14823
+ "peer": true,
14824
+ "funding": {
14825
+ "url": "https://github.com/sponsors/colinhacks"
14826
+ }
14827
+ },
14828
+ "node_modules/zod-to-json-schema": {
14829
+ "version": "3.24.6",
14830
+ "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.24.6.tgz",
14831
+ "integrity": "sha512-h/z3PKvcTcTetyjl1fkj79MHNEjm+HpD6NXheWjzOekY7kV+lwDYnHw+ivHkijnCSMz1yJaWBD9vu/Fcmk+vEg==",
14832
+ "license": "ISC",
14833
+ "peerDependencies": {
14834
+ "zod": "^3.24.1"
14835
+ }
14836
+ },
14837
  "node_modules/zwitch": {
14838
  "version": "2.0.4",
14839
  "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz",
frontend/package.json CHANGED
@@ -10,11 +10,16 @@
10
  "preview": "vite preview"
11
  },
12
  "dependencies": {
 
13
  "@llamaindex/chat-ui": "^0.5.17",
14
  "@swc/helpers": "^0.5.17",
15
  "@tailwindcss/postcss": "^4.1.11",
 
16
  "autoprefixer": "^10.4.21",
 
 
17
  "katex": "^0.16.22",
 
18
  "postcss": "^8.5.6",
19
  "react": "^18.3.1",
20
  "react-dom": "^18.3.1",
@@ -25,6 +30,7 @@
25
  "rehype-katex": "^7.0.1",
26
  "rehype-raw": "^7.0.0",
27
  "remark-math": "^6.0.0",
 
28
  "tailwindcss": "^4.1.11"
29
  },
30
  "devDependencies": {
@@ -36,6 +42,7 @@
36
  "eslint-plugin-react-hooks": "^5.2.0",
37
  "eslint-plugin-react-refresh": "^0.4.20",
38
  "globals": "^16.3.0",
 
39
  "vite": "^7.0.4"
40
  }
41
  }
 
10
  "preview": "vite preview"
11
  },
12
  "dependencies": {
13
+ "@ai-sdk/react": "^2.0.11",
14
  "@llamaindex/chat-ui": "^0.5.17",
15
  "@swc/helpers": "^0.5.17",
16
  "@tailwindcss/postcss": "^4.1.11",
17
+ "ai": "^5.0.11",
18
  "autoprefixer": "^10.4.21",
19
+ "class-variance-authority": "^0.7.1",
20
+ "clsx": "^2.1.1",
21
  "katex": "^0.16.22",
22
+ "lucide-react": "^0.539.0",
23
  "postcss": "^8.5.6",
24
  "react": "^18.3.1",
25
  "react-dom": "^18.3.1",
 
30
  "rehype-katex": "^7.0.1",
31
  "rehype-raw": "^7.0.0",
32
  "remark-math": "^6.0.0",
33
+ "tailwind-merge": "^3.3.1",
34
  "tailwindcss": "^4.1.11"
35
  },
36
  "devDependencies": {
 
42
  "eslint-plugin-react-hooks": "^5.2.0",
43
  "eslint-plugin-react-refresh": "^0.4.20",
44
  "globals": "^16.3.0",
45
+ "tw-animate-css": "^1.3.6",
46
  "vite": "^7.0.4"
47
  }
48
  }
frontend/src/App.jsx CHANGED
@@ -1,15 +1,15 @@
1
  import { BrowserRouter as Router, Routes, Route } from 'react-router-dom';
2
  import Homepage from './components/Homepage';
3
- import UploadPage from './components/UploadPage';
4
  import DocumentProcessor from './components/DocumentProcessor';
 
5
 
6
  function App() {
7
  return (
8
  <Router>
9
  <Routes>
10
  <Route path="/" element={<Homepage />} />
11
- <Route path="/upload" element={<UploadPage />} />
12
  <Route path="/process" element={<DocumentProcessor />} />
 
13
  </Routes>
14
  </Router>
15
  );
 
1
  import { BrowserRouter as Router, Routes, Route } from 'react-router-dom';
2
  import Homepage from './components/Homepage';
 
3
  import DocumentProcessor from './components/DocumentProcessor';
4
+ import TestComponent from './components/TestComponent';
5
 
6
  function App() {
7
  return (
8
  <Router>
9
  <Routes>
10
  <Route path="/" element={<Homepage />} />
 
11
  <Route path="/process" element={<DocumentProcessor />} />
12
+ <Route path="/chat" element={<TestComponent />} />
13
  </Routes>
14
  </Router>
15
  );
frontend/src/components/ChunkPanel.jsx CHANGED
@@ -2,10 +2,9 @@ import ReactMarkdown from 'react-markdown';
2
  import remarkMath from 'remark-math';
3
  import rehypeKatex from 'rehype-katex';
4
  import rehypeRaw from 'rehype-raw';
5
- import { ChatSection, ChatMessages, ChatInput } from '@llamaindex/chat-ui';
6
- import '@llamaindex/chat-ui/styles/markdown.css';
7
  import { useState } from 'react';
8
- import { getChunkMarkdownComponents, getChatMarkdownComponents } from '../utils/markdownComponents.jsx';
 
9
 
10
  const ChunkPanel = ({
11
  documentData,
@@ -15,88 +14,10 @@ const ChunkPanel = ({
15
  chunkStates,
16
  skipChunk,
17
  markChunkUnderstood,
18
- startInteractiveLesson,
19
- fetchImage,
20
- imageCache,
21
- setImageCache
22
  }) => {
23
- const chunkMarkdownComponents = getChunkMarkdownComponents(documentData, fetchImage, imageCache, setImageCache);
24
  const chatMarkdownComponents = getChatMarkdownComponents();
25
-
26
- // Custom chat handler that mimics useChat API
27
- const [messages, setMessages] = useState([{
28
- id: 'welcome',
29
- role: 'assistant',
30
- content: `I'm here to help you understand this section: **${documentData?.chunks?.[currentChunkIndex]?.topic || 'Loading...'}**\n\nFeel free to ask me any questions about the content!`
31
- }]);
32
- const [input, setInput] = useState('');
33
- const [isLoading, setIsLoading] = useState(false);
34
-
35
- const handleInputChange = (e) => {
36
- setInput(e.target.value);
37
- };
38
-
39
- const handleSubmit = async (e) => {
40
- e.preventDefault();
41
- if (!input.trim() || isLoading) return;
42
-
43
- const userMessage = {
44
- id: Date.now().toString(),
45
- role: 'user',
46
- content: input
47
- };
48
-
49
- setMessages(prev => [...prev, userMessage]);
50
- setInput('');
51
- setIsLoading(true);
52
-
53
- try {
54
- // For now, using backend proxy to avoid exposing API key in frontend
55
- const response = await fetch('/api/anthropic-chat', {
56
- method: 'POST',
57
- headers: {
58
- 'Content-Type': 'application/json',
59
- },
60
- body: JSON.stringify({
61
- messages: [...messages, userMessage].map(msg => ({
62
- role: msg.role,
63
- content: msg.content
64
- })),
65
- context: {
66
- topic: documentData?.chunks?.[currentChunkIndex]?.topic,
67
- chunkText: documentData?.chunks?.[currentChunkIndex]?.text
68
- }
69
- })
70
- });
71
-
72
- const data = await response.json();
73
-
74
- const assistantMessage = {
75
- id: (Date.now() + 1).toString(),
76
- role: 'assistant',
77
- content: data.content || data.message
78
- };
79
-
80
- setMessages(prev => [...prev, assistantMessage]);
81
- } catch (error) {
82
- console.error('Chat error:', error);
83
- setMessages(prev => [...prev, {
84
- id: (Date.now() + 1).toString(),
85
- role: 'assistant',
86
- content: 'Sorry, I encountered an error. Please try again.'
87
- }]);
88
- } finally {
89
- setIsLoading(false);
90
- }
91
- };
92
-
93
- const chatHandler = {
94
- messages,
95
- input,
96
- handleInputChange,
97
- handleSubmit,
98
- isLoading
99
- };
100
 
101
  return (
102
  <>
@@ -124,49 +45,82 @@ const ChunkPanel = ({
124
  </span>
125
  </button>
126
 
127
- <button
128
- onClick={markChunkUnderstood}
129
- className="py-2 px-4 bg-gray-50 hover:bg-gray-100 text-gray-600 rounded-lg transition-all text-sm"
130
- >
131
-
132
- </button>
133
  </div>
134
 
135
  {/* Expandable Chunk Content */}
136
  {chunkExpanded && documentData?.chunks?.[currentChunkIndex] && (
137
- <div className="prose prose-sm max-w-none">
138
- <ReactMarkdown
139
- remarkPlugins={[remarkMath]}
140
- rehypePlugins={[rehypeRaw, rehypeKatex]}
141
- components={chunkMarkdownComponents}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  >
143
- {documentData.chunks[currentChunkIndex].text}
144
- </ReactMarkdown>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  </div>
146
  )}
147
  </div>
148
 
149
- {/* Chat Interface */}
150
- <div className="flex-1 flex flex-col min-h-0 bg-gray-50 rounded-lg m-2">
151
- <div className="flex-1 overflow-hidden">
152
- <ChatSection handler={chatHandler}>
153
- <ChatMessages
154
- className="p-4"
155
- showCopy={false}
156
- />
157
- <ChatInput>
158
- <ChatInput.Form className="bg-white rounded-lg mx-4 mb-4 border border-gray-200 relative">
159
- <ChatInput.Field
160
- type="textarea"
161
- className="resize-none border-0 focus:ring-0 pr-12"
162
- placeholder="Ask about this section..."
163
- />
164
- <ChatInput.Submit className="absolute right-2 bottom-2 w-8 h-8 rounded-full bg-gray-500 hover:bg-gray-600 text-white border-0 flex items-center justify-center" />
165
- </ChatInput.Form>
166
- </ChatInput>
167
- </ChatSection>
168
  </div>
169
- </div>
170
  </>
171
  );
172
  };
 
2
  import remarkMath from 'remark-math';
3
  import rehypeKatex from 'rehype-katex';
4
  import rehypeRaw from 'rehype-raw';
 
 
5
  import { useState } from 'react';
6
+ import { getChatMarkdownComponents } from '../utils/markdownComponents.jsx';
7
+ import SimpleChat from './SimpleChat.jsx';
8
 
9
  const ChunkPanel = ({
10
  documentData,
 
14
  chunkStates,
15
  skipChunk,
16
  markChunkUnderstood,
17
+ startInteractiveLesson
 
 
 
18
  }) => {
 
19
  const chatMarkdownComponents = getChatMarkdownComponents();
20
+ const [showChat, setShowChat] = useState(false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  return (
23
  <>
 
45
  </span>
46
  </button>
47
 
 
 
 
 
 
 
48
  </div>
49
 
50
  {/* Expandable Chunk Content */}
51
  {chunkExpanded && documentData?.chunks?.[currentChunkIndex] && (
52
+ <>
53
+ <div className="prose prose-sm max-w-none">
54
+ <ReactMarkdown
55
+ remarkPlugins={[remarkMath]}
56
+ rehypePlugins={[rehypeRaw, rehypeKatex]}
57
+ components={chatMarkdownComponents}
58
+ >
59
+ {documentData.chunks[currentChunkIndex].text}
60
+ </ReactMarkdown>
61
+ </div>
62
+
63
+ {/* Action Buttons */}
64
+ <div className="flex items-center justify-center gap-4 mt-4 pt-4 border-gray-200">
65
+ <button
66
+ onClick={skipChunk}
67
+ className="py-2 px-4 bg-white hover:bg-gray-50 border border-gray-300 rounded-lg transition-all text-sm"
68
+ >
69
+ Skip
70
+ </button>
71
+ <button
72
+ onClick={() => setShowChat(!showChat)}
73
+ className="py-2 px-4 bg-white hover:bg-gray-50 border border-gray-300 rounded-lg transition-all text-sm flex items-center gap-1"
74
+ >
75
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5">
76
+ <path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"/>
77
+ </svg>
78
+ Chat
79
+ </button>
80
+ <button
81
+ onClick={markChunkUnderstood}
82
+ className="py-2 px-4 bg-white hover:bg-gray-50 border border-gray-300 rounded-lg transition-all text-sm"
83
+ >
84
+ Understood
85
+ </button>
86
+ </div>
87
+ </>
88
+ )}
89
+
90
+ {/* Show buttons even when chunk is collapsed */}
91
+ {!chunkExpanded && (
92
+ <div className="flex items-center justify-center gap-4 mt-4 pt-4 border-t border-gray-200">
93
+ <button
94
+ onClick={skipChunk}
95
+ className="py-2 px-4 bg-white hover:bg-gray-50 border border-gray-300 rounded-lg transition-all text-sm"
96
  >
97
+ Skip
98
+ </button>
99
+ <button
100
+ onClick={() => setShowChat(!showChat)}
101
+ className="py-2 px-4 bg-white hover:bg-gray-50 border border-gray-300 rounded-lg transition-all text-sm flex items-center gap-1"
102
+ >
103
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5">
104
+ <path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z"/>
105
+ </svg>
106
+ Chat
107
+ </button>
108
+ <button
109
+ onClick={markChunkUnderstood}
110
+ className="py-2 px-4 bg-white hover:bg-gray-50 border border-gray-300 rounded-lg transition-all text-sm"
111
+ >
112
+ Understood
113
+ </button>
114
  </div>
115
  )}
116
  </div>
117
 
118
+ {/* Chat Interface - Only shown when showChat is true */}
119
+ {showChat && (
120
+ <div className="flex-1 flex flex-col min-h-0 bg-white rounded-lg m-2 shadow-lg">
121
+ <SimpleChat />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  </div>
123
+ )}
124
  </>
125
  );
126
  };
frontend/src/components/DocumentProcessor.jsx CHANGED
@@ -1,9 +1,7 @@
1
- // Removed useMemo import - no longer needed
2
  import 'katex/dist/katex.min.css';
3
 
4
  // Import custom hooks
5
  import { useDocumentProcessor } from '../hooks/useDocumentProcessor';
6
- // Removed useChat import - handled in ChunkPanel now
7
  import { useChunkNavigation } from '../hooks/useChunkNavigation';
8
  import { usePanelResize } from '../hooks/usePanelResize';
9
 
@@ -13,8 +11,6 @@ import DocumentViewer from './DocumentViewer';
13
  import ChunkNavigation from './ChunkNavigation';
14
  import ChunkPanel from './ChunkPanel';
15
 
16
- // Removed markdown utilities - using PDF viewer now
17
-
18
  function DocumentProcessor() {
19
  // Custom hooks
20
  const {
@@ -22,17 +18,12 @@ function DocumentProcessor() {
22
  selectedFile,
23
  processing,
24
  uploadProgress,
25
- ocrProgress,
26
  documentData,
27
- imageCache,
28
  handleFileChange,
29
- fetchImage,
30
  processDocument,
31
  setSelectedFile
32
  } = useDocumentProcessor();
33
 
34
- // Removed useChat hook - now handled in ChunkPanel
35
-
36
  const {
37
  chunkStates,
38
  currentChunkIndex,
@@ -43,7 +34,7 @@ function DocumentProcessor() {
43
  markChunkUnderstood,
44
  startInteractiveLesson,
45
  setChunkExpanded
46
- } = useChunkNavigation(documentData, null); // No clearTypingAnimation needed
47
 
48
  const {
49
  leftPanelWidth,
@@ -51,15 +42,12 @@ function DocumentProcessor() {
51
  containerRef,
52
  handleMouseDown
53
  } = usePanelResize(50);
54
- } = usePanelResize(50);
55
 
56
- // Simplified startInteractiveLesson - no chat hook needed
57
  const handleStartInteractiveLesson = () => {
58
  startInteractiveLesson();
59
  };
60
 
61
- // No longer need highlighted markdown - using PDF viewer instead
62
-
63
  // Early returns for different states
64
  if (!selectedFile) {
65
  return (
@@ -82,7 +70,7 @@ function DocumentProcessor() {
82
  }
83
 
84
  if (processing) {
85
- return <LoadingAnimation uploadProgress={uploadProgress} ocrProgress={ocrProgress} />;
86
  }
87
 
88
  if (!documentData) {
@@ -116,7 +104,6 @@ function DocumentProcessor() {
116
  {/* Left Panel - Document */}
117
  <div style={{ width: `${leftPanelWidth}%`, height: '100%' }}>
118
  <DocumentViewer
119
- selectedFile={selectedFile}
120
  selectedFile={selectedFile}
121
  documentData={documentData}
122
  />
@@ -162,9 +149,6 @@ function DocumentProcessor() {
162
  skipChunk={skipChunk}
163
  markChunkUnderstood={markChunkUnderstood}
164
  startInteractiveLesson={handleStartInteractiveLesson}
165
- fetchImage={fetchImage}
166
- imageCache={imageCache}
167
- setImageCache={() => {}} // Handled by useDocumentProcessor
168
  />
169
  </div>
170
  </div>
 
 
1
  import 'katex/dist/katex.min.css';
2
 
3
  // Import custom hooks
4
  import { useDocumentProcessor } from '../hooks/useDocumentProcessor';
 
5
  import { useChunkNavigation } from '../hooks/useChunkNavigation';
6
  import { usePanelResize } from '../hooks/usePanelResize';
7
 
 
11
  import ChunkNavigation from './ChunkNavigation';
12
  import ChunkPanel from './ChunkPanel';
13
 
 
 
14
  function DocumentProcessor() {
15
  // Custom hooks
16
  const {
 
18
  selectedFile,
19
  processing,
20
  uploadProgress,
 
21
  documentData,
 
22
  handleFileChange,
 
23
  processDocument,
24
  setSelectedFile
25
  } = useDocumentProcessor();
26
 
 
 
27
  const {
28
  chunkStates,
29
  currentChunkIndex,
 
34
  markChunkUnderstood,
35
  startInteractiveLesson,
36
  setChunkExpanded
37
+ } = useChunkNavigation(documentData, null);
38
 
39
  const {
40
  leftPanelWidth,
 
42
  containerRef,
43
  handleMouseDown
44
  } = usePanelResize(50);
 
45
 
46
+ // Simplified startInteractiveLesson
47
  const handleStartInteractiveLesson = () => {
48
  startInteractiveLesson();
49
  };
50
 
 
 
51
  // Early returns for different states
52
  if (!selectedFile) {
53
  return (
 
70
  }
71
 
72
  if (processing) {
73
+ return <LoadingAnimation uploadProgress={uploadProgress} />;
74
  }
75
 
76
  if (!documentData) {
 
104
  {/* Left Panel - Document */}
105
  <div style={{ width: `${leftPanelWidth}%`, height: '100%' }}>
106
  <DocumentViewer
 
107
  selectedFile={selectedFile}
108
  documentData={documentData}
109
  />
 
149
  skipChunk={skipChunk}
150
  markChunkUnderstood={markChunkUnderstood}
151
  startInteractiveLesson={handleStartInteractiveLesson}
 
 
 
152
  />
153
  </div>
154
  </div>
frontend/src/components/DocumentProcessor.jsx.backup DELETED
@@ -1,889 +0,0 @@
1
- import { useMemo } from 'react';
2
- import 'katex/dist/katex.min.css';
3
-
4
- // Import custom hooks
5
- import { useDocumentProcessor } from '../hooks/useDocumentProcessor';
6
- import { useChat } from '../hooks/useChat';
7
- import { useChunkNavigation } from '../hooks/useChunkNavigation';
8
- import { usePanelResize } from '../hooks/usePanelResize';
9
-
10
- // Import components
11
- import LoadingAnimation from './LoadingAnimation';
12
- import DocumentViewer from './DocumentViewer';
13
- import ChunkNavigation from './ChunkNavigation';
14
- import ChunkPanel from './ChunkPanel';
15
-
16
- // Import utilities
17
- import { highlightChunkInMarkdown } from '../utils/markdownUtils';
18
-
19
-
20
- function DocumentProcessor() {
21
- // Custom hooks
22
- const {
23
- fileInputRef,
24
- selectedFile,
25
- processing,
26
- uploadProgress,
27
- ocrProgress,
28
- documentData,
29
- imageCache,
30
- handleFileChange,
31
- fetchImage,
32
- processDocument,
33
- setSelectedFile
34
- } = useDocumentProcessor();
35
-
36
- const {
37
- chatLoading,
38
- chatMessages,
39
- userInput,
40
- typingMessage,
41
- startChunkLesson,
42
- clearTypingAnimation,
43
- setUserInput
44
- } = useChat();
45
-
46
- const {
47
- chunkStates,
48
- currentChunkIndex,
49
- chunkExpanded,
50
- goToNextChunk,
51
- goToPrevChunk,
52
- skipChunk,
53
- markChunkUnderstood,
54
- startInteractiveLesson,
55
- setChunkExpanded
56
- } = useChunkNavigation(documentData, clearTypingAnimation);
57
-
58
- const {
59
- leftPanelWidth,
60
- isDragging,
61
- containerRef,
62
- handleMouseDown
63
- } = usePanelResize(40);
64
-
65
- // Enhanced startInteractiveLesson that uses the chat hook
66
- const handleStartInteractiveLesson = () => {
67
- startInteractiveLesson(() => startChunkLesson(currentChunkIndex, documentData));
68
- };
69
-
70
- // Memoize the highlighted markdown to prevent unnecessary re-renders
71
- const highlightedMarkdown = useMemo(() => {
72
- if (!documentData || !documentData.markdown || !documentData.chunks) {
73
- return '';
74
- }
75
- return highlightChunkInMarkdown(documentData.markdown, documentData.chunks, currentChunkIndex);
76
- }, [documentData?.markdown, documentData?.chunks, currentChunkIndex]);
77
-
78
-
79
- // Handle panel resizing
80
- const handleMouseDown = (e) => {
81
- setIsDragging(true);
82
- e.preventDefault();
83
- };
84
-
85
- const handleMouseMove = (e) => {
86
- if (!isDragging || !containerRef.current) return;
87
-
88
- const containerRect = containerRef.current.getBoundingClientRect();
89
- const newLeftWidth = ((e.clientX - containerRect.left) / containerRect.width) * 100;
90
-
91
- // Constrain between 20% and 80%
92
- if (newLeftWidth >= 20 && newLeftWidth <= 80) {
93
- setLeftPanelWidth(newLeftWidth);
94
- }
95
- };
96
-
97
- const handleMouseUp = () => {
98
- setIsDragging(false);
99
- };
100
-
101
- useEffect(() => {
102
- if (isDragging) {
103
- document.addEventListener('mousemove', handleMouseMove);
104
- document.addEventListener('mouseup', handleMouseUp);
105
- return () => {
106
- document.removeEventListener('mousemove', handleMouseMove);
107
- document.removeEventListener('mouseup', handleMouseUp);
108
- };
109
- }
110
- }, [isDragging]);
111
-
112
- // Function to simulate typing animation
113
- const typeMessage = (text, callback) => {
114
- // Clear any existing typing animation
115
- if (typingInterval) {
116
- clearInterval(typingInterval);
117
- }
118
-
119
- setTypingMessage('');
120
- let currentIndex = 0;
121
- const typeSpeed = Math.max(1, Math.min(3, 200 / text.length)); // Much faster: max 800ms total
122
-
123
- const interval = setInterval(() => {
124
- if (currentIndex < text.length) {
125
- setTypingMessage(text.slice(0, currentIndex + 1));
126
- currentIndex++;
127
- } else {
128
- clearInterval(interval);
129
- setTypingInterval(null);
130
- setTypingMessage('');
131
- callback();
132
- }
133
- }, typeSpeed);
134
-
135
- setTypingInterval(interval);
136
- };
137
-
138
- // Function to start a chunk lesson
139
- const startChunkLesson = async (chunkIndex) => {
140
- if (!documentData || !documentData.chunks[chunkIndex]) return;
141
-
142
- setChatLoading(true);
143
-
144
- try {
145
- const chunk = documentData.chunks[chunkIndex];
146
- console.log('Starting lesson for chunk:', chunkIndex, chunk);
147
- console.log('Document data:', documentData.fileId, documentData.markdown?.length);
148
-
149
- const response = await fetch(`/start_chunk_lesson/${documentData.fileId}/${chunkIndex}`, {
150
- method: 'POST',
151
- headers: {
152
- 'Content-Type': 'application/json',
153
- },
154
- body: JSON.stringify({
155
- chunk: chunk,
156
- document_markdown: documentData.markdown
157
- })
158
- });
159
-
160
- if (!response.ok) {
161
- const errorData = await response.text();
162
- console.error('Backend error:', errorData);
163
- throw new Error(`Failed to start lesson: ${response.status} - ${errorData}`);
164
- }
165
-
166
- const lessonData = await response.json();
167
- setChatData(prev => ({
168
- ...prev,
169
- [chunkIndex]: {
170
- ...lessonData,
171
- chunkIndex: chunkIndex,
172
- chunk: chunk
173
- }
174
- }));
175
-
176
- setChatLoading(false);
177
-
178
- // Type out the message with animation
179
- typeMessage(lessonData.questions, () => {
180
- setChatMessages(prev => ({
181
- ...prev,
182
- [chunkIndex]: [
183
- { type: 'ai', text: lessonData.questions }
184
- ]
185
- }));
186
- });
187
-
188
- } catch (error) {
189
- console.error('Error starting lesson:', error);
190
- alert('Error starting lesson: ' + error.message);
191
- setChatLoading(false);
192
- }
193
- };
194
-
195
- // Navigation functions
196
- const goToNextChunk = () => {
197
- if (documentData && currentChunkIndex < documentData.chunks.length - 1) {
198
- // Clear any ongoing typing animation
199
- if (typingInterval) {
200
- clearInterval(typingInterval);
201
- setTypingInterval(null);
202
- }
203
- setTypingMessage('');
204
- setCurrentChunkIndex(currentChunkIndex + 1);
205
- }
206
- };
207
-
208
- const goToPrevChunk = () => {
209
- if (currentChunkIndex > 0) {
210
- // Clear any ongoing typing animation
211
- if (typingInterval) {
212
- clearInterval(typingInterval);
213
- setTypingInterval(null);
214
- }
215
- setTypingMessage('');
216
- setCurrentChunkIndex(currentChunkIndex - 1);
217
- }
218
- };
219
-
220
- // Chunk action functions
221
- const skipChunk = () => {
222
- setChunkStates(prev => ({
223
- ...prev,
224
- [currentChunkIndex]: 'skipped'
225
- }));
226
- };
227
-
228
- const markChunkUnderstood = () => {
229
- setChunkStates(prev => ({
230
- ...prev,
231
- [currentChunkIndex]: 'understood'
232
- }));
233
- };
234
-
235
- const startInteractiveLesson = () => {
236
- setChunkStates(prev => ({
237
- ...prev,
238
- [currentChunkIndex]: 'interactive'
239
- }));
240
- startChunkLesson(currentChunkIndex);
241
- };
242
-
243
- const fetchImage = useCallback(async (imageId, fileId) => {
244
- // Check if image is already cached using ref
245
- if (imageCacheRef.current[imageId]) {
246
- return imageCacheRef.current[imageId];
247
- }
248
-
249
- try {
250
- const response = await fetch(`/get_image/${fileId}/${imageId}`);
251
- if (response.ok) {
252
- const data = await response.json();
253
- const imageData = data.image_base64;
254
-
255
- // Cache the image in ref
256
- imageCacheRef.current = {
257
- ...imageCacheRef.current,
258
- [imageId]: imageData
259
- };
260
-
261
- // Also update state for other components that might need it
262
- setImageCache(prev => ({
263
- ...prev,
264
- [imageId]: imageData
265
- }));
266
-
267
- return imageData;
268
- }
269
- } catch (error) {
270
- console.error('Error fetching image:', error);
271
- }
272
- return null;
273
- }, []); // No dependencies - stable function
274
-
275
- const ImageComponent = memo(({ src, alt }) => {
276
- const [imageSrc, setImageSrc] = useState(null);
277
- const [loading, setLoading] = useState(true);
278
-
279
- useEffect(() => {
280
- if (documentData && src) {
281
- fetchImage(src, documentData.fileId).then(imageData => {
282
- if (imageData) {
283
- setImageSrc(imageData);
284
- }
285
- setLoading(false);
286
- });
287
- }
288
- }, [src, documentData?.fileId, fetchImage]);
289
-
290
- if (loading) {
291
- return (
292
- <span style={{
293
- display: 'inline-block',
294
- width: '100%',
295
- height: '200px',
296
- backgroundColor: '#f3f4f6',
297
- textAlign: 'center',
298
- lineHeight: '200px',
299
- margin: '1rem 0',
300
- borderRadius: '0.5rem',
301
- color: '#6b7280'
302
- }}>
303
- Loading image...
304
- </span>
305
- );
306
- }
307
-
308
- if (!imageSrc) {
309
- return (
310
- <span style={{
311
- display: 'inline-block',
312
- width: '100%',
313
- height: '200px',
314
- backgroundColor: '#fef2f2',
315
- textAlign: 'center',
316
- lineHeight: '200px',
317
- margin: '1rem 0',
318
- borderRadius: '0.5rem',
319
- border: '1px solid #fecaca',
320
- color: '#dc2626'
321
- }}>
322
- Image not found: {alt || src}
323
- </span>
324
- );
325
- }
326
-
327
- return (
328
- <img
329
- src={imageSrc}
330
- alt={alt || 'Document image'}
331
- style={{
332
- display: 'block',
333
- maxWidth: '100%',
334
- height: 'auto',
335
- margin: '1.5rem auto'
336
- }}
337
- />
338
- );
339
- });
340
-
341
-
342
-
343
- const processDocument = async () => {
344
- if (!selectedFile) return;
345
-
346
- setProcessing(true);
347
- setUploadProgress(0);
348
- setOcrProgress(0);
349
-
350
- try {
351
- // Step 1: Upload PDF
352
- const formData = new FormData();
353
- formData.append('file', selectedFile);
354
-
355
- setUploadProgress(30);
356
- const uploadResponse = await fetch('/upload_pdf', {
357
- method: 'POST',
358
- body: formData,
359
- });
360
-
361
- if (!uploadResponse.ok) {
362
- throw new Error('Failed to upload PDF');
363
- }
364
-
365
- const uploadData = await uploadResponse.json();
366
- setUploadProgress(100);
367
-
368
- // Step 2: Process OCR
369
- setOcrProgress(20);
370
- await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for UX
371
-
372
- setOcrProgress(60);
373
- const ocrResponse = await fetch(`/process_ocr/${uploadData.file_id}`);
374
-
375
- if (!ocrResponse.ok) {
376
- throw new Error('Failed to process OCR');
377
- }
378
-
379
- const ocrData = await ocrResponse.json();
380
- setOcrProgress(100);
381
-
382
- // Combine all markdown from pages
383
- const combinedMarkdown = ocrData.pages
384
- .map(page => page.markdown)
385
- .join('\n\n---\n\n');
386
-
387
- // Collect all chunks from all pages
388
- const allChunks = [];
389
- let markdownOffset = 0;
390
-
391
- ocrData.pages.forEach((page, pageIndex) => {
392
- if (page.chunks && page.chunks.length > 0) {
393
- page.chunks.forEach(chunk => {
394
- allChunks.push({
395
- ...chunk,
396
- start_position: chunk.start_position + markdownOffset,
397
- end_position: chunk.end_position + markdownOffset,
398
- pageIndex: pageIndex
399
- });
400
- });
401
- }
402
- markdownOffset += page.markdown.length + 6; // +6 for the separator "\n\n---\n\n"
403
- });
404
-
405
- setDocumentData({
406
- fileId: uploadData.file_id,
407
- filename: uploadData.filename,
408
- markdown: combinedMarkdown,
409
- pages: ocrData.pages,
410
- totalPages: ocrData.total_pages,
411
- chunks: allChunks
412
- });
413
-
414
- } catch (error) {
415
- console.error('Error processing document:', error);
416
- alert('Error processing document: ' + error.message);
417
- } finally {
418
- setProcessing(false);
419
- }
420
- };
421
-
422
- const LoadingAnimation = () => (
423
- <div className="flex flex-col items-center justify-center min-h-screen bg-gray-50">
424
- <div className="text-center max-w-md">
425
- <div className="mb-8">
426
- <div className="w-16 h-16 border-4 border-blue-500 border-t-transparent rounded-full animate-spin mx-auto mb-4"></div>
427
- <h2 className="text-2xl font-bold text-gray-900 mb-2">Processing Your Document</h2>
428
- <p className="text-gray-600">This may take a moment...</p>
429
- </div>
430
-
431
- {/* Upload Progress */}
432
- <div className="mb-6">
433
- <div className="flex justify-between text-sm text-gray-600 mb-1">
434
- <span>Uploading PDF</span>
435
- <span>{uploadProgress}%</span>
436
- </div>
437
- <div className="w-full bg-gray-200 rounded-full h-2">
438
- <div
439
- className="bg-blue-500 h-2 rounded-full transition-all duration-300"
440
- style={{ width: `${uploadProgress}%` }}
441
- ></div>
442
- </div>
443
- </div>
444
-
445
- {/* OCR Progress */}
446
- <div className="mb-6">
447
- <div className="flex justify-between text-sm text-gray-600 mb-1">
448
- <span>Processing with AI</span>
449
- <span>{ocrProgress}%</span>
450
- </div>
451
- <div className="w-full bg-gray-200 rounded-full h-2">
452
- <div
453
- className="bg-green-500 h-2 rounded-full transition-all duration-300"
454
- style={{ width: `${ocrProgress}%` }}
455
- ></div>
456
- </div>
457
- </div>
458
-
459
- <p className="text-sm text-gray-500">
460
- Using AI to extract text and understand your document structure...
461
- </p>
462
- </div>
463
- </div>
464
- );
465
-
466
-
467
- if (!selectedFile) {
468
- return (
469
- <div className="h-screen bg-gray-50 flex items-center justify-center">
470
- <input
471
- ref={fileInputRef}
472
- type="file"
473
- accept=".pdf"
474
- className="hidden"
475
- onChange={handleFileChange}
476
- />
477
- <button
478
- onClick={() => fileInputRef.current.click()}
479
- className="px-6 py-3 bg-white shadow-md hover:shadow-lg text-gray-700 font-medium rounded-lg transition-all"
480
- >
481
- Select PDF
482
- </button>
483
- </div>
484
- );
485
- }
486
-
487
- if (processing) {
488
- return <LoadingAnimation />;
489
- }
490
-
491
- if (!documentData) {
492
- return (
493
- <div className="h-screen bg-gray-50 flex items-center justify-center">
494
- <div className="flex gap-4">
495
- <button
496
- onClick={processDocument}
497
- className="px-6 py-3 bg-white shadow-md hover:shadow-lg text-gray-700 font-medium rounded-lg transition-all"
498
- >
499
- Process
500
- </button>
501
- <button
502
- onClick={() => setSelectedFile(null)}
503
- className="px-6 py-3 bg-white shadow-md hover:shadow-lg text-gray-700 font-medium rounded-lg transition-all"
504
- >
505
- ← Back
506
- </button>
507
- </div>
508
- </div>
509
- );
510
- }
511
-
512
- return (
513
- <div
514
- ref={containerRef}
515
- className="h-screen bg-gray-100 flex gap-2 p-6 overflow-hidden"
516
- style={{ cursor: isDragging ? 'col-resize' : 'default' }}
517
- >
518
- {/* Left Panel - Document */}
519
- <div
520
- className="bg-white rounded-lg shadow-sm flex flex-col"
521
- style={{ width: `${leftPanelWidth}%` }}
522
- >
523
- {/* Header */}
524
- <div className="sticky top-0 bg-white rounded-t-lg px-6 py-4 border-b border-gray-200 z-10">
525
- <h2 className="text-lg font-semibold text-left text-gray-800">Document</h2>
526
- </div>
527
-
528
- {/* Content */}
529
- <div className="flex-1 px-6 pt-6 pb-8 overflow-y-auto">
530
- <style>
531
- {`
532
- @keyframes fadeInHighlight {
533
- 0% {
534
- background-color: rgba(255, 214, 100, 0);
535
- border-left-color: rgba(156, 163, 175, 0);
536
- transform: translateX(-10px);
537
- opacity: 0;
538
- }
539
- 100% {
540
- background-color: rgba(255, 214, 100, 0.15);
541
- border-left-color: rgba(156, 163, 175, 0.5);
542
- transform: translateX(0);
543
- opacity: 1;
544
- }
545
- }
546
- `}
547
- </style>
548
- <div className="prose prose-sm max-w-none" style={{
549
- fontSize: '0.875rem',
550
- lineHeight: '1.5',
551
- color: 'rgb(55, 65, 81)'
552
- }}>
553
- <ReactMarkdown
554
- remarkPlugins={[remarkMath]}
555
- rehypePlugins={[rehypeRaw, rehypeKatex]}
556
- components={{
557
- h1: ({ children }) => <h1 style={{ fontSize: '1.5rem', fontWeight: 'bold', marginBottom: '1rem', color: '#1a202c' }}>{children}</h1>,
558
- h2: ({ children }) => <h2 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', marginTop: '1.5rem', color: '#1a202c' }}>{children}</h2>,
559
- h3: ({ children }) => <h3 style={{ fontSize: '1.125rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '1rem', color: '#1a202c' }}>{children}</h3>,
560
- p: ({ children }) => <p style={{ marginBottom: '0.75rem', color: '#374151', lineHeight: '1.5', fontSize: '0.875rem' }}>{children}</p>,
561
- hr: () => <hr style={{ margin: '1.5rem 0', borderColor: '#d1d5db' }} />,
562
- ul: ({ children }) => <ul style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'disc', fontSize: '0.875rem' }}>{children}</ul>,
563
- ol: ({ children }) => <ol style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'decimal', fontSize: '0.875rem' }}>{children}</ol>,
564
- li: ({ children }) => <li style={{ marginBottom: '0.125rem', color: '#374151' }}>{children}</li>,
565
- blockquote: ({ children }) => (
566
- <blockquote style={{ borderLeft: '3px solid #3b82f6', paddingLeft: '0.75rem', fontStyle: 'italic', margin: '0.75rem 0', color: '#6b7280', fontSize: '0.875rem' }}>
567
- {children}
568
- </blockquote>
569
- ),
570
- code: ({ inline, children }) =>
571
- inline ?
572
- <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code> :
573
- <pre style={{ backgroundColor: '#f3f4f6', padding: '0.75rem', borderRadius: '0.375rem', overflowX: 'auto', margin: '0.75rem 0' }}>
574
- <code style={{ fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code>
575
- </pre>,
576
- div: ({ children, style }) => (
577
- <div style={style}>
578
- {children}
579
- </div>
580
- ),
581
- img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
582
- }}
583
- >
584
- {highlightedMarkdown}
585
- </ReactMarkdown>
586
- </div>
587
- </div>
588
- </div>
589
-
590
- {/* Resizable Divider */}
591
- <div
592
- className="flex items-center justify-center cursor-col-resize group transition-all duration-200"
593
- style={{ width: '8px' }}
594
- onMouseDown={handleMouseDown}
595
- >
596
- {/* Resizable Divider */}
597
- <div
598
- className="w-px h-full rounded-full transition-all
599
- duration-200 group-hover:shadow-lg"
600
- style={{
601
- backgroundColor: isDragging ? 'rgba(59, 130, 246, 0.8)' : 'transparent',
602
- boxShadow: isDragging ? '0 0 8px rgba(59, 130, 246, 0.8)' : 'none'
603
- }}
604
- ></div>
605
- </div>
606
-
607
- {/* Right Panel Container */}
608
- <div
609
- className="flex flex-col"
610
- style={{ width: `${100 - leftPanelWidth}%` }}
611
- >
612
- {/* Navigation Bar - Above chunk panel */}
613
- <div className="flex items-center justify-center gap-4 mb-4 px-4">
614
- <button
615
- onClick={goToPrevChunk}
616
- disabled={currentChunkIndex === 0}
617
- className="p-3 bg-white hover:bg-gray-50 disabled:opacity-30 disabled:cursor-not-allowed rounded-lg shadow-sm transition-all"
618
- >
619
- <svg className="w-5 h-5 text-gray-700" fill="none" stroke="currentColor" viewBox="0 0 24 24" strokeWidth={3}>
620
- <path strokeLinecap="round" strokeLinejoin="round" d="M15 19l-7-7 7-7" />
621
- </svg>
622
- </button>
623
-
624
- <div className="flex space-x-2">
625
- {documentData?.chunks?.map((_, index) => (
626
- <div
627
- key={index}
628
- className={`w-3 h-3 rounded-full ${
629
- chunkStates[index] === 'understood' ? 'bg-green-500' :
630
- chunkStates[index] === 'skipped' ? 'bg-red-500' :
631
- chunkStates[index] === 'interactive' ? 'bg-blue-500' :
632
- index === currentChunkIndex ? 'bg-gray-600' : 'bg-gray-300'
633
- }`}
634
- />
635
- ))}
636
- </div>
637
-
638
- <button
639
- onClick={goToNextChunk}
640
- disabled={!documentData?.chunks || currentChunkIndex === documentData.chunks.length - 1}
641
- className="p-3 bg-white hover:bg-gray-50 disabled:opacity-30 disabled:cursor-not-allowed rounded-lg shadow-sm transition-all"
642
- >
643
- <svg className="w-5 h-5 text-gray-700" fill="none" stroke="currentColor" viewBox="0 0 24 24" strokeWidth={3}>
644
- <path strokeLinecap="round" strokeLinejoin="round" d="M9 5l7 7-7 7" />
645
- </svg>
646
- </button>
647
- </div>
648
-
649
- {/* Chunk Panel */}
650
- {/* Chunk Header - Left aligned title only */}
651
- <div className="px-6 py-4 flex-shrink-0 bg-white rounded-t-lg border-b border-gray-200 z-10">
652
- <div className="flex items-center justify-between">
653
- <button
654
- onClick={() => setChunkExpanded(!chunkExpanded)}
655
- className="flex items-center hover:bg-gray-50 py-2 px-3 rounded-lg transition-all -ml-3"
656
- >
657
- <span className="font-semibold text-gray-900 text-left">
658
- {documentData?.chunks?.[currentChunkIndex]?.topic || "Loading..."}
659
- </span>
660
- <span className="text-gray-400 ml-3">
661
- {chunkExpanded ? '▲' : '▼'}
662
- </span>
663
- </button>
664
-
665
- <button
666
- onClick={markChunkUnderstood}
667
- className="py-2 px-4 bg-gray-50 hover:bg-gray-100 text-gray-600 rounded-lg transition-all text-sm"
668
- >
669
-
670
- </button>
671
- </div>
672
-
673
- {/* Expandable Chunk Content - in header area */}
674
- {chunkExpanded && documentData?.chunks?.[currentChunkIndex] && (
675
- <div className="prose prose-sm max-w-none">
676
- <ReactMarkdown
677
- remarkPlugins={[remarkMath]}
678
- rehypePlugins={[rehypeRaw, rehypeKatex]}
679
- components={{
680
- h1: ({ children }) => <h1 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', color: '#1a202c' }}>{children}</h1>,
681
- h2: ({ children }) => <h2 style={{ fontSize: '1.125rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '1rem', color: '#1a202c' }}>{children}</h2>,
682
- h3: ({ children }) => <h3 style={{ fontSize: '1rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '0.75rem', color: '#1a202c' }}>{children}</h3>,
683
- p: ({ children }) => <p style={{ marginBottom: '0.5rem', color: '#374151', lineHeight: '1.4', fontSize: '0.875rem' }}>{children}</p>,
684
- hr: () => <hr style={{ margin: '1rem 0', borderColor: '#d1d5db' }} />,
685
- ul: ({ children }) => <ul style={{ marginBottom: '0.5rem', marginLeft: '1rem', listStyleType: 'disc', fontSize: '0.875rem' }}>{children}</ul>,
686
- ol: ({ children }) => <ol style={{ marginBottom: '0.5rem', marginLeft: '1rem', listStyleType: 'decimal', fontSize: '0.875rem' }}>{children}</ol>,
687
- li: ({ children }) => <li style={{ marginBottom: '0.125rem', color: '#374151' }}>{children}</li>,
688
- blockquote: ({ children }) => (
689
- <blockquote style={{ borderLeft: '2px solid #9ca3af', paddingLeft: '0.5rem', fontStyle: 'italic', margin: '0.5rem 0', color: '#6b7280', fontSize: '0.875rem' }}>
690
- {children}
691
- </blockquote>
692
- ),
693
- code: ({ inline, children }) =>
694
- inline ?
695
- <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code> :
696
- <pre style={{ backgroundColor: '#f3f4f6', padding: '0.5rem', borderRadius: '0.25rem', overflowX: 'auto', margin: '0.5rem 0' }}>
697
- <code style={{ fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code>
698
- </pre>,
699
- img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
700
- }}
701
- >
702
- {documentData.markdown.slice(
703
- documentData.chunks[currentChunkIndex].start_position,
704
- documentData.chunks[currentChunkIndex].end_position
705
- )}
706
- </ReactMarkdown>
707
- </div>
708
- )}
709
-
710
-
711
- </div>
712
-
713
-
714
- {/* Content Area */}
715
- <div className="flex-1 flex flex-col min-h-0">
716
- {/* Action Buttons */}
717
- {chunkStates[currentChunkIndex] !== 'interactive' && (
718
- <div className="flex-shrink-0 p-6 border-b border-gray-200">
719
- <div className="flex gap-3">
720
- <button
721
- onClick={skipChunk}
722
- className="flex-1 py-3 bg-gray-50 hover:bg-gray-100 text-gray-600 rounded-lg transition-all"
723
- >
724
-
725
- </button>
726
-
727
- <button
728
- onClick={startInteractiveLesson}
729
- disabled={chatLoading}
730
- className="flex-1 py-3 bg-gray-50 hover:bg-gray-100 disabled:opacity-50 text-gray-600 rounded-lg transition-all"
731
- >
732
- {chatLoading ? '...' : 'Start'}
733
- </button>
734
-
735
- <button
736
- onClick={markChunkUnderstood}
737
- className="flex-1 py-3 bg-gray-50 hover:bg-gray-100 text-gray-600 rounded-lg transition-all"
738
- >
739
-
740
- </button>
741
- </div>
742
- </div>
743
- )}
744
-
745
- {/* Chat Area - sandwich layout when interactive */}
746
- {chunkStates[currentChunkIndex] === 'interactive' && (
747
- <div className="flex-1 flex flex-col min-h-0">
748
- {/* Chat Messages - scrollable middle layer */}
749
- <div className="bg-white flex-1 overflow-y-auto space-y-4 px-6 py-2">
750
- {(chatMessages[currentChunkIndex] || []).map((message, index) => (
751
- message.type === 'user' ? (
752
- <div
753
- key={index}
754
- className="w-full bg-gray-50 border border-gray-200 rounded-lg p-4 shadow-sm"
755
- >
756
- <div className="text-xs font-medium mb-2 text-gray-600">
757
- You
758
- </div>
759
- <div className="prose prose-sm max-w-none">
760
- <ReactMarkdown
761
- remarkPlugins={[remarkMath]}
762
- rehypePlugins={[rehypeRaw, rehypeKatex]}
763
- components={{
764
- p: ({ children }) => <p className="mb-2 text-gray-800 leading-relaxed">{children}</p>,
765
- ul: ({ children }) => <ul className="mb-2 ml-4 list-disc">{children}</ul>,
766
- ol: ({ children }) => <ol className="mb-2 ml-4 list-decimal">{children}</ol>,
767
- li: ({ children }) => <li className="mb-1 text-gray-800">{children}</li>,
768
- strong: ({ children }) => <strong className="font-semibold text-gray-900">{children}</strong>,
769
- em: ({ children }) => <em className="italic">{children}</em>,
770
- code: ({ inline, children }) =>
771
- inline ?
772
- <code className="bg-gray-100 px-1 py-0.5 rounded text-sm font-mono">{children}</code> :
773
- <pre className="bg-gray-100 p-2 rounded overflow-x-auto my-2">
774
- <code className="text-sm font-mono">{children}</code>
775
- </pre>,
776
- blockquote: ({ children }) => (
777
- <blockquote className="border-l-4 border-blue-200 pl-4 italic text-gray-700 my-2">
778
- {children}
779
- </blockquote>
780
- )
781
- }}
782
- >
783
- {message.text}
784
- </ReactMarkdown>
785
- </div>
786
- </div>
787
- ) : (
788
- <div key={index} className="w-full py-4">
789
- <div className="prose prose-sm max-w-none">
790
- <ReactMarkdown
791
- remarkPlugins={[remarkMath]}
792
- rehypePlugins={[rehypeRaw, rehypeKatex]}
793
- components={{
794
- p: ({ children }) => <p className="mb-2 text-gray-800 leading-relaxed">{children}</p>,
795
- ul: ({ children }) => <ul className="mb-2 ml-4 list-disc">{children}</ul>,
796
- ol: ({ children }) => <ol className="mb-2 ml-4 list-decimal">{children}</ol>,
797
- li: ({ children }) => <li className="mb-1 text-gray-800">{children}</li>,
798
- strong: ({ children }) => <strong className="font-semibold text-gray-900">{children}</strong>,
799
- em: ({ children }) => <em className="italic">{children}</em>,
800
- code: ({ inline, children }) =>
801
- inline ?
802
- <code className="bg-gray-100 px-1 py-0.5 rounded text-sm font-mono">{children}</code> :
803
- <pre className="bg-gray-100 p-2 rounded overflow-x-auto my-2">
804
- <code className="text-sm font-mono">{children}</code>
805
- </pre>,
806
- blockquote: ({ children }) => (
807
- <blockquote className="border-l-4 border-blue-200 pl-4 italic text-gray-700 my-2">
808
- {children}
809
- </blockquote>
810
- )
811
- }}
812
- >
813
- {message.text}
814
- </ReactMarkdown>
815
- </div>
816
- </div>
817
- )
818
- ))}
819
-
820
- {/* Typing animation message */}
821
- {typingMessage && (
822
- <div className="w-full py-4">
823
- <div className="prose prose-sm max-w-none">
824
- <ReactMarkdown
825
- remarkPlugins={[remarkMath]}
826
- rehypePlugins={[rehypeRaw, rehypeKatex]}
827
- components={{
828
- p: ({ children }) => <p className="mb-2 text-gray-800 leading-relaxed">{children}</p>,
829
- ul: ({ children }) => <ul className="mb-2 ml-4 list-disc">{children}</ul>,
830
- ol: ({ children }) => <ol className="mb-2 ml-4 list-decimal">{children}</ol>,
831
- li: ({ children }) => <li className="mb-1 text-gray-800">{children}</li>,
832
- strong: ({ children }) => <strong className="font-semibold text-gray-900">{children}</strong>,
833
- em: ({ children }) => <em className="italic">{children}</em>,
834
- code: ({ inline, children }) =>
835
- inline ?
836
- <code className="bg-gray-100 px-1 py-0.5 rounded text-sm font-mono">{children}</code> :
837
- <pre className="bg-gray-100 p-2 rounded overflow-x-auto my-2">
838
- <code className="text-sm font-mono">{children}</code>
839
- </pre>,
840
- blockquote: ({ children }) => (
841
- <blockquote className="border-l-4 border-blue-200 pl-4 italic text-gray-700 my-2">
842
- {children}
843
- </blockquote>
844
- )
845
- }}
846
- >
847
- {typingMessage}
848
- </ReactMarkdown>
849
- </div>
850
- </div>
851
- )}
852
-
853
- {/* Loading dots */}
854
- {chatLoading && (
855
- <div className="w-full py-4">
856
- <div className="flex space-x-1">
857
- <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce"></div>
858
- <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.1s'}}></div>
859
- <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.2s'}}></div>
860
- </div>
861
- </div>
862
- )}
863
- </div>
864
-
865
- {/* Chat Input - sticky at bottom */}
866
- <div className="flex-shrink-0 bg-white border-t border-gray-200 p-6">
867
- <div className="flex gap-2 mb-3">
868
- <input
869
- type="text"
870
- value={userInput}
871
- onChange={(e) => setUserInput(e.target.value)}
872
- placeholder="Type your response..."
873
- className="flex-1 px-3 py-2 border border-gray-200 rounded-lg text-sm focus:outline-none focus:ring-1 focus:ring-gray-300"
874
- />
875
- <button className="px-4 py-2 bg-gray-50 hover:bg-gray-100 text-gray-600 rounded-lg transition-all">
876
-
877
- </button>
878
- </div>
879
-
880
- </div>
881
- </div>
882
- )}
883
- </div>
884
- </div>
885
- </div>
886
- );
887
- }
888
-
889
- export default DocumentProcessor;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/src/components/ImageComponent.jsx DELETED
@@ -1,115 +0,0 @@
1
- import { useState, useEffect, memo } from 'react';
2
-
3
- /**
4
- * ImageComponent - Handles loading and displaying images from the backend
5
- *
6
- * Props:
7
- * - src: The image ID to fetch
8
- * - alt: Alt text for the image
9
- * - fileId: The document file ID (for fetching the image)
10
- * - imageCache: Object containing cached images
11
- * - onImageCached: Callback when image is successfully cached
12
- */
13
- const ImageComponent = memo(({ src, alt, fileId, imageCache, onImageCached }) => {
14
- // Local state for this specific image
15
- const [imageSrc, setImageSrc] = useState(null);
16
- const [loading, setLoading] = useState(true);
17
-
18
- useEffect(() => {
19
- // Only proceed if we have the required data
20
- if (!fileId || !src) {
21
- setLoading(false);
22
- return;
23
- }
24
-
25
- // Check if image is already cached
26
- if (imageCache && imageCache[src]) {
27
- setImageSrc(imageCache[src]);
28
- setLoading(false);
29
- return;
30
- }
31
-
32
- // Fetch the image from backend
33
- const fetchImage = async () => {
34
- try {
35
- const response = await fetch(`/get_image/${fileId}/${src}`);
36
- if (response.ok) {
37
- const data = await response.json();
38
- const imageData = data.image_base64;
39
-
40
- // Set the image for display
41
- setImageSrc(imageData);
42
-
43
- // Notify parent component to cache this image
44
- if (onImageCached) {
45
- onImageCached(src, imageData);
46
- }
47
- }
48
- } catch (error) {
49
- console.error('Error fetching image:', error);
50
- } finally {
51
- setLoading(false);
52
- }
53
- };
54
-
55
- fetchImage();
56
- }, [src, fileId, imageCache, onImageCached]);
57
-
58
- // Show loading state
59
- if (loading) {
60
- return (
61
- <span style={{
62
- display: 'inline-block',
63
- width: '100%',
64
- height: '200px',
65
- backgroundColor: '#f3f4f6',
66
- textAlign: 'center',
67
- lineHeight: '200px',
68
- margin: '1rem 0',
69
- borderRadius: '0.5rem',
70
- color: '#6b7280'
71
- }}>
72
- Loading image...
73
- </span>
74
- );
75
- }
76
-
77
- // Show error state if image couldn't be loaded
78
- if (!imageSrc) {
79
- return (
80
- <span style={{
81
- display: 'inline-block',
82
- width: '100%',
83
- height: '200px',
84
- backgroundColor: '#fef2f2',
85
- textAlign: 'center',
86
- lineHeight: '200px',
87
- margin: '1rem 0',
88
- borderRadius: '0.5rem',
89
- border: '1px solid #fecaca',
90
- color: '#dc2626'
91
- }}>
92
- Image not found: {alt || src}
93
- </span>
94
- );
95
- }
96
-
97
- // Render the actual image
98
- return (
99
- <img
100
- src={imageSrc}
101
- alt={alt || 'Document image'}
102
- style={{
103
- display: 'block',
104
- maxWidth: '100%',
105
- height: 'auto',
106
- margin: '1.5rem auto'
107
- }}
108
- />
109
- );
110
- });
111
-
112
- // Set display name for debugging
113
- ImageComponent.displayName = 'ImageComponent';
114
-
115
- export default ImageComponent;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/src/components/LoadingAnimation.jsx CHANGED
@@ -1,10 +1,10 @@
1
- const LoadingAnimation = ({ uploadProgress, ocrProgress }) => (
2
  <div className="flex flex-col items-center justify-center min-h-screen bg-gray-50">
3
  <div className="text-center max-w-md">
4
  <div className="mb-8">
5
  <div className="w-16 h-16 border-4 border-blue-500 border-t-transparent rounded-full animate-spin mx-auto mb-4"></div>
6
  <h2 className="text-2xl font-bold text-gray-900 mb-2">Processing Your Document</h2>
7
- <p className="text-gray-600">This may take a moment...</p>
8
  </div>
9
 
10
  {/* Upload Progress */}
@@ -21,22 +21,8 @@ const LoadingAnimation = ({ uploadProgress, ocrProgress }) => (
21
  </div>
22
  </div>
23
 
24
- {/* OCR Progress */}
25
- <div className="mb-6">
26
- <div className="flex justify-between text-sm text-gray-600 mb-1">
27
- <span>Processing with AI</span>
28
- <span>{ocrProgress}%</span>
29
- </div>
30
- <div className="w-full bg-gray-200 rounded-full h-2">
31
- <div
32
- className="bg-green-500 h-2 rounded-full transition-all duration-300"
33
- style={{ width: `${ocrProgress}%` }}
34
- ></div>
35
- </div>
36
- </div>
37
-
38
  <p className="text-sm text-gray-500">
39
- Using AI to extract text and understand your document structure...
40
  </p>
41
  </div>
42
  </div>
 
1
+ const LoadingAnimation = ({ uploadProgress }) => (
2
  <div className="flex flex-col items-center justify-center min-h-screen bg-gray-50">
3
  <div className="text-center max-w-md">
4
  <div className="mb-8">
5
  <div className="w-16 h-16 border-4 border-blue-500 border-t-transparent rounded-full animate-spin mx-auto mb-4"></div>
6
  <h2 className="text-2xl font-bold text-gray-900 mb-2">Processing Your Document</h2>
7
+ <p className="text-gray-600">Uploading your PDF...</p>
8
  </div>
9
 
10
  {/* Upload Progress */}
 
21
  </div>
22
  </div>
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  <p className="text-sm text-gray-500">
25
+ Preparing your document for viewing...
26
  </p>
27
  </div>
28
  </div>
frontend/src/components/SimpleChat.jsx ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from 'react';
2
+
3
+ const SimpleChat = () => {
4
+ const [messages, setMessages] = useState([
5
+ {
6
+ id: 1,
7
+ role: 'assistant',
8
+ content: 'Hi! Ask me anything about this section.'
9
+ }
10
+ ]);
11
+ const [input, setInput] = useState('');
12
+ const [isLoading, setIsLoading] = useState(false);
13
+
14
+ const sendMessage = async (e) => {
15
+ e.preventDefault();
16
+ if (!input.trim() || isLoading) return;
17
+
18
+ const userMessage = {
19
+ id: Date.now(),
20
+ role: 'user',
21
+ content: input.trim()
22
+ };
23
+
24
+ setMessages(prev => [...prev, userMessage]);
25
+ setInput('');
26
+ setIsLoading(true);
27
+
28
+ try {
29
+ const response = await fetch('/api/chat', {
30
+ method: 'POST',
31
+ headers: { 'Content-Type': 'application/json' },
32
+ body: JSON.stringify({
33
+ messages: [...messages, userMessage].map(msg => ({
34
+ role: msg.role,
35
+ content: msg.content
36
+ }))
37
+ })
38
+ });
39
+
40
+ const data = await response.json();
41
+
42
+ setMessages(prev => [...prev, {
43
+ id: Date.now() + 1,
44
+ role: 'assistant',
45
+ content: data.content || data.message || 'Sorry, no response received.'
46
+ }]);
47
+ } catch (error) {
48
+ console.error('Error:', error);
49
+ setMessages(prev => [...prev, {
50
+ id: Date.now() + 1,
51
+ role: 'assistant',
52
+ content: 'Sorry, something went wrong. Please try again.'
53
+ }]);
54
+ } finally {
55
+ setIsLoading(false);
56
+ }
57
+ };
58
+
59
+ return (
60
+ <div className="flex flex-col h-full">
61
+ {/* Messages */}
62
+ <div className="flex-1 overflow-y-auto p-4 space-y-3">
63
+ {messages.map(message => (
64
+ <div
65
+ key={message.id}
66
+ className={`flex ${message.role === 'user' ? 'justify-end' : 'justify-start'}`}
67
+ >
68
+ <div
69
+ className={`max-w-[70%] p-3 rounded-lg ${
70
+ message.role === 'user'
71
+ ? 'bg-blue-500 text-white'
72
+ : 'bg-gray-100 text-gray-900'
73
+ }`}
74
+ >
75
+ {message.content}
76
+ </div>
77
+ </div>
78
+ ))}
79
+ {isLoading && (
80
+ <div className="flex justify-start">
81
+ <div className="bg-gray-100 p-3 rounded-lg">
82
+ <div className="flex space-x-1">
83
+ <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce"></div>
84
+ <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.1s'}}></div>
85
+ <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.2s'}}></div>
86
+ </div>
87
+ </div>
88
+ </div>
89
+ )}
90
+ </div>
91
+
92
+ {/* Input */}
93
+ <form onSubmit={sendMessage} className="p-4 border-t">
94
+ <div className="flex space-x-2">
95
+ <input
96
+ type="text"
97
+ value={input}
98
+ onChange={(e) => setInput(e.target.value)}
99
+ placeholder="Type your message..."
100
+ disabled={isLoading}
101
+ className="flex-1 px-3 py-2 border rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100"
102
+ />
103
+ <button
104
+ type="submit"
105
+ disabled={!input.trim() || isLoading}
106
+ className="px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 disabled:bg-gray-300 disabled:cursor-not-allowed"
107
+ >
108
+ {isLoading ? '...' : 'Send'}
109
+ </button>
110
+ </div>
111
+ </form>
112
+ </div>
113
+ );
114
+ };
115
+
116
+ export default SimpleChat;
frontend/src/components/UploadPage.jsx DELETED
@@ -1,277 +0,0 @@
1
- import { useState, useRef } from 'react';
2
- import { Document, Page, pdfjs } from 'react-pdf';
3
- import 'react-pdf/dist/Page/AnnotationLayer.css';
4
- import 'react-pdf/dist/Page/TextLayer.css';
5
-
6
- pdfjs.GlobalWorkerOptions.workerSrc = '/pdf.worker.min.js';
7
-
8
- function UploadPage() {
9
- const fileInputRef = useRef(null);
10
- const pdfContainerRef = useRef(null);
11
- const [selectedFile, setSelectedFile] = useState(null);
12
- const [numPages, setNumPages] = useState(null);
13
- const [currentPage, setCurrentPage] = useState(1);
14
- const [zoomLevel, setZoomLevel] = useState(1);
15
- const [visiblePages, setVisiblePages] = useState(new Set([1]));
16
- const [chunks, setChunks] = useState([]);
17
- const [processing, setProcessing] = useState(false);
18
-
19
- const handleFileChange = (e) => {
20
- setSelectedFile(e.target.files[0]);
21
- setChunks([]); // Clear previous chunks
22
- };
23
-
24
- const processPdf = async () => {
25
- if (!selectedFile) return;
26
-
27
- setProcessing(true);
28
- const formData = new FormData();
29
- formData.append('file', selectedFile);
30
-
31
- try {
32
- const response = await fetch('http://localhost:8000/upload_pdf', {
33
- method: 'POST',
34
- body: formData,
35
- });
36
-
37
- if (response.ok) {
38
- const data = await response.json();
39
- // Handle the new response format - create a fake chunk array for now
40
- setChunks([{
41
- text: `File processed: ${data.filename}`,
42
- page_number: 1,
43
- chunk_type: "info",
44
- size: data.size,
45
- has_api_key: data.has_api_key
46
- }]);
47
- } else {
48
- console.error('Failed to process PDF');
49
- }
50
- } catch (error) {
51
- console.error('Error processing PDF:', error);
52
- } finally {
53
- setProcessing(false);
54
- }
55
- };
56
-
57
- // Handle scroll to update current page and track visible pages
58
- const handleScroll = () => {
59
- if (!pdfContainerRef.current || !numPages) return;
60
-
61
- const container = pdfContainerRef.current;
62
- const scrollTop = container.scrollTop;
63
- const containerHeight = container.clientHeight;
64
- const totalScrollHeight = container.scrollHeight - containerHeight;
65
-
66
- // Calculate which page we're viewing based on scroll position
67
- const scrollPercent = scrollTop / totalScrollHeight;
68
- const newPage = Math.min(Math.floor(scrollPercent * numPages) + 1, numPages);
69
-
70
- if (newPage !== currentPage) {
71
- setCurrentPage(newPage);
72
- }
73
-
74
- // Track visible pages based on zoom level (more pages visible when zoomed out)
75
- const newVisiblePages = new Set();
76
- const visibleRange = Math.max(1, Math.ceil(2 / zoomLevel)); // More pages when zoomed out
77
- for (let i = Math.max(1, newPage - visibleRange); i <= Math.min(numPages, newPage + visibleRange); i++) {
78
- newVisiblePages.add(i);
79
- }
80
-
81
- // Update visible pages if changed
82
- if (newVisiblePages.size !== visiblePages.size ||
83
- ![...newVisiblePages].every(page => visiblePages.has(page))) {
84
- setVisiblePages(newVisiblePages);
85
- }
86
- };
87
-
88
- // Jump to specific page
89
- const goToPage = (pageNumber) => {
90
- if (!pdfContainerRef.current || !numPages) return;
91
-
92
- // Update visible pages immediately for target page
93
- const newVisiblePages = new Set();
94
- const visibleRange = Math.max(1, Math.ceil(2 / zoomLevel)); // More pages when zoomed out
95
- for (let i = Math.max(1, pageNumber - visibleRange); i <= Math.min(numPages, pageNumber + visibleRange); i++) {
96
- newVisiblePages.add(i);
97
- }
98
- setVisiblePages(newVisiblePages);
99
-
100
- const container = pdfContainerRef.current;
101
- const totalScrollHeight = container.scrollHeight - container.clientHeight;
102
-
103
- // Calculate scroll position for the target page
104
- const targetScrollPercent = (pageNumber - 1) / numPages;
105
- const targetScrollTop = targetScrollPercent * totalScrollHeight;
106
-
107
- container.scrollTo({
108
- top: targetScrollTop,
109
- behavior: 'smooth'
110
- });
111
- };
112
-
113
- // Zoom controls
114
- const zoomIn = () => setZoomLevel(prev => Math.min(prev + 0.25, 3));
115
- const zoomOut = () => setZoomLevel(prev => Math.max(prev - 0.25, 0.5));
116
- const resetZoom = () => setZoomLevel(1);
117
-
118
- return (
119
- <div className="h-screen bg-gray-50 overflow-hidden">
120
- {!selectedFile ? (
121
- // Show upload UI
122
- <div className="flex items-center justify-center min-h-screen">
123
- <div className="text-center">
124
- <h1 className="text-3xl font-bold text-gray-900 mb-4">
125
- Upload Your PDF
126
- </h1>
127
- <p className="text-gray-600 mb-8">
128
- Click below to upload a PDF and start your deep dive.
129
- </p>
130
- <input
131
- ref={fileInputRef}
132
- type="file"
133
- accept=".pdf"
134
- className="hidden"
135
- onChange={handleFileChange}
136
- />
137
- <button
138
- onClick={() => fileInputRef.current.click()}
139
- className="bg-blue-500 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded"
140
- >
141
- Upload PDF
142
- </button>
143
- </div>
144
- </div>
145
- ) : (
146
- // Show PDF + chat layout
147
- <div className="flex h-screen">
148
- <div className="w-2/3 bg-white flex flex-col relative">
149
- {/* PDF container with scrolling */}
150
- <div
151
- ref={pdfContainerRef}
152
- className="flex-1 overflow-auto flex justify-center bg-gray-100"
153
- onScroll={handleScroll}
154
- >
155
- <div className="py-4">
156
- <Document
157
- file={selectedFile}
158
- onLoadSuccess={({ numPages }) => setNumPages(numPages)}
159
- >
160
- {/* Render all pages continuously */}
161
- {numPages && Array.from(new Array(numPages), (_, index) => {
162
- const pageNum = index + 1;
163
- const isVisible = visiblePages.has(pageNum);
164
- const currentZoom = isVisible ? zoomLevel : 1; // Only zoom visible pages
165
-
166
- return (
167
- <div key={pageNum} className="mb-4 flex justify-center">
168
- <Page
169
- pageNumber={pageNum}
170
- width={typeof window !== 'undefined' ? window.innerWidth * 0.66 * 0.9 * currentZoom : 600 * currentZoom}
171
- />
172
- </div>
173
- );
174
- })}
175
- </Document>
176
- </div>
177
- </div>
178
- {/* Pagination overlay - floating pill */}
179
- {numPages && (
180
- <div className="absolute bottom-4 left-1/2 transform -translate-x-1/2 z-10">
181
- <div className="flex items-center bg-gray-800/90 backdrop-blur-sm rounded-full shadow-lg px-3 py-2 space-x-3">
182
- <button
183
- onClick={() => goToPage(Math.max(currentPage - 1, 1))}
184
- disabled={currentPage <= 1}
185
- className="w-8 h-8 rounded-full bg-gray-600 hover:bg-gray-500 disabled:opacity-30 disabled:cursor-not-allowed flex items-center justify-center transition-colors text-white"
186
- >
187
- <svg width="16" height="16" viewBox="0 0 16 16" fill="currentColor">
188
- <path d="M10 12l-4-4 4-4v8z"/>
189
- </svg>
190
- </button>
191
-
192
- <span className="px-3 py-1 text-sm font-medium text-white min-w-[60px] text-center">
193
- {currentPage}/{numPages}
194
- </span>
195
-
196
- <button
197
- onClick={() => goToPage(Math.min(currentPage + 1, numPages))}
198
- disabled={currentPage >= numPages}
199
- className="w-8 h-8 rounded-full bg-gray-600 hover:bg-gray-500 disabled:opacity-30 disabled:cursor-not-allowed flex items-center justify-center transition-colors text-white"
200
- >
201
- <svg width="16" height="16" viewBox="0 0 16 16" fill="currentColor">
202
- <path d="M6 4l4 4-4 4V4z"/>
203
- </svg>
204
- </button>
205
- </div>
206
- </div>
207
- )}
208
-
209
- {/* Zoom controls overlay - bottom right */}
210
- {numPages && (
211
- <div className="absolute bottom-4 right-4 z-10 flex flex-col items-center space-y-2">
212
- {/* Main zoom pill - vertical */}
213
- <div className="flex flex-col items-center bg-gray-800/90 backdrop-blur-sm rounded-full shadow-lg px-2 py-2 space-y-1">
214
- <button
215
- onClick={zoomIn}
216
- disabled={zoomLevel >= 3}
217
- className="w-6 h-6 rounded-full bg-gray-600 hover:bg-gray-500 disabled:opacity-30 disabled:cursor-not-allowed flex items-center justify-center transition-colors text-white"
218
- >
219
- <svg width="14" height="14" viewBox="0 0 16 16" fill="currentColor">
220
- <path d="M8 4v4H4v1h4v4h1V9h4V8H9V4z"/>
221
- </svg>
222
- </button>
223
-
224
- <button
225
- onClick={zoomOut}
226
- disabled={zoomLevel <= 0.5}
227
- className="w-6 h-6 rounded-full bg-gray-600 hover:bg-gray-500 disabled:opacity-30 disabled:cursor-not-allowed flex items-center justify-center transition-colors text-white"
228
- >
229
- <svg width="14" height="14" viewBox="0 0 16 16" fill="currentColor">
230
- <path d="M4 8h8v1H4z"/>
231
- </svg>
232
- </button>
233
- </div>
234
-
235
- {/* Reset button below */}
236
- <button
237
- onClick={resetZoom}
238
- className="w-10 h-10 bg-gray-700 hover:bg-gray-500 backdrop-blur-sm rounded-full shadow-lg flex items-center justify-center text-white transition-colors"
239
- >
240
- <svg width="14" height="14" viewBox="0 0 16 16" fill="currentColor" stroke="currentColor" strokeWidth="0.5">
241
- <path d="M8 3a5 5 0 1 0 4.546 2.914.5.5 0 0 1 .908-.417A6 6 0 1 1 8 2v1z" strokeWidth="1"/>
242
- <path d="M8 4.466V.534a.25.25 0 0 1 .41-.192l2.36 1.966c.12.1.12.284 0 .384L8.41 4.658A.25.25 0 0 1 8 4.466z"/>
243
- </svg>
244
- </button>
245
- </div>
246
- )}
247
-
248
- </div>
249
- {/* White separator bar */}
250
- <div className="w-4 bg-white"></div>
251
-
252
- <div className="flex-1 bg-gray-100 overflow-auto">
253
- <div className="p-4">
254
- <button
255
- onClick={processPdf}
256
- disabled={processing}
257
- className="bg-green-500 hover:bg-green-700 text-white font-bold py-2 px-4 rounded mb-4"
258
- >
259
- {processing ? 'Processing...' : 'Process PDF'}
260
- </button>
261
- <div>
262
- {chunks.map((chunk, index) => (
263
- <div key={index} className="bg-white p-4 rounded-lg shadow mb-4">
264
- <p className="text-sm text-gray-600">Page: {chunk.page_number}, Type: {chunk.chunk_type}</p>
265
- <p className="text-gray-800">{chunk.text}</p>
266
- </div>
267
- ))}
268
- </div>
269
- </div>
270
- </div>
271
- </div>
272
- )}
273
- </div>
274
- );
275
- }
276
-
277
- export default UploadPage;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/src/hooks/useChat.js DELETED
@@ -1,109 +0,0 @@
1
- import { useState, useRef } from 'react';
2
-
3
- export const useChat = () => {
4
- const [chatData, setChatData] = useState({});
5
- const [chatLoading, setChatLoading] = useState(false);
6
- const [chatMessages, setChatMessages] = useState({});
7
- const [userInput, setUserInput] = useState('');
8
- const [typingMessage, setTypingMessage] = useState('');
9
- const [typingInterval, setTypingInterval] = useState(null);
10
-
11
- const typeMessage = (text, callback) => {
12
- if (typingInterval) {
13
- clearInterval(typingInterval);
14
- }
15
-
16
- setTypingMessage('');
17
- let currentIndex = 0;
18
- const typeSpeed = Math.max(1, Math.min(3, 200 / text.length));
19
-
20
- const interval = setInterval(() => {
21
- if (currentIndex < text.length) {
22
- setTypingMessage(text.slice(0, currentIndex + 1));
23
- currentIndex++;
24
- } else {
25
- clearInterval(interval);
26
- setTypingInterval(null);
27
- setTypingMessage('');
28
- callback();
29
- }
30
- }, typeSpeed);
31
-
32
- setTypingInterval(interval);
33
- };
34
-
35
- const startChunkLesson = async (chunkIndex, documentData) => {
36
- if (!documentData || !documentData.chunks[chunkIndex]) return;
37
-
38
- setChatLoading(true);
39
-
40
- try {
41
- const chunk = documentData.chunks[chunkIndex];
42
- console.log('Starting lesson for chunk:', chunkIndex, chunk);
43
- console.log('Document data:', documentData.fileId, documentData.markdown?.length);
44
-
45
- const response = await fetch(`/start_chunk_lesson/${documentData.fileId}/${chunkIndex}`, {
46
- method: 'POST',
47
- headers: {
48
- 'Content-Type': 'application/json',
49
- },
50
- body: JSON.stringify({
51
- chunk: chunk,
52
- document_markdown: documentData.markdown
53
- })
54
- });
55
-
56
- if (!response.ok) {
57
- const errorData = await response.text();
58
- console.error('Backend error:', errorData);
59
- throw new Error(`Failed to start lesson: ${response.status} - ${errorData}`);
60
- }
61
-
62
- const lessonData = await response.json();
63
- setChatData(prev => ({
64
- ...prev,
65
- [chunkIndex]: {
66
- ...lessonData,
67
- chunkIndex: chunkIndex,
68
- chunk: chunk
69
- }
70
- }));
71
-
72
- setChatLoading(false);
73
-
74
- typeMessage(lessonData.questions, () => {
75
- setChatMessages(prev => ({
76
- ...prev,
77
- [chunkIndex]: [
78
- { type: 'ai', text: lessonData.questions }
79
- ]
80
- }));
81
- });
82
-
83
- } catch (error) {
84
- console.error('Error starting lesson:', error);
85
- alert('Error starting lesson: ' + error.message);
86
- setChatLoading(false);
87
- }
88
- };
89
-
90
- const clearTypingAnimation = () => {
91
- if (typingInterval) {
92
- clearInterval(typingInterval);
93
- setTypingInterval(null);
94
- }
95
- setTypingMessage('');
96
- };
97
-
98
- return {
99
- chatData,
100
- chatLoading,
101
- chatMessages,
102
- userInput,
103
- typingMessage,
104
- startChunkLesson,
105
- clearTypingAnimation,
106
- setUserInput,
107
- setChatMessages
108
- };
109
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/src/hooks/useChunkNavigation.js CHANGED
@@ -11,6 +11,7 @@ export const useChunkNavigation = (documentData, clearTypingAnimation) => {
11
  clearTypingAnimation();
12
  }
13
  setCurrentChunkIndex(currentChunkIndex + 1);
 
14
  }
15
  };
16
 
@@ -20,6 +21,7 @@ export const useChunkNavigation = (documentData, clearTypingAnimation) => {
20
  clearTypingAnimation();
21
  }
22
  setCurrentChunkIndex(currentChunkIndex - 1);
 
23
  }
24
  };
25
 
 
11
  clearTypingAnimation();
12
  }
13
  setCurrentChunkIndex(currentChunkIndex + 1);
14
+ setChunkExpanded(true);
15
  }
16
  };
17
 
 
21
  clearTypingAnimation();
22
  }
23
  setCurrentChunkIndex(currentChunkIndex - 1);
24
+ setChunkExpanded(true);
25
  }
26
  };
27
 
frontend/src/hooks/useDocumentProcessor.js CHANGED
@@ -1,59 +1,23 @@
1
- import { useState, useRef, useCallback } from 'react';
2
 
3
  export const useDocumentProcessor = () => {
4
  const fileInputRef = useRef(null);
5
  const [selectedFile, setSelectedFile] = useState(null);
6
  const [processing, setProcessing] = useState(false);
7
  const [uploadProgress, setUploadProgress] = useState(0);
8
- const [ocrProgress, setOcrProgress] = useState(0);
9
  const [documentData, setDocumentData] = useState(null);
10
- const [imageCache, setImageCache] = useState({});
11
- const imageCacheRef = useRef({});
12
 
13
  const handleFileChange = (e) => {
14
  setSelectedFile(e.target.files[0]);
15
  setDocumentData(null);
16
  setUploadProgress(0);
17
- setOcrProgress(0);
18
- setImageCache({});
19
- imageCacheRef.current = {};
20
  };
21
 
22
- const fetchImage = useCallback(async (imageId, fileId) => {
23
- if (imageCacheRef.current[imageId]) {
24
- return imageCacheRef.current[imageId];
25
- }
26
-
27
- try {
28
- const response = await fetch(`/get_image/${fileId}/${imageId}`);
29
- if (response.ok) {
30
- const data = await response.json();
31
- const imageData = data.image_base64;
32
-
33
- imageCacheRef.current = {
34
- ...imageCacheRef.current,
35
- [imageId]: imageData
36
- };
37
-
38
- setImageCache(prev => ({
39
- ...prev,
40
- [imageId]: imageData
41
- }));
42
-
43
- return imageData;
44
- }
45
- } catch (error) {
46
- console.error('Error fetching image:', error);
47
- }
48
- return null;
49
- }, []);
50
-
51
  const processDocument = async () => {
52
  if (!selectedFile) return;
53
 
54
  setProcessing(true);
55
  setUploadProgress(0);
56
- setOcrProgress(0);
57
 
58
  try {
59
  // Step 1: Upload PDF
@@ -67,89 +31,92 @@ export const useDocumentProcessor = () => {
67
  });
68
 
69
  if (!uploadResponse.ok) {
 
 
70
  throw new Error('Failed to upload PDF');
71
  }
72
 
73
- const uploadData = await uploadResponse.json();
 
 
74
  setUploadProgress(100);
75
 
76
- // Step 2: Process OCR
77
- setOcrProgress(20);
78
- await new Promise(resolve => setTimeout(resolve, 500));
79
-
80
- setOcrProgress(60);
81
- const ocrResponse = await fetch(`/process_ocr/${uploadData.file_id}`);
82
-
83
- if (!ocrResponse.ok) {
84
- throw new Error('Failed to process OCR');
85
- }
86
-
87
- const ocrData = await ocrResponse.json();
88
- setOcrProgress(100);
89
 
90
- // Use hardcoded chunks for MVP testing
91
  const hardcodedChunks = [
92
- {
93
  "topic": "Magnetfeldmessung und Hysterese-Analyse",
94
- "text": "Zu Beginn des Versuchs haben wir mit Hilfe des Teslameters die Magnetfeldstärke B an der Position der Cd-Lampe bei verschiedenen Spulenströmen gemessen. (siehe Messwerte in Tabelle 1 im Laborbuch). In Figure 1 sind die gemessenen Feldstärken als Funktion der Stromstärke aufgetragen.\nAnhand der Fehlerbalken und der praktisch identischen Überlagerung der beiden linearen Fitgeraden für auf- und absteigende Stromstärken, wird deutlich, dass keine Hystereseeffekte vorliegen. Der lineare Fit wurde hierbei nur auf die Stromstärken bis einschl. 10A angewandt, da für größere Stromstärken das Magnetfeld nicht in direktem proportionalen Zusammenhang ansteigt. Dies ist mit Sättigungseffekten der Magnetisierung des Eisenkerns der verwendeten Spule zu erklären."
95
- },
 
96
  {
97
  "topic": "Qualitative Beobachtung des Zeeman-Effekts",
98
- "text": "Mit Hilfe der CMOS Kamera wurde das Spektrum des emittierten Lichts der Cadmiumlampe unter Verwendung des Lummer Gehercke Interferometers beobachtet. Die Beobachtungen wurden in longitudinaler und transversaler Richtung zum Magnetfeld durchgeführt."
99
- },
 
100
  {
101
  "topic": "Zeeman-Effekt: Longitudinale Richtung mit Filtern",
102
- "text": "## ohne Filter:\n\nEs sind deutlich zwei Linien pro Ordnung zu erkennen. Dies sind die $\\sigma^{+}$und $\\sigma^{-}$Linien. Die $\\pi$ Linie ist in longitudinaler Richtung nicht zu beobachten\n\n## mit $\\lambda / 4$-Plättchen und Polarisationsfilter:\n\nVon der Cadmiumlampe aus betrachtet wird zuerst ein $\\lambda / 4$-Plättchen und danach ein Polarisationsfilter in den Strahlengang gebracht. Je nach Ausrichtung der Filter zueinander wird nun eine der beiden Linien ausgeblendet.\n\n$$\n-45^{\\circ} \\text { Winkel: }\n$$\n\nStehen $\\lambda / 4$-Plättchen und Polarisationsfilter zueinander im $-45^{\\circ}$ Winkel, wird das zirkular polarisierte Licht der $\\sigma^{-}$Linie um $45^{\\circ}$ verschoben linear polarisiert und somit vom Polarisationsfilter abgeschirmt. Folglich ist in dieser Konstellation nur die linke der beiden $\\sigma$ Linien zu beobachten.\n\n$$\n+45^{\\circ} \\text { Winkel: }\n$$\n\nStehen $\\lambda / 4$-Plättchen und Polarisationsfilter zueinander im $+45^{\\circ}$ Winkel, ist nach analogem Prinzip wie zuvor nur die rechte Linie auf dem Kamerabild zu beobachten."
103
- },
 
104
  {
105
  "topic": "Zeeman-Effekt: Transversale Richtung und Polarisation",
106
- "text": "## ohne Filter:\n\nEs sind deutlich drei Linien pro Ordnung zu erkennen. Dies sind die $\\sigma^{+}, \\pi$ und $\\sigma^{-}$Linien.\n\n## mit Polarisationsfilter horizontal (in B-Feld Richtung):\n\nDie beiden $\\sigma$-Linien sind vollständig ausgeblendet. Die $\\pi-$ Linie ist deutlich sichtbar.\nmit Polarisationsfilter vertikal $\\left(90^{\\circ}\\right.$ zu B-Feld Richtung):\nDie beiden $\\sigma$-Linien sind klar sichtbar. Die $\\pi$-Linie ist ausgeblendet.\n\nWie in Figure 3 gut zu erkennen ist, sind die ausgeblendeten Linien in beiden Konfigurationen weiterhin leicht sichtbar. Dies ist auf das nicht perfekt homogene Magnetfeld am Ort der Ca-Lampe zurückzuführen. Das Licht ist also nicht perfekt zirkular bzw. in B-Feld Richtung polarisiert, weshalb ein vollständiges Ausblenden im Experiment nicht zu beobachten ist."
107
- },
 
108
  {
109
  "topic": "Bestimmung des Zeemanshifts und Datenaufbereitung",
110
- "text": "Die Messdaten bei verschiedene Stromstärken wurden jeweils in einem Plot dargestellt. Um für den Fit möglichst saubere Messkurven des Spektrums zu verwenden, wurde die Messreihe bei $I=8 A$ nicht in die Datenauswertung einbezogen, da die Aufspaltung der Cadmiumlinie nur schwer zu beobachten war. Das gleich gilt für die 8. Interferenzodnung, die nicht berücksichtigt wurde. Für die Datenauswertung fließen also die Nullte bis 7. Ordnung jeweils bei 9 bis 13 Ampere ein.\nAls Funktion um die Messdaten zu fitten wurde ein Pseudo-Voigt-Profil verwendet. Die drei Kurven einer Ordnung wurden hierbei gemeinsam mit der Summe dreier Pseudo-Voigt-Profile gefittet. In Figure 4 sind exemplarisch anhand der Daten für $I=12 A$ die Messdaten und der abschnittsweise Fit zu erkennen."
111
- },
 
112
  {
113
  "topic": "Fehleranalyse der Fitparameter und Verzerrungseffekte",
114
- "text": "Anhand der Fitparameter wird die Position der $\\sigma$ und $\\pi$ Linien bestimmt. Die Fehler der Fitparameter sind extrem klein $(\\approx 0,1 p x)$ und eigenen sich nicht als realistische Fehler für unsere weitere Rechnung. Als minimalen Fehler nehmen wir daher die Auflösung der Kamera an ( $1 p x$ ) und skalieren alle Fehler so, dass der kleineste Fehler exakt $1 p x$ beträgt. Die anderen Fehler sind dann entsprechend linear skaliert größer. Dies berücksichtigt die unterschiedliche Qualität der Fits auf unterschiedliche Interferenz-Ordnungen, bringt die Fehler aber in einen experimentell realistischen Bereich.\nFür die Berechnung des Zeemanshifts müssen die Verzerrungseffekte der Lummer-Gehrcke-Platte beachtet werden. Hierfür wird die Position der $\\pi$-Linien gegen der Interferenzordnung $k$ der entsprechenden Linie aufgetragen. Der funktionelle Zusammenhang dieser beiden Größen wird durch eine quadratische Funktion $k=f(a)$ approximiert:\n\n$$\\nk=f(a)=b a^{2}+c a+d\n$$"
115
- },
 
116
  {
117
  "topic": "Berechnung der Wellenlängen- und Energieverschiebung",
118
- "text": "Die Differenz zur ganzzahligen Ordnung der zugehörigen $\\pi$-Linie ergibt $\\delta k$. Für eine (kleine) Wellenlängenverschiebung $\\delta \\lambda$ gilt:\n\n$$\\n\\delta \\lambda=\\frac{\\delta k}{\\Delta k} \\cdot \\frac{\\lambda^{2}}{2 d \\cdot \\sqrt{n^{2}-1}}\n$$\n\nFür den Abstand $\\Delta k$ zweier Ordnungen gilt $\\Delta k=1$. Für die Wellenlänge $\\lambda$ der betrachten Linie verwenden wir den in Part 2 bestimmten Wert von $\\lambda=$ $(643,842 \\pm 0,007) \\mathrm{nm}$.\nWir kennen nun die Wellenlänge des Zeemanshift für jede von uns betrachtete Linie. Mit dem Zusammenhang zwischen Wellenlänge und Energie $E=\\frac{h c}{\\lambda}$ lässt sich nun die Energieverschiebung der Linine bestimmen. Wir nehmen an, dass die Wellenlängenverschiebung $\\delta \\lambda$ klein gegenüber der absoluten Wellenlänge $\\lambda$ ist, und erhalten daher für die Energieverschiebung $\\delta E$ in guter Näherung:\n\n$$\\n\\delta E=\\frac{h c}{\\lambda^{2}} \\delta \\lambda\n$$"
119
- },
 
120
  {
121
  "topic": "Bestimmung des Bohrschen Magnetons aus experimentellen Daten",
122
- "text": "Abschließend nehmen wir den Durchschnitt aller Werte $\\delta E$ für eine Stromstärke $I$.\n\n### 3.2 Bestimmen des Bohrschen Magnetons $\\mu_{B}$ \n\nFür die Energieverschiebung beim Zeemaneffekt gilt:\n\n$$\n\\delta E=\\mu_{B} \\cdot m_{l} \\cdot B\n$$\n\nDa es sich bei der betrachteten Cadmiumlinie um einen ${ }^{1} D_{2} \\rightarrow{ }^{1} P_{1}$ Übergang handelt gilt hier $m_{l}= \\pm 1$. Somit folgt für das Bohrsche Magneton $\\mu_{B}$ als Funktion des Spulenstroms $I$ :\n\n$$\n\\mu_{B}(I)=\\frac{\\delta E(I)}{B(I)}\n$$\n\nDie Magnetfeldstärke $B(I)$ wurde hier anhand der Messwerte aus Teil 1 des Experiments bestimmt.\nWir erhalten für jeden Spulenstrom $I$ einen experimentell bestimmten Wert des Bohrschen Magnetons $\\mu_{B}$. Unsere Ergebnisse sind in Figure 6 graphisch dargestellt."
123
- },
 
124
  {
125
  "topic": "Vergleich des experimentellen Werts mit dem Literaturwert",
126
- "text": "Für den experimentellen Mittelwert erhalten wir:\n\n$$\n\\mu_{B, \\exp }=(10,1 \\pm 0.8) \\cdot 10^{-24} \\frac{J}{T}\n$$\n\nDer Literaturwert beträgt:\n\n$$\n\\mu_{B, l i t}=9,27400949 \\cdot 10^{-24} \\frac{J}{T}\n$$\n\nUnsere experimentell ermittelte Wert weicht also um 1,2 Sigma vom Literaturwert ab. Die Abweichung ist folglich nicht signifikant."
127
- },
 
128
  {
129
  "topic": "Kritische Betrachtung der Ergebnisse und Fehlerquellen",
130
- "text": "Erfreulicherweise scheint unsere experimentelle Methode keine signifikante Abweichung zwischen Literaturwert und experimentellem Wert des Bohrschen Magnetons zu ergeben. Wir befinden uns mit unserem Wert im niedirgen 2-SigmaIntervall. Dennoch ist kritisch anzumerken, dass wir einen vergleichsweise großen realtiven Fehler auf unser Messergebnis von $7,1 \\%$ erhalten. Das bedeutet, unsere Abweichung ist zwar nicht sigifikant, dennoch weicht unser experimenteller Wert um knapp $10 \\%$ vom Literaturwert ab. Der verwendete experimentelle Aufbau ist folglich nur bedingt für eine exakte Bestimmung des Bohrschen Magnetons geeigent.\n\nDie beiden dominierenden Fehlerquellen sind zum einen die Bestimmung des Magnetfeldes B am Ort der Cadmium Lampe (Inhomogenitäten, exakte Platzierung der Lampe) und zum anderen die Wahl der Fehler der Positionen der $\\pi$ - und $\\sigma$-Linien im Spektrum.\nZum Vergleich: Legt man den Fehler prinzipiell für alle Linien auf $1 p x$, also die maximale Auflösung der Kamera, fest und verzichtet auf eine Skalierung der Fehler, beträgt die Abweichung des exp. Werts zum Literaturwert schon 2,8 Sigma. Wählt man analog für den Fehler der Linien $2 p x$, da beispielsweise ein Maximum auch exakt zwischen zwei Pixelreihen liegen kann, liegt die Abweichung bei 1,4 Sigma."
131
- },
 
132
  {
133
  "topic": "Quantitative Spektrumsbetrachtung und Wellenlängenbestimmung der Cd-Linie",
134
- "text": "Zunächst wird der Untergrund von den Messdaten abgezogen, um Störungen durch Rauschen oder Sondereffekte wie kosmische Strahlung oder Umgebungsquellen zu eliminieren. Sollten sich in den Spektren negative Werte befinden, ist dies auf zufällige Unterschiede im Rauschen zurückzuführen. Anhand bekannter Linien des Neonspektrums werden den Pixeln nun Wellenlängen zugeordnet. Hierfür wurde der Bereich des Neonspektrums aufgenommen, in dem sich auch die rote Linie des Cadmiumspektrums befindet. In 7 sieht man das Neonspektrum und die Peaks, an die jeweils ein Voigt-Profil gelegt wurde. Jetzt kann man den identifizierten Linien ihre jeweilige Wellenlänge zuordnen und einen polynomiellen Zusammenhang finden. Wir haben uns für eine Gerade entschieden, die wie in Figure 8 zu sehen gut zu den Daten passt.\nSchließlich wird ein Voigt-Profil an die gemessene rote Cd-Linie gelegt, wie in Figure 9 gezeigt. Umrechnung anhand der Kalibrierung führt auf einen Wert von $\\lambda_{C d}=(643,842 \\pm 0,007) \\mathrm{nm}$. Dies befindet sich im $1 \\sigma$-Bereich des Literaturwertes von $\\lambda_{L i t}=643,84695 \\mathrm{~nm}$. Der Fehler ist Ergebnis der Gauß'schen Fehlerfortpflanzung."
135
- },
 
136
  {
137
  "topic": "Kritische Betrachtung der Genauigkeit und systematischer Fehler",
138
- "text": "Messwert und theoretische Vorhersage für die bestimmte Linie stimmen innerhalb statistischer Schwankungen überein. Dies ist umso interessanter, wenn man die Unsicherheit des Messergebnisses betrachtet, die kleiner als 0,002\\% ist. Der absolute Fehler ist, wenn man die Steigung der Kalibrationsgeraden betrachtet, kleiner als 1px. Er besteht ausschließlich aus Abweichungen der numerischen Fits. Berücksichtigt man Ungenauigkeiten des CMOS Sensors oder die Möglichkeit, dass je nach Lage des Messwerts auch eine Abweichung um weniger als 1px eine größere Messwertschwankung verursachen kann, da die Pixel nur diskrete Werte messen können, liegt eine nachträgliche Anpassung nahe. Skaliert man die Unsicherheit auf 1px, liegt der Fehler des Messwerts bei $0,012 \\mathrm{~nm}$. Damit ist der relative Fehler weiterhin kleiner $0,005 \\%$.\n\nZur hohen Genauigkeit trägt vor allem das gute Messverfahren bei. Spektrometer und Datenaufnahme per Computer lassen wenig Raum für Abweichungen. Wie die Daten zeigen, haben wir dabei eine Quelle für einen möglichen großen systematischen Fehler umgangen: Die Kamera wurde auf das Spektrometer nur locker aufgesteckt. Hätte sich deren Position zwischen Neon- und Cadmiummessung z.B. durch Erschütterung des Labortisches verändert, hätte die Energiekalibrierung nicht mehr zur Messung der Cadmiumlinie gepasst."
139
- },
 
140
  {
141
  "topic": "Unerwartetes Verhalten durch mögliche Restmagnetisierung",
142
- "text": "Abbildung 6 zeigt unerwartetes Verhalten. Obwohl der Magnet ausgeschaltet war, sind drei Maxima zu sehen, deren Flanken sehr steil abfallen. Vergleicht man mit den Messungen im Magnetfeld, ähneln sich die Strukturen. Möglich ist, dass die Eisenkernspule, in der sich die Lampe während der Messung befand eine Restmagnetisierung aufwies, die eine Aufspaltung herbeigeführt hat."
143
- }
 
144
  ];
145
 
146
  setDocumentData({
147
- fileId: uploadData.file_id,
148
- filename: uploadData.filename,
149
- markdown: ocrData.combined_markdown,
150
- pages: ocrData.pages,
151
- totalPages: ocrData.total_pages,
152
- chunks: hardcodedChunks // Use hardcoded chunks instead of OCR chunks
153
  });
154
 
155
  } catch (error) {
@@ -165,11 +132,8 @@ export const useDocumentProcessor = () => {
165
  selectedFile,
166
  processing,
167
  uploadProgress,
168
- ocrProgress,
169
  documentData,
170
- imageCache,
171
  handleFileChange,
172
- fetchImage,
173
  processDocument,
174
  setSelectedFile
175
  };
 
1
+ import { useState, useRef } from 'react';
2
 
3
  export const useDocumentProcessor = () => {
4
  const fileInputRef = useRef(null);
5
  const [selectedFile, setSelectedFile] = useState(null);
6
  const [processing, setProcessing] = useState(false);
7
  const [uploadProgress, setUploadProgress] = useState(0);
 
8
  const [documentData, setDocumentData] = useState(null);
 
 
9
 
10
  const handleFileChange = (e) => {
11
  setSelectedFile(e.target.files[0]);
12
  setDocumentData(null);
13
  setUploadProgress(0);
 
 
 
14
  };
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  const processDocument = async () => {
17
  if (!selectedFile) return;
18
 
19
  setProcessing(true);
20
  setUploadProgress(0);
 
21
 
22
  try {
23
  // Step 1: Upload PDF
 
31
  });
32
 
33
  if (!uploadResponse.ok) {
34
+ const errorText = await uploadResponse.text();
35
+ console.error('Upload failed:', uploadResponse.status, errorText);
36
  throw new Error('Failed to upload PDF');
37
  }
38
 
39
+ const responseText = await uploadResponse.text();
40
+ console.log('Raw response:', responseText);
41
+ const uploadData = JSON.parse(responseText);
42
  setUploadProgress(100);
43
 
44
+ // Brief pause to show completion
45
+ await new Promise(resolve => setTimeout(resolve, 200));
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ // Use hardcoded chunks for the document
48
  const hardcodedChunks = [
49
+ {
50
  "topic": "Magnetfeldmessung und Hysterese-Analyse",
51
+ "text": "Zu Beginn des Versuchs haben wir mit Hilfe des Teslameters die Magnetfeldstärke B an der Position der Cd-Lampe bei verschiedenen Spulenströmen gemessen. (siehe Messwerte in Tabelle 1 im Laborbuch). In Figure 1 sind die gemessenen Feldstärken als Funktion der Stromstärke aufgetragen.\nAnhand der Fehlerbalken und der praktisch identischen Überlagerung der beiden linearen Fitgeraden für auf- und absteigende Stromstärken, wird deutlich, dass keine Hystereseeffekte vorliegen. Der lineare Fit wurde hierbei nur auf die Stromstärken bis einschl. 10A angewandt, da für größere Stromstärken das Magnetfeld nicht in direktem proportionalen Zusammenhang ansteigt. Dies ist mit Sättigungseffekten der Magnetisierung des Eisenkerns der verwendeten Spule zu erklären.",
52
+ "page": 1
53
+ },
54
  {
55
  "topic": "Qualitative Beobachtung des Zeeman-Effekts",
56
+ "text": "Mit Hilfe der CMOS Kamera wurde das Spektrum des emittierten Lichts der Cadmiumlampe unter Verwendung des Lummer Gehercke Interferometers beobachtet. Die Beobachtungen wurden in longitudinaler und transversaler Richtung zum Magnetfeld durchgeführt.",
57
+ "page": 2
58
+ },
59
  {
60
  "topic": "Zeeman-Effekt: Longitudinale Richtung mit Filtern",
61
+ "text": "## ohne Filter:\n\nEs sind deutlich zwei Linien pro Ordnung zu erkennen. Dies sind die $\\sigma^{+}$und $\\sigma^{-}$Linien. Die $\\pi$ Linie ist in longitudinaler Richtung nicht zu beobachten\n\n## mit $\\lambda / 4$-Plättchen und Polarisationsfilter:\n\nVon der Cadmiumlampe aus betrachtet wird zuerst ein $\\lambda / 4$-Plättchen und danach ein Polarisationsfilter in den Strahlengang gebracht. Je nach Ausrichtung der Filter zueinander wird nun eine der beiden Linien ausgeblendet.\n\n$$\n-45^{\\circ} \\text { Winkel: }\n$$\n\nStehen $\\lambda / 4$-Plättchen und Polarisationsfilter zueinander im $-45^{\\circ}$ Winkel, wird das zirkular polarisierte Licht der $\\sigma^{-}$Linie um $45^{\\circ}$ verschoben linear polarisiert und somit vom Polarisationsfilter abgeschirmt. Folglich ist in dieser Konstellation nur die linke der beiden $\\sigma$ Linien zu beobachten.\n\n$$\n+45^{\\circ} \\text { Winkel: }\n$$\n\nStehen $\\lambda / 4$-Plättchen und Polarisationsfilter zueinander im $+45^{\\circ}$ Winkel, ist nach analogem Prinzip wie zuvor nur die rechte Linie auf dem Kamerabild zu beobachten.",
62
+ "page": 2
63
+ },
64
  {
65
  "topic": "Zeeman-Effekt: Transversale Richtung und Polarisation",
66
+ "text": "## ohne Filter:\n\nEs sind deutlich drei Linien pro Ordnung zu erkennen. Dies sind die $\\sigma^{+}, \\pi$ und $\\sigma^{-}$Linien.\n\n## mit Polarisationsfilter horizontal (in B-Feld Richtung):\n\nDie beiden $\\sigma$-Linien sind vollständig ausgeblendet. Die $\\pi-$ Linie ist deutlich sichtbar.\nmit Polarisationsfilter vertikal $\\left(90^{\\circ}\\right.$ zu B-Feld Richtung):\nDie beiden $\\sigma$-Linien sind klar sichtbar. Die $\\pi$-Linie ist ausgeblendet.\n\nWie in Figure 3 gut zu erkennen ist, sind die ausgeblendeten Linien in beiden Konfigurationen weiterhin leicht sichtbar. Dies ist auf das nicht perfekt homogene Magnetfeld am Ort der Ca-Lampe zurückzuführen. Das Licht ist also nicht perfekt zirkular bzw. in B-Feld Richtung polarisiert, weshalb ein vollständiges Ausblenden im Experiment nicht zu beobachten ist.",
67
+ "page": 3
68
+ },
69
  {
70
  "topic": "Bestimmung des Zeemanshifts und Datenaufbereitung",
71
+ "text": "Die Messdaten bei verschiedene Stromstärken wurden jeweils in einem Plot dargestellt. Um für den Fit möglichst saubere Messkurven des Spektrums zu verwenden, wurde die Messreihe bei $I=8 A$ nicht in die Datenauswertung einbezogen, da die Aufspaltung der Cadmiumlinie nur schwer zu beobachten war. Das gleich gilt für die 8. Interferenzodnung, die nicht berücksichtigt wurde. Für die Datenauswertung fließen also die Nullte bis 7. Ordnung jeweils bei 9 bis 13 Ampere ein.\nAls Funktion um die Messdaten zu fitten wurde ein Pseudo-Voigt-Profil verwendet. Die drei Kurven einer Ordnung wurden hierbei gemeinsam mit der Summe dreier Pseudo-Voigt-Profile gefittet. In Figure 4 sind exemplarisch anhand der Daten für $I=12 A$ die Messdaten und der abschnittsweise Fit zu erkennen.",
72
+ "page": 4
73
+ },
74
  {
75
  "topic": "Fehleranalyse der Fitparameter und Verzerrungseffekte",
76
+ "text": "Anhand der Fitparameter wird die Position der $\\sigma$ und $\\pi$ Linien bestimmt. Die Fehler der Fitparameter sind extrem klein $(\\approx 0,1 p x)$ und eigenen sich nicht als realistische Fehler für unsere weitere Rechnung. Als minimalen Fehler nehmen wir daher die Auflösung der Kamera an ( $1 p x$ ) und skalieren alle Fehler so, dass der kleineste Fehler exakt $1 p x$ beträgt. Die anderen Fehler sind dann entsprechend linear skaliert größer. Dies berücksichtigt die unterschiedliche Qualität der Fits auf unterschiedliche Interferenz-Ordnungen, bringt die Fehler aber in einen experimentell realistischen Bereich.\nFür die Berechnung des Zeemanshifts müssen die Verzerrungseffekte der Lummer-Gehrcke-Platte beachtet werden. Hierfür wird die Position der $\\pi$-Linien gegen der Interferenzordnung $k$ der entsprechenden Linie aufgetragen. Der funktionelle Zusammenhang dieser beiden Größen wird durch eine quadratische Funktion $k=f(a)$ approximiert: \n\n $k=f(a)=b a^{2}+c a+d$",
77
+ "page": 4
78
+ },
79
  {
80
  "topic": "Berechnung der Wellenlängen- und Energieverschiebung",
81
+ "text": "Die Differenz zur ganzzahligen Ordnung der zugehörigen $\\pi$-Linie ergibt $\\delta k$. Für eine (kleine) Wellenlängenverschiebung $\\delta \\lambda$ gilt:\n\n$$\\n\\delta \\lambda=\\frac{\\delta k}{\\Delta k} \\cdot \\frac{\\lambda^{2}}{2 d \\cdot \\sqrt{n^{2}-1}}\n$$\n\nFür den Abstand $\\Delta k$ zweier Ordnungen gilt $\\Delta k=1$. Für die Wellenlänge $\\lambda$ der betrachten Linie verwenden wir den in Part 2 bestimmten Wert von $\\lambda=$ $(643,842 \\pm 0,007) \\mathrm{nm}$.\nWir kennen nun die Wellenlänge des Zeemanshift für jede von uns betrachtete Linie. Mit dem Zusammenhang zwischen Wellenlänge und Energie $E=\\frac{h c}{\\lambda}$ lässt sich nun die Energieverschiebung der Linine bestimmen. Wir nehmen an, dass die Wellenlängenverschiebung $\\delta \\lambda$ klein gegenüber der absoluten Wellenlänge $\\lambda$ ist, und erhalten daher für die Energieverschiebung $\\delta E$ in guter Näherung:\n\n$$\\n\\delta E=\\frac{h c}{\\lambda^{2}} \\delta \\lambda\n$$",
82
+ "page": 5
83
+ },
84
  {
85
  "topic": "Bestimmung des Bohrschen Magnetons aus experimentellen Daten",
86
+ "text": "Abschließend nehmen wir den Durchschnitt aller Werte $\\delta E$ für eine Stromstärke $I$.\n\n### 3.2 Bestimmen des Bohrschen Magnetons $\\mu_{B}$ \n\nFür die Energieverschiebung beim Zeemaneffekt gilt:\n\n$$\n\\delta E=\\mu_{B} \\cdot m_{l} \\cdot B\n$$\n\nDa es sich bei der betrachteten Cadmiumlinie um einen ${ }^{1} D_{2} \\rightarrow{ }^{1} P_{1}$ Übergang handelt gilt hier $m_{l}= \\pm 1$. Somit folgt für das Bohrsche Magneton $\\mu_{B}$ als Funktion des Spulenstroms $I$ :\n\n$$\n\\mu_{B}(I)=\\frac{\\delta E(I)}{B(I)}\n$$\n\nDie Magnetfeldstärke $B(I)$ wurde hier anhand der Messwerte aus Teil 1 des Experiments bestimmt.\nWir erhalten für jeden Spulenstrom $I$ einen experimentell bestimmten Wert des Bohrschen Magnetons $\\mu_{B}$. Unsere Ergebnisse sind in Figure 6 graphisch dargestellt.",
87
+ "page": 6
88
+ },
89
  {
90
  "topic": "Vergleich des experimentellen Werts mit dem Literaturwert",
91
+ "text": "Für den experimentellen Mittelwert erhalten wir:\n\n$$\n\\mu_{B, \\exp }=(10,1 \\pm 0.8) \\cdot 10^{-24} \\frac{J}{T}\n$$\n\nDer Literaturwert beträgt:\n\n$$\n\\mu_{B, l i t}=9,27400949 \\cdot 10^{-24} \\frac{J}{T}\n$$\n\nUnsere experimentell ermittelte Wert weicht also um 1,2 Sigma vom Literaturwert ab. Die Abweichung ist folglich nicht signifikant.",
92
+ "page": 6
93
+ },
94
  {
95
  "topic": "Kritische Betrachtung der Ergebnisse und Fehlerquellen",
96
+ "text": "Erfreulicherweise scheint unsere experimentelle Methode keine signifikante Abweichung zwischen Literaturwert und experimentellem Wert des Bohrschen Magnetons zu ergeben. Wir befinden uns mit unserem Wert im niedirgen 2-SigmaIntervall. Dennoch ist kritisch anzumerken, dass wir einen vergleichsweise großen realtiven Fehler auf unser Messergebnis von $7,1 \\%$ erhalten. Das bedeutet, unsere Abweichung ist zwar nicht sigifikant, dennoch weicht unser experimenteller Wert um knapp $10 \\%$ vom Literaturwert ab. Der verwendete experimentelle Aufbau ist folglich nur bedingt für eine exakte Bestimmung des Bohrschen Magnetons geeigent.\n\nDie beiden dominierenden Fehlerquellen sind zum einen die Bestimmung des Magnetfeldes B am Ort der Cadmium Lampe (Inhomogenitäten, exakte Platzierung der Lampe) und zum anderen die Wahl der Fehler der Positionen der $\\pi$ - und $\\sigma$-Linien im Spektrum.\nZum Vergleich: Legt man den Fehler prinzipiell für alle Linien auf $1 p x$, also die maximale Auflösung der Kamera, fest und verzichtet auf eine Skalierung der Fehler, beträgt die Abweichung des exp. Werts zum Literaturwert schon 2,8 Sigma. Wählt man analog für den Fehler der Linien $2 p x$, da beispielsweise ein Maximum auch exakt zwischen zwei Pixelreihen liegen kann, liegt die Abweichung bei 1,4 Sigma.",
97
+ "page": 7
98
+ },
99
  {
100
  "topic": "Quantitative Spektrumsbetrachtung und Wellenlängenbestimmung der Cd-Linie",
101
+ "text": "Zunächst wird der Untergrund von den Messdaten abgezogen, um Störungen durch Rauschen oder Sondereffekte wie kosmische Strahlung oder Umgebungsquellen zu eliminieren. Sollten sich in den Spektren negative Werte befinden, ist dies auf zufällige Unterschiede im Rauschen zurückzuführen. Anhand bekannter Linien des Neonspektrums werden den Pixeln nun Wellenlängen zugeordnet. Hierfür wurde der Bereich des Neonspektrums aufgenommen, in dem sich auch die rote Linie des Cadmiumspektrums befindet. In 7 sieht man das Neonspektrum und die Peaks, an die jeweils ein Voigt-Profil gelegt wurde. Jetzt kann man den identifizierten Linien ihre jeweilige Wellenlänge zuordnen und einen polynomiellen Zusammenhang finden. Wir haben uns für eine Gerade entschieden, die wie in Figure 8 zu sehen gut zu den Daten passt.\nSchließlich wird ein Voigt-Profil an die gemessene rote Cd-Linie gelegt, wie in Figure 9 gezeigt. Umrechnung anhand der Kalibrierung führt auf einen Wert von $\\lambda_{C d}=(643,842 \\pm 0,007) \\mathrm{nm}$. Dies befindet sich im $1 \\sigma$-Bereich des Literaturwertes von $\\lambda_{L i t}=643,84695 \\mathrm{~nm}$. Der Fehler ist Ergebnis der Gauß'schen Fehlerfortpflanzung.",
102
+ "page": 8
103
+ },
104
  {
105
  "topic": "Kritische Betrachtung der Genauigkeit und systematischer Fehler",
106
+ "text": "Messwert und theoretische Vorhersage für die bestimmte Linie stimmen innerhalb statistischer Schwankungen überein. Dies ist umso interessanter, wenn man die Unsicherheit des Messergebnisses betrachtet, die kleiner als 0,002\\% ist. Der absolute Fehler ist, wenn man die Steigung der Kalibrationsgeraden betrachtet, kleiner als 1px. Er besteht ausschließlich aus Abweichungen der numerischen Fits. Berücksichtigt man Ungenauigkeiten des CMOS Sensors oder die Möglichkeit, dass je nach Lage des Messwerts auch eine Abweichung um weniger als 1px eine größere Messwertschwankung verursachen kann, da die Pixel nur diskrete Werte messen können, liegt eine nachträgliche Anpassung nahe. Skaliert man die Unsicherheit auf 1px, liegt der Fehler des Messwerts bei $0,012 \\mathrm{~nm}$. Damit ist der relative Fehler weiterhin kleiner $0,005 \\%$.\n\nZur hohen Genauigkeit trägt vor allem das gute Messverfahren bei. Spektrometer und Datenaufnahme per Computer lassen wenig Raum für Abweichungen. Wie die Daten zeigen, haben wir dabei eine Quelle für einen möglichen großen systematischen Fehler umgangen: Die Kamera wurde auf das Spektrometer nur locker aufgesteckt. Hätte sich deren Position zwischen Neon- und Cadmiummessung z.B. durch Erschütterung des Labortisches verändert, hätte die Energiekalibrierung nicht mehr zur Messung der Cadmiumlinie gepasst.",
107
+ "page": 9
108
+ },
109
  {
110
  "topic": "Unerwartetes Verhalten durch mögliche Restmagnetisierung",
111
+ "text": "Abbildung 6 zeigt unerwartetes Verhalten. Obwohl der Magnet ausgeschaltet war, sind drei Maxima zu sehen, deren Flanken sehr steil abfallen. Vergleicht man mit den Messungen im Magnetfeld, ähneln sich die Strukturen. Möglich ist, dass die Eisenkernspule, in der sich die Lampe während der Messung befand eine Restmagnetisierung aufwies, die eine Aufspaltung herbeigeführt hat.",
112
+ "page": 9
113
+ }
114
  ];
115
 
116
  setDocumentData({
117
+ filename: uploadData.filename || selectedFile.name,
118
+ filePath: uploadData.file_path,
119
+ chunks: hardcodedChunks
 
 
 
120
  });
121
 
122
  } catch (error) {
 
132
  selectedFile,
133
  processing,
134
  uploadProgress,
 
135
  documentData,
 
136
  handleFileChange,
 
137
  processDocument,
138
  setSelectedFile
139
  };
frontend/src/lib/utils.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import { clsx } from "clsx";
2
+ import { twMerge } from "tailwind-merge"
3
+
4
+ export function cn(...inputs) {
5
+ return twMerge(clsx(inputs));
6
+ }
frontend/src/utils/markdownComponents.jsx CHANGED
@@ -1,84 +1,8 @@
1
- import ImageComponent from '../components/ImageComponent';
2
-
3
- export const getDocumentMarkdownComponents = (documentData, fetchImage, imageCache, setImageCache) => ({
4
- h1: ({ children }) => <h1 style={{ fontSize: '1.5rem', fontWeight: 'bold', marginBottom: '1rem', color: '#1a202c' }}>{children}</h1>,
5
- h2: ({ children }) => <h2 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', marginTop: '1.5rem', color: '#1a202c' }}>{children}</h2>,
6
- h3: ({ children }) => <h3 style={{ fontSize: '1.125rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '1rem', color: '#1a202c' }}>{children}</h3>,
7
- p: ({ children }) => <p style={{ marginBottom: '0.75rem', color: '#374151', lineHeight: '1.5', fontSize: '0.875rem' }}>{children}</p>,
8
- hr: () => <hr style={{ margin: '1.5rem 0', borderColor: '#d1d5db' }} />,
9
- ul: ({ children }) => <ul style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'disc', fontSize: '0.875rem' }}>{children}</ul>,
10
- ol: ({ children }) => <ol style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'decimal', fontSize: '0.875rem' }}>{children}</ol>,
11
- li: ({ children }) => <li style={{ marginBottom: '0.125rem', color: '#374151' }}>{children}</li>,
12
- blockquote: ({ children }) => (
13
- <blockquote style={{ borderLeft: '3px solid #3b82f6', paddingLeft: '0.75rem', fontStyle: 'italic', margin: '0.75rem 0', color: '#6b7280', fontSize: '0.875rem' }}>
14
- {children}
15
- </blockquote>
16
- ),
17
- code: ({ inline, children }) =>
18
- inline ?
19
- <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code> :
20
- <pre style={{ backgroundColor: '#f3f4f6', padding: '0.75rem', borderRadius: '0.375rem', overflowX: 'auto', margin: '0.75rem 0' }}>
21
- <code style={{ fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code>
22
- </pre>,
23
- div: ({ children, style }) => (
24
- <div style={style}>
25
- {children}
26
- </div>
27
- ),
28
- img: ({ src, alt }) => (
29
- <ImageComponent
30
- src={src}
31
- alt={alt}
32
- fileId={documentData?.fileId}
33
- imageCache={imageCache}
34
- onImageCached={(imageId, imageData) => {
35
- setImageCache(prev => ({
36
- ...prev,
37
- [imageId]: imageData
38
- }));
39
- }}
40
- />
41
- )
42
- });
43
-
44
- export const getChunkMarkdownComponents = (documentData, fetchImage, imageCache, setImageCache) => ({
45
- h1: ({ children }) => <h1 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', color: '#1a202c' }}>{children}</h1>,
46
- h2: ({ children }) => <h2 style={{ fontSize: '1.125rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '1rem', color: '#1a202c' }}>{children}</h2>,
47
- h3: ({ children }) => <h3 style={{ fontSize: '1rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '0.75rem', color: '#1a202c' }}>{children}</h3>,
48
- p: ({ children }) => <p style={{ marginBottom: '0.5rem', color: '#374151', lineHeight: '1.4', fontSize: '0.875rem' }}>{children}</p>,
49
- hr: () => <hr style={{ margin: '1rem 0', borderColor: '#d1d5db' }} />,
50
- ul: ({ children }) => <ul style={{ marginBottom: '0.5rem', marginLeft: '1rem', listStyleType: 'disc', fontSize: '0.875rem' }}>{children}</ul>,
51
- ol: ({ children }) => <ol style={{ marginBottom: '0.5rem', marginLeft: '1rem', listStyleType: 'decimal', fontSize: '0.875rem' }}>{children}</ol>,
52
- li: ({ children }) => <li style={{ marginBottom: '0.125rem', color: '#374151' }}>{children}</li>,
53
- blockquote: ({ children }) => (
54
- <blockquote style={{ borderLeft: '2px solid #9ca3af', paddingLeft: '0.5rem', fontStyle: 'italic', margin: '0.5rem 0', color: '#6b7280', fontSize: '0.875rem' }}>
55
- {children}
56
- </blockquote>
57
- ),
58
- code: ({ inline, children }) =>
59
- inline ?
60
- <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code> :
61
- <pre style={{ backgroundColor: '#f3f4f6', padding: '0.5rem', borderRadius: '0.25rem', overflowX: 'auto', margin: '0.5rem 0' }}>
62
- <code style={{ fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code>
63
- </pre>,
64
- img: ({ src, alt }) => (
65
- <ImageComponent
66
- src={src}
67
- alt={alt}
68
- fileId={documentData?.fileId}
69
- imageCache={imageCache}
70
- onImageCached={(imageId, imageData) => {
71
- setImageCache(prev => ({
72
- ...prev,
73
- [imageId]: imageData
74
- }));
75
- }}
76
- />
77
- )
78
- });
79
-
80
  export const getChatMarkdownComponents = () => ({
81
  p: ({ children }) => <p className="mb-2 text-gray-800 leading-relaxed">{children}</p>,
 
 
 
82
  ul: ({ children }) => <ul className="mb-2 ml-4 list-disc">{children}</ul>,
83
  ol: ({ children }) => <ol className="mb-2 ml-4 list-decimal">{children}</ol>,
84
  li: ({ children }) => <li className="mb-1 text-gray-800">{children}</li>,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  export const getChatMarkdownComponents = () => ({
2
  p: ({ children }) => <p className="mb-2 text-gray-800 leading-relaxed">{children}</p>,
3
+ h1: ({ children }) => <h1 className="text-xl font-bold mb-3 text-gray-900">{children}</h1>,
4
+ h2: ({ children }) => <h2 className="text-lg font-bold mb-2 text-gray-900">{children}</h2>,
5
+ h3: ({ children }) => <h3 className="text-base font-bold mb-2 text-gray-900">{children}</h3>,
6
  ul: ({ children }) => <ul className="mb-2 ml-4 list-disc">{children}</ul>,
7
  ol: ({ children }) => <ol className="mb-2 ml-4 list-decimal">{children}</ol>,
8
  li: ({ children }) => <li className="mb-1 text-gray-800">{children}</li>,
frontend/src/utils/markdownUtils.js DELETED
@@ -1,33 +0,0 @@
1
- export const highlightChunkInMarkdown = (markdown, chunks, currentChunkIndex) => {
2
- if (!chunks || !chunks[currentChunkIndex] || !markdown) {
3
- return markdown;
4
- }
5
-
6
- const chunk = chunks[currentChunkIndex];
7
- const chunkText = markdown.slice(chunk.start_position, chunk.end_position);
8
-
9
- console.log('Chunk debugging:', {
10
- chunkIndex: currentChunkIndex,
11
- startPos: chunk.start_position,
12
- endPos: chunk.end_position,
13
- chunkTextLength: chunkText.length,
14
- chunkTextPreview: chunkText.substring(0, 50) + '...',
15
- beforeText: markdown.slice(Math.max(0, chunk.start_position - 20), chunk.start_position),
16
- afterText: markdown.slice(chunk.end_position, chunk.end_position + 20)
17
- });
18
-
19
- // Use markdown blockquote which preserves structure while providing visual distinction
20
- const lines = chunkText.split('\n');
21
- const highlightedLines = lines.map(line => {
22
- if (line.trim() === '') return '>'; // Empty blockquote line
23
- return '> ' + line;
24
- });
25
-
26
- const highlightedChunk = '\n\n> **Current Learning Section**\n>\n' +
27
- highlightedLines.join('\n') +
28
- '\n\n';
29
-
30
- return markdown.slice(0, chunk.start_position) +
31
- highlightedChunk +
32
- markdown.slice(chunk.end_position);
33
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/vite.config.js CHANGED
@@ -7,11 +7,7 @@ export default defineConfig({
7
  server: {
8
  proxy: {
9
  '/upload_pdf': 'http://localhost:8000',
10
- '/process_ocr': 'http://localhost:8000',
11
- '/get_image': 'http://localhost:8000',
12
- '/chunk_page': 'http://localhost:8000',
13
- '/start_chunk_lesson': 'http://localhost:8000',
14
  '/api': 'http://localhost:8000'
15
  }
16
  }
17
- })
 
7
  server: {
8
  proxy: {
9
  '/upload_pdf': 'http://localhost:8000',
 
 
 
 
10
  '/api': 'http://localhost:8000'
11
  }
12
  }
13
+ })
test_fuzzy_find.py DELETED
@@ -1,194 +0,0 @@
1
- #%%
2
- import matplotlib.pyplot as plt
3
- from difflib import SequenceMatcher
4
- import numpy as np
5
-
6
- def fuzzy_find(text, pattern, start_pos=0):
7
- """Find the best fuzzy match for pattern in text starting from start_pos"""
8
- best_ratio = 0
9
- best_pos = -1
10
-
11
- # Search in sliding windows
12
- pattern_len = len(pattern)
13
- for i in range(start_pos, len(text) - pattern_len + 1):
14
- window = text[i:i + pattern_len]
15
- ratio = SequenceMatcher(None, pattern.lower(), window.lower()).ratio()
16
-
17
- if ratio > best_ratio and ratio > 0.8: # Much stricter: 80% similarity
18
- best_ratio = ratio
19
- best_pos = i
20
-
21
- return best_pos if best_pos != -1 else None
22
-
23
- def analyze_fuzzy_ratios(markdown_text, chunk_text):
24
- """
25
- Analyze fuzzy matching ratios across the entire markdown text using a rolling window.
26
- Returns positions and their corresponding similarity ratios.
27
- """
28
- chunk_len = len(chunk_text)
29
- positions = []
30
- ratios = []
31
-
32
- # Rolling window over the entire markdown text
33
- for i in range(len(markdown_text) - chunk_len + 1):
34
- window = markdown_text[i:i + chunk_len]
35
- ratio = SequenceMatcher(None, chunk_text.lower(), window.lower()).ratio()
36
- positions.append(i)
37
- ratios.append(ratio)
38
-
39
- return positions, ratios
40
-
41
- def plot_ratio_distribution(positions, ratios, chunk_text, markdown_file_path=None):
42
- """
43
- Create a plot showing the similarity ratio distribution across positions.
44
- """
45
- fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
46
-
47
- # Main plot: ratio vs position
48
- ax1.plot(positions, ratios, 'b-', alpha=0.7, linewidth=1)
49
- ax1.axhline(y=0.8, color='r', linestyle='--', label='Fuzzy find threshold (0.8)')
50
- ax1.set_xlabel('Position in Markdown Text')
51
- ax1.set_ylabel('Similarity Ratio')
52
- ax1.set_title(f'Fuzzy Match Similarity Ratios Across Text\n(Chunk length: {len(chunk_text)} chars)')
53
- ax1.grid(True, alpha=0.3)
54
- ax1.legend()
55
-
56
- # Highlight maximum ratio
57
- max_ratio = max(ratios)
58
- max_pos = positions[ratios.index(max_ratio)]
59
- ax1.plot(max_pos, max_ratio, 'ro', markersize=8, label=f'Max ratio: {max_ratio:.3f} at pos {max_pos}')
60
- ax1.legend()
61
-
62
- # Histogram of ratios
63
- ax2.hist(ratios, bins=50, alpha=0.7, edgecolor='black')
64
- ax2.axvline(x=0.8, color='r', linestyle='--', label='Fuzzy find threshold (0.8)')
65
- ax2.axvline(x=max_ratio, color='g', linestyle='--', label=f'Maximum ratio: {max_ratio:.3f}')
66
- ax2.set_xlabel('Similarity Ratio')
67
- ax2.set_ylabel('Frequency')
68
- ax2.set_title('Distribution of Similarity Ratios')
69
- ax2.legend()
70
- ax2.grid(True, alpha=0.3)
71
-
72
- plt.tight_layout()
73
- return fig, max_ratio, max_pos
74
-
75
- def compare_texts(original_chunk, found_text, max_pos):
76
- """
77
- Compare the original chunk text with the text found by fuzzy_find.
78
- Shows character-by-character differences and similarity analysis.
79
- """
80
- print("\n" + "="*80)
81
- print("TEXT COMPARISON: Original Chunk vs Fuzzy Find Result")
82
- print("="*80)
83
-
84
- print(f"\nOriginal chunk length: {len(original_chunk)} characters")
85
- print(f"Found text length: {len(found_text)} characters")
86
- print(f"Found at position: {max_pos}")
87
-
88
- # Calculate overall similarity
89
- similarity = SequenceMatcher(None, original_chunk.lower(), found_text.lower()).ratio()
90
- print(f"Overall similarity: {similarity:.4f} ({similarity*100:.2f}%)")
91
-
92
- # Show first 200 characters of each
93
- print(f"\nOriginal chunk (first 200 chars):")
94
- print(f"'{original_chunk}{'...' if len(original_chunk) > 200 else ''}'")
95
-
96
- print(f"\nFound text (first 200 chars):")
97
- print(f"'{found_text}{'...' if len(found_text) > 200 else ''}'")
98
-
99
- # Character-by-character analysis for first 100 characters
100
- print(f"\nCharacter-by-character comparison (first 100 chars):")
101
- print("Original: ", end="")
102
- for i, char in enumerate(original_chunk[:100]):
103
- if i < len(found_text) and char.lower() == found_text[i].lower():
104
- print(char, end="") # Same character
105
- else:
106
- print(f"[{char}]", end="") # Different character
107
- print()
108
-
109
- print("Found: ", end="")
110
- for i, char in enumerate(found_text[:100]):
111
- if i < len(original_chunk) and char.lower() == original_chunk[i].lower():
112
- print(char, end="") # Same character
113
- else:
114
- print(f"[{char}]", end="") # Different character
115
- print()
116
-
117
- # Analyze differences
118
- matcher = SequenceMatcher(None, original_chunk, found_text)
119
- differences = []
120
- for tag, i1, i2, j1, j2 in matcher.get_opcodes():
121
- if tag != 'equal':
122
- differences.append({
123
- 'type': tag,
124
- 'original_pos': (i1, i2),
125
- 'found_pos': (j1, j2),
126
- 'original_text': original_chunk[i1:i2],
127
- 'found_text': found_text[j1:j2]
128
- })
129
-
130
- print(f"\nFound {len(differences)} differences:")
131
- for i, diff in enumerate(differences[:10]): # Show first 10 differences
132
- print(f"{i+1}. {diff['type'].upper()} at original[{diff['original_pos'][0]}:{diff['original_pos'][1]}] -> found[{diff['found_pos'][0]}:{diff['found_pos'][1]}]")
133
- if diff['original_text']:
134
- print(f" Original: '{diff['original_text'][:50]}{'...' if len(diff['original_text']) > 50 else ''}'")
135
- if diff['found_text']:
136
- print(f" Found: '{diff['found_text'][:50]}{'...' if len(diff['found_text']) > 50 else ''}'")
137
-
138
- if len(differences) > 10:
139
- print(f" ... and {len(differences) - 10} more differences")
140
-
141
- return similarity, differences
142
-
143
- def run_fuzzy_analysis():
144
- """
145
- Main function to run the fuzzy find analysis.
146
- You can modify the markdown_text and chunk_text variables below.
147
- """
148
-
149
- # TODO: Replace these with your actual markdown content and chunk
150
- markdown_text = """# An improved method for mobile characterisation of $\\delta^{13} \\mathrm{CH}_{4}$ source signatures and its application in Germany \n\nAntje Hoheisel ${ }^{1}$, Christiane Yeman ${ }^{1, a}$, Florian Dinger ${ }^{1,2}$, Henrik Eckhardt ${ }^{1}$, and Martina Schmidt ${ }^{1}$<br>${ }^{1}$ Institute of Environmental Physics, Heidelberg University, Heidelberg, Germany<br>${ }^{2}$ Max Planck Institute for Chemistry, Mainz, Germany<br>${ }^{a}$ now at: Laboratory of Ion Beam Physics, ETH Zurich, Zurich, Switzerland\n\nCorrespondence: Antje Hoheisel (antje.hoheisel@iup.uni-heidelberg.de)\nReceived: 7 August 2018 - Discussion started: 1 October 2018\nRevised: 17 January 2019 - Accepted: 28 January 2019 - Published: 22 February 2019\n\n\n#### Abstract\n\nThe carbon isotopic signature $\\left(\\delta^{13} \\mathrm{CH}_{4}\\right)$ of several methane sources in Germany (around Heidelberg and in North Rhine-Westphalia) were characterised. Mobile measurements of the plume of $\\mathrm{CH}_{4}$ sources are carried out using an analyser based on cavity ring-down spectroscopy (CRDS). To achieve precise results a CRDS analyser, which measures methane $\\left(\\mathrm{CH}_{4}\\right)$, carbon dioxide $\\left(\\mathrm{CO}_{2}\\right)$ and their ${ }^{13} \\mathrm{C}$-to- ${ }^{12} \\mathrm{C}$ ratios, was characterised especially with regard to cross sensitivities of composition differences of the gas matrix in air samples or calibration tanks. The two most important gases which affect $\\delta^{13} \\mathrm{CH}_{4}$ are water vapour $\\left(\\mathrm{H}_{2} \\mathrm{O}\\right)$ and ethane $\\left(\\mathrm{C}_{2} \\mathrm{H}_{6}\\right)$. To avoid the cross sensitivity with $\\mathrm{H}_{2} \\mathrm{O}$, the air is dried with a Nafion dryer during mobile measurements. $\\mathrm{C}_{2} \\mathrm{H}_{6}$ is typically abundant in natural gases and thus in methane plumes or samples originating from natural gas. $\\mathrm{A}_{2} \\mathrm{H}_{6}$ correction and calibration are essential to obtain accurate $\\delta^{13} \\mathrm{CH}_{4}$ results, which can deviate by up to $3 \\%$ depending on whether a $\\mathrm{C}_{2} \\mathrm{H}_{6}$ correction is applied.\n\nThe isotopic signature is determined with the Miller-Tans approach and the York fitting method. During 21 field campaigns the mean $\\delta^{13} \\mathrm{CH}_{4}$ signatures of three dairy farms $\\left(-63.9 \\pm 0.9 \\%_{e}\\right)$, a biogas plant $\\left(-62.4 \\pm 1.2 \\%_{e}\\right)$, a landfill $\\left(-58.7 \\pm 3.3 \\%_{e}\\right)$, a wastewater treatment plant $(-52.5 \\pm$ $1.4 \\%$ ), an active deep coal mine ( $-56.0 \\pm 2.3 \\%$ ) and two natural gas storage and gas compressor stations ( $-46.1 \\pm$ $0.8 \\%$ ) were recorded.\n\nIn addition, between December 2016 and November 2018 gas samples from the Heidelberg natural gas distribution network were measured with a mean $\\delta^{13} \\mathrm{CH}_{4}$ value of $-43.3 \\pm$ $0.8 \\%$. Contrary to previous measurements between 1991\n\n\n#### Abstract\n\nand 1996 by Levin et al. (1999), no strong seasonal cycle is shown.\n\n\n## 1 Introduction\n\nMethane $\\left(\\mathrm{CH}_{4}\\right)$ is the second most important anthropogenic greenhouse gas. The atmospheric growth rate of $\\mathrm{CH}_{4}$ has changed significantly during the last decades, stabilising at zero growth from 1999 to 2006 before beginning to increase again after 2007 (Dlugokencky et al., 2009). Several studies have focused on the recent $\\mathrm{CH}_{4}$ growth caused by changes in sources and sinks (Rigby et al., 2017; Turner et al., 2017).\n\nRecent studies by Schaefer et al. (2016), Rice et al. (2016) and Nisbet et al. (2016) have shown how the $\\delta^{13} \\mathrm{CH}_{4}$ measurements can help to understand the changes in global $\\mathrm{CH}_{4}$ increase rates and to assign the related source types. The stable carbon isotope ratio $\\left({ }^{13} \\mathrm{C} /{ }^{12} \\mathrm{C}\\right)$ of $\\mathrm{CH}_{4}$ sources varies due to the initial source material and the fractionation during production and release to the atmosphere. The source categories can be classified as pyrogenic (e.g. biomass burning), biogenic (e.g. wetlands and livestock) or thermogenic (e.g. a subcategory of fossil fuel extraction), which show different but also overlapping isotope ratio ranges. Various studies have shown that the assignment of isotopic signatures from different $\\mathrm{CH}_{4}$ sources remains uncertain due to large temporal variabilities and also regional specificities (e.g. Sherwood et al., 2017). This missing knowledge may result in large uncertainties when the $\\mathrm{CH}_{4}$ budget is determined on global or regional scales using isotope-based estimates. In addition to global studies, the use of $\\delta^{13} \\mathrm{CH}_{4}$ was already successfully"""
151
-
152
- chunk_text = """## 1 Introduction\nMethane ($\mathrm{CH}_{4}$) is the second most important anthropogenic greenhouse gas. The atmospheric growth rate of $\mathrm{CH}_{4}$ has changed significantly during the last decades, stabilising at zero growth from 1999 to 2006 before beginning to increase again after 2007 (Dlugokencky et al., 2009). Several studies have focused on the recent $\mathrm{CH}_{4}$ growth caused by changes in sources and sinks (Rigby et al., 2017; Turner et al., 2017).\n\nRecent studies by Schaefer et al. (2016), Rice et al. (2016) and Nisbet et al. (2016) have shown how the $\delta^{13} \mathrm{CH}_{4}$ measurements can help to understand the changes in global $\mathrm{CH}_{4}$ increase rates and to assign the related source types. The stable carbon isotope ratio (${}^{13}\mathrm{C}$/${}^{12}\mathrm{C}$) of $\mathrm{CH}_{4}$ sources varies due to the initial source material and the fractionation during production and release to the atmosphere. The source categories can be classified as pyrogenic (e.g. biomass burning), biogenic (e.g. wetlands and livestock) or thermogenic (e.g. a subcategory of fossil fuel extraction), which show different but also overlapping isotope ratio ranges. Various studies have shown that the assignment of isotopic signatures from different $\mathrm{CH}_{4}$ sources remains uncertain due to large temporal variabilities and also regional specificities (e.g. Sherwood et al., 2017). This missing knowledge may result in large uncertainties when the $\mathrm{CH}_{4}$ budget is determined on global or regional scales using isotope-based estimates. In addition to global studies, the use of $\delta^{13}\mathrm{CH}_{4}$ was already successfully"""
153
-
154
- print("Analyzing fuzzy matching ratios...")
155
- print(f"Markdown text length: {len(markdown_text)} characters")
156
- print(f"Chunk text length: {len(chunk_text)} characters")
157
-
158
- # Run the analysis
159
- positions, ratios = analyze_fuzzy_ratios(markdown_text, chunk_text)
160
-
161
- # Create the plot
162
- fig, max_ratio, max_pos = plot_ratio_distribution(positions, ratios, chunk_text)
163
-
164
- # Print statistics
165
- print(f"\nStatistics:")
166
- print(f"Maximum similarity ratio: {max_ratio:.3f}")
167
- print(f"Maximum ratio position: {max_pos}")
168
- print(f"Number of positions above 0.8 threshold: {sum(1 for r in ratios if r > 0.8)}")
169
- print(f"Mean ratio: {np.mean(ratios):.3f}")
170
- print(f"Standard deviation: {np.std(ratios):.3f}")
171
-
172
- # Test the original fuzzy_find function
173
- result = fuzzy_find(markdown_text, chunk_text)
174
- print(f"\nOriginal fuzzy_find result: {result}")
175
- if result is not None:
176
- print(f"Found match at position {result}")
177
- else:
178
- print("No match found above 0.8 threshold")
179
-
180
- # Compare the found text with the original chunk
181
- if max_ratio > 0: # If we found any match
182
- found_text = markdown_text[max_pos:max_pos + len(chunk_text)]
183
- text_similarity, differences = compare_texts(chunk_text, found_text, max_pos)
184
- print(f"\nDetailed comparison similarity: {text_similarity:.4f}")
185
- print(f"Number of character differences: {len(differences)}")
186
-
187
- plt.show()
188
- return positions, ratios, max_ratio, max_pos
189
-
190
- if __name__ == "__main__":
191
- # Run the analysis
192
- positions, ratios, max_ratio, max_pos = run_fuzzy_analysis()
193
-
194
- #%%