Spaces:
Sleeping
Sleeping
Commit ·
b1e57e4
1
Parent(s): 02158b8
Add chunking and markdown rendering
Browse files- Dockerfile +1 -1
- backend/app.py +102 -99
- backend/requirements.txt +4 -0
- frontend/src/components/DocumentProcessor.jsx +122 -45
- frontend/vite.config.js +9 -0
Dockerfile
CHANGED
|
@@ -8,7 +8,7 @@ COPY frontend/ ./
|
|
| 8 |
RUN npm run build
|
| 9 |
|
| 10 |
# Use Python runtime for backend
|
| 11 |
-
FROM python:3.
|
| 12 |
|
| 13 |
WORKDIR /code
|
| 14 |
|
|
|
|
| 8 |
RUN npm run build
|
| 9 |
|
| 10 |
# Use Python runtime for backend
|
| 11 |
+
FROM python:3.10
|
| 12 |
|
| 13 |
WORKDIR /code
|
| 14 |
|
backend/app.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
| 1 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
|
|
|
| 3 |
from mistralai import Mistral
|
| 4 |
import os
|
| 5 |
import tempfile
|
| 6 |
import json
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from difflib import SequenceMatcher
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Load environment variables
|
| 11 |
load_dotenv()
|
|
@@ -20,9 +25,6 @@ app.add_middleware(
|
|
| 20 |
allow_headers=["*"],
|
| 21 |
)
|
| 22 |
|
| 23 |
-
@app.get("/")
|
| 24 |
-
def hello():
|
| 25 |
-
return {"message": "Backend is running!"}
|
| 26 |
|
| 27 |
@app.get("/api/test")
|
| 28 |
def test():
|
|
@@ -220,6 +222,16 @@ async def get_image_base64(file_id: str, image_id: str):
|
|
| 220 |
print(f"❌ Error getting image: {e}")
|
| 221 |
raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
def fuzzy_find(text, pattern, start_pos=0):
|
| 224 |
"""Find the best fuzzy match for pattern in text starting from start_pos"""
|
| 225 |
best_match = None
|
|
@@ -239,68 +251,57 @@ def fuzzy_find(text, pattern, start_pos=0):
|
|
| 239 |
|
| 240 |
return best_pos if best_pos != -1 else None
|
| 241 |
|
| 242 |
-
async def auto_chunk_page(page_markdown, client):
|
| 243 |
-
"""Auto-chunk a page during OCR processing"""
|
| 244 |
if not page_markdown or len(page_markdown.strip()) < 100:
|
| 245 |
return [] # Skip very short pages
|
| 246 |
|
| 247 |
-
#
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
DOCUMENT PAGE:
|
| 251 |
{page_markdown}
|
| 252 |
|
| 253 |
-
For each chunk you identify, output ONLY a JSON array with this exact format:
|
| 254 |
-
[
|
| 255 |
-
{{
|
| 256 |
-
"topic": "Brief topic name",
|
| 257 |
-
"start_phrase": "First few words of the chunk",
|
| 258 |
-
"end_phrase": "Last few words of the chunk"
|
| 259 |
-
}}
|
| 260 |
-
]
|
| 261 |
-
|
| 262 |
Rules:
|
| 263 |
1. Each chunk should contain 2-3 valuable lessons
|
| 264 |
2. start_phrase and end_phrase should be 5-15 words long
|
| 265 |
3. Focus on educational content (concepts, examples, key points)
|
| 266 |
-
4.
|
| 267 |
-
5.
|
| 268 |
-
6. Ensure the JSON is valid and well-formed
|
| 269 |
-
7. Do not include any explanations or additional text, just the JSON array
|
| 270 |
|
| 271 |
-
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
model="mistral-small-latest",
|
| 277 |
-
messages=[{"role": "user", "content": prompt}],
|
| 278 |
-
temperature=0.3
|
| 279 |
-
)
|
| 280 |
-
|
| 281 |
-
chunk_text = response.choices[0].message.content.strip()
|
| 282 |
-
|
| 283 |
-
# Parse JSON response
|
| 284 |
-
try:
|
| 285 |
-
chunks = json.loads(chunk_text)
|
| 286 |
-
except json.JSONDecodeError:
|
| 287 |
-
# Try to extract JSON from response if wrapped in text
|
| 288 |
-
import re
|
| 289 |
-
json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
|
| 290 |
-
if json_match:
|
| 291 |
-
chunks = json.loads(json_match.group())
|
| 292 |
-
else:
|
| 293 |
-
return []
|
| 294 |
|
| 295 |
# Find positions using fuzzy matching
|
| 296 |
positioned_chunks = []
|
| 297 |
for chunk in chunks:
|
| 298 |
-
start_pos = fuzzy_find(page_markdown, chunk.
|
| 299 |
-
end_pos = fuzzy_find(page_markdown, chunk.
|
| 300 |
|
| 301 |
if start_pos is not None:
|
| 302 |
positioned_chunks.append({
|
| 303 |
-
|
|
|
|
|
|
|
| 304 |
"start_position": start_pos,
|
| 305 |
"end_position": end_pos,
|
| 306 |
"found_start": True,
|
|
@@ -310,26 +311,35 @@ JSON:"""
|
|
| 310 |
return positioned_chunks
|
| 311 |
|
| 312 |
except Exception as e:
|
|
|
|
| 313 |
print(f"❌ Auto-chunking error: {e}")
|
|
|
|
| 314 |
return []
|
| 315 |
|
| 316 |
@app.post("/chunk_page")
|
| 317 |
async def chunk_page(request: dict):
|
| 318 |
-
"""Analyze a page and suggest chunks for lessons"""
|
| 319 |
print(f"🧠 Chunking page...")
|
| 320 |
|
| 321 |
page_markdown = request.get("markdown", "")
|
| 322 |
if not page_markdown:
|
| 323 |
raise HTTPException(status_code=400, detail="No markdown provided")
|
| 324 |
|
| 325 |
-
# Get
|
| 326 |
-
|
| 327 |
-
if not
|
| 328 |
-
raise HTTPException(status_code=500, detail="
|
| 329 |
|
| 330 |
try:
|
| 331 |
-
# Initialize
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
# Create chunking prompt
|
| 335 |
prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
|
|
@@ -337,69 +347,40 @@ async def chunk_page(request: dict):
|
|
| 337 |
DOCUMENT PAGE:
|
| 338 |
{page_markdown}
|
| 339 |
|
| 340 |
-
For each chunk you identify, output ONLY a JSON array with this exact format:
|
| 341 |
-
[
|
| 342 |
-
{{
|
| 343 |
-
"topic": "Brief topic name",
|
| 344 |
-
"start_phrase": "First few words of the chunk",
|
| 345 |
-
"end_phrase": "Last few words of the chunk",
|
| 346 |
-
}}
|
| 347 |
-
]
|
| 348 |
-
|
| 349 |
Rules:
|
| 350 |
-
1. Each chunk should contain 2-3 valuable lessons
|
| 351 |
2. start_phrase and end_phrase should be 5-15 words long
|
| 352 |
3. Focus on educational content (concepts, examples, key points)
|
| 353 |
-
4.
|
| 354 |
-
5.
|
| 355 |
-
6. Ensure the JSON is valid and well-formed
|
| 356 |
-
7. Do not include any explanations or additional text, just the JSON array
|
| 357 |
|
| 358 |
-
|
| 359 |
|
| 360 |
-
# Call
|
| 361 |
-
print("🚀 Calling
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
"role": "user",
|
| 366 |
-
"content": prompt
|
| 367 |
-
}],
|
| 368 |
-
temperature=0.3 # Lower temperature for more consistent output
|
| 369 |
-
)
|
| 370 |
-
|
| 371 |
-
chunk_text = response.choices[0].message.content.strip()
|
| 372 |
-
print(f"📝 LLM Response: {chunk_text[:200]}...")
|
| 373 |
-
|
| 374 |
-
# Parse JSON response
|
| 375 |
-
try:
|
| 376 |
-
chunks = json.loads(chunk_text)
|
| 377 |
-
except json.JSONDecodeError:
|
| 378 |
-
# Try to extract JSON from response if wrapped in text
|
| 379 |
-
import re
|
| 380 |
-
json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
|
| 381 |
-
if json_match:
|
| 382 |
-
chunks = json.loads(json_match.group())
|
| 383 |
-
else:
|
| 384 |
-
raise ValueError("Could not parse JSON from LLM response")
|
| 385 |
|
| 386 |
# Find positions using fuzzy matching
|
| 387 |
positioned_chunks = []
|
| 388 |
for chunk in chunks:
|
| 389 |
-
start_pos = fuzzy_find(page_markdown, chunk.
|
| 390 |
-
end_pos = fuzzy_find(page_markdown, chunk.
|
| 391 |
|
| 392 |
if start_pos is not None:
|
| 393 |
positioned_chunks.append({
|
| 394 |
-
|
|
|
|
|
|
|
| 395 |
"start_position": start_pos,
|
| 396 |
"end_position": end_pos,
|
| 397 |
"found_start": True,
|
| 398 |
"found_end": end_pos is not None
|
| 399 |
})
|
| 400 |
-
print(f"✅ Found chunk: {chunk.
|
| 401 |
else:
|
| 402 |
-
print(f"❌ Could not find chunk: {chunk.
|
| 403 |
|
| 404 |
print(f"📊 Successfully positioned {len(positioned_chunks)}/{len(chunks)} chunks")
|
| 405 |
|
|
@@ -410,5 +391,27 @@ JSON:"""
|
|
| 410 |
}
|
| 411 |
|
| 412 |
except Exception as e:
|
|
|
|
| 413 |
print(f"❌ Error chunking page: {e}")
|
| 414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from fastapi.staticfiles import StaticFiles
|
| 4 |
+
from fastapi.responses import FileResponse
|
| 5 |
from mistralai import Mistral
|
| 6 |
import os
|
| 7 |
import tempfile
|
| 8 |
import json
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
from difflib import SequenceMatcher
|
| 11 |
+
from pydantic import BaseModel, Field
|
| 12 |
+
from typing import Optional, List
|
| 13 |
+
from langchain.chat_models import init_chat_model
|
| 14 |
|
| 15 |
# Load environment variables
|
| 16 |
load_dotenv()
|
|
|
|
| 25 |
allow_headers=["*"],
|
| 26 |
)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
@app.get("/api/test")
|
| 30 |
def test():
|
|
|
|
| 222 |
print(f"❌ Error getting image: {e}")
|
| 223 |
raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
|
| 224 |
|
| 225 |
+
class ChunkSchema(BaseModel):
|
| 226 |
+
"""Schema for document chunks suitable for creating interactive lessons."""
|
| 227 |
+
topic: str = Field(description="Brief topic name for the chunk")
|
| 228 |
+
start_phrase: str = Field(description="First few words of the chunk (5-15 words)")
|
| 229 |
+
end_phrase: str = Field(description="Last few words of the chunk (5-15 words)")
|
| 230 |
+
|
| 231 |
+
class ChunkList(BaseModel):
|
| 232 |
+
"""Container for a list of document chunks."""
|
| 233 |
+
chunks: List[ChunkSchema] = Field(description="List of identified chunks for interactive lessons")
|
| 234 |
+
|
| 235 |
def fuzzy_find(text, pattern, start_pos=0):
|
| 236 |
"""Find the best fuzzy match for pattern in text starting from start_pos"""
|
| 237 |
best_match = None
|
|
|
|
| 251 |
|
| 252 |
return best_pos if best_pos != -1 else None
|
| 253 |
|
| 254 |
+
async def auto_chunk_page(page_markdown, client=None):
|
| 255 |
+
"""Auto-chunk a page during OCR processing using Fireworks AI with structured output"""
|
| 256 |
if not page_markdown or len(page_markdown.strip()) < 100:
|
| 257 |
return [] # Skip very short pages
|
| 258 |
|
| 259 |
+
# Get Fireworks API key
|
| 260 |
+
fireworks_api_key = os.environ.get("FIREWORKS_API_KEY")
|
| 261 |
+
if not fireworks_api_key:
|
| 262 |
+
print("⚠️ No Fireworks API key found, falling back to regular chunking")
|
| 263 |
+
return []
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
# Initialize Fireworks LLM with structured output
|
| 267 |
+
llm = init_chat_model(
|
| 268 |
+
"accounts/fireworks/models/llama4-maverick-instruct-basic",
|
| 269 |
+
model_provider="fireworks",
|
| 270 |
+
api_key=fireworks_api_key
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Create structured LLM that returns ChunkList object
|
| 274 |
+
structured_llm = llm.with_structured_output(ChunkList)
|
| 275 |
+
|
| 276 |
+
# Create chunking prompt
|
| 277 |
+
prompt = f"""Imagine you are a teacher. You are given an individual page, and you have to decide how to dissect this page. Your task is to identify chunks of content by providing start and end phrases that can be used to create interactive lessons. Here's the page:
|
| 278 |
DOCUMENT PAGE:
|
| 279 |
{page_markdown}
|
| 280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
Rules:
|
| 282 |
1. Each chunk should contain 2-3 valuable lessons
|
| 283 |
2. start_phrase and end_phrase should be 5-15 words long
|
| 284 |
3. Focus on educational content (concepts, examples, key points)
|
| 285 |
+
4. More dense content should have more chunks, less dense content fewer chunks
|
| 286 |
+
5. Identify chunks that would make good interactive lessons
|
|
|
|
|
|
|
| 287 |
|
| 288 |
+
Return a list of chunks with topic, start_phrase, and end_phrase for each."""
|
| 289 |
|
| 290 |
+
# Call Fireworks with structured output
|
| 291 |
+
chunk_response = structured_llm.invoke(prompt)
|
| 292 |
+
chunks = chunk_response.chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
# Find positions using fuzzy matching
|
| 295 |
positioned_chunks = []
|
| 296 |
for chunk in chunks:
|
| 297 |
+
start_pos = fuzzy_find(page_markdown, chunk.start_phrase)
|
| 298 |
+
end_pos = fuzzy_find(page_markdown, chunk.end_phrase, start_pos or 0)
|
| 299 |
|
| 300 |
if start_pos is not None:
|
| 301 |
positioned_chunks.append({
|
| 302 |
+
"topic": chunk.topic,
|
| 303 |
+
"start_phrase": chunk.start_phrase,
|
| 304 |
+
"end_phrase": chunk.end_phrase,
|
| 305 |
"start_position": start_pos,
|
| 306 |
"end_position": end_pos,
|
| 307 |
"found_start": True,
|
|
|
|
| 311 |
return positioned_chunks
|
| 312 |
|
| 313 |
except Exception as e:
|
| 314 |
+
import traceback
|
| 315 |
print(f"❌ Auto-chunking error: {e}")
|
| 316 |
+
print(f"❌ Full traceback: {traceback.format_exc()}")
|
| 317 |
return []
|
| 318 |
|
| 319 |
@app.post("/chunk_page")
|
| 320 |
async def chunk_page(request: dict):
|
| 321 |
+
"""Analyze a page and suggest chunks for lessons using Fireworks AI with structured output"""
|
| 322 |
print(f"🧠 Chunking page...")
|
| 323 |
|
| 324 |
page_markdown = request.get("markdown", "")
|
| 325 |
if not page_markdown:
|
| 326 |
raise HTTPException(status_code=400, detail="No markdown provided")
|
| 327 |
|
| 328 |
+
# Get Fireworks API key
|
| 329 |
+
fireworks_api_key = os.environ.get("FIREWORKS_API_KEY")
|
| 330 |
+
if not fireworks_api_key:
|
| 331 |
+
raise HTTPException(status_code=500, detail="FIREWORKS_API_KEY not set")
|
| 332 |
|
| 333 |
try:
|
| 334 |
+
# Initialize Fireworks LLM with structured output
|
| 335 |
+
llm = init_chat_model(
|
| 336 |
+
"accounts/fireworks/models/llama4-maverick-instruct-basic",
|
| 337 |
+
model_provider="fireworks",
|
| 338 |
+
api_key=fireworks_api_key
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
# Create structured LLM that returns ChunkList object
|
| 342 |
+
structured_llm = llm.with_structured_output(ChunkList)
|
| 343 |
|
| 344 |
# Create chunking prompt
|
| 345 |
prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
|
|
|
|
| 347 |
DOCUMENT PAGE:
|
| 348 |
{page_markdown}
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
Rules:
|
| 351 |
+
1. Each chunk should contain 2-3 valuable lessons
|
| 352 |
2. start_phrase and end_phrase should be 5-15 words long
|
| 353 |
3. Focus on educational content (concepts, examples, key points)
|
| 354 |
+
4. More dense content should have more chunks, less dense content fewer chunks
|
| 355 |
+
5. Identify chunks that would make good interactive lessons
|
|
|
|
|
|
|
| 356 |
|
| 357 |
+
Return a list of chunks with topic, start_phrase, and end_phrase for each."""
|
| 358 |
|
| 359 |
+
# Call Fireworks with structured output
|
| 360 |
+
print("🚀 Calling Fireworks for chunking...")
|
| 361 |
+
chunk_response = structured_llm.invoke(prompt)
|
| 362 |
+
chunks = chunk_response.chunks
|
| 363 |
+
print(f"📝 Received {len(chunks)} chunks from Fireworks")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
# Find positions using fuzzy matching
|
| 366 |
positioned_chunks = []
|
| 367 |
for chunk in chunks:
|
| 368 |
+
start_pos = fuzzy_find(page_markdown, chunk.start_phrase)
|
| 369 |
+
end_pos = fuzzy_find(page_markdown, chunk.end_phrase, start_pos or 0)
|
| 370 |
|
| 371 |
if start_pos is not None:
|
| 372 |
positioned_chunks.append({
|
| 373 |
+
"topic": chunk.topic,
|
| 374 |
+
"start_phrase": chunk.start_phrase,
|
| 375 |
+
"end_phrase": chunk.end_phrase,
|
| 376 |
"start_position": start_pos,
|
| 377 |
"end_position": end_pos,
|
| 378 |
"found_start": True,
|
| 379 |
"found_end": end_pos is not None
|
| 380 |
})
|
| 381 |
+
print(f"✅ Found chunk: {chunk.topic} at position {start_pos}")
|
| 382 |
else:
|
| 383 |
+
print(f"❌ Could not find chunk: {chunk.topic}")
|
| 384 |
|
| 385 |
print(f"📊 Successfully positioned {len(positioned_chunks)}/{len(chunks)} chunks")
|
| 386 |
|
|
|
|
| 391 |
}
|
| 392 |
|
| 393 |
except Exception as e:
|
| 394 |
+
import traceback
|
| 395 |
print(f"❌ Error chunking page: {e}")
|
| 396 |
+
print(f"❌ Full traceback: {traceback.format_exc()}")
|
| 397 |
+
raise HTTPException(status_code=500, detail=f"Error chunking page: {str(e)}")
|
| 398 |
+
|
| 399 |
+
# Mount static files for production deployment
|
| 400 |
+
frontend_path = os.path.join(os.path.dirname(__file__), "..", "frontend")
|
| 401 |
+
assets_path = os.path.join(frontend_path, "assets")
|
| 402 |
+
|
| 403 |
+
if os.path.exists(frontend_path):
|
| 404 |
+
# Only mount assets if the directory exists (production build)
|
| 405 |
+
if os.path.exists(assets_path):
|
| 406 |
+
app.mount("/assets", StaticFiles(directory=assets_path), name="assets")
|
| 407 |
+
|
| 408 |
+
@app.get("/")
|
| 409 |
+
async def serve_frontend():
|
| 410 |
+
index_path = os.path.join(frontend_path, "index.html")
|
| 411 |
+
if os.path.exists(index_path):
|
| 412 |
+
return FileResponse(index_path)
|
| 413 |
+
return {"message": "Backend is running - frontend not found"}
|
| 414 |
+
else:
|
| 415 |
+
@app.get("/")
|
| 416 |
+
def hello():
|
| 417 |
+
return {"message": "Backend is running!"}
|
backend/requirements.txt
CHANGED
|
@@ -3,3 +3,7 @@ fastapi==0.115.7
|
|
| 3 |
python-multipart>=0.0.5
|
| 4 |
mistralai
|
| 5 |
python-dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
python-multipart>=0.0.5
|
| 4 |
mistralai
|
| 5 |
python-dotenv
|
| 6 |
+
fireworks-ai
|
| 7 |
+
langchain-core
|
| 8 |
+
langchain-fireworks
|
| 9 |
+
pydantic
|
frontend/src/components/DocumentProcessor.jsx
CHANGED
|
@@ -12,6 +12,7 @@ function DocumentProcessor() {
|
|
| 12 |
const [ocrProgress, setOcrProgress] = useState(0);
|
| 13 |
const [documentData, setDocumentData] = useState(null);
|
| 14 |
const [imageCache, setImageCache] = useState({});
|
|
|
|
| 15 |
|
| 16 |
const handleFileChange = (e) => {
|
| 17 |
setSelectedFile(e.target.files[0]);
|
|
@@ -27,7 +28,7 @@ function DocumentProcessor() {
|
|
| 27 |
}
|
| 28 |
|
| 29 |
try {
|
| 30 |
-
const response = await fetch(`
|
| 31 |
if (response.ok) {
|
| 32 |
const data = await response.json();
|
| 33 |
const imageData = data.image_base64;
|
|
@@ -46,6 +47,56 @@ function DocumentProcessor() {
|
|
| 46 |
return null;
|
| 47 |
};
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
const ImageComponent = ({ src, alt }) => {
|
| 50 |
const [imageSrc, setImageSrc] = useState(null);
|
| 51 |
const [loading, setLoading] = useState(true);
|
|
@@ -64,50 +115,52 @@ function DocumentProcessor() {
|
|
| 64 |
|
| 65 |
if (loading) {
|
| 66 |
return (
|
| 67 |
-
<
|
|
|
|
| 68 |
width: '100%',
|
| 69 |
height: '200px',
|
| 70 |
backgroundColor: '#f3f4f6',
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
justifyContent: 'center',
|
| 74 |
margin: '1rem 0',
|
| 75 |
-
borderRadius: '0.5rem'
|
|
|
|
| 76 |
}}>
|
| 77 |
-
|
| 78 |
-
</
|
| 79 |
);
|
| 80 |
}
|
| 81 |
|
| 82 |
if (!imageSrc) {
|
| 83 |
return (
|
| 84 |
-
<
|
|
|
|
| 85 |
width: '100%',
|
| 86 |
height: '200px',
|
| 87 |
backgroundColor: '#fef2f2',
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
justifyContent: 'center',
|
| 91 |
margin: '1rem 0',
|
| 92 |
borderRadius: '0.5rem',
|
| 93 |
-
border: '1px solid #fecaca'
|
|
|
|
| 94 |
}}>
|
| 95 |
-
|
| 96 |
-
</
|
| 97 |
);
|
| 98 |
}
|
| 99 |
|
| 100 |
return (
|
| 101 |
-
<
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
);
|
| 112 |
};
|
| 113 |
|
|
@@ -124,7 +177,7 @@ function DocumentProcessor() {
|
|
| 124 |
formData.append('file', selectedFile);
|
| 125 |
|
| 126 |
setUploadProgress(30);
|
| 127 |
-
const uploadResponse = await fetch('
|
| 128 |
method: 'POST',
|
| 129 |
body: formData,
|
| 130 |
});
|
|
@@ -141,7 +194,7 @@ function DocumentProcessor() {
|
|
| 141 |
await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for UX
|
| 142 |
|
| 143 |
setOcrProgress(60);
|
| 144 |
-
const ocrResponse = await fetch(`
|
| 145 |
|
| 146 |
if (!ocrResponse.ok) {
|
| 147 |
throw new Error('Failed to process OCR');
|
|
@@ -155,12 +208,31 @@ function DocumentProcessor() {
|
|
| 155 |
.map(page => page.markdown)
|
| 156 |
.join('\n\n---\n\n');
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
setDocumentData({
|
| 159 |
fileId: uploadData.file_id,
|
| 160 |
filename: uploadData.filename,
|
| 161 |
markdown: combinedMarkdown,
|
| 162 |
pages: ocrData.pages,
|
| 163 |
-
totalPages: ocrData.total_pages
|
|
|
|
| 164 |
});
|
| 165 |
|
| 166 |
} catch (error) {
|
|
@@ -276,33 +348,33 @@ function DocumentProcessor() {
|
|
| 276 |
}
|
| 277 |
|
| 278 |
return (
|
| 279 |
-
<div className="min-h-screen bg-gray-50">
|
| 280 |
-
{/* Document
|
| 281 |
-
<div className="
|
| 282 |
-
<div className="bg-white rounded-lg shadow-sm p-
|
| 283 |
-
<div className="prose prose-
|
| 284 |
<ReactMarkdown
|
| 285 |
remarkPlugins={[remarkMath]}
|
| 286 |
rehypePlugins={[rehypeKatex]}
|
| 287 |
components={{
|
| 288 |
-
h1: ({ children }) => <h1 style={{ fontSize: '
|
| 289 |
-
h2: ({ children }) => <h2 style={{ fontSize: '1.
|
| 290 |
-
h3: ({ children }) => <h3 style={{ fontSize: '1.
|
| 291 |
-
p: ({ children }) => <p style={{ marginBottom: '
|
| 292 |
-
hr: () => <hr style={{ margin: '
|
| 293 |
-
ul: ({ children }) => <ul style={{ marginBottom: '
|
| 294 |
-
ol: ({ children }) => <ol style={{ marginBottom: '
|
| 295 |
-
li: ({ children }) => <li style={{ marginBottom: '0.
|
| 296 |
blockquote: ({ children }) => (
|
| 297 |
-
<blockquote style={{ borderLeft: '
|
| 298 |
{children}
|
| 299 |
</blockquote>
|
| 300 |
),
|
| 301 |
code: ({ inline, children }) =>
|
| 302 |
inline ?
|
| 303 |
-
<code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.
|
| 304 |
-
<pre style={{ backgroundColor: '#f3f4f6', padding: '
|
| 305 |
-
<code style={{ fontSize: '0.
|
| 306 |
</pre>,
|
| 307 |
img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
|
| 308 |
}}
|
|
@@ -312,6 +384,11 @@ function DocumentProcessor() {
|
|
| 312 |
</div>
|
| 313 |
</div>
|
| 314 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
</div>
|
| 316 |
);
|
| 317 |
}
|
|
|
|
| 12 |
const [ocrProgress, setOcrProgress] = useState(0);
|
| 13 |
const [documentData, setDocumentData] = useState(null);
|
| 14 |
const [imageCache, setImageCache] = useState({});
|
| 15 |
+
const [selectedChunk, setSelectedChunk] = useState(0);
|
| 16 |
|
| 17 |
const handleFileChange = (e) => {
|
| 18 |
setSelectedFile(e.target.files[0]);
|
|
|
|
| 28 |
}
|
| 29 |
|
| 30 |
try {
|
| 31 |
+
const response = await fetch(`/get_image/${fileId}/${imageId}`);
|
| 32 |
if (response.ok) {
|
| 33 |
const data = await response.json();
|
| 34 |
const imageData = data.image_base64;
|
|
|
|
| 47 |
return null;
|
| 48 |
};
|
| 49 |
|
| 50 |
+
// Component for the chunk panel
|
| 51 |
+
const ChunkPanel = ({ chunks }) => {
|
| 52 |
+
if (!chunks || chunks.length === 0) {
|
| 53 |
+
return (
|
| 54 |
+
<div className="p-6 text-center text-gray-500">
|
| 55 |
+
No interactive chunks found
|
| 56 |
+
</div>
|
| 57 |
+
);
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
return (
|
| 61 |
+
<div className="p-4 space-y-4 overflow-y-auto max-h-screen">
|
| 62 |
+
<h3 className="text-lg font-semibold text-gray-800 mb-4 sticky top-0 bg-white pb-2">
|
| 63 |
+
Interactive Chunks ({chunks.length})
|
| 64 |
+
</h3>
|
| 65 |
+
|
| 66 |
+
{chunks.map((chunk, index) => (
|
| 67 |
+
<div
|
| 68 |
+
key={index}
|
| 69 |
+
onClick={() => setSelectedChunk(index)}
|
| 70 |
+
className={`p-4 rounded-lg border-2 cursor-pointer transition-all duration-200 ${
|
| 71 |
+
selectedChunk === index
|
| 72 |
+
? 'border-blue-500 bg-blue-50 shadow-md'
|
| 73 |
+
: 'border-gray-200 bg-white hover:border-gray-300 hover:shadow-sm'
|
| 74 |
+
}`}
|
| 75 |
+
>
|
| 76 |
+
<div className="flex items-start justify-between mb-2">
|
| 77 |
+
<h4 className="font-medium text-gray-900">
|
| 78 |
+
📚 {chunk.topic}
|
| 79 |
+
</h4>
|
| 80 |
+
<span className="text-xs text-gray-500 bg-gray-100 px-2 py-1 rounded">
|
| 81 |
+
{index + 1}/{chunks.length}
|
| 82 |
+
</span>
|
| 83 |
+
</div>
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
{chunk.start_position !== null && chunk.start_position !== undefined &&
|
| 87 |
+
chunk.end_position !== null && chunk.end_position !== undefined && (
|
| 88 |
+
<div className="mt-3 p-3 bg-gray-50 rounded text-sm text-gray-700 italic">
|
| 89 |
+
{documentData.markdown.slice(chunk.start_position, chunk.end_position).substring(0, 150)}
|
| 90 |
+
{documentData.markdown.slice(chunk.start_position, chunk.end_position).length > 150 ? '...' : ''}
|
| 91 |
+
</div>
|
| 92 |
+
)}
|
| 93 |
+
</div>
|
| 94 |
+
))}
|
| 95 |
+
</div>
|
| 96 |
+
);
|
| 97 |
+
};
|
| 98 |
+
|
| 99 |
+
|
| 100 |
const ImageComponent = ({ src, alt }) => {
|
| 101 |
const [imageSrc, setImageSrc] = useState(null);
|
| 102 |
const [loading, setLoading] = useState(true);
|
|
|
|
| 115 |
|
| 116 |
if (loading) {
|
| 117 |
return (
|
| 118 |
+
<span style={{
|
| 119 |
+
display: 'inline-block',
|
| 120 |
width: '100%',
|
| 121 |
height: '200px',
|
| 122 |
backgroundColor: '#f3f4f6',
|
| 123 |
+
textAlign: 'center',
|
| 124 |
+
lineHeight: '200px',
|
|
|
|
| 125 |
margin: '1rem 0',
|
| 126 |
+
borderRadius: '0.5rem',
|
| 127 |
+
color: '#6b7280'
|
| 128 |
}}>
|
| 129 |
+
Loading image...
|
| 130 |
+
</span>
|
| 131 |
);
|
| 132 |
}
|
| 133 |
|
| 134 |
if (!imageSrc) {
|
| 135 |
return (
|
| 136 |
+
<span style={{
|
| 137 |
+
display: 'inline-block',
|
| 138 |
width: '100%',
|
| 139 |
height: '200px',
|
| 140 |
backgroundColor: '#fef2f2',
|
| 141 |
+
textAlign: 'center',
|
| 142 |
+
lineHeight: '200px',
|
|
|
|
| 143 |
margin: '1rem 0',
|
| 144 |
borderRadius: '0.5rem',
|
| 145 |
+
border: '1px solid #fecaca',
|
| 146 |
+
color: '#dc2626'
|
| 147 |
}}>
|
| 148 |
+
Image not found: {alt || src}
|
| 149 |
+
</span>
|
| 150 |
);
|
| 151 |
}
|
| 152 |
|
| 153 |
return (
|
| 154 |
+
<img
|
| 155 |
+
src={imageSrc}
|
| 156 |
+
alt={alt || 'Document image'}
|
| 157 |
+
style={{
|
| 158 |
+
display: 'block',
|
| 159 |
+
maxWidth: '100%',
|
| 160 |
+
height: 'auto',
|
| 161 |
+
margin: '1.5rem auto'
|
| 162 |
+
}}
|
| 163 |
+
/>
|
| 164 |
);
|
| 165 |
};
|
| 166 |
|
|
|
|
| 177 |
formData.append('file', selectedFile);
|
| 178 |
|
| 179 |
setUploadProgress(30);
|
| 180 |
+
const uploadResponse = await fetch('/upload_pdf', {
|
| 181 |
method: 'POST',
|
| 182 |
body: formData,
|
| 183 |
});
|
|
|
|
| 194 |
await new Promise(resolve => setTimeout(resolve, 500)); // Small delay for UX
|
| 195 |
|
| 196 |
setOcrProgress(60);
|
| 197 |
+
const ocrResponse = await fetch(`/process_ocr/${uploadData.file_id}`);
|
| 198 |
|
| 199 |
if (!ocrResponse.ok) {
|
| 200 |
throw new Error('Failed to process OCR');
|
|
|
|
| 208 |
.map(page => page.markdown)
|
| 209 |
.join('\n\n---\n\n');
|
| 210 |
|
| 211 |
+
// Collect all chunks from all pages
|
| 212 |
+
const allChunks = [];
|
| 213 |
+
let markdownOffset = 0;
|
| 214 |
+
|
| 215 |
+
ocrData.pages.forEach((page, pageIndex) => {
|
| 216 |
+
if (page.chunks && page.chunks.length > 0) {
|
| 217 |
+
page.chunks.forEach(chunk => {
|
| 218 |
+
allChunks.push({
|
| 219 |
+
...chunk,
|
| 220 |
+
start_position: chunk.start_position + markdownOffset,
|
| 221 |
+
end_position: chunk.end_position + markdownOffset,
|
| 222 |
+
pageIndex: pageIndex
|
| 223 |
+
});
|
| 224 |
+
});
|
| 225 |
+
}
|
| 226 |
+
markdownOffset += page.markdown.length + 6; // +6 for the separator "\n\n---\n\n"
|
| 227 |
+
});
|
| 228 |
+
|
| 229 |
setDocumentData({
|
| 230 |
fileId: uploadData.file_id,
|
| 231 |
filename: uploadData.filename,
|
| 232 |
markdown: combinedMarkdown,
|
| 233 |
pages: ocrData.pages,
|
| 234 |
+
totalPages: ocrData.total_pages,
|
| 235 |
+
chunks: allChunks
|
| 236 |
});
|
| 237 |
|
| 238 |
} catch (error) {
|
|
|
|
| 348 |
}
|
| 349 |
|
| 350 |
return (
|
| 351 |
+
<div className="min-h-screen bg-gray-50 flex">
|
| 352 |
+
{/* Left Panel - Document (66%) */}
|
| 353 |
+
<div className="w-2/3 p-6">
|
| 354 |
+
<div className="bg-white rounded-lg shadow-sm p-6 h-full">
|
| 355 |
+
<div className="prose prose-sm max-w-none overflow-y-auto" style={{ maxHeight: 'calc(100vh - 60px)' }}>
|
| 356 |
<ReactMarkdown
|
| 357 |
remarkPlugins={[remarkMath]}
|
| 358 |
rehypePlugins={[rehypeKatex]}
|
| 359 |
components={{
|
| 360 |
+
h1: ({ children }) => <h1 style={{ fontSize: '1.5rem', fontWeight: 'bold', marginBottom: '1rem', color: '#1a202c' }}>{children}</h1>,
|
| 361 |
+
h2: ({ children }) => <h2 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', marginTop: '1.5rem', color: '#1a202c' }}>{children}</h2>,
|
| 362 |
+
h3: ({ children }) => <h3 style={{ fontSize: '1.125rem', fontWeight: 'bold', marginBottom: '0.5rem', marginTop: '1rem', color: '#1a202c' }}>{children}</h3>,
|
| 363 |
+
p: ({ children }) => <p style={{ marginBottom: '0.75rem', color: '#374151', lineHeight: '1.5', fontSize: '0.875rem' }}>{children}</p>,
|
| 364 |
+
hr: () => <hr style={{ margin: '1.5rem 0', borderColor: '#d1d5db' }} />,
|
| 365 |
+
ul: ({ children }) => <ul style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'disc', fontSize: '0.875rem' }}>{children}</ul>,
|
| 366 |
+
ol: ({ children }) => <ol style={{ marginBottom: '0.75rem', marginLeft: '1.25rem', listStyleType: 'decimal', fontSize: '0.875rem' }}>{children}</ol>,
|
| 367 |
+
li: ({ children }) => <li style={{ marginBottom: '0.125rem', color: '#374151' }}>{children}</li>,
|
| 368 |
blockquote: ({ children }) => (
|
| 369 |
+
<blockquote style={{ borderLeft: '3px solid #3b82f6', paddingLeft: '0.75rem', fontStyle: 'italic', margin: '0.75rem 0', color: '#6b7280', fontSize: '0.875rem' }}>
|
| 370 |
{children}
|
| 371 |
</blockquote>
|
| 372 |
),
|
| 373 |
code: ({ inline, children }) =>
|
| 374 |
inline ?
|
| 375 |
+
<code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code> :
|
| 376 |
+
<pre style={{ backgroundColor: '#f3f4f6', padding: '0.75rem', borderRadius: '0.375rem', overflowX: 'auto', margin: '0.75rem 0' }}>
|
| 377 |
+
<code style={{ fontSize: '0.75rem', fontFamily: 'monospace' }}>{children}</code>
|
| 378 |
</pre>,
|
| 379 |
img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
|
| 380 |
}}
|
|
|
|
| 384 |
</div>
|
| 385 |
</div>
|
| 386 |
</div>
|
| 387 |
+
|
| 388 |
+
{/* Right Panel - Chunks (33%) */}
|
| 389 |
+
<div className="w-1/3 border-l border-gray-200 bg-white">
|
| 390 |
+
<ChunkPanel chunks={documentData.chunks} />
|
| 391 |
+
</div>
|
| 392 |
</div>
|
| 393 |
);
|
| 394 |
}
|
frontend/vite.config.js
CHANGED
|
@@ -4,4 +4,13 @@ import react from '@vitejs/plugin-react'
|
|
| 4 |
// https://vite.dev/config/
|
| 5 |
export default defineConfig({
|
| 6 |
plugins: [react()],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
})
|
|
|
|
| 4 |
// https://vite.dev/config/
|
| 5 |
export default defineConfig({
|
| 6 |
plugins: [react()],
|
| 7 |
+
server: {
|
| 8 |
+
proxy: {
|
| 9 |
+
'/upload_pdf': 'http://localhost:8000',
|
| 10 |
+
'/process_ocr': 'http://localhost:8000',
|
| 11 |
+
'/get_image': 'http://localhost:8000',
|
| 12 |
+
'/chunk_page': 'http://localhost:8000',
|
| 13 |
+
'/api': 'http://localhost:8000'
|
| 14 |
+
}
|
| 15 |
+
}
|
| 16 |
})
|