Spaces:

FrictionAI
/

SokratesAI

Sleeping

App Files Files Community

Alleinzellgaenger commited on Jul 29, 2025

Commit

02158b8

1 Parent(s): c075953

Implement chunking and loading animation stuff

Browse files

Files changed (2) hide show

backend/app.py +208 -1
frontend/src/components/DocumentProcessor.jsx +125 -98

backend/app.py CHANGED Viewed

@@ -3,7 +3,9 @@ from fastapi.middleware.cors import CORSMiddleware
 from mistralai import Mistral
 import os
 import tempfile
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
@@ -144,6 +146,18 @@ async def process_ocr_content(file_id: str):
                 }
                 page_data["images"].append(image_data)
             processed_pages.append(page_data)
         print(f"📝 Total processed pages: {len(processed_pages)}")
@@ -204,4 +218,197 @@ async def get_image_base64(file_id: str, image_id: str):
     except Exception as e:
         print(f"❌ Error getting image: {e}")
-        raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")

 from mistralai import Mistral
 import os
 import tempfile
+import json
 from dotenv import load_dotenv
+from difflib import SequenceMatcher
 # Load environment variables
 load_dotenv()
                 }
                 page_data["images"].append(image_data)
+            # Auto-chunk this page
+            try:
+                print(f"🧠 Auto-chunking page {page_idx + 1}...")
+                chunks = await auto_chunk_page(page.markdown, client)
+                page_data["chunks"] = chunks
+                print(f"📊 Page {page_idx + 1} chunks found: {len(chunks)}")
+                for i, chunk in enumerate(chunks):
+                    print(f"  {i+1}. {chunk.get('topic', 'Unknown')}: {chunk.get('start_phrase', '')[:50]}...")
+            except Exception as chunk_error:
+                print(f"⚠️ Chunking failed for page {page_idx + 1}: {chunk_error}")
+                page_data["chunks"] = []
             processed_pages.append(page_data)
         print(f"📝 Total processed pages: {len(processed_pages)}")
     except Exception as e:
         print(f"❌ Error getting image: {e}")
+        raise HTTPException(status_code=500, detail=f"Error getting image: {str(e)}")
+def fuzzy_find(text, pattern, start_pos=0):
+    """Find the best fuzzy match for pattern in text starting from start_pos"""
+    best_match = None
+    best_ratio = 0
+    best_pos = -1
+    # Search in sliding windows
+    pattern_len = len(pattern)
+    for i in range(start_pos, len(text) - pattern_len + 1):
+        window = text[i:i + pattern_len]
+        ratio = SequenceMatcher(None, pattern.lower(), window.lower()).ratio()
+        if ratio > best_ratio and ratio > 0.8:  # Minimum 60% similarity
+            best_ratio = ratio
+            best_pos = i
+            best_match = window
+    return best_pos if best_pos != -1 else None
+async def auto_chunk_page(page_markdown, client):
+    """Auto-chunk a page during OCR processing"""
+    if not page_markdown or len(page_markdown.strip()) < 100:
+        return []  # Skip very short pages
+    # Create chunking prompt
+    prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
+DOCUMENT PAGE:
+{page_markdown}
+For each chunk you identify, output ONLY a JSON array with this exact format:
+[
+  {{
+    "topic": "Brief topic name",
+    "start_phrase": "First few words of the chunk",
+    "end_phrase": "Last few words of the chunk"
+  }}
+]
+Rules:
+1. Each chunk should contain 2-3 valuable lessons
+2. start_phrase and end_phrase should be 5-15 words long
+3. Focus on educational content (concepts, examples, key points)
+4. Output ONLY the JSON array, no other text
+5. More dense content should have more chunks, less dense content fewer chunks
+6. Ensure the JSON is valid and well-formed
+7. Do not include any explanations or additional text, just the JSON array
+JSON:"""
+    try:
+        # Call Mistral for chunking
+        response = client.chat.complete(
+            model="mistral-small-latest",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.3
+        )
+        chunk_text = response.choices[0].message.content.strip()
+        # Parse JSON response
+        try:
+            chunks = json.loads(chunk_text)
+        except json.JSONDecodeError:
+            # Try to extract JSON from response if wrapped in text
+            import re
+            json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
+            if json_match:
+                chunks = json.loads(json_match.group())
+            else:
+                return []
+        # Find positions using fuzzy matching
+        positioned_chunks = []
+        for chunk in chunks:
+            start_pos = fuzzy_find(page_markdown, chunk.get("start_phrase", ""))
+            end_pos = fuzzy_find(page_markdown, chunk.get("end_phrase", ""), start_pos or 0)
+            if start_pos is not None:
+                positioned_chunks.append({
+                    **chunk,
+                    "start_position": start_pos,
+                    "end_position": end_pos,
+                    "found_start": True,
+                    "found_end": end_pos is not None
+                })
+        return positioned_chunks
+    except Exception as e:
+        print(f"❌ Auto-chunking error: {e}")
+        return []
+@app.post("/chunk_page")
+async def chunk_page(request: dict):
+    """Analyze a page and suggest chunks for lessons"""
+    print(f"🧠 Chunking page...")
+    page_markdown = request.get("markdown", "")
+    if not page_markdown:
+        raise HTTPException(status_code=400, detail="No markdown provided")
+    # Get Mistral API key
+    api_key = os.environ.get("MISTRAL_API_KEY")
+    if not api_key:
+        raise HTTPException(status_code=500, detail="MISTRAL_API_KEY not set")
+    try:
+        # Initialize Mistral client
+        client = Mistral(api_key=api_key)
+        # Create chunking prompt
+        prompt = f"""Analyze this academic document page and identify chunks suitable for creating interactive lessons.
+DOCUMENT PAGE:
+{page_markdown}
+For each chunk you identify, output ONLY a JSON array with this exact format:
+[
+  {{
+    "topic": "Brief topic name",
+    "start_phrase": "First few words of the chunk",
+    "end_phrase": "Last few words of the chunk",
+  }}
+]
+Rules:
+1. Each chunk should contain 2-3 valuable lessons.
+2. start_phrase and end_phrase should be 5-15 words long
+3. Focus on educational content (concepts, examples, key points)
+4. Output ONLY the JSON array, no other text
+5. More dense content should have more chunks, less dense content fewer chunks
+6. Ensure the JSON is valid and well-formed
+7. Do not include any explanations or additional text, just the JSON array
+JSON:"""
+        # Call Mistral for chunking
+        print("🚀 Calling Mistral for chunking...")
+        response = client.chat.complete(
+            model="mistral-small-latest",  # Faster and cheaper for this task
+            messages=[{
+                "role": "user",
+                "content": prompt
+            }],
+            temperature=0.3  # Lower temperature for more consistent output
+        )
+        chunk_text = response.choices[0].message.content.strip()
+        print(f"📝 LLM Response: {chunk_text[:200]}...")
+        # Parse JSON response
+        try:
+            chunks = json.loads(chunk_text)
+        except json.JSONDecodeError:
+            # Try to extract JSON from response if wrapped in text
+            import re
+            json_match = re.search(r'\[.*\]', chunk_text, re.DOTALL)
+            if json_match:
+                chunks = json.loads(json_match.group())
+            else:
+                raise ValueError("Could not parse JSON from LLM response")
+        # Find positions using fuzzy matching
+        positioned_chunks = []
+        for chunk in chunks:
+            start_pos = fuzzy_find(page_markdown, chunk.get("start_phrase", ""))
+            end_pos = fuzzy_find(page_markdown, chunk.get("end_phrase", ""), start_pos or 0)
+            if start_pos is not None:
+                positioned_chunks.append({
+                    **chunk,
+                    "start_position": start_pos,
+                    "end_position": end_pos,
+                    "found_start": True,
+                    "found_end": end_pos is not None
+                })
+                print(f"✅ Found chunk: {chunk.get('topic')} at position {start_pos}")
+            else:
+                print(f"❌ Could not find chunk: {chunk.get('topic')}")
+        print(f"📊 Successfully positioned {len(positioned_chunks)}/{len(chunks)} chunks")
+        return {
+            "chunks": positioned_chunks,
+            "total_found": len(positioned_chunks),
+            "total_suggested": len(chunks)
+        }
+    except Exception as e:
+        print(f"❌ Error chunking page: {e}")
+        raise HTTPException(status_code=500, detail=f"Error chunking page: {str(e)}")

frontend/src/components/DocumentProcessor.jsx CHANGED Viewed

@@ -1,13 +1,8 @@
-import { useState, useRef } from 'react';
 import ReactMarkdown from 'react-markdown';
-import { Document, Page, pdfjs } from 'react-pdf';
 import remarkMath from 'remark-math';
 import rehypeKatex from 'rehype-katex';
 import 'katex/dist/katex.min.css';
-import 'react-pdf/dist/Page/AnnotationLayer.css';
-import 'react-pdf/dist/Page/TextLayer.css';
-pdfjs.GlobalWorkerOptions.workerSrc = '/pdf.worker.min.js';
 function DocumentProcessor() {
     const fileInputRef = useRef(null);
@@ -16,14 +11,104 @@ function DocumentProcessor() {
     const [uploadProgress, setUploadProgress] = useState(0);
     const [ocrProgress, setOcrProgress] = useState(0);
     const [documentData, setDocumentData] = useState(null);
-    const [showPdfViewer, setShowPdfViewer] = useState(false);
-    const [numPages, setNumPages] = useState(null);
     const handleFileChange = (e) => {
         setSelectedFile(e.target.files[0]);
         setDocumentData(null);
         setUploadProgress(0);
         setOcrProgress(0);
     };
     const processDocument = async () => {
@@ -124,53 +209,12 @@ function DocumentProcessor() {
                 </div>
                 <p className="text-sm text-gray-500">
-                    Using Mistral AI to extract text and understand your document structure...
                 </p>
             </div>
         </div>
     );
-    const PdfViewer = () => (
-        <div className={`fixed bottom-4 left-4 bg-white rounded-lg shadow-xl border transition-all duration-300 ${
-            showPdfViewer ? 'w-80 h-96' : 'w-48 h-12'
-        }`}>
-            <div className="p-3 border-b flex justify-between items-center">
-                <span className="text-sm font-medium text-gray-700">Original PDF</span>
-                <button
-                    onClick={() => setShowPdfViewer(!showPdfViewer)}
-                    className="text-gray-500 hover:text-gray-700"
-                >
-                    {showPdfViewer ? (
-                        <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
-                        </svg>
-                    ) : (
-                        <svg className="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M5 15l7-7 7 7" />
-                        </svg>
-                    )}
-                </button>
-            </div>
-            {showPdfViewer && (
-                <div className="h-80 overflow-auto">
-                    <Document
-                        file={selectedFile}
-                        onLoadSuccess={({ numPages }) => setNumPages(numPages)}
-                    >
-                        {numPages && Array.from(new Array(numPages), (_, index) => (
-                            <div key={index + 1} className="mb-2">
-                                <Page
-                                    pageNumber={index + 1}
-                                    width={280}
-                                />
-                            </div>
-                        ))}
-                    </Document>
-                </div>
-            )}
-        </div>
-    );
     if (!selectedFile) {
         return (
@@ -233,58 +277,41 @@ function DocumentProcessor() {
     return (
         <div className="min-h-screen bg-gray-50">
-            {/* Header */}
-            <div className="bg-white shadow-sm border-b">
-                <div className="max-w-6xl mx-auto px-4 py-4 flex justify-between items-center">
-                    <div>
-                        <h1 className="text-xl font-bold text-gray-900">{documentData.filename}</h1>
-                        <p className="text-sm text-gray-600">{documentData.totalPages} pages processed</p>
-                    </div>
-                    <button
-                        onClick={() => setSelectedFile(null)}
-                        className="bg-blue-600 hover:bg-blue-700 text-white px-4 py-2 rounded-lg transition-colors"
-                    >
-                        Upload New Document
-                    </button>
-                </div>
-            </div>
             {/* Document Content */}
             <div className="max-w-4xl mx-auto px-4 py-8">
-                <div className="bg-white rounded-lg shadow-sm border p-8">
-                    <ReactMarkdown
-                        remarkPlugins={[remarkMath]}
-                        rehypePlugins={[rehypeKatex]}
-                        className="prose prose-lg max-w-none"
-                        components={{
-                            h1: ({ children }) => <h1 className="text-3xl font-bold mb-6 text-gray-900">{children}</h1>,
-                            h2: ({ children }) => <h2 className="text-2xl font-bold mb-4 text-gray-900 mt-8">{children}</h2>,
-                            h3: ({ children }) => <h3 className="text-xl font-bold mb-3 text-gray-900 mt-6">{children}</h3>,
-                            p: ({ children }) => <p className="mb-4 text-gray-700 leading-relaxed">{children}</p>,
-                            hr: () => <hr className="my-8 border-gray-300" />,
-                            ul: ({ children }) => <ul className="mb-4 ml-6 list-disc">{children}</ul>,
-                            ol: ({ children }) => <ol className="mb-4 ml-6 list-decimal">{children}</ol>,
-                            li: ({ children }) => <li className="mb-1 text-gray-700">{children}</li>,
-                            blockquote: ({ children }) => (
-                                <blockquote className="border-l-4 border-blue-500 pl-4 italic my-4 text-gray-600">
-                                    {children}
-                                </blockquote>
-                            ),
-                            code: ({ inline, children }) =>
-                                inline ?
-                                <code className="bg-gray-100 px-1 py-0.5 rounded text-sm font-mono">{children}</code> :
-                                <pre className="bg-gray-100 p-4 rounded-lg overflow-x-auto my-4">
-                                    <code className="text-sm font-mono">{children}</code>
-                                </pre>
-                        }}
-                    >
-                        {documentData.markdown}
-                    </ReactMarkdown>
                 </div>
             </div>
-            {/* PDF Viewer */}
-            <PdfViewer />
         </div>
     );
 }

+import { useState, useRef, useEffect } from 'react';
 import ReactMarkdown from 'react-markdown';
 import remarkMath from 'remark-math';
 import rehypeKatex from 'rehype-katex';
 import 'katex/dist/katex.min.css';
 function DocumentProcessor() {
     const fileInputRef = useRef(null);
     const [uploadProgress, setUploadProgress] = useState(0);
     const [ocrProgress, setOcrProgress] = useState(0);
     const [documentData, setDocumentData] = useState(null);
+    const [imageCache, setImageCache] = useState({});
     const handleFileChange = (e) => {
         setSelectedFile(e.target.files[0]);
         setDocumentData(null);
         setUploadProgress(0);
         setOcrProgress(0);
+        setImageCache({});
+    };
+    const fetchImage = async (imageId, fileId) => {
+        if (imageCache[imageId]) {
+            return imageCache[imageId];
+        }
+        try {
+            const response = await fetch(`http://localhost:8000/get_image/${fileId}/${imageId}`);
+            if (response.ok) {
+                const data = await response.json();
+                const imageData = data.image_base64;
+                // Cache the image
+                setImageCache(prev => ({
+                    ...prev,
+                    [imageId]: imageData
+                }));
+                return imageData;
+            }
+        } catch (error) {
+            console.error('Error fetching image:', error);
+        }
+        return null;
+    };
+    const ImageComponent = ({ src, alt }) => {
+        const [imageSrc, setImageSrc] = useState(null);
+        const [loading, setLoading] = useState(true);
+        useEffect(() => {
+            if (documentData && src) {
+                // Extract image ID from src (assuming format like ![imageId](imageId))
+                fetchImage(src, documentData.fileId).then(imageData => {
+                    if (imageData) {
+                        setImageSrc(imageData);
+                    }
+                    setLoading(false);
+                });
+            }
+        }, [src, documentData]);
+        if (loading) {
+            return (
+                <div style={{
+                    width: '100%',
+                    height: '200px',
+                    backgroundColor: '#f3f4f6',
+                    display: 'flex',
+                    alignItems: 'center',
+                    justifyContent: 'center',
+                    margin: '1rem 0',
+                    borderRadius: '0.5rem'
+                }}>
+                    <span style={{ color: '#6b7280' }}>Loading image...</span>
+                </div>
+            );
+        }
+        if (!imageSrc) {
+            return (
+                <div style={{
+                    width: '100%',
+                    height: '200px',
+                    backgroundColor: '#fef2f2',
+                    display: 'flex',
+                    alignItems: 'center',
+                    justifyContent: 'center',
+                    margin: '1rem 0',
+                    borderRadius: '0.5rem',
+                    border: '1px solid #fecaca'
+                }}>
+                    <span style={{ color: '#dc2626' }}>Image not found: {alt || src}</span>
+                </div>
+            );
+        }
+        return (
+            <div style={{ margin: '1.5rem 0', textAlign: 'center' }}>
+                <img
+                    src={imageSrc}
+                    alt={alt || 'Document image'}
+                    style={{
+                        maxWidth: '100%',
+                        height: 'auto',
+                    }}
+                />
+            </div>
+        );
     };
     const processDocument = async () => {
                 </div>
                 <p className="text-sm text-gray-500">
+                    Using AI to extract text and understand your document structure...
                 </p>
             </div>
         </div>
     );
     if (!selectedFile) {
         return (
     return (
         <div className="min-h-screen bg-gray-50">
             {/* Document Content */}
             <div className="max-w-4xl mx-auto px-4 py-8">
+                <div className="bg-white rounded-lg shadow-sm p-8">
+                    <div className="prose prose-lg max-w-none">
+                        <ReactMarkdown
+                            remarkPlugins={[remarkMath]}
+                            rehypePlugins={[rehypeKatex]}
+                            components={{
+                                h1: ({ children }) => <h1 style={{ fontSize: '2rem', fontWeight: 'bold', marginBottom: '1.5rem', color: '#1a202c' }}>{children}</h1>,
+                                h2: ({ children }) => <h2 style={{ fontSize: '1.5rem', fontWeight: 'bold', marginBottom: '1rem', marginTop: '2rem', color: '#1a202c' }}>{children}</h2>,
+                                h3: ({ children }) => <h3 style={{ fontSize: '1.25rem', fontWeight: 'bold', marginBottom: '0.75rem', marginTop: '1.5rem', color: '#1a202c' }}>{children}</h3>,
+                                p: ({ children }) => <p style={{ marginBottom: '1rem', color: '#374151', lineHeight: '1.6' }}>{children}</p>,
+                                hr: () => <hr style={{ margin: '2rem 0', borderColor: '#d1d5db' }} />,
+                                ul: ({ children }) => <ul style={{ marginBottom: '1rem', marginLeft: '1.5rem', listStyleType: 'disc' }}>{children}</ul>,
+                                ol: ({ children }) => <ol style={{ marginBottom: '1rem', marginLeft: '1.5rem', listStyleType: 'decimal' }}>{children}</ol>,
+                                li: ({ children }) => <li style={{ marginBottom: '0.25rem', color: '#374151' }}>{children}</li>,
+                                blockquote: ({ children }) => (
+                                    <blockquote style={{ borderLeft: '4px solid #3b82f6', paddingLeft: '1rem', fontStyle: 'italic', margin: '1rem 0', color: '#6b7280' }}>
+                                        {children}
+                                    </blockquote>
+                                ),
+                                code: ({ inline, children }) =>
+                                    inline ?
+                                    <code style={{ backgroundColor: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem', fontSize: '0.875rem', fontFamily: 'monospace' }}>{children}</code> :
+                                    <pre style={{ backgroundColor: '#f3f4f6', padding: '1rem', borderRadius: '0.5rem', overflowX: 'auto', margin: '1rem 0' }}>
+                                        <code style={{ fontSize: '0.875rem', fontFamily: 'monospace' }}>{children}</code>
+                                    </pre>,
+                                img: ({ src, alt }) => <ImageComponent src={src} alt={alt} />
+                            }}
+                        >
+                            {documentData.markdown}
+                        </ReactMarkdown>
+                    </div>
                 </div>
             </div>
         </div>
     );
 }