Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>DocuSense Explorer - Document Processing System</title> | |
| <link rel="stylesheet" href="style.css"> | |
| <script src="https://cdn.tailwindcss.com"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/feather-icons/dist/feather.min.js"></script> | |
| <script src="https://unpkg.com/feather-icons"></script> | |
| <script src="components/navbar.js"></script> | |
| <script src="components/footer.js"></script> | |
| </head> | |
| <body class="bg-gray-50 dark:bg-gray-900 min-h-screen flex flex-col"> | |
| <custom-navbar></custom-navbar> | |
| <main class="flex-grow container mx-auto px-4 py-8"> | |
| <div class="max-w-5xl mx-auto"> | |
| <!-- Hero Section --> | |
| <section class="mb-16 text-center"> | |
| <div class="bg-gradient-to-r from-blue-600 to-indigo-700 text-white p-8 rounded-xl shadow-2xl"> | |
| <h1 class="text-4xl md:text-5xl font-bold mb-4">Document Processing <br>with Context Recovery</h1> | |
| <p class="text-xl mb-8 max-w-3xl mx-auto">Smart section-based processing with full context retrieval</p> | |
| <div class="flex justify-center gap-4"> | |
| <a href="#overview" class="bg-white text-blue-700 px-6 py-3 rounded-lg font-medium hover:bg-blue-50 transition">Learn More</a> | |
| <a href="#demo" class="border-2 border-white text-white px-6 py-3 rounded-lg font-medium hover:bg-white hover:text-blue-700 transition">See Demo</a> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- System Overview --> | |
| <section id="overview" class="mb-16"> | |
| <h2 class="text-3xl font-bold mb-8 text-center dark:text-white">System Architecture</h2> | |
| <div class="relative"> | |
| <!-- Architecture SVG --> | |
| <div class="bg-white dark:bg-gray-800 p-6 rounded-xl shadow-lg"> | |
| <img src="architecture.svg" alt="System Architecture Diagram" class="w-full h-auto rounded-lg"> | |
| </div> | |
| <!-- Key Features --> | |
| <div class="grid md:grid-cols-2 lg:grid-cols-3 gap-6 mt-8"> | |
| <div class="bg-white dark:bg-gray-800 p-6 rounded-xl shadow hover:shadow-lg transition"> | |
| <div class="bg-blue-100 dark:bg-blue-900 w-12 h-12 rounded-full flex items-center justify-center mb-4"> | |
| <i data-feather="layers" class="text-blue-600 dark:text-blue-300"></i> | |
| </div> | |
| <h3 class="text-xl font-semibold mb-2 dark:text-white">Section-Based Processing</h3> | |
| <p class="text-gray-600 dark:text-gray-300">Documents are intelligently split into meaningful sections while preserving hierarchy.</p> | |
| </div> | |
| <div class="bg-white dark:bg-gray-800 p-6 rounded-xl shadow hover:shadow-lg transition"> | |
| <div class="bg-purple-100 dark:bg-purple-900 w-12 h-12 rounded-full flex items-center justify-center mb-4"> | |
| <i data-feather="cpu" class="text-purple-600 dark:text-purple-300"></i> | |
| </div> | |
| <h3 class="text-xl font-semibold mb-2 dark:text-white">Context Recovery</h3> | |
| <p class="text-gray-600 dark:text-gray-300">Retrieve the full original section content even when searching small chunks.</p> | |
| </div> | |
| <div class="bg-white dark:bg-gray-800 p-6 rounded-xl shadow hover:shadow-lg transition"> | |
| <div class="bg-green-100 dark:bg-green-900 w-12 h-12 rounded-full flex items-center justify-center mb-4"> | |
| <i data-feather="zap" class="text-green-600 dark:text-green-300"></i> | |
| </div> | |
| <h3 class="text-xl font-semibold mb-2 dark:text-white">Hybrid Search</h3> | |
| <p class="text-gray-600 dark:text-gray-300">Combines vector similarity and BM25 for precise and relevant results.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- Workflow Demo --> | |
| <section id="demo" class="mb-16"> | |
| <h2 class="text-3xl font-bold mb-8 text-center dark:text-white">Workflow Demonstration</h2> | |
| <div class="bg-white dark:bg-gray-800 rounded-xl shadow-lg overflow-hidden"> | |
| <div class="grid md:grid-cols-2 gap-0"> | |
| <!-- Step Navigation --> | |
| <div class="bg-gray-50 dark:bg-gray-700 p-6"> | |
| <div class="sticky top-6"> | |
| <h3 class="text-xl font-semibold mb-4 dark:text-white">Process Steps</h3> | |
| <div class="space-y-2"> | |
| <button class="workflow-step active" data-step="1">1. Document Upload</button> | |
| <button class="workflow-step" data-step="2">2. Section Splitting</button> | |
| <button class="workflow-step" data-step="3">3. Chunk Generation</button> | |
| <button class="workflow-step" data-step="4">4. Storage</button> | |
| <button class="workflow-step" data-step="5">5. Search Process</button> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Step Content --> | |
| <div class="p-6"> | |
| <div class="workflow-content" data-step-content="1"> | |
| <h3 class="text-xl font-semibold mb-4 dark:text-white">Document Upload</h3> | |
| <p class="mb-4 text-gray-600 dark:text-gray-300">Users upload documents through the API endpoint:</p> | |
| <pre class="bg-gray-100 dark:bg-gray-900 rounded-lg p-4 mb-4 overflow-x-auto"> | |
| <code class="text-sm"> | |
| POST /index-file/ | |
| { | |
| "list_path": [ | |
| { | |
| "id": "uuid-file-1", | |
| "path": "extracted/document.md" | |
| } | |
| ] | |
| } | |
| </code> | |
| </pre> | |
| <p class="text-gray-600 dark:text-gray-300">The system creates a unique job ID for processing and returns immediately.</p> | |
| </div> | |
| <div class="workflow-content hidden" data-step-content="2"> | |
| <h3 class="text-xl font-semibold mb-4 dark:text-white">Section Splitting</h3> | |
| <p class="mb-4 text-gray-600 dark:text-gray-300">Documents are split into hierarchical sections:</p> | |
| <div class="bg-gray-100 dark:bg-gray-900 rounded-lg p-4 mb-4"> | |
| <div class="flex items-center mb-2"> | |
| <div class="w-2 h-2 bg-blue-500 rounded-full mr-2"></div> | |
| <span class="font-mono text-sm"># Main Title</span> | |
| </div> | |
| <div class="flex items-center mb-2 ml-4"> | |
| <div class="w-2 h-2 bg-purple-500 rounded-full mr-2"></div> | |
| <span class="font-mono text-sm">## Section 1</span> | |
| </div> | |
| <div class="flex items-center mb-2 ml-8"> | |
| <div class="w-2 h-2 bg-green-500 rounded-full mr-2"></div> | |
| <span class="font-mono text-sm">### Subsection 1.1</span> | |
| </div> | |
| <div class="flex items-center ml-4"> | |
| <div class="w-2 h-2 bg-purple-500 rounded-full mr-2"></div> | |
| <span class="font-mono text-sm">## Section 2</span> | |
| </div> | |
| </div> | |
| <p class="text-gray-600 dark:text-gray-300">Each section maintains its page information and document context.</p> | |
| </div> | |
| <div class="workflow-content hidden" data-step-content="3"> | |
| <h3 class="text-xl font-semibold mb-4 dark:text-white">Chunk Generation</h3> | |
| <p class="mb-4 text-gray-600 dark:text-gray-300">Large sections are split into optimally-sized chunks:</p> | |
| <div class="flex items-start mb-4"> | |
| <div class="border-r-2 border-gray-300 dark:border-gray-600 pr-4 mr-4"> | |
| <div class="bg-blue-100 dark:bg-blue-900 px-3 py-2 rounded-lg mb-2"> | |
| <p class="text-sm">Section ID: 123e4567</p> | |
| </div> | |
| <div class="space-y-2"> | |
| <div class="bg-gray-200 dark:bg-gray-700 px-3 py-2 rounded-lg"> | |
| <p class="text-sm">Chunk 1 (Tokens: 512)</p> | |
| </div> | |
| <div class="bg-gray-200 dark:bg-gray-700 px-3 py-2 rounded-lg"> | |
| <p class="text-sm">Chunk 2 (Tokens: 498)</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div> | |
| <p class="text-gray-600 dark:text-gray-300 text-sm">Each chunk references its parent section while being small enough for efficient vector search.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="workflow-content hidden" data-step-content="4"> | |
| <h3 class="text-xl font-semibold mb-4 dark:text-white">Storage</h3> | |
| <div class="grid grid-cols-1 md:grid-cols-2 gap-4 mb-4"> | |
| <div class="bg-gray-100 dark:bg-gray-900 p-4 rounded-lg"> | |
| <div class="flex items-center mb-2"> | |
| <i data-feather="database" class="mr-2 text-blue-500"></i> | |
| <h4 class="font-medium">PostgreSQL</h4> | |
| </div> | |
| <p class="text-sm text-gray-600 dark:text-gray-300">Stores complete section content with metadata</p> | |
| </div> | |
| <div class="bg-gray-100 dark:bg-gray-900 p-4 rounded-lg"> | |
| <div class="flex items-center mb-2"> | |
| <i data-feather="box" class="mr-2 text-purple-500"></i> | |
| <h4 class="font-medium">Milvus</h4> | |
| </div> | |
| <p class="text-sm text-gray-600 dark:text-gray-300">Stores vector embeddings of chunks with section references</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="workflow-content hidden" data-step-content="5"> | |
| <h3 class="text-xl font-semibold mb-4 dark:text-white">Search Process</h3> | |
| <ol class="space-y-4"> | |
| <li class="flex items-start"> | |
| <div class="bg-blue-500 text-white rounded-full w-6 h-6 flex items-center justify-center mr-3 mt-0.5 flex-shrink-0">1</div> | |
| <div> | |
| <p class="font-medium dark:text-white">Query Processing</p> | |
| <p class="text-sm text-gray-600 dark:text-gray-300">Convert search query to embeddings and search Milvus</p> | |
| </div> | |
| </li> | |
| <li class="flex items-start"> | |
| <div class="bg-blue-500 text-white rounded-full w-6 h-6 flex items-center justify-center mr-3 mt-0.5 flex-shrink-0">2</div> | |
| <div> | |
| <p class="font-medium dark:text-white">Result Aggregation</p> | |
| <p class="text-sm text-gray-600 dark:text-gray-300">Group chunks by their section_id and score</p> | |
| </div> | |
| </li> | |
| <li class="flex items-start"> | |
| <div class="bg-blue-500 text-white rounded-full w-6 h-6 flex items-center justify-center mr-3 mt-0.5 flex-shrink-0">3</div> | |
| <div> | |
| <p class="font-medium dark:text-white">Context Retrieval</p> | |
| <p class="text-sm text-gray-600 dark:text-gray-300">Fetch full section content from PostgreSQL</p> | |
| </div> | |
| </li> | |
| </ol> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- Benefits Section --> | |
| <section class="mb-16"> | |
| <h2 class="text-3xl font-bold mb-8 text-center dark:text-white">System Benefits</h2> | |
| <div class="grid md:grid-cols-3 gap-6"> | |
| <div class="bg-white dark:bg-gray-800 p-6 rounded-xl shadow-lg border-t-4 border-blue-500"> | |
| <h3 class="text-xl font-semibold mb-3 dark:text-white">Full Context Recovery</h3> | |
| <p class="text-gray-600 dark:text-gray-300">Retrieve complete section content even when searching small chunks, ensuring no context is lost.</p> | |
| </div> | |
| <div class="bg-white dark:bg-gray-800 p-6 rounded-xl shadow-lg border-t-4 border-purple-500"> | |
| <h3 class="text-xl font-semibold mb-3 dark:text-white">Hybrid Search</h3> | |
| <p class="text-gray-600 dark:text-gray-300">Combine semantic vector search with keyword-based BM25 for more accurate results.</p> | |
| </div> | |
| <div class="bg-white dark:bg-gray-800 p-6 rounded-xl shadow-lg border-t-4 border-green-500"> | |
| <h3 class="text-xl font-semibold mb-3 dark:text-white">Optimal Performance</h3> | |
| <p class="text-gray-600 dark:text-gray-300">Fast vector search on small chunks with instant retrieval of full sections from PostgreSQL.</p> | |
| </div> | |
| </div> | |
| </section> | |
| </div> | |
| </main> | |
| <custom-footer></custom-footer> | |
| <script> | |
| feather.replace(); | |
| // Workflow step navigation | |
| document.addEventListener('DOMContentLoaded', function() { | |
| const steps = document.querySelectorAll('.workflow-step'); | |
| const contents = document.querySelectorAll('.workflow-content'); | |
| steps.forEach(step => { | |
| step.addEventListener('click', function() { | |
| // Remove active class from all steps | |
| steps.forEach(s => s.classList.remove('active', 'bg-blue-500', 'text-white')); | |
| // Add active class to clicked step | |
| this.classList.add('active', 'bg-blue-500', 'text-white'); | |
| // Hide all content | |
| contents.forEach(c => c.classList.add('hidden')); | |
| // Show corresponding content | |
| const stepNum = this.getAttribute('data-step'); | |
| document.querySelector(`[data-step-content="${stepNum}"]`).classList.remove('hidden'); | |
| }); | |
| }); | |
| // Activate first step by default | |
| document.querySelector('.workflow-step').click(); | |
| }); | |
| </script> | |
| <script src="https://huggingface.co/deepsite/deepsite-badge.js"></script> | |
| </body> | |
| </html> |