Spaces:
Running
Running
code
Browse files- README.md +39 -12
- index.html +32 -19
- main.js +123 -0
- offline_processing.py +122 -0
- transformers.min.js +0 -0
- worker.js +322 -0
README.md
CHANGED
|
@@ -1,12 +1,39 @@
|
|
| 1 |
-
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Client-Side Semantic Quote Retrieval Engine
|
| 2 |
+
|
| 3 |
+
This project implements a "Zero-Infrastructure" client-side semantic quote retrieval engine. All computationally intensive tasks, including vectorization, indexing, and model quantization, are handled offline. The search and retrieval operations then occur entirely within the user's browser, ensuring data privacy and minimal operational overhead.
|
| 4 |
+
|
| 5 |
+
## Project Overview
|
| 6 |
+
|
| 7 |
+
The core idea is to pre-process a large dataset of quotes into a highly optimized, quantized vector index. This index, along with a small machine learning model, is then loaded by the client-side application. When a user enters a query, the application generates an embedding for the query and performs a fast Approximate Nearest Neighbor Search (ANNS) directly in the browser to find semantically similar quotes.
|
| 8 |
+
|
| 9 |
+
## Data Source
|
| 10 |
+
|
| 11 |
+
The quotes data used in this project was sourced from:
|
| 12 |
+
[https://archive.org/details/quotes_20230625](https://archive.org/details/quotes_20230625)
|
| 13 |
+
|
| 14 |
+
## Setup and Usage
|
| 15 |
+
|
| 16 |
+
1. **Offline Data Processing**:
|
| 17 |
+
* Ensure you have Python and the necessary libraries (e.g., `pandas`, `numpy`, `torch`, `sentence-transformers`, `tqdm`) installed.
|
| 18 |
+
* Run the `offline_processing.py` script to generate the `quotes_index.bin` file. This file contains the pre-computed embeddings and metadata.
|
| 19 |
+
```bash
|
| 20 |
+
python offline_processing.py
|
| 21 |
+
```
|
| 22 |
+
* *Note: This step can be time-consuming for large datasets, especially the first time as the embedding model needs to be downloaded.*
|
| 23 |
+
* **Important:** The script now includes validation to ensure that categories in the CSV do not contain uppercase letters. Rows with invalid categories will be ignored.
|
| 24 |
+
|
| 25 |
+
2. **Client-Side Application**:
|
| 26 |
+
* Open `index.html` in your web browser.
|
| 27 |
+
* The search input and button are immediately available.
|
| 28 |
+
* The application will first check for a cached index. If not found, it will display a message indicating a significant one-time download (for the model and index) which will occur on your first search.
|
| 29 |
+
* The `quotes_index.bin` is loaded (from cache or downloaded) and the necessary machine learning model (via `transformers.js`) is downloaded on demand during your first search.
|
| 30 |
+
* A progress bar with detailed status will be shown during downloads and processing, disappearing once complete.
|
| 31 |
+
* A "Delete Cached Data" button will appear when data is cached, allowing you to clear local storage.
|
| 32 |
+
* Enter your search queries to retrieve semantically similar quotes.
|
| 33 |
+
|
| 34 |
+
## Technologies Used
|
| 35 |
+
|
| 36 |
+
* **Frontend**: HTML, CSS (Tailwind CSS), JavaScript
|
| 37 |
+
* **Offline Processing**: Python (pandas, numpy, torch, sentence-transformers)
|
| 38 |
+
* **Embedding Model**: nomic-ai/nomic-embed-text-v1.5
|
| 39 |
+
* **Client-Side ML**: transformers.js, Web Workers
|
index.html
CHANGED
|
@@ -1,19 +1,32 @@
|
|
| 1 |
-
<!
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Client-Side Quote Search</title>
|
| 7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 8 |
+
</head>
|
| 9 |
+
<body class="bg-gray-100 flex items-center justify-center min-h-screen">
|
| 10 |
+
<div class="container mx-auto p-8 bg-white rounded-lg shadow-lg w-full max-w-2xl relative">
|
| 11 |
+
<div id="delete-button-container" class="flex justify-end mb-4 hidden">
|
| 12 |
+
<button id="delete-data-button" class="bg-red-500 text-white px-3 py-1 rounded-lg text-sm hover:bg-red-600 focus:outline-none focus:ring-2 focus:ring-red-500">Delete Cached Data</button>
|
| 13 |
+
</div>
|
| 14 |
+
<h1 class="text-3xl font-bold text-center text-gray-800 mb-4">Client-Side Quote Search</h1>
|
| 15 |
+
|
| 16 |
+
<div id="data-info" class="text-center text-gray-600 mb-2"></div>
|
| 17 |
+
<div id="status" class="text-center text-gray-600 mb-4"></div>
|
| 18 |
+
|
| 19 |
+
<div id="progress-container" class="w-full bg-gray-200 rounded-full h-4 mb-4 hidden">
|
| 20 |
+
<div id="progress-bar" class="bg-blue-500 h-4 rounded-full" style="width: 0%"></div>
|
| 21 |
+
</div>
|
| 22 |
+
|
| 23 |
+
<div class="flex gap-2 mb-4">
|
| 24 |
+
<textarea id="search-input" class="w-full p-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:outline-none" placeholder="Enter your query..."></textarea>
|
| 25 |
+
<button id="search-button" class="bg-blue-500 text-white px-4 py-2 rounded-lg hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-blue-500">Search</button>
|
| 26 |
+
</div>
|
| 27 |
+
|
| 28 |
+
<div id="results"></div>
|
| 29 |
+
</div>
|
| 30 |
+
<script src="main.js"></script>
|
| 31 |
+
</body>
|
| 32 |
+
</html>
|
main.js
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const statusDiv = document.getElementById('status');
|
| 2 |
+
const searchInput = document.getElementById('search-input');
|
| 3 |
+
const resultsDiv = document.getElementById('results');
|
| 4 |
+
const searchButton = document.getElementById('search-button');
|
| 5 |
+
const progressContainer = document.getElementById('progress-container');
|
| 6 |
+
const progressBar = document.getElementById('progress-bar');
|
| 7 |
+
const dataInfoDiv = document.getElementById('data-info');
|
| 8 |
+
const deleteDataButton = document.getElementById('delete-data-button');
|
| 9 |
+
const deleteButtonContainer = document.getElementById('delete-button-container');
|
| 10 |
+
|
| 11 |
+
const worker = new Worker('worker.js', { type: 'module' });
|
| 12 |
+
|
| 13 |
+
// Initial state
|
| 14 |
+
// searchInput.disabled = true; // Removed to enable during initialization
|
| 15 |
+
// searchButton.disabled = true; // Removed to enable during initialization
|
| 16 |
+
deleteButtonContainer.classList.add('hidden');
|
| 17 |
+
|
| 18 |
+
// Helper to format bytes to MB
|
| 19 |
+
function formatBytesToMB(bytes) {
|
| 20 |
+
return (bytes / (1024 * 1024)).toFixed(2);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
worker.onmessage = (event) => {
|
| 24 |
+
const { type, payload } = event.data;
|
| 25 |
+
|
| 26 |
+
if (type === 'loading') {
|
| 27 |
+
statusDiv.textContent = payload;
|
| 28 |
+
progressContainer.classList.remove('hidden');
|
| 29 |
+
} else if (type === 'progress') {
|
| 30 |
+
const progressPercentage = Math.round(payload.progress);
|
| 31 |
+
progressBar.style.width = `${progressPercentage || 0}%`;
|
| 32 |
+
statusDiv.textContent = payload.detail;
|
| 33 |
+
progressContainer.classList.remove('hidden');
|
| 34 |
+
} else if (type === 'ready') {
|
| 35 |
+
statusDiv.textContent = ''; // Clear status when index is ready
|
| 36 |
+
progressContainer.classList.add('hidden');
|
| 37 |
+
searchInput.disabled = false;
|
| 38 |
+
searchButton.disabled = false;
|
| 39 |
+
deleteButtonContainer.classList.remove('hidden'); // Ensure button is shown when index is ready
|
| 40 |
+
} else if (type === 'results') {
|
| 41 |
+
// statusDiv.textContent = 'Search complete.'; // Removed as results themselves convey completion
|
| 42 |
+
progressContainer.classList.add('hidden'); // Hide progress bar
|
| 43 |
+
renderResults(payload);
|
| 44 |
+
statusDiv.textContent = ''; // Clear status after search is complete
|
| 45 |
+
deleteButtonContainer.classList.remove('hidden'); // Ensure delete button is visible
|
| 46 |
+
} else if (type === 'indexSize') {
|
| 47 |
+
if (payload.indexCached || payload.modelCached) {
|
| 48 |
+
deleteButtonContainer.classList.remove('hidden'); // Ensure button is shown if index or model is cached
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
if (payload.indexCached) {
|
| 52 |
+
dataInfoDiv.textContent = ''; // Clear data info if cached
|
| 53 |
+
searchInput.disabled = false;
|
| 54 |
+
searchButton.disabled = false;
|
| 55 |
+
if (payload.modelCached) {
|
| 56 |
+
statusDiv.textContent = ''; // Clear status if both index and model are loaded
|
| 57 |
+
} else {
|
| 58 |
+
statusDiv.textContent = ''; // Clear status if index is loaded, model will load on first search
|
| 59 |
+
}
|
| 60 |
+
} else {
|
| 61 |
+
dataInfoDiv.textContent = ''; // Clear data info if not cached
|
| 62 |
+
deleteButtonContainer.classList.add('hidden'); // Hide delete button if not cached
|
| 63 |
+
searchInput.disabled = false; // Ensure input is enabled
|
| 64 |
+
searchButton.disabled = false; // Ensure button is enabled
|
| 65 |
+
statusDiv.textContent = `To enable search, a one-time download of approximately ${formatBytesToMB(payload.size)} MB is required. This will happen when you perform your first search.`;
|
| 66 |
+
}
|
| 67 |
+
} else if (type === 'dataDeleted') {
|
| 68 |
+
statusDiv.textContent = payload;
|
| 69 |
+
dataInfoDiv.textContent = ''; // Clear data info
|
| 70 |
+
deleteButtonContainer.classList.add('hidden');
|
| 71 |
+
// searchInput.disabled = true; // Removed: state will be set by getIndexSize
|
| 72 |
+
// searchButton.disabled = true; // Removed: state will be set by getIndexSize
|
| 73 |
+
progressContainer.classList.add('hidden'); // Hide progress bar
|
| 74 |
+
// Re-request size info after deletion
|
| 75 |
+
worker.postMessage({ type: 'getIndexSize' });
|
| 76 |
+
} else if (type === 'error') {
|
| 77 |
+
statusDiv.textContent = `Error: ${payload}`;
|
| 78 |
+
progressContainer.classList.add('hidden');
|
| 79 |
+
}
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
// Send initial request for index size when the page loads
|
| 83 |
+
worker.postMessage({ type: 'getIndexSize' });
|
| 84 |
+
|
| 85 |
+
searchButton.addEventListener('click', () => {
|
| 86 |
+
const query = searchInput.value;
|
| 87 |
+
if (query) {
|
| 88 |
+
worker.postMessage({ type: 'search', payload: query });
|
| 89 |
+
statusDiv.textContent = 'Searching...';
|
| 90 |
+
resultsDiv.innerHTML = ''; // Clear previous results
|
| 91 |
+
} else {
|
| 92 |
+
resultsDiv.innerHTML = '<p class="text-gray-500">Please enter a query.</p>';
|
| 93 |
+
}
|
| 94 |
+
});
|
| 95 |
+
|
| 96 |
+
deleteDataButton.addEventListener('click', () => {
|
| 97 |
+
if (confirm('Are you sure you want to delete the cached data?')) {
|
| 98 |
+
worker.postMessage({ type: 'deleteData' });
|
| 99 |
+
}
|
| 100 |
+
});
|
| 101 |
+
|
| 102 |
+
function renderResults(results) {
|
| 103 |
+
resultsDiv.innerHTML = '';
|
| 104 |
+
if (results.length === 0) {
|
| 105 |
+
resultsDiv.innerHTML = '<p class="text-gray-500">No results found.</p>';
|
| 106 |
+
return;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
results.forEach(result => {
|
| 110 |
+
const card = document.createElement('div');
|
| 111 |
+
card.className = 'bg-gray-50 p-4 rounded-lg shadow mb-4';
|
| 112 |
+
// Handle null author and category gracefully
|
| 113 |
+
const authorText = result.author ? `- ${result.author}` : '- Unknown Author';
|
| 114 |
+
const categoryText = result.category ? `Categories: ${result.category}` : 'Categories: N/A';
|
| 115 |
+
|
| 116 |
+
card.innerHTML = `
|
| 117 |
+
<p class="text-lg font-medium text-gray-800">"${result.quote}"</p>
|
| 118 |
+
<p class="text-right text-gray-600 italic mt-2">${authorText}</p>
|
| 119 |
+
<p class="text-sm text-gray-500 mt-2">${categoryText}</p>
|
| 120 |
+
`;
|
| 121 |
+
resultsDiv.appendChild(card);
|
| 122 |
+
});
|
| 123 |
+
}
|
offline_processing.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import json
|
| 6 |
+
import struct
|
| 7 |
+
|
| 8 |
+
# 1. Constants
|
| 9 |
+
MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5'
|
| 10 |
+
EMBEDDING_DIM = 256
|
| 11 |
+
INPUT_CSV_PATH = 'data/quotes.csv'
|
| 12 |
+
OUTPUT_BINARY_PATH = 'data/quotes_index.bin'
|
| 13 |
+
BATCH_SIZE = 64 # Optimized batch size for GPU processing
|
| 14 |
+
|
| 15 |
+
# 2. Load Data
|
| 16 |
+
def load_quotes(file_path):
|
| 17 |
+
"""Loads quotes from a CSV file using pandas, skipping malformed lines."""
|
| 18 |
+
# Define column names explicitly
|
| 19 |
+
column_names = ['quote', 'author', 'category']
|
| 20 |
+
|
| 21 |
+
# Read CSV with explicit separator, quote character, and skip bad lines
|
| 22 |
+
# header=None because we are providing names explicitly
|
| 23 |
+
df = pd.read_csv(
|
| 24 |
+
file_path,
|
| 25 |
+
sep=',',
|
| 26 |
+
quotechar='"',
|
| 27 |
+
header=None,
|
| 28 |
+
names=column_names,
|
| 29 |
+
on_bad_lines='skip' # Skip lines that pandas cannot parse into 3 columns
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Filter out rows where the category contains uppercase letters
|
| 33 |
+
initial_rows = len(df)
|
| 34 |
+
df = df[df['category'].apply(lambda x: isinstance(x, str) and not any(c.isupper() for c in x))]
|
| 35 |
+
filtered_rows = len(df)
|
| 36 |
+
if initial_rows - filtered_rows > 0:
|
| 37 |
+
print(f"Ignored {initial_rows - filtered_rows} rows due to uppercase letters in category.")
|
| 38 |
+
|
| 39 |
+
# Prepend the required prefix for retrieval tasks
|
| 40 |
+
df['quote_for_embedding'] = "search_query: " + df['quote']
|
| 41 |
+
return df.to_dict('records')
|
| 42 |
+
|
| 43 |
+
# 3. Generate Embeddings
|
| 44 |
+
def generate_embeddings(quotes, model_name, embedding_dim):
|
| 45 |
+
"""Generates and truncates embeddings for a list of quotes."""
|
| 46 |
+
model = SentenceTransformer(model_name, trust_remote_code=True)
|
| 47 |
+
# The model automatically uses the GPU if available
|
| 48 |
+
embeddings = model.encode(
|
| 49 |
+
[q['quote_for_embedding'] for q in quotes],
|
| 50 |
+
convert_to_tensor=True,
|
| 51 |
+
batch_size=BATCH_SIZE,
|
| 52 |
+
show_progress_bar=True # Display progress bar for embedding generation
|
| 53 |
+
)
|
| 54 |
+
# Truncate embeddings to the desired dimension
|
| 55 |
+
truncated_embeddings = embeddings[:, :embedding_dim]
|
| 56 |
+
return truncated_embeddings.cpu().numpy()
|
| 57 |
+
|
| 58 |
+
# 4. Quantize Embeddings
|
| 59 |
+
def quantize_embeddings(embeddings):
|
| 60 |
+
"""Quantizes float32 embeddings to int8."""
|
| 61 |
+
# Calculate the scale factor
|
| 62 |
+
abs_max = np.abs(embeddings).max()
|
| 63 |
+
scale = 127.0 / abs_max if abs_max != 0 else 0
|
| 64 |
+
|
| 65 |
+
# Quantize and clip
|
| 66 |
+
quantized_embeddings = np.clip(embeddings * scale, -127, 127).astype(np.int8)
|
| 67 |
+
|
| 68 |
+
return quantized_embeddings, scale
|
| 69 |
+
|
| 70 |
+
def main():
|
| 71 |
+
"""Main function to run the offline processing pipeline."""
|
| 72 |
+
print("Starting offline processing...")
|
| 73 |
+
|
| 74 |
+
# Load quotes
|
| 75 |
+
quotes = load_quotes(INPUT_CSV_PATH)
|
| 76 |
+
print(f"Loaded {len(quotes)} quotes.")
|
| 77 |
+
|
| 78 |
+
# Generate embeddings
|
| 79 |
+
print("Generating embeddings...")
|
| 80 |
+
float_embeddings = generate_embeddings(quotes, MODEL_NAME, EMBEDDING_DIM)
|
| 81 |
+
print(f"Generated float embeddings with shape: {float_embeddings.shape}")
|
| 82 |
+
|
| 83 |
+
# Quantize embeddings
|
| 84 |
+
print("Quantizing embeddings...")
|
| 85 |
+
quantized_embeddings, scale = quantize_embeddings(float_embeddings)
|
| 86 |
+
print(f"Quantized embeddings with shape: {quantized_embeddings.shape}")
|
| 87 |
+
print(f"Quantization scale factor: {scale}")
|
| 88 |
+
|
| 89 |
+
# Prepare metadata
|
| 90 |
+
metadata = [
|
| 91 |
+
{"quote": q["quote"], "author": q["author"], "category": q["category"]}
|
| 92 |
+
for q in quotes
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
# Replace NaN values with None for JSON compatibility
|
| 96 |
+
for item in metadata:
|
| 97 |
+
for key, value in item.items():
|
| 98 |
+
if isinstance(value, float) and np.isnan(value):
|
| 99 |
+
item[key] = None
|
| 100 |
+
|
| 101 |
+
metadata_json = json.dumps(metadata, separators=(",", ":"))
|
| 102 |
+
metadata_bytes = metadata_json.encode('utf-8')
|
| 103 |
+
|
| 104 |
+
# Pack data into a binary file
|
| 105 |
+
print("Packaging data into binary file...")
|
| 106 |
+
with open(OUTPUT_BINARY_PATH, 'wb') as f:
|
| 107 |
+
# Header
|
| 108 |
+
f.write(struct.pack('<I', len(quotes))) # 4 bytes: number of quotes
|
| 109 |
+
f.write(struct.pack('<H', EMBEDDING_DIM)) # 2 bytes: embedding dimension
|
| 110 |
+
f.write(struct.pack('<f', scale)) # 4 bytes: quantization scale factor
|
| 111 |
+
f.write(struct.pack('<I', len(metadata_bytes))) # 4 bytes: metadata size
|
| 112 |
+
|
| 113 |
+
# Metadata
|
| 114 |
+
f.write(metadata_bytes)
|
| 115 |
+
|
| 116 |
+
# Embeddings
|
| 117 |
+
f.write(quantized_embeddings.tobytes())
|
| 118 |
+
|
| 119 |
+
print(f"Offline processing complete. Index file saved to {OUTPUT_BINARY_PATH}")
|
| 120 |
+
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
main()
|
transformers.min.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
worker.js
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
// 1. Import transformers.js
|
| 3 |
+
import { pipeline, env } from './transformers.min.js';
|
| 4 |
+
|
| 5 |
+
// 2. Constants
|
| 6 |
+
env.allowLocalModels = true;
|
| 7 |
+
const MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5';
|
| 8 |
+
const EMBEDDING_DIM = 256;
|
| 9 |
+
const INDEX_PATH = 'data/quotes_index.bin';
|
| 10 |
+
const APPROX_MODEL_SIZE_MB = 180; // Actual observed size of nomic-embed-text-v1.5 model
|
| 11 |
+
|
| 12 |
+
const DB_NAME = 'QuoteSearchDB';
|
| 13 |
+
const DB_VERSION = 1;
|
| 14 |
+
const STORE_NAME = 'quoteIndex';
|
| 15 |
+
|
| 16 |
+
let model;
|
| 17 |
+
let indexData;
|
| 18 |
+
let isReady = false;
|
| 19 |
+
|
| 20 |
+
// IndexedDB Helper Functions
|
| 21 |
+
function openDB() {
|
| 22 |
+
return new Promise((resolve, reject) => {
|
| 23 |
+
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
| 24 |
+
|
| 25 |
+
request.onupgradeneeded = (event) => {
|
| 26 |
+
const db = event.target.result;
|
| 27 |
+
db.createObjectStore(STORE_NAME, { keyPath: 'id' });
|
| 28 |
+
};
|
| 29 |
+
|
| 30 |
+
request.onsuccess = (event) => {
|
| 31 |
+
resolve(event.target.result);
|
| 32 |
+
};
|
| 33 |
+
|
| 34 |
+
request.onerror = (event) => {
|
| 35 |
+
reject('IndexedDB error: ' + event.target.errorCode);
|
| 36 |
+
};
|
| 37 |
+
});
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
async function getFromDB(key) {
|
| 41 |
+
const db = await openDB();
|
| 42 |
+
return new Promise((resolve, reject) => {
|
| 43 |
+
const transaction = db.transaction([STORE_NAME], 'readonly');
|
| 44 |
+
const store = transaction.objectStore(STORE_NAME);
|
| 45 |
+
const request = store.get(key);
|
| 46 |
+
|
| 47 |
+
request.onsuccess = () => {
|
| 48 |
+
resolve(request.result ? request.result.value : null);
|
| 49 |
+
};
|
| 50 |
+
|
| 51 |
+
request.onerror = () => {
|
| 52 |
+
reject('Error getting data from DB');
|
| 53 |
+
};
|
| 54 |
+
});
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
async function putInDB(key, value) {
|
| 58 |
+
const db = await openDB();
|
| 59 |
+
return new Promise((resolve, reject) => {
|
| 60 |
+
const transaction = db.transaction([STORE_NAME], 'readwrite');
|
| 61 |
+
const store = transaction.objectStore(STORE_NAME);
|
| 62 |
+
const request = store.put({ id: key, value: value });
|
| 63 |
+
|
| 64 |
+
request.onsuccess = () => {
|
| 65 |
+
resolve();
|
| 66 |
+
};
|
| 67 |
+
|
| 68 |
+
request.onerror = () => {
|
| 69 |
+
reject('Error putting data in DB');
|
| 70 |
+
};
|
| 71 |
+
});
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
async function deleteFromDB(key) {
|
| 75 |
+
const db = await openDB();
|
| 76 |
+
return new Promise((resolve, reject) => {
|
| 77 |
+
const transaction = db.transaction([STORE_NAME], 'readwrite');
|
| 78 |
+
const store = transaction.objectStore(STORE_NAME);
|
| 79 |
+
const request = store.delete(key);
|
| 80 |
+
|
| 81 |
+
request.onsuccess = () => {
|
| 82 |
+
resolve();
|
| 83 |
+
};
|
| 84 |
+
|
| 85 |
+
request.onerror = () => {
|
| 86 |
+
reject('Error deleting data from DB');
|
| 87 |
+
};
|
| 88 |
+
});
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
// 3. Load Model and Index
|
| 92 |
+
async function loadModel() {
|
| 93 |
+
try {
|
| 94 |
+
self.postMessage({ type: 'loading', payload: 'Loading model...' });
|
| 95 |
+
|
| 96 |
+
// Load the model with a progress callback
|
| 97 |
+
model = await pipeline('feature-extraction', MODEL_NAME, {
|
| 98 |
+
progress_callback: (progress) => {
|
| 99 |
+
const detailMessage = `Model ${progress.status}: ${progress.file || ''} ${Math.floor(progress.progress || 0)}%`;
|
| 100 |
+
self.postMessage({ type: 'progress', payload: { ...progress, detail: detailMessage } });
|
| 101 |
+
}
|
| 102 |
+
});
|
| 103 |
+
} catch (error) {
|
| 104 |
+
console.error('Error loading model:', error);
|
| 105 |
+
self.postMessage({ type: 'error', payload: error.message });
|
| 106 |
+
throw error; // Re-throw to prevent further execution if model fails to load
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
async function loadIndex() {
|
| 111 |
+
try {
|
| 112 |
+
self.postMessage({ type: 'loading', payload: 'Checking cached index...' });
|
| 113 |
+
const cachedIndex = await getFromDB('quoteIndexData');
|
| 114 |
+
|
| 115 |
+
if (cachedIndex) {
|
| 116 |
+
indexData = cachedIndex;
|
| 117 |
+
self.postMessage({ type: 'loading', payload: 'Index loaded from cache.' });
|
| 118 |
+
} else {
|
| 119 |
+
self.postMessage({ type: 'loading', payload: 'Loading index...' });
|
| 120 |
+
|
| 121 |
+
// Fetch and parse the index file with progress reporting
|
| 122 |
+
const response = await fetch(INDEX_PATH);
|
| 123 |
+
const contentLength = response.headers.get('Content-Length');
|
| 124 |
+
const total = parseInt(contentLength, 10);
|
| 125 |
+
let loaded = 0;
|
| 126 |
+
|
| 127 |
+
const reader = response.body.getReader();
|
| 128 |
+
const chunks = [];
|
| 129 |
+
|
| 130 |
+
while (true) {
|
| 131 |
+
const { done, value } = await reader.read();
|
| 132 |
+
if (done) {
|
| 133 |
+
break;
|
| 134 |
+
}
|
| 135 |
+
chunks.push(value);
|
| 136 |
+
loaded += value.length;
|
| 137 |
+
|
| 138 |
+
const progress = {
|
| 139 |
+
status: 'Downloading Index',
|
| 140 |
+
progress: (loaded / total) * 100,
|
| 141 |
+
file: INDEX_PATH,
|
| 142 |
+
detail: `Downloading index: ${Math.floor((loaded / total) * 100)}% (${(loaded / (1024 * 1024)).toFixed(2)}MB / ${(total / (1024 * 1024)).toFixed(2)}MB)`
|
| 143 |
+
};
|
| 144 |
+
self.postMessage({ type: 'progress', payload: progress });
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
const buffer = await new Response(new Blob(chunks)).arrayBuffer();
|
| 148 |
+
|
| 149 |
+
// Parse the binary file
|
| 150 |
+
let offset = 0;
|
| 151 |
+
const numQuotes = new Uint32Array(buffer.slice(offset, offset + 4))[0];
|
| 152 |
+
offset += 4;
|
| 153 |
+
const embeddingDim = new Uint16Array(buffer.slice(offset, offset + 2))[0];
|
| 154 |
+
offset += 2;
|
| 155 |
+
const scale = new Float32Array(buffer.slice(offset, offset + 4))[0];
|
| 156 |
+
offset += 4;
|
| 157 |
+
const metadataSize = new Uint32Array(buffer.slice(offset, offset + 4))[0];
|
| 158 |
+
offset += 4;
|
| 159 |
+
|
| 160 |
+
const metadataBytes = buffer.slice(offset, offset + metadataSize);
|
| 161 |
+
offset += metadataSize;
|
| 162 |
+
const metadata = JSON.parse(new TextDecoder().decode(metadataBytes));
|
| 163 |
+
|
| 164 |
+
const quantizedEmbeddings = new Int8Array(buffer.slice(offset));
|
| 165 |
+
|
| 166 |
+
// De-quantize embeddings with progress reporting
|
| 167 |
+
const embeddings = new Float32Array(quantizedEmbeddings.length);
|
| 168 |
+
const totalEmbeddings = quantizedEmbeddings.length;
|
| 169 |
+
const updateInterval = Math.floor(totalEmbeddings / 100); // Update every 1%
|
| 170 |
+
|
| 171 |
+
for (let i = 0; i < totalEmbeddings; i++) {
|
| 172 |
+
embeddings[i] = quantizedEmbeddings[i] / scale;
|
| 173 |
+
if (updateInterval > 0 && i % updateInterval === 0) {
|
| 174 |
+
const progress = {
|
| 175 |
+
status: 'De-quantizing',
|
| 176 |
+
progress: (i / totalEmbeddings) * 100,
|
| 177 |
+
file: INDEX_PATH,
|
| 178 |
+
detail: `De-quantizing embeddings: ${Math.floor((i / totalEmbeddings) * 100)}%`
|
| 179 |
+
};
|
| 180 |
+
self.postMessage({ type: 'progress', payload: progress });
|
| 181 |
+
}
|
| 182 |
+
}
|
| 183 |
+
indexData = {
|
| 184 |
+
metadata,
|
| 185 |
+
embeddings: reshape(embeddings, [numQuotes, embeddingDim]),
|
| 186 |
+
embeddingsByteLength: quantizedEmbeddings.byteLength // Store byteLength
|
| 187 |
+
};
|
| 188 |
+
await putInDB('quoteIndexData', indexData); // Store in IndexedDB
|
| 189 |
+
}
|
| 190 |
+
} catch (error) {
|
| 191 |
+
console.error('Error loading index:', error);
|
| 192 |
+
self.postMessage({ type: 'error', payload: error.message });
|
| 193 |
+
throw error;
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
async function load() {
|
| 198 |
+
try {
|
| 199 |
+
await loadIndex();
|
| 200 |
+
isReady = true;
|
| 201 |
+
self.postMessage({ type: 'ready' });
|
| 202 |
+
} catch (error) {
|
| 203 |
+
console.error('Error in worker load function:', error);
|
| 204 |
+
self.postMessage({ type: 'error', payload: error.message });
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
// 4. Listen for Messages
|
| 209 |
+
self.onmessage = async (event) => {
|
| 210 |
+
const { type, payload } = event.data;
|
| 211 |
+
|
| 212 |
+
if (type === 'search') {
|
| 213 |
+
if (!indexData) {
|
| 214 |
+
self.postMessage({ type: 'loading', payload: 'Loading index before search...' });
|
| 215 |
+
await loadIndex();
|
| 216 |
+
}
|
| 217 |
+
if (!model) {
|
| 218 |
+
self.postMessage({ type: 'loading', payload: 'Loading model before search...' });
|
| 219 |
+
await loadModel();
|
| 220 |
+
}
|
| 221 |
+
self.postMessage({ type: 'loading', payload: 'Searching...' });
|
| 222 |
+
const results = await search(payload);
|
| 223 |
+
self.postMessage({ type: 'results', payload: results });
|
| 224 |
+
} else if (type === 'deleteData') {
|
| 225 |
+
self.postMessage({ type: 'loading', payload: 'Deleting cached data...' });
|
| 226 |
+
try {
|
| 227 |
+
await deleteFromDB('quoteIndexData');
|
| 228 |
+
|
| 229 |
+
// Attempt to clear transformers.js model cache
|
| 230 |
+
const cacheNames = await caches.keys();
|
| 231 |
+
for (const cacheName of cacheNames) {
|
| 232 |
+
if (cacheName.startsWith('transformers-cache') || cacheName.includes(MODEL_NAME.replace(/\//g, '-'))) {
|
| 233 |
+
await caches.delete(cacheName);
|
| 234 |
+
}
|
| 235 |
+
}
|
| 236 |
+
indexData = null; // Clear in-memory data
|
| 237 |
+
isReady = false; // Mark as not ready until reloaded
|
| 238 |
+
self.postMessage({ type: 'dataDeleted', payload: 'Cached data deleted successfully.' });
|
| 239 |
+
} catch (error) {
|
| 240 |
+
self.postMessage({ type: 'error', payload: 'Failed to delete cached data.' });
|
| 241 |
+
}
|
| 242 |
+
} else if (type === 'getIndexSize') {
|
| 243 |
+
try {
|
| 244 |
+
let totalSize = 0;
|
| 245 |
+
let indexCached = false;
|
| 246 |
+
let modelCached = false;
|
| 247 |
+
|
| 248 |
+
// Check if index is cached
|
| 249 |
+
const cachedIndex = await getFromDB('quoteIndexData');
|
| 250 |
+
if (cachedIndex) {
|
| 251 |
+
totalSize += JSON.stringify(cachedIndex.metadata).length + cachedIndex.embeddingsByteLength;
|
| 252 |
+
indexCached = true;
|
| 253 |
+
} else {
|
| 254 |
+
const response = await fetch(INDEX_PATH, { method: 'HEAD' });
|
| 255 |
+
const contentLength = response.headers.get('Content-Length');
|
| 256 |
+
totalSize += parseInt(contentLength, 10);
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
// Add approximate model size
|
| 260 |
+
totalSize += APPROX_MODEL_SIZE_MB * 1024 * 1024; // Convert MB to bytes
|
| 261 |
+
// Heuristic: assume model is cached if `model` object is initialized
|
| 262 |
+
if (model) {
|
| 263 |
+
modelCached = true;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
self.postMessage({ type: 'indexSize', payload: { size: totalSize, indexCached: indexCached, modelCached: modelCached } });
|
| 267 |
+
} catch (error) {
|
| 268 |
+
self.postMessage({ type: 'error', payload: 'Failed to get index size: ' + error.message });
|
| 269 |
+
}
|
| 270 |
+
}
|
| 271 |
+
};
|
| 272 |
+
|
| 273 |
+
// 5. Search Function
|
| 274 |
+
async function search(query) {
|
| 275 |
+
if (!model || !indexData) {
|
| 276 |
+
return [];
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
// Generate query embedding
|
| 280 |
+
const queryEmbedding = await model("search_query: " + query, { pooling: 'mean', normalize: true });
|
| 281 |
+
const truncatedQueryEmbedding = queryEmbedding.data.slice(0, EMBEDDING_DIM);
|
| 282 |
+
|
| 283 |
+
// Calculate cosine similarities
|
| 284 |
+
const similarities = [];
|
| 285 |
+
for (let i = 0; i < indexData.embeddings.length; i++) {
|
| 286 |
+
const similarity = cosineSimilarity(truncatedQueryEmbedding, indexData.embeddings[i]);
|
| 287 |
+
similarities.push({ index: i, similarity });
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
// Sort by similarity
|
| 291 |
+
similarities.sort((a, b) => b.similarity - a.similarity);
|
| 292 |
+
|
| 293 |
+
// Get top 20 results
|
| 294 |
+
const topResults = similarities.slice(0, 20).map(item => {
|
| 295 |
+
return indexData.metadata[item.index];
|
| 296 |
+
});
|
| 297 |
+
|
| 298 |
+
return topResults;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
// 6. Helper Functions
|
| 302 |
+
function cosineSimilarity(vecA, vecB) {
|
| 303 |
+
let dotProduct = 0;
|
| 304 |
+
let normA = 0;
|
| 305 |
+
let normB = 0;
|
| 306 |
+
for (let i = 0; i < vecA.length; i++) {
|
| 307 |
+
dotProduct += vecA[i] * vecB[i];
|
| 308 |
+
normA += vecA[i] * vecA[i];
|
| 309 |
+
normB += vecB[i] * vecB[i];
|
| 310 |
+
}
|
| 311 |
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
function reshape(array, shape) {
|
| 315 |
+
const reshaped = [];
|
| 316 |
+
let offset = 0;
|
| 317 |
+
for (let i = 0; i < shape[0]; i++) {
|
| 318 |
+
reshaped.push(array.slice(offset, offset + shape[1]));
|
| 319 |
+
offset += shape[1];
|
| 320 |
+
}
|
| 321 |
+
return reshaped;
|
| 322 |
+
}
|