ruidiao commited on
Commit
1e32f6a
·
1 Parent(s): 785042b
Files changed (6) hide show
  1. README.md +39 -12
  2. index.html +32 -19
  3. main.js +123 -0
  4. offline_processing.py +122 -0
  5. transformers.min.js +0 -0
  6. worker.js +322 -0
README.md CHANGED
@@ -1,12 +1,39 @@
1
- ---
2
- title: ClientSideQuoteSearch
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: static
7
- pinned: false
8
- license: mit
9
- short_description: 'Client-side AI quote search: fast, private, no servers.'
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Client-Side Semantic Quote Retrieval Engine
2
+
3
+ This project implements a "Zero-Infrastructure" client-side semantic quote retrieval engine. All computationally intensive tasks, including vectorization, indexing, and model quantization, are handled offline. The search and retrieval operations then occur entirely within the user's browser, ensuring data privacy and minimal operational overhead.
4
+
5
+ ## Project Overview
6
+
7
+ The core idea is to pre-process a large dataset of quotes into a highly optimized, quantized vector index. This index, along with a small machine learning model, is then loaded by the client-side application. When a user enters a query, the application generates an embedding for the query and performs a fast Approximate Nearest Neighbor Search (ANNS) directly in the browser to find semantically similar quotes.
8
+
9
+ ## Data Source
10
+
11
+ The quotes data used in this project was sourced from:
12
+ [https://archive.org/details/quotes_20230625](https://archive.org/details/quotes_20230625)
13
+
14
+ ## Setup and Usage
15
+
16
+ 1. **Offline Data Processing**:
17
+ * Ensure you have Python and the necessary libraries (e.g., `pandas`, `numpy`, `torch`, `sentence-transformers`, `tqdm`) installed.
18
+ * Run the `offline_processing.py` script to generate the `quotes_index.bin` file. This file contains the pre-computed embeddings and metadata.
19
+ ```bash
20
+ python offline_processing.py
21
+ ```
22
+ * *Note: This step can be time-consuming for large datasets, especially the first time as the embedding model needs to be downloaded.*
23
+ * **Important:** The script now includes validation to ensure that categories in the CSV do not contain uppercase letters. Rows with invalid categories will be ignored.
24
+
25
+ 2. **Client-Side Application**:
26
+ * Open `index.html` in your web browser.
27
+ * The search input and button are immediately available.
28
+ * The application will first check for a cached index. If not found, it will display a message indicating a significant one-time download (for the model and index) which will occur on your first search.
29
+ * The `quotes_index.bin` is loaded (from cache or downloaded) and the necessary machine learning model (via `transformers.js`) is downloaded on demand during your first search.
30
+ * A progress bar with detailed status will be shown during downloads and processing, disappearing once complete.
31
+ * A "Delete Cached Data" button will appear when data is cached, allowing you to clear local storage.
32
+ * Enter your search queries to retrieve semantically similar quotes.
33
+
34
+ ## Technologies Used
35
+
36
+ * **Frontend**: HTML, CSS (Tailwind CSS), JavaScript
37
+ * **Offline Processing**: Python (pandas, numpy, torch, sentence-transformers)
38
+ * **Embedding Model**: nomic-ai/nomic-embed-text-v1.5
39
+ * **Client-Side ML**: transformers.js, Web Workers
index.html CHANGED
@@ -1,19 +1,32 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Client-Side Quote Search</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ </head>
9
+ <body class="bg-gray-100 flex items-center justify-center min-h-screen">
10
+ <div class="container mx-auto p-8 bg-white rounded-lg shadow-lg w-full max-w-2xl relative">
11
+ <div id="delete-button-container" class="flex justify-end mb-4 hidden">
12
+ <button id="delete-data-button" class="bg-red-500 text-white px-3 py-1 rounded-lg text-sm hover:bg-red-600 focus:outline-none focus:ring-2 focus:ring-red-500">Delete Cached Data</button>
13
+ </div>
14
+ <h1 class="text-3xl font-bold text-center text-gray-800 mb-4">Client-Side Quote Search</h1>
15
+
16
+ <div id="data-info" class="text-center text-gray-600 mb-2"></div>
17
+ <div id="status" class="text-center text-gray-600 mb-4"></div>
18
+
19
+ <div id="progress-container" class="w-full bg-gray-200 rounded-full h-4 mb-4 hidden">
20
+ <div id="progress-bar" class="bg-blue-500 h-4 rounded-full" style="width: 0%"></div>
21
+ </div>
22
+
23
+ <div class="flex gap-2 mb-4">
24
+ <textarea id="search-input" class="w-full p-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:outline-none" placeholder="Enter your query..."></textarea>
25
+ <button id="search-button" class="bg-blue-500 text-white px-4 py-2 rounded-lg hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-blue-500">Search</button>
26
+ </div>
27
+
28
+ <div id="results"></div>
29
+ </div>
30
+ <script src="main.js"></script>
31
+ </body>
32
+ </html>
main.js ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const statusDiv = document.getElementById('status');
2
+ const searchInput = document.getElementById('search-input');
3
+ const resultsDiv = document.getElementById('results');
4
+ const searchButton = document.getElementById('search-button');
5
+ const progressContainer = document.getElementById('progress-container');
6
+ const progressBar = document.getElementById('progress-bar');
7
+ const dataInfoDiv = document.getElementById('data-info');
8
+ const deleteDataButton = document.getElementById('delete-data-button');
9
+ const deleteButtonContainer = document.getElementById('delete-button-container');
10
+
11
+ const worker = new Worker('worker.js', { type: 'module' });
12
+
13
+ // Initial state
14
+ // searchInput.disabled = true; // Removed to enable during initialization
15
+ // searchButton.disabled = true; // Removed to enable during initialization
16
+ deleteButtonContainer.classList.add('hidden');
17
+
18
+ // Helper to format bytes to MB
19
+ function formatBytesToMB(bytes) {
20
+ return (bytes / (1024 * 1024)).toFixed(2);
21
+ }
22
+
23
+ worker.onmessage = (event) => {
24
+ const { type, payload } = event.data;
25
+
26
+ if (type === 'loading') {
27
+ statusDiv.textContent = payload;
28
+ progressContainer.classList.remove('hidden');
29
+ } else if (type === 'progress') {
30
+ const progressPercentage = Math.round(payload.progress);
31
+ progressBar.style.width = `${progressPercentage || 0}%`;
32
+ statusDiv.textContent = payload.detail;
33
+ progressContainer.classList.remove('hidden');
34
+ } else if (type === 'ready') {
35
+ statusDiv.textContent = ''; // Clear status when index is ready
36
+ progressContainer.classList.add('hidden');
37
+ searchInput.disabled = false;
38
+ searchButton.disabled = false;
39
+ deleteButtonContainer.classList.remove('hidden'); // Ensure button is shown when index is ready
40
+ } else if (type === 'results') {
41
+ // statusDiv.textContent = 'Search complete.'; // Removed as results themselves convey completion
42
+ progressContainer.classList.add('hidden'); // Hide progress bar
43
+ renderResults(payload);
44
+ statusDiv.textContent = ''; // Clear status after search is complete
45
+ deleteButtonContainer.classList.remove('hidden'); // Ensure delete button is visible
46
+ } else if (type === 'indexSize') {
47
+ if (payload.indexCached || payload.modelCached) {
48
+ deleteButtonContainer.classList.remove('hidden'); // Ensure button is shown if index or model is cached
49
+ }
50
+
51
+ if (payload.indexCached) {
52
+ dataInfoDiv.textContent = ''; // Clear data info if cached
53
+ searchInput.disabled = false;
54
+ searchButton.disabled = false;
55
+ if (payload.modelCached) {
56
+ statusDiv.textContent = ''; // Clear status if both index and model are loaded
57
+ } else {
58
+ statusDiv.textContent = ''; // Clear status if index is loaded, model will load on first search
59
+ }
60
+ } else {
61
+ dataInfoDiv.textContent = ''; // Clear data info if not cached
62
+ deleteButtonContainer.classList.add('hidden'); // Hide delete button if not cached
63
+ searchInput.disabled = false; // Ensure input is enabled
64
+ searchButton.disabled = false; // Ensure button is enabled
65
+ statusDiv.textContent = `To enable search, a one-time download of approximately ${formatBytesToMB(payload.size)} MB is required. This will happen when you perform your first search.`;
66
+ }
67
+ } else if (type === 'dataDeleted') {
68
+ statusDiv.textContent = payload;
69
+ dataInfoDiv.textContent = ''; // Clear data info
70
+ deleteButtonContainer.classList.add('hidden');
71
+ // searchInput.disabled = true; // Removed: state will be set by getIndexSize
72
+ // searchButton.disabled = true; // Removed: state will be set by getIndexSize
73
+ progressContainer.classList.add('hidden'); // Hide progress bar
74
+ // Re-request size info after deletion
75
+ worker.postMessage({ type: 'getIndexSize' });
76
+ } else if (type === 'error') {
77
+ statusDiv.textContent = `Error: ${payload}`;
78
+ progressContainer.classList.add('hidden');
79
+ }
80
+ };
81
+
82
+ // Send initial request for index size when the page loads
83
+ worker.postMessage({ type: 'getIndexSize' });
84
+
85
+ searchButton.addEventListener('click', () => {
86
+ const query = searchInput.value;
87
+ if (query) {
88
+ worker.postMessage({ type: 'search', payload: query });
89
+ statusDiv.textContent = 'Searching...';
90
+ resultsDiv.innerHTML = ''; // Clear previous results
91
+ } else {
92
+ resultsDiv.innerHTML = '<p class="text-gray-500">Please enter a query.</p>';
93
+ }
94
+ });
95
+
96
+ deleteDataButton.addEventListener('click', () => {
97
+ if (confirm('Are you sure you want to delete the cached data?')) {
98
+ worker.postMessage({ type: 'deleteData' });
99
+ }
100
+ });
101
+
102
+ function renderResults(results) {
103
+ resultsDiv.innerHTML = '';
104
+ if (results.length === 0) {
105
+ resultsDiv.innerHTML = '<p class="text-gray-500">No results found.</p>';
106
+ return;
107
+ }
108
+
109
+ results.forEach(result => {
110
+ const card = document.createElement('div');
111
+ card.className = 'bg-gray-50 p-4 rounded-lg shadow mb-4';
112
+ // Handle null author and category gracefully
113
+ const authorText = result.author ? `- ${result.author}` : '- Unknown Author';
114
+ const categoryText = result.category ? `Categories: ${result.category}` : 'Categories: N/A';
115
+
116
+ card.innerHTML = `
117
+ <p class="text-lg font-medium text-gray-800">"${result.quote}"</p>
118
+ <p class="text-right text-gray-600 italic mt-2">${authorText}</p>
119
+ <p class="text-sm text-gray-500 mt-2">${categoryText}</p>
120
+ `;
121
+ resultsDiv.appendChild(card);
122
+ });
123
+ }
offline_processing.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ from sentence_transformers import SentenceTransformer
5
+ import json
6
+ import struct
7
+
8
+ # 1. Constants
9
+ MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5'
10
+ EMBEDDING_DIM = 256
11
+ INPUT_CSV_PATH = 'data/quotes.csv'
12
+ OUTPUT_BINARY_PATH = 'data/quotes_index.bin'
13
+ BATCH_SIZE = 64 # Optimized batch size for GPU processing
14
+
15
+ # 2. Load Data
16
+ def load_quotes(file_path):
17
+ """Loads quotes from a CSV file using pandas, skipping malformed lines."""
18
+ # Define column names explicitly
19
+ column_names = ['quote', 'author', 'category']
20
+
21
+ # Read CSV with explicit separator, quote character, and skip bad lines
22
+ # header=None because we are providing names explicitly
23
+ df = pd.read_csv(
24
+ file_path,
25
+ sep=',',
26
+ quotechar='"',
27
+ header=None,
28
+ names=column_names,
29
+ on_bad_lines='skip' # Skip lines that pandas cannot parse into 3 columns
30
+ )
31
+
32
+ # Filter out rows where the category contains uppercase letters
33
+ initial_rows = len(df)
34
+ df = df[df['category'].apply(lambda x: isinstance(x, str) and not any(c.isupper() for c in x))]
35
+ filtered_rows = len(df)
36
+ if initial_rows - filtered_rows > 0:
37
+ print(f"Ignored {initial_rows - filtered_rows} rows due to uppercase letters in category.")
38
+
39
+ # Prepend the required prefix for retrieval tasks
40
+ df['quote_for_embedding'] = "search_query: " + df['quote']
41
+ return df.to_dict('records')
42
+
43
+ # 3. Generate Embeddings
44
+ def generate_embeddings(quotes, model_name, embedding_dim):
45
+ """Generates and truncates embeddings for a list of quotes."""
46
+ model = SentenceTransformer(model_name, trust_remote_code=True)
47
+ # The model automatically uses the GPU if available
48
+ embeddings = model.encode(
49
+ [q['quote_for_embedding'] for q in quotes],
50
+ convert_to_tensor=True,
51
+ batch_size=BATCH_SIZE,
52
+ show_progress_bar=True # Display progress bar for embedding generation
53
+ )
54
+ # Truncate embeddings to the desired dimension
55
+ truncated_embeddings = embeddings[:, :embedding_dim]
56
+ return truncated_embeddings.cpu().numpy()
57
+
58
+ # 4. Quantize Embeddings
59
+ def quantize_embeddings(embeddings):
60
+ """Quantizes float32 embeddings to int8."""
61
+ # Calculate the scale factor
62
+ abs_max = np.abs(embeddings).max()
63
+ scale = 127.0 / abs_max if abs_max != 0 else 0
64
+
65
+ # Quantize and clip
66
+ quantized_embeddings = np.clip(embeddings * scale, -127, 127).astype(np.int8)
67
+
68
+ return quantized_embeddings, scale
69
+
70
+ def main():
71
+ """Main function to run the offline processing pipeline."""
72
+ print("Starting offline processing...")
73
+
74
+ # Load quotes
75
+ quotes = load_quotes(INPUT_CSV_PATH)
76
+ print(f"Loaded {len(quotes)} quotes.")
77
+
78
+ # Generate embeddings
79
+ print("Generating embeddings...")
80
+ float_embeddings = generate_embeddings(quotes, MODEL_NAME, EMBEDDING_DIM)
81
+ print(f"Generated float embeddings with shape: {float_embeddings.shape}")
82
+
83
+ # Quantize embeddings
84
+ print("Quantizing embeddings...")
85
+ quantized_embeddings, scale = quantize_embeddings(float_embeddings)
86
+ print(f"Quantized embeddings with shape: {quantized_embeddings.shape}")
87
+ print(f"Quantization scale factor: {scale}")
88
+
89
+ # Prepare metadata
90
+ metadata = [
91
+ {"quote": q["quote"], "author": q["author"], "category": q["category"]}
92
+ for q in quotes
93
+ ]
94
+
95
+ # Replace NaN values with None for JSON compatibility
96
+ for item in metadata:
97
+ for key, value in item.items():
98
+ if isinstance(value, float) and np.isnan(value):
99
+ item[key] = None
100
+
101
+ metadata_json = json.dumps(metadata, separators=(",", ":"))
102
+ metadata_bytes = metadata_json.encode('utf-8')
103
+
104
+ # Pack data into a binary file
105
+ print("Packaging data into binary file...")
106
+ with open(OUTPUT_BINARY_PATH, 'wb') as f:
107
+ # Header
108
+ f.write(struct.pack('<I', len(quotes))) # 4 bytes: number of quotes
109
+ f.write(struct.pack('<H', EMBEDDING_DIM)) # 2 bytes: embedding dimension
110
+ f.write(struct.pack('<f', scale)) # 4 bytes: quantization scale factor
111
+ f.write(struct.pack('<I', len(metadata_bytes))) # 4 bytes: metadata size
112
+
113
+ # Metadata
114
+ f.write(metadata_bytes)
115
+
116
+ # Embeddings
117
+ f.write(quantized_embeddings.tobytes())
118
+
119
+ print(f"Offline processing complete. Index file saved to {OUTPUT_BINARY_PATH}")
120
+
121
+ if __name__ == "__main__":
122
+ main()
transformers.min.js ADDED
The diff for this file is too large to render. See raw diff
 
worker.js ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ // 1. Import transformers.js
3
+ import { pipeline, env } from './transformers.min.js';
4
+
5
+ // 2. Constants
6
+ env.allowLocalModels = true;
7
+ const MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5';
8
+ const EMBEDDING_DIM = 256;
9
+ const INDEX_PATH = 'data/quotes_index.bin';
10
+ const APPROX_MODEL_SIZE_MB = 180; // Actual observed size of nomic-embed-text-v1.5 model
11
+
12
+ const DB_NAME = 'QuoteSearchDB';
13
+ const DB_VERSION = 1;
14
+ const STORE_NAME = 'quoteIndex';
15
+
16
+ let model;
17
+ let indexData;
18
+ let isReady = false;
19
+
20
+ // IndexedDB Helper Functions
21
+ function openDB() {
22
+ return new Promise((resolve, reject) => {
23
+ const request = indexedDB.open(DB_NAME, DB_VERSION);
24
+
25
+ request.onupgradeneeded = (event) => {
26
+ const db = event.target.result;
27
+ db.createObjectStore(STORE_NAME, { keyPath: 'id' });
28
+ };
29
+
30
+ request.onsuccess = (event) => {
31
+ resolve(event.target.result);
32
+ };
33
+
34
+ request.onerror = (event) => {
35
+ reject('IndexedDB error: ' + event.target.errorCode);
36
+ };
37
+ });
38
+ }
39
+
40
+ async function getFromDB(key) {
41
+ const db = await openDB();
42
+ return new Promise((resolve, reject) => {
43
+ const transaction = db.transaction([STORE_NAME], 'readonly');
44
+ const store = transaction.objectStore(STORE_NAME);
45
+ const request = store.get(key);
46
+
47
+ request.onsuccess = () => {
48
+ resolve(request.result ? request.result.value : null);
49
+ };
50
+
51
+ request.onerror = () => {
52
+ reject('Error getting data from DB');
53
+ };
54
+ });
55
+ }
56
+
57
+ async function putInDB(key, value) {
58
+ const db = await openDB();
59
+ return new Promise((resolve, reject) => {
60
+ const transaction = db.transaction([STORE_NAME], 'readwrite');
61
+ const store = transaction.objectStore(STORE_NAME);
62
+ const request = store.put({ id: key, value: value });
63
+
64
+ request.onsuccess = () => {
65
+ resolve();
66
+ };
67
+
68
+ request.onerror = () => {
69
+ reject('Error putting data in DB');
70
+ };
71
+ });
72
+ }
73
+
74
+ async function deleteFromDB(key) {
75
+ const db = await openDB();
76
+ return new Promise((resolve, reject) => {
77
+ const transaction = db.transaction([STORE_NAME], 'readwrite');
78
+ const store = transaction.objectStore(STORE_NAME);
79
+ const request = store.delete(key);
80
+
81
+ request.onsuccess = () => {
82
+ resolve();
83
+ };
84
+
85
+ request.onerror = () => {
86
+ reject('Error deleting data from DB');
87
+ };
88
+ });
89
+ }
90
+
91
+ // 3. Load Model and Index
92
+ async function loadModel() {
93
+ try {
94
+ self.postMessage({ type: 'loading', payload: 'Loading model...' });
95
+
96
+ // Load the model with a progress callback
97
+ model = await pipeline('feature-extraction', MODEL_NAME, {
98
+ progress_callback: (progress) => {
99
+ const detailMessage = `Model ${progress.status}: ${progress.file || ''} ${Math.floor(progress.progress || 0)}%`;
100
+ self.postMessage({ type: 'progress', payload: { ...progress, detail: detailMessage } });
101
+ }
102
+ });
103
+ } catch (error) {
104
+ console.error('Error loading model:', error);
105
+ self.postMessage({ type: 'error', payload: error.message });
106
+ throw error; // Re-throw to prevent further execution if model fails to load
107
+ }
108
+ }
109
+
110
+ async function loadIndex() {
111
+ try {
112
+ self.postMessage({ type: 'loading', payload: 'Checking cached index...' });
113
+ const cachedIndex = await getFromDB('quoteIndexData');
114
+
115
+ if (cachedIndex) {
116
+ indexData = cachedIndex;
117
+ self.postMessage({ type: 'loading', payload: 'Index loaded from cache.' });
118
+ } else {
119
+ self.postMessage({ type: 'loading', payload: 'Loading index...' });
120
+
121
+ // Fetch and parse the index file with progress reporting
122
+ const response = await fetch(INDEX_PATH);
123
+ const contentLength = response.headers.get('Content-Length');
124
+ const total = parseInt(contentLength, 10);
125
+ let loaded = 0;
126
+
127
+ const reader = response.body.getReader();
128
+ const chunks = [];
129
+
130
+ while (true) {
131
+ const { done, value } = await reader.read();
132
+ if (done) {
133
+ break;
134
+ }
135
+ chunks.push(value);
136
+ loaded += value.length;
137
+
138
+ const progress = {
139
+ status: 'Downloading Index',
140
+ progress: (loaded / total) * 100,
141
+ file: INDEX_PATH,
142
+ detail: `Downloading index: ${Math.floor((loaded / total) * 100)}% (${(loaded / (1024 * 1024)).toFixed(2)}MB / ${(total / (1024 * 1024)).toFixed(2)}MB)`
143
+ };
144
+ self.postMessage({ type: 'progress', payload: progress });
145
+ }
146
+
147
+ const buffer = await new Response(new Blob(chunks)).arrayBuffer();
148
+
149
+ // Parse the binary file
150
+ let offset = 0;
151
+ const numQuotes = new Uint32Array(buffer.slice(offset, offset + 4))[0];
152
+ offset += 4;
153
+ const embeddingDim = new Uint16Array(buffer.slice(offset, offset + 2))[0];
154
+ offset += 2;
155
+ const scale = new Float32Array(buffer.slice(offset, offset + 4))[0];
156
+ offset += 4;
157
+ const metadataSize = new Uint32Array(buffer.slice(offset, offset + 4))[0];
158
+ offset += 4;
159
+
160
+ const metadataBytes = buffer.slice(offset, offset + metadataSize);
161
+ offset += metadataSize;
162
+ const metadata = JSON.parse(new TextDecoder().decode(metadataBytes));
163
+
164
+ const quantizedEmbeddings = new Int8Array(buffer.slice(offset));
165
+
166
+ // De-quantize embeddings with progress reporting
167
+ const embeddings = new Float32Array(quantizedEmbeddings.length);
168
+ const totalEmbeddings = quantizedEmbeddings.length;
169
+ const updateInterval = Math.floor(totalEmbeddings / 100); // Update every 1%
170
+
171
+ for (let i = 0; i < totalEmbeddings; i++) {
172
+ embeddings[i] = quantizedEmbeddings[i] / scale;
173
+ if (updateInterval > 0 && i % updateInterval === 0) {
174
+ const progress = {
175
+ status: 'De-quantizing',
176
+ progress: (i / totalEmbeddings) * 100,
177
+ file: INDEX_PATH,
178
+ detail: `De-quantizing embeddings: ${Math.floor((i / totalEmbeddings) * 100)}%`
179
+ };
180
+ self.postMessage({ type: 'progress', payload: progress });
181
+ }
182
+ }
183
+ indexData = {
184
+ metadata,
185
+ embeddings: reshape(embeddings, [numQuotes, embeddingDim]),
186
+ embeddingsByteLength: quantizedEmbeddings.byteLength // Store byteLength
187
+ };
188
+ await putInDB('quoteIndexData', indexData); // Store in IndexedDB
189
+ }
190
+ } catch (error) {
191
+ console.error('Error loading index:', error);
192
+ self.postMessage({ type: 'error', payload: error.message });
193
+ throw error;
194
+ }
195
+ }
196
+
197
+ async function load() {
198
+ try {
199
+ await loadIndex();
200
+ isReady = true;
201
+ self.postMessage({ type: 'ready' });
202
+ } catch (error) {
203
+ console.error('Error in worker load function:', error);
204
+ self.postMessage({ type: 'error', payload: error.message });
205
+ }
206
+ }
207
+
208
+ // 4. Listen for Messages
209
+ self.onmessage = async (event) => {
210
+ const { type, payload } = event.data;
211
+
212
+ if (type === 'search') {
213
+ if (!indexData) {
214
+ self.postMessage({ type: 'loading', payload: 'Loading index before search...' });
215
+ await loadIndex();
216
+ }
217
+ if (!model) {
218
+ self.postMessage({ type: 'loading', payload: 'Loading model before search...' });
219
+ await loadModel();
220
+ }
221
+ self.postMessage({ type: 'loading', payload: 'Searching...' });
222
+ const results = await search(payload);
223
+ self.postMessage({ type: 'results', payload: results });
224
+ } else if (type === 'deleteData') {
225
+ self.postMessage({ type: 'loading', payload: 'Deleting cached data...' });
226
+ try {
227
+ await deleteFromDB('quoteIndexData');
228
+
229
+ // Attempt to clear transformers.js model cache
230
+ const cacheNames = await caches.keys();
231
+ for (const cacheName of cacheNames) {
232
+ if (cacheName.startsWith('transformers-cache') || cacheName.includes(MODEL_NAME.replace(/\//g, '-'))) {
233
+ await caches.delete(cacheName);
234
+ }
235
+ }
236
+ indexData = null; // Clear in-memory data
237
+ isReady = false; // Mark as not ready until reloaded
238
+ self.postMessage({ type: 'dataDeleted', payload: 'Cached data deleted successfully.' });
239
+ } catch (error) {
240
+ self.postMessage({ type: 'error', payload: 'Failed to delete cached data.' });
241
+ }
242
+ } else if (type === 'getIndexSize') {
243
+ try {
244
+ let totalSize = 0;
245
+ let indexCached = false;
246
+ let modelCached = false;
247
+
248
+ // Check if index is cached
249
+ const cachedIndex = await getFromDB('quoteIndexData');
250
+ if (cachedIndex) {
251
+ totalSize += JSON.stringify(cachedIndex.metadata).length + cachedIndex.embeddingsByteLength;
252
+ indexCached = true;
253
+ } else {
254
+ const response = await fetch(INDEX_PATH, { method: 'HEAD' });
255
+ const contentLength = response.headers.get('Content-Length');
256
+ totalSize += parseInt(contentLength, 10);
257
+ }
258
+
259
+ // Add approximate model size
260
+ totalSize += APPROX_MODEL_SIZE_MB * 1024 * 1024; // Convert MB to bytes
261
+ // Heuristic: assume model is cached if `model` object is initialized
262
+ if (model) {
263
+ modelCached = true;
264
+ }
265
+
266
+ self.postMessage({ type: 'indexSize', payload: { size: totalSize, indexCached: indexCached, modelCached: modelCached } });
267
+ } catch (error) {
268
+ self.postMessage({ type: 'error', payload: 'Failed to get index size: ' + error.message });
269
+ }
270
+ }
271
+ };
272
+
273
+ // 5. Search Function
274
+ async function search(query) {
275
+ if (!model || !indexData) {
276
+ return [];
277
+ }
278
+
279
+ // Generate query embedding
280
+ const queryEmbedding = await model("search_query: " + query, { pooling: 'mean', normalize: true });
281
+ const truncatedQueryEmbedding = queryEmbedding.data.slice(0, EMBEDDING_DIM);
282
+
283
+ // Calculate cosine similarities
284
+ const similarities = [];
285
+ for (let i = 0; i < indexData.embeddings.length; i++) {
286
+ const similarity = cosineSimilarity(truncatedQueryEmbedding, indexData.embeddings[i]);
287
+ similarities.push({ index: i, similarity });
288
+ }
289
+
290
+ // Sort by similarity
291
+ similarities.sort((a, b) => b.similarity - a.similarity);
292
+
293
+ // Get top 20 results
294
+ const topResults = similarities.slice(0, 20).map(item => {
295
+ return indexData.metadata[item.index];
296
+ });
297
+
298
+ return topResults;
299
+ }
300
+
301
+ // 6. Helper Functions
302
+ function cosineSimilarity(vecA, vecB) {
303
+ let dotProduct = 0;
304
+ let normA = 0;
305
+ let normB = 0;
306
+ for (let i = 0; i < vecA.length; i++) {
307
+ dotProduct += vecA[i] * vecB[i];
308
+ normA += vecA[i] * vecA[i];
309
+ normB += vecB[i] * vecB[i];
310
+ }
311
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
312
+ }
313
+
314
+ function reshape(array, shape) {
315
+ const reshaped = [];
316
+ let offset = 0;
317
+ for (let i = 0; i < shape[0]; i++) {
318
+ reshaped.push(array.slice(offset, offset + shape[1]));
319
+ offset += shape[1];
320
+ }
321
+ return reshaped;
322
+ }