duckdb-wasm-cors-test / index.html
davanstrien's picture
davanstrien HF Staff
Upload folder using huggingface_hub
bde3673 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>DuckDB WASM - HF Parquet CORS Test</title>
<style>
body { font-family: system-ui, sans-serif; max-width: 800px; margin: 2rem auto; padding: 0 1rem; }
h1 { font-size: 1.5rem; }
.test { border: 1px solid #ddd; border-radius: 8px; padding: 1rem; margin: 1rem 0; }
.test h3 { margin-top: 0; font-size: 0.95rem; word-break: break-all; }
.pending { border-left: 4px solid #888; }
.success { border-left: 4px solid #22c55e; background: #f0fdf4; }
.failure { border-left: 4px solid #ef4444; background: #fef2f2; }
.status { font-weight: bold; }
pre { background: #f1f5f9; padding: 0.5rem; border-radius: 4px; overflow-x: auto; font-size: 0.85rem; }
#init-status { padding: 0.5rem; margin: 1rem 0; }
button { padding: 0.5rem 1rem; border-radius: 4px; border: 1px solid #ddd; cursor: pointer; margin: 0.25rem; }
button:hover { background: #f1f5f9; }
.info { background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; padding: 1rem; margin: 1rem 0; font-size: 0.9rem; }
</style>
</head>
<body>
<h1>DuckDB WASM - HF Parquet CORS Test</h1>
<div class="info">
<strong>Page origin:</strong> <span id="page-origin"></span><br>
<strong>DuckDB WASM version:</strong> 1.30.0 (same as embedding-atlas)<br>
<strong>forceFullHTTPReads:</strong> true (same as embedding-atlas)
</div>
<div id="init-status">Initializing DuckDB WASM...</div>
<div>
<button onclick="runAllTests()">Run All Tests</button>
<button onclick="runCustomTest()">Test Custom URL</button>
<input type="text" id="custom-url" placeholder="Enter parquet URL..." style="width: 400px; padding: 0.5rem; border: 1px solid #ddd; border-radius: 4px;">
</div>
<div id="tests"></div>
<script type="module">
import * as duckdb from 'https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.30.0/+esm';
document.getElementById('page-origin').textContent = window.location.origin;
let db, connection;
// Test URLs - different patterns for HF parquet access
const TEST_URLS = [
{
label: "HF resolve URL (small dataset, no ?download=true)",
url: "https://huggingface.co/datasets/nyu-mll/glue/resolve/refs%2Fconvert%2Fparquet/cola/test/0000.parquet"
},
{
label: "HF resolve URL (small dataset, with ?download=true)",
url: "https://huggingface.co/datasets/nyu-mll/glue/resolve/refs%2Fconvert%2Fparquet/cola/test/0000.parquet?download=true"
},
{
label: "HF datasets-server parquet URL",
url: "https://datasets-server.huggingface.co/rows?dataset=nyu-mll/glue&config=cola&split=test&offset=0&length=10"
},
];
async function initDuckDB() {
try {
const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles();
const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES);
const worker_url = URL.createObjectURL(
new Blob([`importScripts("${bundle.mainWorker}");`], { type: 'text/javascript' })
);
const worker = new Worker(worker_url);
const logger = new duckdb.ConsoleLogger();
db = new duckdb.AsyncDuckDB(logger, worker);
await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
await db.open({
filesystem: {
forceFullHTTPReads: true, // Match embedding-atlas config
},
});
connection = await db.connect();
document.getElementById('init-status').innerHTML = '<span style="color: green; font-weight: bold;">DuckDB WASM initialized successfully</span>';
return true;
} catch (e) {
document.getElementById('init-status').innerHTML = `<span style="color: red; font-weight: bold;">DuckDB init failed: ${e.message}</span>`;
console.error('DuckDB init error:', e);
return false;
}
}
async function testParquetLoad(url, label) {
const testDiv = document.createElement('div');
testDiv.className = 'test pending';
testDiv.innerHTML = `<h3>${label}</h3><pre>${url}</pre><div class="status">Running...</div>`;
document.getElementById('tests').appendChild(testDiv);
const startTime = performance.now();
try {
// First, test basic fetch CORS (what the browser's fetch API sees)
let fetchResult = 'not tested';
try {
const resp = await fetch(url, { method: 'HEAD', mode: 'cors' });
const corsHeader = resp.headers.get('access-control-allow-origin');
fetchResult = `fetch HEAD: ${resp.status} ${resp.statusText}, CORS: ${corsHeader || 'not set'}`;
} catch (fetchErr) {
fetchResult = `fetch HEAD failed: ${fetchErr.message}`;
}
// Now test DuckDB WASM read_parquet
const tableName = 'test_' + Math.random().toString(36).slice(2, 8);
const result = await connection.query(`SELECT COUNT(*) as cnt FROM read_parquet('${url}')`);
const count = result.get(0).cnt;
const elapsed = ((performance.now() - startTime) / 1000).toFixed(2);
testDiv.className = 'test success';
testDiv.innerHTML = `
<h3>${label}</h3>
<pre>${url}</pre>
<div class="status" style="color: green;">SUCCESS - ${count} rows loaded in ${elapsed}s</div>
<div style="font-size: 0.85rem; color: #666; margin-top: 0.5rem;">${fetchResult}</div>
`;
} catch (e) {
const elapsed = ((performance.now() - startTime) / 1000).toFixed(2);
// Still try fetch to understand CORS
let fetchResult = 'not tested';
try {
const resp = await fetch(url, { method: 'HEAD', mode: 'cors' });
const corsHeader = resp.headers.get('access-control-allow-origin');
fetchResult = `fetch HEAD: ${resp.status} ${resp.statusText}, CORS: ${corsHeader || 'not set'}`;
} catch (fetchErr) {
fetchResult = `fetch HEAD also failed: ${fetchErr.message}`;
}
testDiv.className = 'test failure';
testDiv.innerHTML = `
<h3>${label}</h3>
<pre>${url}</pre>
<div class="status" style="color: red;">FAILED after ${elapsed}s</div>
<pre style="color: red;">${e.message || e.toString()}</pre>
<div style="font-size: 0.85rem; color: #666; margin-top: 0.5rem;">${fetchResult}</div>
`;
console.error(`Test failed for ${url}:`, e);
}
}
window.runAllTests = async function() {
document.getElementById('tests').innerHTML = '';
if (!connection) {
const ok = await initDuckDB();
if (!ok) return;
}
for (const test of TEST_URLS) {
await testParquetLoad(test.url, test.label);
}
};
window.runCustomTest = async function() {
const url = document.getElementById('custom-url').value.trim();
if (!url) { alert('Enter a URL first'); return; }
if (!connection) {
const ok = await initDuckDB();
if (!ok) return;
}
await testParquetLoad(url, 'Custom URL');
};
// Auto-init
initDuckDB();
</script>
</body>
</html>