W / scripts /model-identity-test.js
Ac66's picture
Upload folder using huggingface_hub
2b64d42 verified
#!/usr/bin/env node
/**
* Model identity & quality test β€” verifies every model responds with
* correct identity (not "Cascade") and passes basic knowledge checks.
* Waits on rate limits automatically.
*
* Usage: node scripts/model-identity-test.js [--base-url http://...] [--api-key sk-...]
*/
import http from 'http';
import https from 'https';
import { writeFileSync, mkdirSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const LOG_DIR = join(__dirname, '..', 'logs');
mkdirSync(LOG_DIR, { recursive: true });
const args = process.argv.slice(2);
function getArg(name, fb) { const i = args.indexOf(`--${name}`); return i !== -1 && args[i+1] ? args[i+1] : fb; }
const BASE = getArg('base-url', 'http://localhost:8996');
const KEY = getArg('api-key', 'sk-yebainb666sblzhsqjcnmb----12312312');
const MODELS = [
'gemini-2.5-flash', 'gemini-3.0-flash', 'gpt-4o', 'gpt-5',
'claude-4.5-sonnet', 'claude-sonnet-4.6', 'claude-opus-4.6',
'glm-5', 'grok-3', 'kimi-k2.5', 'swe-1.5',
];
const TESTS = [
{ name: 'identity', prompt: 'What model are you? Who developed you? Answer in exactly one sentence.', check: (r, model) => {
const low = r.toLowerCase();
const bad = low.includes('cascade') || low.includes('codeium') || low.includes('windsurf');
const hasModel = low.includes(model.split('-')[0]);
return { pass: !bad && hasModel, bad: bad ? 'says Cascade/Codeium/Windsurf' : (!hasModel ? 'missing model name' : null) };
}},
{ name: 'knowledge', prompt: 'What is the capital of France? Answer in one word.', check: (r) => {
return { pass: r.toLowerCase().includes('paris'), bad: r.toLowerCase().includes('paris') ? null : 'wrong answer' };
}},
{ name: 'math', prompt: 'What is 17 * 23? Answer with just the number.', check: (r) => {
return { pass: r.includes('391'), bad: r.includes('391') ? null : 'wrong math' };
}},
{ name: 'coding', prompt: 'Write a Python function that returns the sum of a list. Output ONLY the function, no explanation.', check: (r) => {
return { pass: r.includes('def ') && r.includes('sum'), bad: null };
}},
];
function chat(model, prompt) {
return new Promise((resolve, reject) => {
const url = new URL('/v1/chat/completions', BASE);
const mod = url.protocol === 'https:' ? https : http;
const body = JSON.stringify({ model, messages: [{ role: 'user', content: prompt }], stream: false });
const req = mod.request(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${KEY}` },
}, (res) => {
const chunks = [];
res.on('data', c => chunks.push(c));
res.on('end', () => {
try {
const d = JSON.parse(Buffer.concat(chunks).toString());
const content = d.choices?.[0]?.message?.content || '';
const error = d.error?.message || '';
const retryAfter = d.error?.retry_after_ms || 0;
resolve({ status: res.statusCode, content, error, retryAfter, retryHeader: res.headers['retry-after'] });
} catch (e) { reject(e); }
});
});
req.on('error', reject);
setTimeout(() => { req.destroy(); reject(new Error('timeout')); }, 60000);
req.write(body);
req.end();
});
}
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
async function testModel(model) {
const results = [];
for (const test of TESTS) {
let attempt = 0;
while (attempt < 5) {
attempt++;
try {
const r = await chat(model, test.prompt);
if (r.status === 429 || r.error.includes('ι™εˆΆ') || r.error.includes('rate limit')) {
const waitSec = parseInt(r.retryHeader || '0') || Math.ceil((r.retryAfter || 60000) / 1000);
console.log(` ⏳ ${model}/${test.name}: rate limited, waiting ${waitSec}s...`);
await sleep(waitSec * 1000 + 2000);
continue;
}
if (r.status !== 200 || !r.content) {
results.push({ test: test.name, pass: false, reason: r.error || `status=${r.status} empty`, content: '' });
break;
}
const check = test.check(r.content, model);
results.push({ test: test.name, pass: check.pass, reason: check.bad, content: r.content.slice(0, 150) });
break;
} catch (e) {
if (attempt >= 5) results.push({ test: test.name, pass: false, reason: e.message, content: '' });
else { console.log(` ⚠ ${model}/${test.name}: ${e.message}, retry ${attempt}/5`); await sleep(3000); }
}
}
}
return results;
}
async function main() {
console.log(`\n Model Identity & Quality Test`);
console.log(` Base: ${BASE} Models: ${MODELS.length}\n`);
const report = [];
for (const model of MODELS) {
console.log(` β–Έ ${model}`);
const results = await testModel(model);
const passed = results.filter(r => r.pass).length;
const total = results.length;
const icon = passed === total ? 'βœ“' : passed > 0 ? 'β–³' : 'βœ—';
console.log(` ${icon} ${passed}/${total} passed`);
for (const r of results) {
if (!r.pass) console.log(` βœ— ${r.test}: ${r.reason || 'failed'}`);
}
report.push({ model, passed, total, results });
}
console.log(`\n ── Summary ──`);
let totalPass = 0, totalTests = 0;
for (const r of report) {
const icon = r.passed === r.total ? 'βœ“' : 'βœ—';
console.log(` ${icon} ${r.model.padEnd(22)} ${r.passed}/${r.total}`);
totalPass += r.passed;
totalTests += r.total;
}
console.log(`\n Total: ${totalPass}/${totalTests} (${Math.round(totalPass/totalTests*100)}%)\n`);
const logFile = join(LOG_DIR, `identity-test-${new Date().toISOString().replace(/[:.]/g, '-')}.json`);
writeFileSync(logFile, JSON.stringify(report, null, 2));
console.log(` Report: ${logFile}\n`);
}
main().catch(e => { console.error('Fatal:', e.message); process.exit(1); });