File size: 5,977 Bytes
2b64d42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | #!/usr/bin/env node
/**
* Model identity & quality test β verifies every model responds with
* correct identity (not "Cascade") and passes basic knowledge checks.
* Waits on rate limits automatically.
*
* Usage: node scripts/model-identity-test.js [--base-url http://...] [--api-key sk-...]
*/
import http from 'http';
import https from 'https';
import { writeFileSync, mkdirSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const LOG_DIR = join(__dirname, '..', 'logs');
mkdirSync(LOG_DIR, { recursive: true });
const args = process.argv.slice(2);
function getArg(name, fb) { const i = args.indexOf(`--${name}`); return i !== -1 && args[i+1] ? args[i+1] : fb; }
const BASE = getArg('base-url', 'http://localhost:8996');
const KEY = getArg('api-key', 'sk-yebainb666sblzhsqjcnmb----12312312');
const MODELS = [
'gemini-2.5-flash', 'gemini-3.0-flash', 'gpt-4o', 'gpt-5',
'claude-4.5-sonnet', 'claude-sonnet-4.6', 'claude-opus-4.6',
'glm-5', 'grok-3', 'kimi-k2.5', 'swe-1.5',
];
const TESTS = [
{ name: 'identity', prompt: 'What model are you? Who developed you? Answer in exactly one sentence.', check: (r, model) => {
const low = r.toLowerCase();
const bad = low.includes('cascade') || low.includes('codeium') || low.includes('windsurf');
const hasModel = low.includes(model.split('-')[0]);
return { pass: !bad && hasModel, bad: bad ? 'says Cascade/Codeium/Windsurf' : (!hasModel ? 'missing model name' : null) };
}},
{ name: 'knowledge', prompt: 'What is the capital of France? Answer in one word.', check: (r) => {
return { pass: r.toLowerCase().includes('paris'), bad: r.toLowerCase().includes('paris') ? null : 'wrong answer' };
}},
{ name: 'math', prompt: 'What is 17 * 23? Answer with just the number.', check: (r) => {
return { pass: r.includes('391'), bad: r.includes('391') ? null : 'wrong math' };
}},
{ name: 'coding', prompt: 'Write a Python function that returns the sum of a list. Output ONLY the function, no explanation.', check: (r) => {
return { pass: r.includes('def ') && r.includes('sum'), bad: null };
}},
];
function chat(model, prompt) {
return new Promise((resolve, reject) => {
const url = new URL('/v1/chat/completions', BASE);
const mod = url.protocol === 'https:' ? https : http;
const body = JSON.stringify({ model, messages: [{ role: 'user', content: prompt }], stream: false });
const req = mod.request(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${KEY}` },
}, (res) => {
const chunks = [];
res.on('data', c => chunks.push(c));
res.on('end', () => {
try {
const d = JSON.parse(Buffer.concat(chunks).toString());
const content = d.choices?.[0]?.message?.content || '';
const error = d.error?.message || '';
const retryAfter = d.error?.retry_after_ms || 0;
resolve({ status: res.statusCode, content, error, retryAfter, retryHeader: res.headers['retry-after'] });
} catch (e) { reject(e); }
});
});
req.on('error', reject);
setTimeout(() => { req.destroy(); reject(new Error('timeout')); }, 60000);
req.write(body);
req.end();
});
}
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
async function testModel(model) {
const results = [];
for (const test of TESTS) {
let attempt = 0;
while (attempt < 5) {
attempt++;
try {
const r = await chat(model, test.prompt);
if (r.status === 429 || r.error.includes('ιεΆ') || r.error.includes('rate limit')) {
const waitSec = parseInt(r.retryHeader || '0') || Math.ceil((r.retryAfter || 60000) / 1000);
console.log(` β³ ${model}/${test.name}: rate limited, waiting ${waitSec}s...`);
await sleep(waitSec * 1000 + 2000);
continue;
}
if (r.status !== 200 || !r.content) {
results.push({ test: test.name, pass: false, reason: r.error || `status=${r.status} empty`, content: '' });
break;
}
const check = test.check(r.content, model);
results.push({ test: test.name, pass: check.pass, reason: check.bad, content: r.content.slice(0, 150) });
break;
} catch (e) {
if (attempt >= 5) results.push({ test: test.name, pass: false, reason: e.message, content: '' });
else { console.log(` β ${model}/${test.name}: ${e.message}, retry ${attempt}/5`); await sleep(3000); }
}
}
}
return results;
}
async function main() {
console.log(`\n Model Identity & Quality Test`);
console.log(` Base: ${BASE} Models: ${MODELS.length}\n`);
const report = [];
for (const model of MODELS) {
console.log(` βΈ ${model}`);
const results = await testModel(model);
const passed = results.filter(r => r.pass).length;
const total = results.length;
const icon = passed === total ? 'β' : passed > 0 ? 'β³' : 'β';
console.log(` ${icon} ${passed}/${total} passed`);
for (const r of results) {
if (!r.pass) console.log(` β ${r.test}: ${r.reason || 'failed'}`);
}
report.push({ model, passed, total, results });
}
console.log(`\n ββ Summary ββ`);
let totalPass = 0, totalTests = 0;
for (const r of report) {
const icon = r.passed === r.total ? 'β' : 'β';
console.log(` ${icon} ${r.model.padEnd(22)} ${r.passed}/${r.total}`);
totalPass += r.passed;
totalTests += r.total;
}
console.log(`\n Total: ${totalPass}/${totalTests} (${Math.round(totalPass/totalTests*100)}%)\n`);
const logFile = join(LOG_DIR, `identity-test-${new Date().toISOString().replace(/[:.]/g, '-')}.json`);
writeFileSync(logFile, JSON.stringify(report, null, 2));
console.log(` Report: ${logFile}\n`);
}
main().catch(e => { console.error('Fatal:', e.message); process.exit(1); });
|