Spaces:
Running
Running
Update app.js
Browse files
app.js
CHANGED
|
@@ -1735,7 +1735,10 @@ SCOPE axl_meta [level: 0]
|
|
| 1735 |
END SCOPE
|
| 1736 |
`;
|
| 1737 |
|
| 1738 |
-
function buildPrompt(n, recentDomains,
|
|
|
|
|
|
|
|
|
|
| 1739 |
const recentStr = recentDomains.slice(-10).join(", ") || "none yet";
|
| 1740 |
return `${AXL_SPEC}
|
| 1741 |
|
|
@@ -1881,83 +1884,80 @@ app.get("/domains", (_, res) => res.json(tracker.stats()));
|
|
| 1881 |
app.post("/generate", async (req, res) => {
|
| 1882 |
const n = parseInt(req.query.n ?? req.body.n ?? "50");
|
| 1883 |
const rps = Math.min(parseFloat(req.query.rps ?? req.body.rps ?? DEFAULT_RPS.toString()), MAX_RPS);
|
| 1884 |
-
const
|
| 1885 |
const maxVars = parseInt(req.query.max_vars ?? req.body.max_vars ?? "20");
|
| 1886 |
|
| 1887 |
-
log(`Generation request: n=${n} rps=${rps} synthetic=${(
|
| 1888 |
|
| 1889 |
-
|
| 1890 |
-
let
|
| 1891 |
|
| 1892 |
-
// ββ SYNTHETIC BATCH ββ
|
| 1893 |
const nSynthetic = Math.floor(n * syntheticFraction);
|
| 1894 |
if (nSynthetic > 0) {
|
| 1895 |
-
const gens
|
| 1896 |
-
const perGen
|
| 1897 |
-
|
| 1898 |
-
|
| 1899 |
-
for (const genFn of gens) {
|
| 1900 |
-
tempSynth.push(...genFn(perGen));
|
| 1901 |
-
}
|
| 1902 |
-
|
| 1903 |
-
// Shuffle slightly for mix, then slice to EXACTLY nSynthetic
|
| 1904 |
-
tempSynth.sort(() => 0.5 - Math.random());
|
| 1905 |
tempSynth = tempSynth.slice(0, nSynthetic);
|
| 1906 |
|
| 1907 |
for (const raw of tempSynth) {
|
| 1908 |
-
if (!validateSample(raw)) { errors++; continue; }
|
| 1909 |
const norm = normalizeSample(raw);
|
| 1910 |
-
if (
|
| 1911 |
-
|
| 1912 |
-
|
|
|
|
|
|
|
| 1913 |
}
|
| 1914 |
-
|
|
|
|
| 1915 |
}
|
| 1916 |
|
| 1917 |
-
// ββ AI BATCH ββ
|
| 1918 |
-
const nAI
|
| 1919 |
const callsNeeded = Math.ceil(nAI / SAMPLES_PER_CALL);
|
| 1920 |
const minInterval = 1000 / rps;
|
| 1921 |
-
log(`AI generation: ${nAI} samples via ${callsNeeded} Bedrock calls`);
|
| 1922 |
|
| 1923 |
-
|
| 1924 |
-
|
| 1925 |
-
|
| 1926 |
-
|
| 1927 |
-
|
| 1928 |
-
|
| 1929 |
-
const
|
| 1930 |
-
|
| 1931 |
-
|
| 1932 |
-
|
| 1933 |
-
const
|
| 1934 |
-
|
| 1935 |
-
|
| 1936 |
-
|
| 1937 |
-
|
| 1938 |
-
|
| 1939 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1940 |
}
|
| 1941 |
-
|
| 1942 |
-
|
| 1943 |
-
|
| 1944 |
-
errors++;
|
| 1945 |
}
|
| 1946 |
-
|
| 1947 |
-
|
| 1948 |
-
if (elapsed < minInterval) await new Promise(r=>setTimeout(r, minInterval - elapsed));
|
| 1949 |
-
}
|
| 1950 |
-
|
| 1951 |
-
dataset.push(...generated);
|
| 1952 |
-
saveDataset();
|
| 1953 |
-
|
| 1954 |
-
log(`Done: ${generated.length} new | total=${dataset.length} | errors=${errors}`, "OK");
|
| 1955 |
-
res.json({
|
| 1956 |
-
generated: generated.length,
|
| 1957 |
-
errors,
|
| 1958 |
-
total_dataset: dataset.length,
|
| 1959 |
-
samples: generated,
|
| 1960 |
-
});
|
| 1961 |
});
|
| 1962 |
|
| 1963 |
app.get("/dataset/download", (req, res) => {
|
|
|
|
| 1735 |
END SCOPE
|
| 1736 |
`;
|
| 1737 |
|
| 1738 |
+
function buildPrompt(n, recentDomains,
|
| 1739 |
+
maxVars=8
|
| 1740 |
+
// maxVars=20
|
| 1741 |
+
) {
|
| 1742 |
const recentStr = recentDomains.slice(-10).join(", ") || "none yet";
|
| 1743 |
return `${AXL_SPEC}
|
| 1744 |
|
|
|
|
| 1884 |
app.post("/generate", async (req, res) => {
|
| 1885 |
const n = parseInt(req.query.n ?? req.body.n ?? "50");
|
| 1886 |
const rps = Math.min(parseFloat(req.query.rps ?? req.body.rps ?? DEFAULT_RPS.toString()), MAX_RPS);
|
| 1887 |
+
const synthFraction = parseFloat(req.query.synthetic_fraction ?? req.body.synthetic_fraction ?? "0.4");
|
| 1888 |
const maxVars = parseInt(req.query.max_vars ?? req.body.max_vars ?? "20");
|
| 1889 |
|
| 1890 |
+
log(`Generation request: n=${n} rps=${rps} synthetic=${(synthFraction * 100).toFixed(0)}%`, "HEAD");
|
| 1891 |
|
| 1892 |
+
let batchErrors = 0;
|
| 1893 |
+
let newSamplesCount = 0;
|
| 1894 |
|
| 1895 |
+
// ββ 1. SYNTHETIC BATCH (Immediate Save) ββ
|
| 1896 |
const nSynthetic = Math.floor(n * syntheticFraction);
|
| 1897 |
if (nSynthetic > 0) {
|
| 1898 |
+
const gens = Object.values(SYNTHETIC_GENERATORS);
|
| 1899 |
+
const perGen = Math.max(1, Math.ceil(nSynthetic / gens.length));
|
| 1900 |
+
let tempSynth = [];
|
| 1901 |
+
for (const genFn of gens) { tempSynth.push(...genFn(perGen)); }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1902 |
tempSynth = tempSynth.slice(0, nSynthetic);
|
| 1903 |
|
| 1904 |
for (const raw of tempSynth) {
|
|
|
|
| 1905 |
const norm = normalizeSample(raw);
|
| 1906 |
+
if (norm && validateSample(norm)) {
|
| 1907 |
+
dataset.push(norm);
|
| 1908 |
+
tracker.record(norm.domain);
|
| 1909 |
+
newSamplesCount++;
|
| 1910 |
+
} else { batchErrors++; }
|
| 1911 |
}
|
| 1912 |
+
saveDataset(); // Commit synthetics to disk immediately
|
| 1913 |
+
log(`Synthetic batch committed to disk: ${newSamplesCount} samples`, "OK");
|
| 1914 |
}
|
| 1915 |
|
| 1916 |
+
// ββ 2. AI BATCH (Incremental Save per Call) ββ
|
| 1917 |
+
const nAI = n - nSynthetic;
|
| 1918 |
const callsNeeded = Math.ceil(nAI / SAMPLES_PER_CALL);
|
| 1919 |
const minInterval = 1000 / rps;
|
|
|
|
| 1920 |
|
| 1921 |
+
// Send initial response so the connection doesn't timeout,
|
| 1922 |
+
// but the background loop keeps writing to disk.
|
| 1923 |
+
res.json({ message: "Generation started. Use /dataset/download to track progress.", total_requested: n });
|
| 1924 |
+
|
| 1925 |
+
(async () => {
|
| 1926 |
+
for (let ci = 0; ci < callsNeeded; ci++) {
|
| 1927 |
+
const t0 = Date.now();
|
| 1928 |
+
try {
|
| 1929 |
+
const prompt = buildPrompt(SAMPLES_PER_CALL, tracker.recent, maxVars);
|
| 1930 |
+
const text = await callBedrock(prompt);
|
| 1931 |
+
const parsed = parseAIResponse(text);
|
| 1932 |
+
|
| 1933 |
+
let callValid = 0;
|
| 1934 |
+
for (const raw of parsed) {
|
| 1935 |
+
const norm = normalizeSample(raw);
|
| 1936 |
+
if (norm && validateSample(norm)) {
|
| 1937 |
+
norm.metadata = norm.metadata || {};
|
| 1938 |
+
norm.metadata.source = "ai";
|
| 1939 |
+
dataset.push(norm);
|
| 1940 |
+
tracker.record(norm.domain);
|
| 1941 |
+
callValid++;
|
| 1942 |
+
newSamplesCount++;
|
| 1943 |
+
} else { batchErrors++; }
|
| 1944 |
+
}
|
| 1945 |
+
|
| 1946 |
+
// THE FIX: Save to disk after every successful Bedrock call
|
| 1947 |
+
if (callValid > 0) {
|
| 1948 |
+
saveDataset();
|
| 1949 |
+
log(`Incremental Save: Call ${ci + 1}/${callsNeeded} committed (${callValid} samples). Total dataset: ${dataset.length}`, "OK");
|
| 1950 |
+
}
|
| 1951 |
+
} catch (e) {
|
| 1952 |
+
log(`Bedrock call ${ci + 1} failed: ${e.message}`, "ERROR");
|
| 1953 |
+
batchErrors++;
|
| 1954 |
}
|
| 1955 |
+
|
| 1956 |
+
const elapsed = Date.now() - t0;
|
| 1957 |
+
if (elapsed < minInterval) await new Promise(r => setTimeout(r, minInterval - elapsed));
|
|
|
|
| 1958 |
}
|
| 1959 |
+
log(`Batch Process Finished. New: ${newSamplesCount} | Errors: ${batchErrors}`, "HEAD");
|
| 1960 |
+
})();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1961 |
});
|
| 1962 |
|
| 1963 |
app.get("/dataset/download", (req, res) => {
|