Pepguy commited on
Commit
304ff12
Β·
verified Β·
1 Parent(s): ce35ee7

Update app.js

Browse files
Files changed (1) hide show
  1. app.js +60 -60
app.js CHANGED
@@ -1735,7 +1735,10 @@ SCOPE axl_meta [level: 0]
1735
  END SCOPE
1736
  `;
1737
 
1738
- function buildPrompt(n, recentDomains, maxVars=20) {
 
 
 
1739
  const recentStr = recentDomains.slice(-10).join(", ") || "none yet";
1740
  return `${AXL_SPEC}
1741
 
@@ -1881,83 +1884,80 @@ app.get("/domains", (_, res) => res.json(tracker.stats()));
1881
  app.post("/generate", async (req, res) => {
1882
  const n = parseInt(req.query.n ?? req.body.n ?? "50");
1883
  const rps = Math.min(parseFloat(req.query.rps ?? req.body.rps ?? DEFAULT_RPS.toString()), MAX_RPS);
1884
- const syntheticFraction = parseFloat(req.query.synthetic_fraction ?? req.body.synthetic_fraction ?? "0.4");
1885
  const maxVars = parseInt(req.query.max_vars ?? req.body.max_vars ?? "20");
1886
 
1887
- log(`Generation request: n=${n} rps=${rps} synthetic=${(syntheticFraction*100).toFixed(0)}%`, "HEAD");
1888
 
1889
- const generated =[];
1890
- let errors = 0;
1891
 
1892
- // ── SYNTHETIC BATCH ──
1893
  const nSynthetic = Math.floor(n * syntheticFraction);
1894
  if (nSynthetic > 0) {
1895
- const gens = Object.values(SYNTHETIC_GENERATORS);
1896
- const perGen = Math.max(1, Math.ceil(nSynthetic / gens.length));
1897
-
1898
- let tempSynth =[];
1899
- for (const genFn of gens) {
1900
- tempSynth.push(...genFn(perGen));
1901
- }
1902
-
1903
- // Shuffle slightly for mix, then slice to EXACTLY nSynthetic
1904
- tempSynth.sort(() => 0.5 - Math.random());
1905
  tempSynth = tempSynth.slice(0, nSynthetic);
1906
 
1907
  for (const raw of tempSynth) {
1908
- if (!validateSample(raw)) { errors++; continue; }
1909
  const norm = normalizeSample(raw);
1910
- if (!norm) { errors++; continue; }
1911
- generated.push(norm);
1912
- tracker.record(norm.domain);
 
 
1913
  }
1914
- log(`Synthetic: ${generated.length} valid`, "OK");
 
1915
  }
1916
 
1917
- // ── AI BATCH ──
1918
- const nAI = n - generated.length;
1919
  const callsNeeded = Math.ceil(nAI / SAMPLES_PER_CALL);
1920
  const minInterval = 1000 / rps;
1921
- log(`AI generation: ${nAI} samples via ${callsNeeded} Bedrock calls`);
1922
 
1923
- for (let ci = 0; ci < callsNeeded; ci++) {
1924
- const t0 = Date.now();
1925
- try {
1926
- const batchN = Math.min(SAMPLES_PER_CALL, nAI - generated.filter(s=>s.metadata?.source==="ai").length + 2);
1927
- const prompt = buildPrompt(batchN, tracker.recent, maxVars);
1928
- const text = await callBedrock(prompt);
1929
- const parsed = parseAIResponse(text);
1930
- let valid = 0;
1931
- for (const raw of parsed) {
1932
- if (!validateSample(raw)) { errors++; continue; }
1933
- const norm = normalizeSample(raw);
1934
- if (!norm) { errors++; continue; }
1935
- norm.metadata = norm.metadata || {};
1936
- norm.metadata.source = "ai";
1937
- generated.push(norm);
1938
- tracker.record(norm.domain);
1939
- valid++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1940
  }
1941
- log(`Call ${ci+1}/${callsNeeded}: ${valid}/${parsed.length} valid`, "OK");
1942
- } catch(e) {
1943
- log(`Bedrock call ${ci+1} failed: ${e.message}`, "ERROR");
1944
- errors++;
1945
  }
1946
- // rate limit
1947
- const elapsed = Date.now() - t0;
1948
- if (elapsed < minInterval) await new Promise(r=>setTimeout(r, minInterval - elapsed));
1949
- }
1950
-
1951
- dataset.push(...generated);
1952
- saveDataset();
1953
-
1954
- log(`Done: ${generated.length} new | total=${dataset.length} | errors=${errors}`, "OK");
1955
- res.json({
1956
- generated: generated.length,
1957
- errors,
1958
- total_dataset: dataset.length,
1959
- samples: generated,
1960
- });
1961
  });
1962
 
1963
  app.get("/dataset/download", (req, res) => {
 
1735
  END SCOPE
1736
  `;
1737
 
1738
+ function buildPrompt(n, recentDomains,
1739
+ maxVars=8
1740
+ // maxVars=20
1741
+ ) {
1742
  const recentStr = recentDomains.slice(-10).join(", ") || "none yet";
1743
  return `${AXL_SPEC}
1744
 
 
1884
  app.post("/generate", async (req, res) => {
1885
  const n = parseInt(req.query.n ?? req.body.n ?? "50");
1886
  const rps = Math.min(parseFloat(req.query.rps ?? req.body.rps ?? DEFAULT_RPS.toString()), MAX_RPS);
1887
+ const synthFraction = parseFloat(req.query.synthetic_fraction ?? req.body.synthetic_fraction ?? "0.4");
1888
  const maxVars = parseInt(req.query.max_vars ?? req.body.max_vars ?? "20");
1889
 
1890
+ log(`Generation request: n=${n} rps=${rps} synthetic=${(synthFraction * 100).toFixed(0)}%`, "HEAD");
1891
 
1892
+ let batchErrors = 0;
1893
+ let newSamplesCount = 0;
1894
 
1895
+ // ── 1. SYNTHETIC BATCH (Immediate Save) ──
1896
  const nSynthetic = Math.floor(n * syntheticFraction);
1897
  if (nSynthetic > 0) {
1898
+ const gens = Object.values(SYNTHETIC_GENERATORS);
1899
+ const perGen = Math.max(1, Math.ceil(nSynthetic / gens.length));
1900
+ let tempSynth = [];
1901
+ for (const genFn of gens) { tempSynth.push(...genFn(perGen)); }
 
 
 
 
 
 
1902
  tempSynth = tempSynth.slice(0, nSynthetic);
1903
 
1904
  for (const raw of tempSynth) {
 
1905
  const norm = normalizeSample(raw);
1906
+ if (norm && validateSample(norm)) {
1907
+ dataset.push(norm);
1908
+ tracker.record(norm.domain);
1909
+ newSamplesCount++;
1910
+ } else { batchErrors++; }
1911
  }
1912
+ saveDataset(); // Commit synthetics to disk immediately
1913
+ log(`Synthetic batch committed to disk: ${newSamplesCount} samples`, "OK");
1914
  }
1915
 
1916
+ // ── 2. AI BATCH (Incremental Save per Call) ──
1917
+ const nAI = n - nSynthetic;
1918
  const callsNeeded = Math.ceil(nAI / SAMPLES_PER_CALL);
1919
  const minInterval = 1000 / rps;
 
1920
 
1921
+ // Send initial response so the connection doesn't timeout,
1922
+ // but the background loop keeps writing to disk.
1923
+ res.json({ message: "Generation started. Use /dataset/download to track progress.", total_requested: n });
1924
+
1925
+ (async () => {
1926
+ for (let ci = 0; ci < callsNeeded; ci++) {
1927
+ const t0 = Date.now();
1928
+ try {
1929
+ const prompt = buildPrompt(SAMPLES_PER_CALL, tracker.recent, maxVars);
1930
+ const text = await callBedrock(prompt);
1931
+ const parsed = parseAIResponse(text);
1932
+
1933
+ let callValid = 0;
1934
+ for (const raw of parsed) {
1935
+ const norm = normalizeSample(raw);
1936
+ if (norm && validateSample(norm)) {
1937
+ norm.metadata = norm.metadata || {};
1938
+ norm.metadata.source = "ai";
1939
+ dataset.push(norm);
1940
+ tracker.record(norm.domain);
1941
+ callValid++;
1942
+ newSamplesCount++;
1943
+ } else { batchErrors++; }
1944
+ }
1945
+
1946
+ // THE FIX: Save to disk after every successful Bedrock call
1947
+ if (callValid > 0) {
1948
+ saveDataset();
1949
+ log(`Incremental Save: Call ${ci + 1}/${callsNeeded} committed (${callValid} samples). Total dataset: ${dataset.length}`, "OK");
1950
+ }
1951
+ } catch (e) {
1952
+ log(`Bedrock call ${ci + 1} failed: ${e.message}`, "ERROR");
1953
+ batchErrors++;
1954
  }
1955
+
1956
+ const elapsed = Date.now() - t0;
1957
+ if (elapsed < minInterval) await new Promise(r => setTimeout(r, minInterval - elapsed));
 
1958
  }
1959
+ log(`Batch Process Finished. New: ${newSamplesCount} | Errors: ${batchErrors}`, "HEAD");
1960
+ })();
 
 
 
 
 
 
 
 
 
 
 
 
 
1961
  });
1962
 
1963
  app.get("/dataset/download", (req, res) => {