Spaces:

xleaps
/

sgo

Running

Eric Xu commited on 26 days ago

Commit

d94e0d2

unverified ·

1 Parent(s): 92cf501

Use Nemotron personas when available, add dataset setup UI

Web interface now prefers census-grounded Nemotron personas (1M dataset)
over LLM-generated ones. Checks common paths on startup; if not found,
shows a setup panel where user provides a path — loads existing data or
downloads from HuggingFace (~2GB).

- Add /api/nemotron/setup endpoint (load or download to given path)
- Add /api/config nemotron_available field
- Cohort generation uses stratified sampling from Nemotron when available
- Progress log shows data source (census-grounded vs LLM-generated)

Files changed (2) hide show

web/app.py +113 -16
web/static/index.html +70 -1

web/app.py CHANGED Viewed

@@ -44,6 +44,8 @@ from bias_audit import (
     reframe_entity, add_authority_signals, reorder_entity,
     run_paired_evaluation, analyze_probe, generate_report, HUMAN_BASELINES,
 )
 app = FastAPI(title="SGO — Semantic Gradient Optimization")
 app.mount("/static", StaticFiles(directory=Path(__file__).parent / "static"), name="static")
@@ -51,6 +53,49 @@ app.mount("/static", StaticFiles(directory=Path(__file__).parent / "static"), na
 # In-memory store for active sessions
 sessions: dict = {}
 def get_client():
     return OpenAI(
@@ -102,14 +147,43 @@ async def index():
 @app.get("/api/config")
 async def get_config():
-    """Return current LLM config (model name, whether API key is set)."""
     return {
         "model": get_model(),
         "has_api_key": bool(os.getenv("LLM_API_KEY")),
         "base_url": os.getenv("LLM_BASE_URL", ""),
     }
 @app.post("/api/session")
 async def create_session(entity: EntityInput):
     """Create a new evaluation session with an entity."""
@@ -182,22 +256,42 @@ Be concrete and relevant — no generic segments."""
 @app.post("/api/cohort/generate")
 async def generate_cohort_endpoint(config: CohortConfig):
-    """Generate an LLM cohort and attach to a new session."""
     sid = uuid.uuid4().hex[:12]
-    client = get_client()
-    model = get_model()
-    all_personas = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=config.parallel) as pool:
-        futs = {
-            pool.submit(generate_segment, client, model,
-                        seg["label"], seg["count"], config.description): seg
-            for seg in config.segments
-        }
-        for fut in concurrent.futures.as_completed(futs):
-            personas = fut.result()
-            all_personas.extend(personas)
     for i, p in enumerate(all_personas):
         p["user_id"] = i
@@ -211,7 +305,10 @@ async def generate_cohort_endpoint(config: CohortConfig):
         "created": datetime.now().isoformat(),
     }
-    return {"session_id": sid, "cohort_size": len(all_personas), "cohort": all_personas}
 @app.post("/api/cohort/upload/{sid}")

     reframe_entity, add_authority_signals, reorder_entity,
     run_paired_evaluation, analyze_probe, generate_report, HUMAN_BASELINES,
 )
+from persona_loader import load_personas, filter_personas, to_profile
+from stratified_sampler import stratified_sample, age_bracket, make_occupation_fn
 app = FastAPI(title="SGO — Semantic Gradient Optimization")
 app.mount("/static", StaticFiles(directory=Path(__file__).parent / "static"), name="static")
 # In-memory store for active sessions
 sessions: dict = {}
+# Nemotron dataset — loaded once if available
+_nemotron_ds = None
+_nemotron_checked = False
+NEMOTRON_SEARCH_PATHS = [
+    PROJECT_ROOT / "data" / "nemotron",
+    Path.home() / "data" / "nvidia" / "Nemotron-Personas-USA",
+    Path.home() / "data" / "nemotron",
+    Path(os.getenv("NEMOTRON_DATA_DIR", "/nonexistent")),
+]
+def find_nemotron_path():
+    """Find Nemotron dataset on disk. Returns path or None."""
+    for path in NEMOTRON_SEARCH_PATHS:
+        if (path / "dataset_info.json").exists():
+            return path
+    return None
+def get_nemotron(data_dir=None):
+    """Load Nemotron dataset. Returns None if not found."""
+    global _nemotron_ds, _nemotron_checked
+    if data_dir:
+        # Explicit path — reset cache
+        _nemotron_checked = False
+        _nemotron_ds = None
+        NEMOTRON_SEARCH_PATHS.insert(0, Path(data_dir))
+    if _nemotron_checked:
+        return _nemotron_ds
+    _nemotron_checked = True
+    path = find_nemotron_path()
+    if path:
+        try:
+            _nemotron_ds = load_personas(data_dir=path)
+            print(f"Nemotron loaded: {len(_nemotron_ds)} personas from {path}")
+            return _nemotron_ds
+        except Exception as e:
+            print(f"Failed to load Nemotron from {path}: {e}")
+    return None
 def get_client():
     return OpenAI(
 @app.get("/api/config")
 async def get_config():
+    """Return current LLM config and Nemotron status."""
+    nem_path = find_nemotron_path()
     return {
         "model": get_model(),
         "has_api_key": bool(os.getenv("LLM_API_KEY")),
         "base_url": os.getenv("LLM_BASE_URL", ""),
+        "nemotron_path": str(nem_path) if nem_path else None,
+        "nemotron_available": nem_path is not None,
     }
+class NemotronPathInput(BaseModel):
+    path: str
+@app.post("/api/nemotron/setup")
+async def setup_nemotron(input: NemotronPathInput):
+    """Point to existing Nemotron data, or download it to the given path."""
+    p = Path(input.path).expanduser().resolve()
+    if (p / "dataset_info.json").exists():
+        # Already there — just load it
+        ds = get_nemotron(data_dir=str(p))
+        if ds is None:
+            raise HTTPException(500, "Failed to load dataset")
+        return {"status": "loaded", "path": str(p), "count": len(ds)}
+    # Not there — download to this path
+    from setup_data import setup
+    try:
+        ds = setup(data_dir=p)
+        get_nemotron(data_dir=str(p))
+        return {"status": "downloaded", "path": str(p), "count": len(ds)}
+    except Exception as e:
+        raise HTTPException(500, f"Download failed: {e}")
 @app.post("/api/session")
 async def create_session(entity: EntityInput):
     """Create a new evaluation session with an entity."""
 @app.post("/api/cohort/generate")
 async def generate_cohort_endpoint(config: CohortConfig):
+    """Generate a cohort — from Nemotron if available, else LLM-generated."""
     sid = uuid.uuid4().hex[:12]
+    total = sum(s.get("count", 8) for s in config.segments)
+    ds = get_nemotron()
+    if ds is not None:
+        # Use census-grounded Nemotron personas
+        filtered = filter_personas(ds, {}, limit=max(total * 20, 2000))
+        profiles = [to_profile(row, i) for i, row in enumerate(filtered)]
+        dim_fns = [
+            lambda p: age_bracket(p.get("age", 30)),
+            lambda p: p.get("marital_status", "unknown"),
+            lambda p: p.get("education_level", "") or "unknown",
+        ]
+        diversity_fn = lambda p: p.get("occupation", "unknown") or "unknown"
+        all_personas = stratified_sample(profiles, dim_fns, total=total,
+                                         diversity_fn=diversity_fn)
+        source = "nemotron"
+    else:
+        # Fallback: LLM-generated
+        client = get_client()
+        model = get_model()
+        all_personas = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=config.parallel) as pool:
+            futs = {
+                pool.submit(generate_segment, client, model,
+                            seg["label"], seg["count"], config.description): seg
+                for seg in config.segments
+            }
+            for fut in concurrent.futures.as_completed(futs):
+                personas = fut.result()
+                all_personas.extend(personas)
+        source = "llm-generated"
     for i, p in enumerate(all_personas):
         p["user_id"] = i
         "created": datetime.now().isoformat(),
     }
+    return {
+        "session_id": sid, "cohort_size": len(all_personas),
+        "cohort": all_personas, "source": source,
+    }
 @app.post("/api/cohort/upload/{sid}")

web/static/index.html CHANGED Viewed

@@ -308,8 +308,28 @@
     <h1>Semantic Gradient Optimization</h1>
     <p>Evaluate anything against a synthetic panel. Find what to change first.</p>
     <div id="configBadge" class="config-badge">checking...</div>
   </header>
   <!-- STEP 1: Entity + Evaluate (one click) -->
   <div class="step active" id="step1">
     <div class="step-header">
@@ -547,6 +567,7 @@ let evalResultsData = null;
 async function init() {
   const resp = await fetch('/api/config');
   const cfg = await resp.json();
   const badge = document.getElementById('configBadge');
   if (cfg.has_api_key) {
     badge.textContent = cfg.model;
@@ -556,10 +577,57 @@ async function init() {
     badge.className = 'config-badge warn';
   }
   addChange('', '');
   addChange('', '');
 }
 // ── Templates ──
 function loadTemplate(name) {
@@ -669,7 +737,8 @@ async function runFullPipeline() {
       body: JSON.stringify(cohortData.cohort),
     });
-    logStep(`${cohortData.cohort_size} evaluators generated`, 'pos');
     document.getElementById('pipelineProgressBar').style.width = '35%';
     // Phase 4: Evaluate via SSE

     <h1>Semantic Gradient Optimization</h1>
     <p>Evaluate anything against a synthetic panel. Find what to change first.</p>
     <div id="configBadge" class="config-badge">checking...</div>
+    <div id="nemotronBadge" class="config-badge" style="margin-left:8px">checking...</div>
   </header>
+  <!-- Nemotron setup (shown if not available) -->
+  <div class="step hidden" id="nemotronSetup" style="border-color:var(--yellow)">
+    <div class="step-header">
+      <div class="step-num" style="border-color:var(--yellow);color:var(--yellow)">!</div>
+      <div class="step-title">Persona dataset not found</div>
+    </div>
+    <p class="step-desc">SGO uses 1M census-grounded personas for realistic evaluations. Provide a path to the dataset or download it (~2GB).</p>
+    <div class="field">
+      <label>Dataset path</label>
+      <input type="text" id="nemotronPath" placeholder="">
+    </div>
+    <div class="btn-row">
+      <button onclick="setupNemotron()">Load or download</button>
+    </div>
+    <div id="nemotronStatus" class="hidden mt-16">
+      <div class="progress-text" id="nemotronStatusText"></div>
+    </div>
+  </div>
   <!-- STEP 1: Entity + Evaluate (one click) -->
   <div class="step active" id="step1">
     <div class="step-header">
 async function init() {
   const resp = await fetch('/api/config');
   const cfg = await resp.json();
   const badge = document.getElementById('configBadge');
   if (cfg.has_api_key) {
     badge.textContent = cfg.model;
     badge.className = 'config-badge warn';
   }
+  const nemBadge = document.getElementById('nemotronBadge');
+  if (cfg.nemotron_available) {
+    nemBadge.textContent = 'Nemotron 1M';
+    nemBadge.className = 'config-badge ok';
+  } else {
+    nemBadge.textContent = 'No persona dataset';
+    nemBadge.className = 'config-badge warn';
+    document.getElementById('nemotronSetup').classList.remove('hidden');
+    // Default path: project's data dir
+    document.getElementById('nemotronPath').value = cfg.base_url ? '' : 'data/nemotron';
+  }
   addChange('', '');
   addChange('', '');
 }
+async function setupNemotron() {
+  const path = document.getElementById('nemotronPath').value.trim();
+  if (!path) return alert('Please enter a path.');
+  const status = document.getElementById('nemotronStatus');
+  const text = document.getElementById('nemotronStatusText');
+  status.classList.remove('hidden');
+  text.textContent = 'Loading dataset (or downloading if not found — ~2GB, may take a few minutes)...';
+  try {
+    const resp = await fetch('/api/nemotron/setup', {
+      method: 'POST',
+      headers: {'Content-Type': 'application/json'},
+      body: JSON.stringify({path}),
+    });
+    const data = await resp.json();
+    if (!resp.ok) throw new Error(data.detail || 'Failed');
+    text.textContent = `${data.status === 'downloaded' ? 'Downloaded' : 'Loaded'}: ${data.count.toLocaleString()} personas`;
+    text.style.color = 'var(--green)';
+    const nemBadge = document.getElementById('nemotronBadge');
+    nemBadge.textContent = 'Nemotron 1M';
+    nemBadge.className = 'config-badge ok';
+    // Hide setup after a moment
+    setTimeout(() => {
+      document.getElementById('nemotronSetup').classList.add('hidden');
+    }, 2000);
+  } catch (e) {
+    text.textContent = `Error: ${e.message}`;
+    text.style.color = 'var(--red)';
+  }
+}
 // ── Templates ──
 function loadTemplate(name) {
       body: JSON.stringify(cohortData.cohort),
     });
+    const src = cohortData.source === 'nemotron' ? 'census-grounded (Nemotron)' : 'LLM-generated';
+    logStep(`${cohortData.cohort_size} evaluators ready — ${src}`, 'pos');
     document.getElementById('pipelineProgressBar').style.width = '35%';
     // Phase 4: Evaluate via SSE