Spaces:

deirdosh
/

curriculum_analysis_german

Sleeping

App Files Files Community

curriculum_analysis_german / app.py

deirdosh

Update app.py

c58d0a7 verified 20 days ago

raw

history blame contribute delete

37.9 kB

	"""
	Concept Atlas — Embedding Generator
	=====================================
	Generates sentence embeddings for each focus concept (mensch, verhalten, evolution)
	and pushes embeddings + metadata to a HuggingFace public dataset.

	Space: https://huggingface.co/spaces/deirdosh/curriculum_analysis_german
	Dataset: https://huggingface.co/datasets/deirdosh/curriculum_embeddings
	"""

	# ── stdlib ───────────────────────────────────────────────────────────────────
	import os, sys, json, hashlib, warnings, logging, traceback, time, threading
	import shutil
	import urllib.request
	from pathlib import Path
	from datetime import datetime

	warnings.filterwarnings("ignore")

	LOG_FILE = Path("pipeline.log")
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[
	logging.StreamHandler(sys.stdout),
	logging.FileHandler(str(LOG_FILE), mode="a", encoding="utf-8"),
	],
	)
	logger = logging.getLogger(__name__)

	# ── third-party ──────────────────────────────────────────────────────────────
	import numpy as np
	import pandas as pd
	import gradio as gr

	# ── lazy heavy imports ────────────────────────────────────────────────────────
	_ST_MODEL = None

	# ── directories ──────────────────────────────────────────────────────────────
	CACHE_DIR = Path("cache")
	DATA_DIR = Path("data")
	for _d in (CACHE_DIR, DATA_DIR):
	_d.mkdir(parents=True, exist_ok=True)

	# ── constants ─────────────────────────────────────────────────────────────────
	FOCUS_CONCEPTS = ["mensch", "verhalten", "evolution"]
	MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2"

	# Source corpus CSV (this Space's own file)
	CSV_URL = (
	"https://huggingface.co/spaces/deirdosh/curriculum_analysis_german"
	"/resolve/main/data/curriculum_excerpts.csv"
	)

	# Target HuggingFace dataset repository
	HF_DATASET_REPO = "deirdosh/curriculum_embeddings"

	# ── pipeline state ─────────────────────────────────────────────────────────
	_P: dict = {}
	_PIPELINE_RUNNING = threading.Event()
	_PIPELINE_THREAD: threading.Thread \| None = None

	# Live log ring-buffer for the UI ticker
	_LOG_LINES: list[str] = []
	_LOG_LOCK = threading.Lock()
	_MAX_LOG = 400


	class _UILogHandler(logging.Handler):
	def emit(self, record: logging.LogRecord) -> None:
	with _LOG_LOCK:
	_LOG_LINES.append(self.format(record))
	if len(_LOG_LINES) > _MAX_LOG:
	_LOG_LINES.pop(0)


	_ui_handler = _UILogHandler()
	_ui_handler.setFormatter(
	logging.Formatter("%(asctime)s %(message)s", datefmt="%H:%M:%S"))
	logging.getLogger(__name__).addHandler(_ui_handler)


	def get_live_log() -> str:
	with _LOG_LOCK:
	lines = list(_LOG_LINES)
	return "\n".join(lines[-100:])


	# =============================================================================
	# ATOMIC CACHE HELPERS
	# =============================================================================

	def _ckey(logical: str) -> str:
	return hashlib.md5(logical.encode()).hexdigest()[:10]


	def _npy_path(key: str) -> Path:
	safe = "".join(c if c.isalnum() or c in "-_" else "_" for c in key)[:60]
	return CACHE_DIR / f"{safe}__{_ckey(key)}.npy"


	def _json_path(key: str) -> Path:
	safe = "".join(c if c.isalnum() or c in "-_" else "_" for c in key)[:60]
	return CACHE_DIR / f"{safe}__{_ckey(key)}.json"


	def _parquet_path(key: str) -> Path:
	safe = "".join(c if c.isalnum() or c in "-_" else "_" for c in key)[:60]
	return CACHE_DIR / f"{safe}__{_ckey(key)}.parquet"


	def _save_npy(arr: np.ndarray, key: str) -> Path:
	dest = _npy_path(key)
	tmp = dest.with_name(dest.stem + "__tmp.npy")
	try:
	np.save(tmp, arr) # numpy writes exactly tmp (ends in .npy)
	shutil.move(str(tmp), str(dest))
	logger.info(f" [cache ✓] {dest.name} shape={arr.shape} dtype={arr.dtype}")
	return dest
	except Exception as exc:
	tmp.unlink(missing_ok=True)
	raise RuntimeError(f"_save_npy failed '{key}': {exc}") from exc


	def _load_npy(key: str, expected_rows: int \| None = None) -> np.ndarray \| None:
	p = _npy_path(key)
	if not p.exists():
	return None
	try:
	arr = np.load(p, allow_pickle=False)
	if arr.size == 0:
	raise ValueError("empty array")
	if expected_rows is not None and arr.shape[0] != expected_rows:
	raise ValueError(f"rows {arr.shape[0]} ≠ {expected_rows}")
	logger.info(f" [cache ↑] {p.name} shape={arr.shape}")
	return arr
	except Exception as exc:
	logger.warning(f" [cache ✗] {p.name}: {exc} — deleting")
	p.unlink(missing_ok=True)
	return None


	def _save_json(obj, key: str) -> Path:
	p = _json_path(key)
	tmp = p.with_name(p.stem + "__tmp.json")
	try:
	tmp.write_text(
	json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
	shutil.move(str(tmp), str(p))
	logger.info(f" [cache ✓] {p.name}")
	return p
	except Exception as exc:
	tmp.unlink(missing_ok=True)
	raise RuntimeError(f"_save_json failed '{key}': {exc}") from exc


	def _load_json(key: str):
	p = _json_path(key)
	if not p.exists():
	return None
	try:
	obj = json.loads(p.read_text(encoding="utf-8"))
	logger.info(f" [cache ↑] {p.name}")
	return obj
	except Exception as exc:
	logger.warning(f" [cache ✗] {p.name}: {exc} — deleting")
	p.unlink(missing_ok=True)
	return None


	def _save_parquet(df: pd.DataFrame, key: str) -> Path:
	dest = _parquet_path(key)
	tmp = dest.with_name(dest.stem + "__tmp.parquet")
	try:
	df.to_parquet(tmp, index=False)
	shutil.move(str(tmp), str(dest))
	logger.info(f" [cache ✓] {dest.name} rows={len(df)}")
	return dest
	except Exception as exc:
	tmp.unlink(missing_ok=True)
	raise RuntimeError(f"_save_parquet failed '{key}': {exc}") from exc


	def _text_fingerprint(texts: list[str]) -> str:
	s = ((texts[0] if texts else "")
	+ (texts[-1] if len(texts) > 1 else "")
	+ str(len(texts)))
	return hashlib.md5(s.encode()).hexdigest()[:8]


	# =============================================================================
	# DATA LOADING
	# =============================================================================

	def _load_csv() -> pd.DataFrame:
	local = DATA_DIR / "curriculum_excerpts.csv"
	if not local.exists():
	logger.info("Downloading corpus CSV from HuggingFace …")
	for attempt in range(4):
	try:
	urllib.request.urlretrieve(CSV_URL, local)
	logger.info(f" Downloaded → {local}")
	break
	except Exception as exc:
	logger.warning(f" Attempt {attempt+1} failed: {exc}")
	time.sleep(3 * (attempt + 1))
	else:
	raise RuntimeError("Could not download CSV after 4 attempts.")

	df = pd.read_csv(local, dtype=str).fillna("")
	df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

	for req in ("search_term", "text_excerpt"):
	if req not in df.columns:
	raise ValueError(
	f"CSV missing required column '{req}'. "
	f"Found: {list(df.columns)}")
	for opt in ("state", "subject", "grade", "school_type", "year", "file"):
	if opt not in df.columns:
	df[opt] = ""

	df["search_term_lower"] = df["search_term"].str.lower().str.strip()
	df["text_excerpt"] = df["text_excerpt"].str.strip()
	df = df[df["text_excerpt"].str.len() > 20].reset_index(drop=True)
	logger.info(f"CSV loaded: {len(df):,} rows \| columns: {list(df.columns)}")
	return df


	def _filter_concept(df: pd.DataFrame, concept: str) -> pd.DataFrame:
	sub = df[df["search_term_lower"] == concept].reset_index(drop=True)
	if len(sub) < 5:
	sub = df[df["search_term_lower"].str.contains(
	concept, na=False)].reset_index(drop=True)
	logger.info(f" [{concept}] {len(sub):,} rows after filtering")
	return sub


	# =============================================================================
	# SENTENCE-TRANSFORMER
	# =============================================================================

	def _get_model():
	global _ST_MODEL
	if _ST_MODEL is None:
	logger.info(f"Loading SentenceTransformer '{MODEL_NAME}' …")
	from sentence_transformers import SentenceTransformer
	_ST_MODEL = SentenceTransformer(MODEL_NAME)
	logger.info(" Model ready.")
	return _ST_MODEL


	def compute_embeddings(texts: list[str], concept: str) -> np.ndarray:
	"""
	Encode texts with L2-normalised embeddings.
	Cached by concept + n_texts + text fingerprint so a content change
	automatically invalidates the cache.
	"""
	fp = _text_fingerprint(texts)
	key = f"emb_{concept}_{len(texts)}_{fp}"
	hit = _load_npy(key, expected_rows=len(texts))
	if hit is not None:
	logger.info(f" [{concept}] embeddings loaded from cache")
	return hit

	logger.info(f" [{concept}] encoding {len(texts):,} texts …")
	model = _get_model()
	arr = model.encode(
	texts,
	batch_size=32,
	show_progress_bar=True,
	convert_to_numpy=True,
	normalize_embeddings=True,
	).astype(np.float32)
	_save_npy(arr, key)
	return arr


	# =============================================================================
	# BUILD PER-CONCEPT ARTEFACTS
	# =============================================================================

	def _build_concept_artefacts(
	sub: pd.DataFrame,
	embs: np.ndarray,
	concept: str,
	) -> dict[str, Path]:
	"""
	Produce three files per concept and return a dict of {role: path}:

	embeddings.npy — float32 array (N, 768)
	metadata.parquet — one row per excerpt with all CSV columns + row_id
	metadata.json — same but as JSON for easy inspection
	"""
	n = len(sub)
	fp = _text_fingerprint(sub["text_excerpt"].tolist())

	# ── embeddings ────────────────────────────────────────────────────────────
	emb_key = f"emb_{concept}_{n}_{fp}"
	emb_path = _npy_path(emb_key) # already saved by compute_embeddings

	# ── metadata parquet ──────────────────────────────────────────────────────
	meta_key = f"meta_{concept}_{n}_{fp}"
	meta_path = _parquet_path(meta_key)

	if not meta_path.exists():
	meta_df = sub.copy()
	meta_df.insert(0, "row_id", range(n))
	meta_df.insert(1, "concept", concept)
	meta_df["embedding_dim"] = embs.shape[1]
	meta_df["n_texts"] = n
	meta_df["model"] = MODEL_NAME
	meta_df["created_at"] = datetime.now().isoformat(timespec="seconds")
	_save_parquet(meta_df, meta_key)
	else:
	logger.info(f" [{concept}] metadata parquet already cached")

	# ── metadata JSON (first 5 rows + schema) ─────────────────────────────────
	json_key = f"meta_preview_{concept}_{n}_{fp}"
	json_path = _json_path(json_key)

	if not json_path.exists():
	preview = {
	"concept": concept,
	"n_texts": n,
	"embedding_dim": int(embs.shape[1]),
	"model": MODEL_NAME,
	"created_at": datetime.now().isoformat(timespec="seconds"),
	"columns": list(sub.columns),
	"preview_rows": sub.head(5).to_dict(orient="records"),
	}
	_save_json(preview, json_key)
	else:
	logger.info(f" [{concept}] metadata JSON already cached")

	return {
	"embeddings": emb_path,
	"metadata_parquet": meta_path,
	"metadata_json": json_path,
	}


	# =============================================================================
	# HUGGINGFACE DATASET PUSH
	# =============================================================================

	def _push_concept_to_hf(
	concept: str,
	paths: dict[str, Path],
	token: str,
	) -> str:
	"""
	Upload a single concept's artefacts to the HF dataset repo.

	Remote layout:
	{concept}/embeddings.npy
	{concept}/metadata.parquet
	{concept}/metadata_preview.json
	"""
	try:
	from huggingface_hub import HfApi
	api = HfApi(token=token)

	# Ensure the dataset repo exists (creates if needed)
	try:
	api.repo_info(repo_id=HF_DATASET_REPO, repo_type="dataset")
	except Exception:
	logger.info(f" Creating dataset repo '{HF_DATASET_REPO}' …")
	api.create_repo(
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	private=False,
	exist_ok=True,
	)

	uploads = [
	(paths["embeddings"], f"{concept}/embeddings.npy"),
	(paths["metadata_parquet"], f"{concept}/metadata.parquet"),
	(paths["metadata_json"], f"{concept}/metadata_preview.json"),
	]

	for local_path, remote_path in uploads:
	if not local_path.exists():
	logger.warning(f" Skipping missing file: {local_path}")
	continue
	logger.info(f" Uploading {local_path.name} → {remote_path} …")
	api.upload_file(
	path_or_fileobj=str(local_path),
	path_in_repo=remote_path,
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	commit_message=(
	f"[{concept}] update embeddings "
	f"{datetime.now().isoformat(timespec='minutes')}"
	),
	)
	logger.info(f" ✓ {remote_path}")

	return f"✓ [{concept}] pushed to {HF_DATASET_REPO}"

	except Exception as exc:
	msg = f"✗ [{concept}] HF push failed: {exc}\n{traceback.format_exc()}"
	logger.error(msg)
	return msg


	def _push_dataset_card(token: str, summary: dict) -> None:
	"""Write/update a README.md dataset card on HF."""
	try:
	from huggingface_hub import HfApi
	api = HfApi(token=token)
	lines = [
	"---",
	"license: cc-by-4.0",
	"language:",
	"- de",
	"tags:",
	"- embeddings",
	"- curriculum",
	"- education",
	"- german",
	"- sentence-transformers",
	"---",
	"",
	"# German Curriculum Concept Embeddings",
	"",
	"Sentence embeddings for three focus concepts from the German school "
	"curriculum analysis project.",
	"",
	f"Model: `{MODEL_NAME}` ",
	f"Generated: {datetime.now().isoformat(timespec='seconds')} ",
	"",
	"## Structure",
	"",
	"```",
	"concept/",
	" embeddings.npy # float32 (N, 768) L2-normalised",
	" metadata.parquet # one row per excerpt, all CSV columns",
	" metadata_preview.json # schema + first 5 rows",
	"```",
	"",
	"## Concepts",
	"",
	]
	for concept, info in summary.items():
	lines.append(
	f"### `{concept}` "
	f"— {info['n']:,} excerpts · dim={info['dim']}"
	)
	lines += [
	"",
	"## Usage",
	"",
	"```python",
	"import numpy as np",
	"import pandas as pd",
	"from huggingface_hub import hf_hub_download",
	"",
	"concept = 'evolution'",
	"",
	"emb_path = hf_hub_download(",
	f' repo_id="{HF_DATASET_REPO}",',
	' repo_type="dataset",',
	' filename=f"{concept}/embeddings.npy")',
	"",
	"meta_path = hf_hub_download(",
	f' repo_id="{HF_DATASET_REPO}",',
	' repo_type="dataset",',
	' filename=f"{concept}/metadata.parquet")',
	"",
	"embs = np.load(emb_path) # (N, 768)",
	"meta = pd.read_parquet(meta_path) # N rows",
	"```",
	"",
	"## Source",
	"",
	"Generated by the "
	"[Concept Atlas Space]"
	"(https://huggingface.co/spaces/deirdosh/curriculum_analysis_german).",
	]
	readme = "\n".join(lines)
	api.upload_file(
	path_or_fileobj=readme.encode("utf-8"),
	path_in_repo="README.md",
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	commit_message="update dataset card",
	)
	logger.info(" Dataset card (README.md) updated.")
	except Exception as exc:
	logger.warning(f" Dataset card update failed: {exc}")


	# =============================================================================
	# PIPELINE STATE CACHE (lightweight JSON)
	# =============================================================================

	_META_KEY = "embedding_pipeline_meta_v1"


	def _save_state() -> None:
	state = {
	"timestamp": _P.get("timestamp", ""),
	"concepts_done": _P.get("concepts_done", []),
	"concept_meta": _P.get("concept_meta", {}),
	"hf_status": _P.get("hf_status", {}),
	}
	_save_json(state, _META_KEY)


	def _load_state() -> bool:
	state = _load_json(_META_KEY)
	if not state:
	return False
	_P.update({
	"timestamp": state.get("timestamp", ""),
	"concepts_done": state.get("concepts_done", []),
	"concept_meta": state.get("concept_meta", {}),
	"hf_status": state.get("hf_status", {}),
	})
	logger.info(f" Prior run restored: {_P['concepts_done']} [{_P['timestamp']}]")
	return True


	# =============================================================================
	# PIPELINE WORKER
	# =============================================================================

	def _pipeline_worker(token: str) -> None:
	try:
	# ── Load corpus ───────────────────────────────────────────────────────
	logger.info("━" * 56)
	logger.info("STEP 1/4 Loading corpus CSV")
	logger.info("━" * 56)
	df = _load_csv()
	_P["df"] = df

	concepts_done = list(_P.get("concepts_done", []))
	concept_meta = dict(_P.get("concept_meta", {}))
	hf_status = dict(_P.get("hf_status", {}))

	# ── Per-concept embedding ─────────────────────────────────────────────
	logger.info("━" * 56)
	logger.info("STEP 2/4 Computing embeddings per concept")
	logger.info("━" * 56)

	for concept in FOCUS_CONCEPTS:
	logger.info(f"\n[{concept.upper()}] ── filtering …")
	sub = _filter_concept(df, concept)
	n = len(sub)
	if n < 5:
	logger.warning(f" [{concept}] only {n} rows — skipping")
	continue
	texts = sub["text_excerpt"].tolist()

	logger.info(f"[{concept.upper()}] ── embeddings ({n:,} texts) …")
	embs = compute_embeddings(texts, concept)

	logger.info(f"[{concept.upper()}] ── building artefact files …")
	paths = _build_concept_artefacts(sub, embs, concept)

	# store in-memory for status display
	concept_meta[concept] = {
	"n": n,
	"dim": int(embs.shape[1]),
	"fp": _text_fingerprint(texts),
	"emb_path": str(paths["embeddings"]),
	"meta_path": str(paths["metadata_parquet"]),
	}
	if concept not in concepts_done:
	concepts_done.append(concept)

	_P.update(dict(
	concepts_done=concepts_done,
	concept_meta=concept_meta,
	))
	_save_state() # checkpoint after each concept

	# ── Push to HF ────────────────────────────────────────────────────────
	logger.info("━" * 56)
	logger.info("STEP 3/4 Pushing to HuggingFace dataset")
	logger.info("━" * 56)

	if not token:
	logger.warning(
	" HF_TOKEN not provided — skipping upload.\n"
	" Set HF_TOKEN as a Space secret or pass it in the UI.")
	else:
	for concept in concepts_done:
	logger.info(f"\n[{concept.upper()}] ── uploading …")
	meta = concept_meta[concept]
	paths_for_push = {
	"embeddings": Path(meta["emb_path"]),
	"metadata_parquet": Path(meta["meta_path"]),
	"metadata_json": _json_path(
	f"meta_preview_{concept}_{meta['n']}_{meta['fp']}"),
	}
	result = _push_concept_to_hf(concept, paths_for_push, token)
	hf_status[concept] = result
	logger.info(f" {result}")

	# dataset card
	logger.info("\nUpdating dataset README …")
	_push_dataset_card(token, concept_meta)

	_P.update(dict(hf_status=hf_status))

	# ── Finalise ──────────────────────────────────────────────────────────
	logger.info("━" * 56)
	logger.info("STEP 4/4 Saving final state")
	logger.info("━" * 56)

	_P["timestamp"] = datetime.now().isoformat(timespec="seconds")
	_save_state()

	logger.info("\n" + "═" * 56)
	logger.info("✓ PIPELINE COMPLETE")
	logger.info(f" Timestamp : {_P['timestamp']}")
	for concept in concepts_done:
	m = concept_meta[concept]
	logger.info(
	f" {concept:12s}: {m['n']:,} texts · "
	f"dim={m['dim']} · "
	f"HF={hf_status.get(concept, 'not pushed')[:40]}")
	logger.info("═" * 56)

	except Exception:
	logger.error(f"Pipeline error:\n{traceback.format_exc()}")
	finally:
	_PIPELINE_RUNNING.clear()


	# =============================================================================
	# PUBLIC API (called by Gradio buttons)
	# =============================================================================

	def launch_pipeline(hf_token: str) -> None:
	"""Start the embedding pipeline in a background thread."""
	global _PIPELINE_THREAD
	if _PIPELINE_RUNNING.is_set():
	logger.info("Pipeline already running — wait for it to finish.")
	return
	_PIPELINE_RUNNING.set()
	logger.info("⏳ Pipeline launched …")
	token = hf_token.strip() or os.environ.get("HF_TOKEN", "")
	_PIPELINE_THREAD = threading.Thread(
	target=_pipeline_worker,
	args=(token,),
	name="pipeline",
	daemon=True,
	)
	_PIPELINE_THREAD.start()


	def get_status_md() -> str:
	"""Markdown summary card shown in the UI."""
	if not _P.get("concepts_done"):
	return "_No pipeline run yet — click ▶ Run to start._"

	lines = [
	f"Last run: {_P.get('timestamp','—')}",
	"",
	"\| Concept \| N texts \| Dim \| Local cache \| HF status \|",
	"\|---\|---\|---\|---\|---\|",
	]
	for concept in FOCUS_CONCEPTS:
	meta = _P.get("concept_meta", {}).get(concept, {})
	n = f"{meta.get('n', 0):,}" if meta else "—"
	dim = str(meta.get("dim", "—"))
	cached = "✓" if meta.get("emb_path") and Path(meta["emb_path"]).exists() else "—"
	hfs = _P.get("hf_status", {}).get(concept, "—")[:50]
	lines.append(f"\| {concept.capitalize()} \| {n} \| {dim} \| {cached} \| {hfs} \|")

	# dataset link
	lines += [
	"",
	f"Dataset: [{HF_DATASET_REPO}]"
	f"(https://huggingface.co/datasets/{HF_DATASET_REPO})",
	]
	return "\n".join(lines)


	def get_cache_inventory() -> str:
	"""List all cached files."""
	files = sorted(CACHE_DIR.glob("")) + sorted(DATA_DIR.glob(""))
	if not files:
	return "_Cache is empty._"
	total = sum(f.stat().st_size for f in files if f.is_file())
	lines = [f"### {len(files)} cached files ({total/1_048_576:.2f} MB total)", ""]
	for f in files:
	if f.is_file():
	sz = f.stat().st_size / 1024
	ts = datetime.fromtimestamp(f.stat().st_mtime).strftime("%m-%d %H:%M")
	lines.append(f"- `{f.name}` — {sz:.1f} KB · {ts}")
	return "\n".join(lines)


	def download_concept_embeddings(concept: str) -> str \| None:
	"""Return path to the concept's .npy embedding file for gr.File."""
	meta = _P.get("concept_meta", {}).get(concept, {})
	p = Path(meta.get("emb_path", "")) if meta else Path("")
	if p.exists():
	return str(p)
	# try to find by glob
	candidates = sorted(CACHE_DIR.glob(f"emb_{concept}___.npy"),
	key=lambda x: x.stat().st_mtime, reverse=True)
	return str(candidates[0]) if candidates else None


	def download_concept_metadata(concept: str) -> str \| None:
	"""Return path to the concept's .parquet metadata file for gr.File."""
	meta = _P.get("concept_meta", {}).get(concept, {})
	p = Path(meta.get("meta_path", "")) if meta else Path("")
	if p.exists():
	return str(p)
	candidates = sorted(CACHE_DIR.glob(f"meta_{concept}___.parquet"),
	key=lambda x: x.stat().st_mtime, reverse=True)
	return str(candidates[0]) if candidates else None


	def download_all_zip() -> str \| None:
	"""Bundle all concept artefacts into a single ZIP."""
	import zipfile
	files: list[Path] = []
	for concept in FOCUS_CONCEPTS:
	emb = download_concept_embeddings(concept)
	mta = download_concept_metadata(concept)
	if emb: files.append(Path(emb))
	if mta: files.append(Path(mta))
	# JSON preview
	meta = _P.get("concept_meta", {}).get(concept, {})
	if meta:
	jp = _json_path(
	f"meta_preview_{concept}_{meta.get('n',0)}_{meta.get('fp','')}")
	if jp.exists():
	files.append(jp)
	if LOG_FILE.exists():
	files.append(LOG_FILE)
	if not files:
	return None
	out = CACHE_DIR / f"curriculum_embeddings_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
	with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf:
	for p in files:
	zf.write(p, p.name)
	logger.info(f" ZIP → {out.name} {out.stat().st_size//1024} KB")
	return str(out)


	# =============================================================================
	# GRADIO UI
	# =============================================================================

	_HDR = """
	<div style="
	background: linear-gradient(135deg,#0F0B2D 0%,#1E1A56 40%,#0A2818 80%,#2D1200 100%);
	padding:28px 34px 22px; border-radius:14px; margin-bottom:12px;
	box-shadow:0 8px 32px rgba(0,0,0,.55), inset 0 1px 0 rgba(255,255,255,.07);
	">
	<h1 style="color:#fff;margin:0 0 7px;font-size:26px;font-weight:700;">
	🔭 Concept Atlas — Embedding Generator
	</h1>
	<p style="color:rgba(255,255,255,.68);margin:0;font-size:13px;line-height:1.7;">
	Generates <b>sentence embeddings</b> for
	<b style="color:#C084FC">mensch</b> ·
	<b style="color:#34D399">verhalten</b> ·
	<b style="color:#FB923C">evolution</b>
	and pushes them to a public HuggingFace dataset.
	</p>
	<p style="color:rgba(255,255,255,.35);margin:6px 0 0;font-size:11px;">
	Model: paraphrase-multilingual-mpnet-base-v2  \|
	Dataset:
	<a href="https://huggingface.co/datasets/deirdosh/curriculum_embeddings"
	style="color:#C084FC;text-decoration:none;">
	deirdosh/curriculum_embeddings
	</a>
	</p>
	</div>
	"""

	_INSTRUCTIONS = """
	### What this app does

	1. Downloads the German curriculum corpus CSV (~35k excerpts)
	2. Filters excerpts for each of the three focus concepts
	3. Encodes every excerpt with `paraphrase-multilingual-mpnet-base-v2`
	(768-dim, L2-normalised, float32)
	4. Saves per-concept artefacts:
	- `embeddings.npy` — shape `(N, 768)`
	- `metadata.parquet` — all CSV columns + `row_id`, `concept`, `model`, timestamps
	- `metadata_preview.json` — schema + first 5 rows
	5. Pushes all artefacts to
	`huggingface.co/datasets/deirdosh/curriculum_embeddings`

	### How to use

	\| Step \| Action \|
	\|---\|---\|
	\| 1 \| Paste your `HF_TOKEN` (needs write access to the dataset repo) \|
	\| 2 \| Click ▶ Run Pipeline \|
	\| 3 \| Watch the live log — each concept takes ~10 min on CPU \|
	\| 4 \| Download individual files or the full ZIP below \|

	> All steps are individually cached. Re-running skips already-computed embeddings.
	> The HF_TOKEN can also be set as a Space secret — leave the field blank if so.
	"""

	_CSS = """
	body,.gradio-container{background:#0F1120!important;}
	.gr-button-primary{background:#6D28D9!important;border-color:#6D28D9!important;color:#fff!important;}
	.gr-button-primary:hover{background:#5B21B6!important;}
	.gr-button-secondary{background:#1E293B!important;border-color:#334155!important;color:#94A3B8!important;}
	.gr-button-secondary:hover{background:#334155!important;color:#E2E8F0!important;}
	.gr-textbox textarea{background:#0F1120!important;color:#E2E8F0!important;
	border-color:#2D3555!important;font-family:monospace!important;font-size:12px!important;}
	label{color:#94A3B8!important;}
	.gr-markdown{color:#CBD5E1!important;}
	.gr-markdown h3{color:#A78BFA!important;}
	.gr-markdown code{background:#1E293B!important;color:#C084FC!important;}
	footer{display:none!important;}
	"""


	def build_ui() -> gr.Blocks:
	with gr.Blocks(
	title="Concept Atlas — Embeddings",
	css=_CSS,
	theme=gr.themes.Base(
	primary_hue="violet", secondary_hue="emerald", neutral_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "system-ui"],
	),
	) as demo:

	gr.HTML(_HDR)

	with gr.Tabs():

	# ── 0: Run ───────────────────────────────────────────────────────
	with gr.TabItem("🚀 Run Pipeline"):
	gr.Markdown(_INSTRUCTIONS)

	with gr.Row():
	token_box = gr.Textbox(
	label="HuggingFace token (write access)",
	placeholder="hf_… (or leave blank if HF_TOKEN secret is set)",
	type="password",
	scale=3,
	)
	run_btn = gr.Button("▶ Run Pipeline", variant="primary", scale=1)

	live_log = gr.Textbox(
	label="Live pipeline log",
	value=get_live_log,
	interactive=False,
	lines=28,
	every=3,
	)

	run_btn.click(fn=launch_pipeline, inputs=token_box, outputs=None)

	# ── 1: Status ────────────────────────────────────────────────────
	with gr.TabItem("📋 Status"):
	status_btn = gr.Button("Refresh status", variant="secondary")
	status_md = gr.Markdown(value=get_status_md)

	# also auto-refresh every 5 s
	status_auto = gr.Markdown(value=get_status_md, every=5, visible=False)

	status_btn.click(fn=get_status_md, outputs=status_md)

	gr.Markdown("---")
	cache_btn = gr.Button("Refresh cache inventory", variant="secondary")
	cache_md = gr.Markdown()
	cache_btn.click(fn=get_cache_inventory, outputs=cache_md)

	# ── 2: Download ───────────────────────────────────────────────────
	with gr.TabItem("⬇ Download"):
	gr.Markdown(
	"### Download artefacts\n"
	"Files are generated by the pipeline and cached locally. "
	"They are also available at "
	f"[{HF_DATASET_REPO}]"
	f"(https://huggingface.co/datasets/{HF_DATASET_REPO})."
	)

	# ── per-concept ───────────────────────────────────────────────
	for concept, emoji, colour in [
	("mensch", "🔵", "#C084FC"),
	("verhalten", "🟢", "#34D399"),
	("evolution", "🟠", "#FB923C"),
	]:
	gr.Markdown(
	f"#### {emoji} `{concept}` "
	f"<span style='color:{colour}'>embeddings</span>")
	with gr.Row():
	be = gr.Button(f"Download {concept} embeddings (.npy)",
	variant="primary")
	bm = gr.Button(f"Download {concept} metadata (.parquet)",
	variant="secondary")
	with gr.Row():
	fe = gr.File(label=f"{concept}_embeddings.npy")
	fm = gr.File(label=f"{concept}_metadata.parquet")

	be.click(fn=lambda c=concept: download_concept_embeddings(c),
	outputs=fe)
	bm.click(fn=lambda c=concept: download_concept_metadata(c),
	outputs=fm)

	gr.Markdown("---")
	gr.Markdown("#### 📦 Download everything")
	with gr.Row():
	zip_btn = gr.Button("Download all artefacts as ZIP",
	variant="primary")
	zip_file = gr.File(label="curriculum_embeddings_*.zip")
	zip_btn.click(fn=download_all_zip, outputs=zip_file)

	gr.Markdown(
	"---\n"
	"### Dataset usage\n"
	"```python\n"
	"import numpy as np\n"
	"import pandas as pd\n"
	"from huggingface_hub import hf_hub_download\n\n"
	f'concept = "evolution"\n\n'
	"emb_path = hf_hub_download(\n"
	f' repo_id="{HF_DATASET_REPO}",\n'
	' repo_type="dataset",\n'
	' filename=f"{concept}/embeddings.npy")\n\n'
	"meta_path = hf_hub_download(\n"
	f' repo_id="{HF_DATASET_REPO}",\n'
	' repo_type="dataset",\n'
	' filename=f"{concept}/metadata.parquet")\n\n'
	"embs = np.load(emb_path) # (N, 768) float32\n"
	"meta = pd.read_parquet(meta_path) # N rows\n"
	"```"
	)

	gr.HTML(
	"<div style='text-align:center;padding:12px;font-size:11px;color:#475569;"
	"border-top:1px solid #1E293B;margin-top:8px;'>"
	"Concept Atlas · OpenEvo/CCS · "
	f"<a href='https://huggingface.co/datasets/{HF_DATASET_REPO}'"
	" style='color:#7C3AED;'>Dataset</a>"
	"</div>"
	)

	return demo


	# =============================================================================
	# STARTUP — try to restore a prior run's state
	# =============================================================================

	def _startup() -> None:
	logger.info("=" * 56)
	logger.info("Concept Atlas — Embedding Generator — startup")
	logger.info("=" * 56)
	try:
	ok = _load_state()
	if ok:
	logger.info(f" Prior state restored: {_P.get('concepts_done')}")
	else:
	logger.info(" No prior cache — fresh start.")
	except Exception:
	logger.warning(f" Startup restore error:\n{traceback.format_exc()}")


	_startup()

	# =============================================================================
	if __name__ == "__main__":
	demo = build_ui()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	)