Spaces:

CGIAR
/

knowledge-value-lab

Sleeping

File size: 23,404 Bytes

"""Knowledge Value Lab — Streamlit prototype."""

from __future__ import annotations
import os
import time
import anthropic
import streamlit as st
from dotenv import load_dotenv
from datetime import datetime

load_dotenv()

from kvl import ingestor, scorer, report
from kvl.modules import novelty, retrieval, generation, attribution, demand
from kvl.config import DIMENSION_META, KVS_CLASSIFICATION, MODELS, SENSITIVITY_COLOR, model_meta

# ── Page config ───────────────────────────────────────────────────────────────
st.set_page_config(
    page_title="Knowledge Value Lab",
    page_icon="🔬",
    layout="wide",
)

# ── CSS ───────────────────────────────────────────────────────────────────────
st.markdown("""
<style>
.kvs-box {
    background: linear-gradient(135deg, #1e3a5f 0%, #0d2137 100%);
    border-radius: 12px;
    padding: 28px 36px;
    text-align: center;
    margin-bottom: 8px;
}
.kvs-number { font-size: 64px; font-weight: 800; color: #f0f4ff; line-height: 1; }
.kvs-label  { font-size: 16px; color: #8ab4f8; margin-top: 4px; }
.kvs-class  { font-size: 22px; font-weight: 600; margin-top: 8px; }
.kvs-meta   { font-size: 12px; color: #556; margin-top: 10px; font-family: monospace; }
.model-chip {
    display: inline-block;
    background: #1a2740;
    border: 1px solid #2a4060;
    border-radius: 4px;
    padding: 2px 8px;
    font-size: 11px;
    font-family: monospace;
    color: #8ab4f8;
    margin: 2px;
}
.sens-badge {
    display: inline-block;
    border-radius: 4px;
    padding: 1px 7px;
    font-size: 11px;
    font-weight: 600;
}
.warn-box {
    background: #1a1500;
    border-left: 3px solid #f8961e;
    border-radius: 4px;
    padding: 10px 14px;
    font-size: 13px;
    margin: 8px 0 16px 0;
    color: #fff;
}
</style>
""", unsafe_allow_html=True)


# ── Sidebar ───────────────────────────────────────────────────────────────────
with st.sidebar:
    st.markdown("## 🔬 Knowledge Value Lab")
    st.markdown(
        "KVL measures the **marginal value** of a knowledge document to an AI system "
        "across five independent dimensions, producing a single weighted **Knowledge Value Score (KVS)**."
    )

    st.divider()
    st.markdown("### Models Used")
    for key, m in MODELS.items():
        st.markdown(
            f"<span class='model-chip'>{m['display']}</span>  \n"
            f"<span style='font-size:11px;color:#888;'>{m['role']}</span>",
            unsafe_allow_html=True,
        )
        st.markdown("")

    st.markdown(
        "<div class='warn-box'>"
        "⚠️ <strong>Scores are model-relative.</strong> "
        "Knowledge Novelty and Generation Utility reflect this document's value "
        "to the <em>specific models above</em>. Scores will change when models are updated. "
        "Always report scores alongside the model names and evaluation date."
        "</div>",
        unsafe_allow_html=True,
    )

    st.divider()
    st.markdown("### Score Classifications")
    for threshold, label, desc in KVS_CLASSIFICATION:
        hi = threshold + 19 if threshold < 81 else 100
        st.markdown(f"**{threshold}–{hi}** — {label}")
        st.caption(desc)

    st.divider()
    st.markdown("### Metric Guide")
    for key, dmeta in DIMENSION_META.items():
        sens = dmeta["model_sensitivity"]
        sc = SENSITIVITY_COLOR[sens]
        with st.expander(f"{dmeta['name']}  ·  {int(dmeta['weight']*100)}%"):
            st.markdown(
                f"<span class='sens-badge' style='background:{sc}22;color:{sc};border:1px solid {sc}55;'>"
                f"Model sensitivity: {sens}</span>",
                unsafe_allow_html=True,
            )
            st.markdown(dmeta["description"])
            st.markdown(f"**How measured:** {dmeta['how_measured']}")
            st.markdown(f"*{dmeta['sensitivity_note']}*")
            st.markdown(f"**High score:** {dmeta['high_means']}")
            st.markdown(f"**Low score:** {dmeta['low_means']}")


# ── Header ────────────────────────────────────────────────────────────────────
st.title("🔬 Knowledge Value Lab")
st.markdown(
    "**Measuring the Marginal Value of Knowledge Assets for AI Systems**  \n"
    "Upload a Markdown document to receive a quantified Knowledge Value Score across five dimensions."
)
st.divider()


# ── Cached resources ──────────────────────────────────────────────────────────
@st.cache_resource(show_spinner="Loading embedding model (all-MiniLM-L6-v2)...")
def load_embedder():
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer("all-MiniLM-L6-v2")


@st.cache_resource
def load_client():
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        st.error("ANTHROPIC_API_KEY not found. Add it to your .env file.")
        st.stop()
    return anthropic.Anthropic(api_key=api_key)


# ── Upload section ────────────────────────────────────────────────────────────
col_upload, col_preview = st.columns([1, 1], gap="large")

with col_upload:
    st.markdown("### Upload Document")
    uploaded = st.file_uploader(
        "Choose a Markdown file",
        type=["md"],
        help="Upload a .md file to evaluate its knowledge value for AI systems.",
        label_visibility="collapsed",
    )

    if uploaded:
        md_text = uploaded.read().decode("utf-8")
        doc = ingestor.parse(md_text)
        st.success(
            f"**{doc.title}**  \n"
            f"{doc.word_count:,} words · {len(doc.sections)} sections · {len(doc.chunks)} chunks"
        )
        run = st.button("▶  Evaluate Knowledge Value", type="primary", use_container_width=True)
    else:
        st.info("Drag and drop a `.md` file above, or click to browse.")
        run = False

with col_preview:
    st.markdown("### Document Preview")
    if uploaded:
        preview_text = md_text[:1200] + ("…" if len(md_text) > 1200 else "")
        st.markdown(
            f"<div style='background:#0e1117;border:1px solid #2a2a3a;border-radius:8px;"
            f"padding:16px;font-size:13px;max-height:280px;overflow:auto;white-space:pre-wrap;color:#fff;'>"
            f"{preview_text}</div>",
            unsafe_allow_html=True,
        )
    else:
        st.markdown(
            "<div style='background:#0e1117;border:1px solid #2a2a3a;border-radius:8px;"
            "padding:40px;text-align:center;color:#555;'>No document uploaded yet</div>",
            unsafe_allow_html=True,
        )


# ── Evaluation pipeline ───────────────────────────────────────────────────────
if run and uploaded:
    st.divider()
    st.markdown("### Evaluation in Progress")

    client = load_client()
    embedder = load_embedder()

    steps = [
        "Module A: Knowledge Novelty",
        "Module B: Retrieval Utility",
        "Module C: Generation Utility",
        "Module D: Attribution & Grounding",
        "Module E: Demand Utility",
        "Computing Knowledge Value Score",
    ]
    progress_bar = st.progress(0)
    step_statuses = {s: "pending" for s in steps}
    _sub_msg = [""]          # mutable cell so sub_progress can update it
    step_placeholder = st.empty()

    def render_steps(elapsed: int | None = None):
        rows = []
        for s, state in step_statuses.items():
            is_running = state == "running"
            icon  = {"pending": "○", "running": "⟳", "done": "✓"}[state]
            fg    = {"pending": "#555", "running": "#e8f0fe", "done": "#4caf87"}[state]
            bg    = "background:#0d1f35;" if is_running else ""
            bold  = "font-weight:600;" if is_running else ""
            sub   = (
                f"<div style='font-size:12px;color:#8ab4f8;margin:3px 0 0 22px;'>{_sub_msg[0]}</div>"
                if is_running and _sub_msg[0] else ""
            )
            rows.append(
                f"<div style='padding:7px 16px;{bg}border-bottom:1px solid #1a1a2e;'>"
                f"<span style='font-family:monospace;color:{fg};{bold}'>{icon}&nbsp;&nbsp;{s}</span>"
                f"{sub}</div>"
            )

        footer = ""
        if elapsed is not None:
            footer = (
                f"<div style='padding:7px 16px;font-size:12px;color:#4caf87;'>"
                f"✓ Evaluation complete in {elapsed}s</div>"
            )

        step_placeholder.markdown(
            f"<div style='border:1px solid #2a2a3a;border-radius:8px;overflow:hidden;'>"
            + "".join(rows) + footer +
            "</div>",
            unsafe_allow_html=True,
        )

    def sub_progress(msg: str):
        _sub_msg[0] = msg
        render_steps()

    module_results = {}
    eval_start = datetime.now()
    t0 = time.time()

    step_statuses[steps[0]] = "running"; render_steps(); progress_bar.progress(5)
    module_results["novelty"] = novelty.evaluate(client, doc, progress_cb=sub_progress)
    step_statuses[steps[0]] = "done"; progress_bar.progress(20)

    step_statuses[steps[1]] = "running"; render_steps()
    module_results["retrieval"] = retrieval.evaluate(client, doc, embedder, progress_cb=sub_progress)
    step_statuses[steps[1]] = "done"; progress_bar.progress(40)

    step_statuses[steps[2]] = "running"; render_steps()
    module_results["generation"] = generation.evaluate(client, doc, progress_cb=sub_progress)
    step_statuses[steps[2]] = "done"; progress_bar.progress(60)

    step_statuses[steps[3]] = "running"; render_steps()
    module_results["attribution"] = attribution.evaluate(
        client, doc, module_results["generation"], embedder, progress_cb=sub_progress
    )
    step_statuses[steps[3]] = "done"; progress_bar.progress(80)

    step_statuses[steps[4]] = "running"; render_steps()
    module_results["demand"] = demand.evaluate(client, doc, progress_cb=sub_progress)
    step_statuses[steps[4]] = "done"; progress_bar.progress(92)

    step_statuses[steps[5]] = "running"; _sub_msg[0] = "Computing weighted Knowledge Value Score..."; render_steps()
    dim_scores = {k: module_results[k]["score"] for k in module_results}
    kvs_result = scorer.compute(dim_scores)
    step_statuses[steps[5]] = "done"; progress_bar.progress(100)

    elapsed = round(time.time() - t0)
    _sub_msg[0] = ""
    render_steps(elapsed=elapsed)
    eval_date_str = eval_start.strftime("%Y-%m-%d %H:%M UTC")
    meta = model_meta(eval_date_str)

    # ── Results ───────────────────────────────────────────────────────────────
    st.divider()
    st.markdown("## Knowledge Value Report")

    kvs = kvs_result["kvs"]
    classification = kvs_result["classification"]
    color_map = {
        "Transformational Value": "#ffd166",
        "High Value":             "#06d6a0",
        "Moderate Value":         "#8ab4f8",
        "Incremental Value":      "#f8961e",
        "Minimal Value":          "#ef476f",
    }
    badge_color = color_map.get(classification, "#8ab4f8")

    # KVS hero with model metadata
    st.markdown(
        f"""<div class="kvs-box">
            <div class="kvs-number">{kvs}</div>
            <div class="kvs-label">Knowledge Value Score / 100</div>
            <div class="kvs-class" style="color:{badge_color};">{classification}</div>
            <div class="kvs-meta">
                Evaluated {eval_date_str}<br>
                Judge: {MODELS['judge']['display']} &nbsp;·&nbsp;
                Worker: {MODELS['worker']['display']} &nbsp;·&nbsp;
                Embeddings: {MODELS['embedder']['display']}
            </div>
        </div>""",
        unsafe_allow_html=True,
    )

    # Model-relativity warning
    st.markdown(
        "<div class='warn-box'>"
        "⚠️ <strong>Score validity:</strong> "
        "Knowledge Novelty and Generation Utility are <strong>model-relative</strong> — "
        "they reflect this document's marginal value to the models listed above. "
        "Scores will change if the underlying models are updated or replaced. "
        "Always report scores alongside model names and evaluation date."
        "</div>",
        unsafe_allow_html=True,
    )

    # ── Dimension breakdown ───────────────────────────────────────────────────
    st.markdown("### Dimension Breakdown")
    st.caption(
        "Each dimension is scored 0–100 and weighted by its contribution to the overall KVS. "
        "The sensitivity badge shows how much the score depends on the specific AI model used."
    )

    dims = [
        ("Knowledge Novelty",  "novelty",     0.30),
        ("Retrieval Utility",  "retrieval",   0.20),
        ("Generation Utility", "generation",  0.25),
        ("Attribution",        "attribution", 0.15),
        ("Demand Utility",     "demand",      0.10),
    ]

    for label, key, weight in dims:
        sc = dim_scores[key]
        contrib = kvs_result["weighted_contributions"][key]
        filled = round(sc / 100 * 20)
        bar_str = "█" * filled + "░" * (20 - filled)
        pct = int(weight * 100)
        sens = DIMENSION_META[key]["model_sensitivity"]
        sc_color = SENSITIVITY_COLOR[sens]

        col1, col2 = st.columns([4, 1])
        with col1:
            st.markdown(
                f"**{label}** &nbsp;"
                f"<span class='sens-badge' style='background:{sc_color}22;color:{sc_color};"
                f"border:1px solid {sc_color}55;'>sensitivity: {sens}</span>  \n"
                f"`{bar_str}` &nbsp; **{sc}/100** "
                f"<span style='color:#888;font-size:13px;'> ×{pct}% = {contrib} pts</span>",
                unsafe_allow_html=True,
            )
        with col2:
            st.metric(label="score", value=str(sc), label_visibility="collapsed")

    # ── Detailed analysis expanders ───────────────────────────────────────────
    st.markdown("### Detailed Analysis")

    # Module A
    dmeta = DIMENSION_META["novelty"]
    with st.expander(f"Module A — Knowledge Novelty  ·  {dim_scores['novelty']}/100"):
        sens = dmeta["model_sensitivity"]
        sc = SENSITIVITY_COLOR[sens]
        st.markdown(
            f"<span class='sens-badge' style='background:{sc}22;color:{sc};border:1px solid {sc}55;'>"
            f"Model sensitivity: {sens}</span>",
            unsafe_allow_html=True,
        )
        st.markdown(f"**What this measures:** {dmeta['description']}")
        st.markdown(f"**How it's measured:** {dmeta['how_measured']}")
        st.markdown(f"**Models used:** {', '.join(dmeta['models_used'])}")
        st.info(dmeta["sensitivity_note"])
        st.divider()
        st.markdown(f"**Result:** {module_results['novelty']['summary']}")
        details = module_results["novelty"].get("details", [])
        if details:
            st.markdown("**Claim analysis** (🟢 novel · 🟡 partial · 🔴 already known):")
            for d in details:
                known_pct = round(d["known_score"] * 100)
                icon = "🟢" if d["known_score"] < 0.4 else ("🟡" if d["known_score"] < 0.7 else "🔴")
                st.markdown(
                    f"{icon} **{d['claim'][:130]}**  \n"
                    f"*Known to model: {known_pct}% — {d['reason']}*"
                )

    # Module B
    dmeta = DIMENSION_META["retrieval"]
    with st.expander(f"Module B — Retrieval Utility  ·  {dim_scores['retrieval']}/100"):
        sens = dmeta["model_sensitivity"]
        sc = SENSITIVITY_COLOR[sens]
        st.markdown(
            f"<span class='sens-badge' style='background:{sc}22;color:{sc};border:1px solid {sc}55;'>"
            f"Model sensitivity: {sens}</span>",
            unsafe_allow_html=True,
        )
        st.markdown(f"**What this measures:** {dmeta['description']}")
        st.markdown(f"**How it's measured:** {dmeta['how_measured']}")
        st.markdown(f"**Models used:** {', '.join(dmeta['models_used'])}")
        st.info(dmeta["sensitivity_note"])
        st.divider()
        st.markdown(f"**Result:** {module_results['retrieval']['summary']}")
        details = module_results["retrieval"].get("details", [])
        if details:
            st.caption(
                "**Recall@3** — fraction of queries where the correct chunk appears in top 3 results (1.0 = perfect).  \n"
                "**MRR** — Mean Reciprocal Rank; how high the correct chunk ranks on average (1.0 = always first)."
            )
            st.table({
                "Query":    [d["query"] for d in details],
                "Recall@3": [f"{d['recall_at_3']:.2f}" for d in details],
                "MRR":      [f"{d['reciprocal_rank']:.2f}" for d in details],
            })

    # Module C
    dmeta = DIMENSION_META["generation"]
    with st.expander(f"Module C — Generation Utility  ·  {dim_scores['generation']}/100"):
        sens = dmeta["model_sensitivity"]
        sc = SENSITIVITY_COLOR[sens]
        st.markdown(
            f"<span class='sens-badge' style='background:{sc}22;color:{sc};border:1px solid {sc}55;'>"
            f"Model sensitivity: {sens}</span>",
            unsafe_allow_html=True,
        )
        st.markdown(f"**What this measures:** {dmeta['description']}")
        st.markdown(f"**How it's measured:** {dmeta['how_measured']}")
        st.markdown(f"**Models used:** {', '.join(dmeta['models_used'])}")
        st.info(dmeta["sensitivity_note"])
        st.divider()
        st.markdown(f"**Result:** {module_results['generation']['summary']}")
        for d in module_results["generation"].get("details", []):
            st.markdown(f"**Q: {d['question']}**")
            c1, c2 = st.columns(2)
            with c1:
                st.markdown("*Baseline — no document:*")
                st.markdown(f"> {d['baseline_answer'][:350]}")
            with c2:
                st.markdown("*RAG — with document:*")
                st.markdown(f"> {d['rag_answer'][:350]}")
            st.caption(
                f"Improvement: **{d['improvement']}/100** &nbsp;|&nbsp; "
                f"Accuracy: {d['accuracy']}/5 &nbsp;|&nbsp; "
                f"Completeness: {d['completeness']}/5 &nbsp;|&nbsp; "
                f"Specificity: {d['specificity']}/5  \n{d['reason']}"
            )
            st.divider()

    # Module D
    dmeta = DIMENSION_META["attribution"]
    with st.expander(f"Module D — Attribution & Grounding  ·  {dim_scores['attribution']}/100"):
        sens = dmeta["model_sensitivity"]
        sc = SENSITIVITY_COLOR[sens]
        st.markdown(
            f"<span class='sens-badge' style='background:{sc}22;color:{sc};border:1px solid {sc}55;'>"
            f"Model sensitivity: {sens}</span>",
            unsafe_allow_html=True,
        )
        st.markdown(f"**What this measures:** {dmeta['description']}")
        st.markdown(f"**How it's measured:** {dmeta['how_measured']}")
        st.markdown(f"**Models used:** {', '.join(dmeta['models_used'])}")
        st.info(dmeta["sensitivity_note"])
        st.divider()
        st.markdown(f"**Result:** {module_results['attribution']['summary']}")
        for d in module_results["attribution"].get("details", []):
            halluc = "⚠️ Hallucination detected" if d.get("hallucination_detected") else "✓ No hallucination"
            st.markdown(
                f"**Q: {d['question'][:110]}**  \n"
                f"Grounding: **{round(d['grounding_fraction']*100)}%** &nbsp;|&nbsp; "
                f"Semantic similarity: {d['semantic_similarity']} &nbsp;|&nbsp; {halluc}"
            )
            if d.get("ungrounded_claims"):
                st.caption("Ungrounded claims: " + "; ".join(d["ungrounded_claims"][:3]))
            if d.get("reason"):
                st.caption(d["reason"])

    # Module E
    dmeta = DIMENSION_META["demand"]
    with st.expander(f"Module E — Demand Utility  ·  {dim_scores['demand']}/100"):
        sens = dmeta["model_sensitivity"]
        sc = SENSITIVITY_COLOR[sens]
        st.markdown(
            f"<span class='sens-badge' style='background:{sc}22;color:{sc};border:1px solid {sc}55;'>"
            f"Model sensitivity: {sens}</span>",
            unsafe_allow_html=True,
        )
        st.markdown(f"**What this measures:** {dmeta['description']}")
        st.markdown(f"**How it's measured:** {dmeta['how_measured']}")
        st.markdown(f"**Models used:** {', '.join(dmeta['models_used'])}")
        st.info(dmeta["sensitivity_note"])
        st.divider()
        st.markdown(f"**Result:** {module_results['demand']['summary']}")
        topics = module_results["demand"].get("topics", [])
        if topics:
            st.caption(
                "**Query Freq** — estimated user query frequency for this topic (1 = rare, 10 = very common).  \n"
                "**Priority Domain** — whether this is a high-impact sector (health, climate, food, policy, etc.).  \n"
                "**Unmet Need** — whether existing AI models fall short in covering this topic."
            )
            st.table({
                "Topic":             [t.get("topic", "") for t in topics],
                "Query Freq (1-10)": [t.get("query_frequency", "-") for t in topics],
                "Priority Domain":   ["Yes" if t.get("priority_domain") else "No" for t in topics],
                "Unmet Need":        ["Yes" if t.get("unmet_need") else "No" for t in topics],
                "Rationale":         [t.get("rationale", "")[:80] for t in topics],
            })

    # ── Recommendations ───────────────────────────────────────────────────────
    st.markdown("### Recommended Actions")
    for rec in kvs_result["recommendations"]:
        st.markdown(f"- {rec}")

    # ── Download ──────────────────────────────────────────────────────────────
    st.divider()
    report_md = report.generate(doc.title, kvs_result, module_results, meta)
    st.download_button(
        label="⬇  Download Full Report (Markdown)",
        data=report_md,
        file_name=f"kvl_report_{doc.title[:40].replace(' ', '_')}.md",
        mime="text/markdown",
        use_container_width=True,
    )

elif not uploaded:
    st.info("Upload a `.md` file above to begin evaluation.")