Spaces:

narcis2007
/

ClimateBERT

Running

File size: 9,134 Bytes

"""
ClimateBERT — Greenwashing Signal Detector (Gradio demo)

Runs six specialized ClimateBERT models on a paragraph of text and returns
a proxy "cheap talk" greenwashing risk score. Aligned with the EU ECGT
Directive (applies 27 September 2026) and the proposed Green Claims Directive.

All models are Apache-2.0, from https://huggingface.co/climatebert
"""

import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# CPU-only (HF Spaces free tier)
DEVICE = -1
MAX_LEN = 512

# Each entry: (internal_key, model_repo, tokenizer_repo_or_None)
MODELS = [
    ("detector",    "climatebert/distilroberta-base-climate-detector",    None),
    ("env_claims",  "climatebert/environmental-claims",                   None),
    ("specificity", "climatebert/distilroberta-base-climate-specificity", None),
    ("commitment",  "climatebert/distilroberta-base-climate-commitment",  None),
    ("sentiment",   "climatebert/distilroberta-base-climate-sentiment",   None),
    # netzero-reduction does not ship its own tokenizer — use the base LM
    ("netzero",     "climatebert/netzero-reduction",
                    "climatebert/distilroberta-base-climate-f"),
]

print("Loading ClimateBERT models (first run downloads ~2 GB)...")
PIPES = {}
for key, model_repo, tok_repo in MODELS:
    tok = AutoTokenizer.from_pretrained(tok_repo or model_repo, model_max_length=MAX_LEN)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_repo)
    PIPES[key] = pipeline(
        "text-classification",
        model=mdl,
        tokenizer=tok,
        truncation=True,
        padding=True,
        max_length=MAX_LEN,
        device=DEVICE,
    )
    print(f"  loaded {key}: {model_repo}")
print("All models loaded.")


def _norm(label: str) -> str:
    return (label or "").strip().lower()


def _is_positive(label: str, positive_keywords=("yes", "claim", "climate", "true", "1")) -> bool:
    label = _norm(label)
    return any(kw in label for kw in positive_keywords)


def _is_non_specific(label: str) -> bool:
    label = _norm(label)
    return "non" in label  # "non-specific", "nonspecific"


def _no_commitment(label: str) -> bool:
    label = _norm(label)
    return label in ("no", "none") or "no" == label[:2] or "none" in label


def classify(text: str):
    if not text or not text.strip():
        return "Please enter some text to analyze.", {}, "", ""

    text = text.strip()

    results = {key: pipe(text)[0] for key, pipe in PIPES.items()}

    det = results["detector"]
    is_climate = _is_positive(det["label"])

    # Greenwashing risk score (only meaningful if climate-related)
    # Weights follow the Bingler/Kraus/Leippold/Webersinke "cheap talk" pattern:
    # environmental claim + non-specific + no commitment + opportunity framing.
    risk = 0.0
    reasons = []
    if is_climate:
        claim = results["env_claims"]
        spec = results["specificity"]
        commit = results["commitment"]
        senti = results["sentiment"]

        if _is_positive(claim["label"]):
            risk += 0.40 * claim["score"]
            reasons.append(
                "- **Environmental claim detected** — subject to the EU ECGT Directive (from 27 Sep 2026)."
            )
        if _is_non_specific(spec["label"]):
            risk += 0.30 * spec["score"]
            reasons.append("- **Non-specific language** — a classic cheap-talk signal.")
        if _no_commitment(commit["label"]):
            risk += 0.20 * commit["score"]
            reasons.append("- **No concrete commitment detected** — claim without follow-through.")
        if "opportunity" in _norm(senti["label"]):
            risk += 0.10 * senti["score"]
            reasons.append("- **Opportunity framing** — positive cherry-picking is common in greenwashing.")

    risk_pct = round(risk * 100, 1)

    # Verdict summary
    if not is_climate:
        summary = (
            f"### Verdict: Not climate-related\n\n"
            f"Detector confidence: **{det['score']:.1%}**\n\n"
            f"_Greenwashing scoring is skipped for non-climate text. "
            f"Other signals below are informational only._"
        )
    else:
        if risk >= 0.5:
            badge = "HIGH greenwashing risk"
        elif risk >= 0.25:
            badge = "MODERATE greenwashing risk"
        else:
            badge = "LOW greenwashing risk"
        summary = (
            f"### Verdict: {badge}\n\n"
            f"**Risk score: {risk_pct} / 100**\n\n"
            f"Climate detector confidence: {det['score']:.1%}"
        )

    # Signal breakdown (dict for Gradio JSON component)
    def fmt(r):
        return {"label": r["label"], "confidence": round(float(r["score"]), 4)}

    signals = {
        "climate_related": fmt(det),
        "environmental_claim": fmt(results["env_claims"]),
        "specificity": fmt(results["specificity"]),
        "commitment": fmt(results["commitment"]),
        "sentiment": fmt(results["sentiment"]),
        "netzero_reduction": fmt(results["netzero"]),
    }

    explanation = "\n".join(reasons) if reasons else "_No strong greenwashing signals detected._"

    raw = "\n".join(f"{k}: {v}" for k, v in results.items())

    return summary, signals, explanation, raw


EXAMPLES = [
    [
        "We are proud to announce our commitment to become climate neutral by 2040 "
        "through a combination of renewable energy investments and carbon offsetting."
    ],
    [
        "In 2024 we reduced our Scope 1 and Scope 2 emissions by 23% year-over-year, "
        "from 145,000 tCO2e to 111,650 tCO2e, verified by an independent third-party "
        "auditor and aligned with our SBTi-validated 1.5C pathway."
    ],
    [
        "Our eco-friendly products are designed with the planet in mind, featuring "
        "sustainable materials and a greener approach to packaging that customers love."
    ],
    [
        "The quarterly earnings report showed revenue growth of 12% driven by strong "
        "performance in our core European markets and improved operational efficiency."
    ],
    [
        "By 2030 we aim to achieve net-zero emissions across our entire value chain, "
        "aligned with a 1.5C science-based target validated by SBTi, with interim "
        "milestones of 50% absolute reduction by 2027 against a 2020 baseline."
    ],
]


INTRO = """
# ClimateBERT — Greenwashing Signal Detector

Paste a paragraph from a sustainability report, marketing copy, or corporate
disclosure. This demo runs **six specialized ClimateBERT classifiers** in parallel
to surface cheap-talk signals relevant to the upcoming EU regulations:

- **ECGT Directive** — applies 27 September 2026, bans vague green claims
  and "climate neutral via offsetting" statements.
- **Green Claims Directive** (proposed) — pre-verification of environmental claims.
- **CSRD / ESRS** — the source of text that will be scrutinized.

**Models** (all from [climatebert on Hugging Face](https://huggingface.co/climatebert), Apache-2.0):
`distilroberta-base-climate-detector`, `environmental-claims`,
`distilroberta-base-climate-specificity`, `distilroberta-base-climate-commitment`,
`distilroberta-base-climate-sentiment`, `netzero-reduction`.

> **Caveats.** Models are trained on **paragraphs** (not single sentences) and on
> **English** only. Outputs are proxy signals, not a legal verdict. Ground-truth
> greenwashing labels do not exist in any public dataset — every detector
> operationalizes proxies (specificity, commitment gap, cheap talk).
"""

with gr.Blocks(title="ClimateBERT — Greenwashing Signal Detector") as demo:
    gr.Markdown(INTRO)

    with gr.Row():
        with gr.Column(scale=2):
            text_in = gr.Textbox(
                label="Text to analyze (a paragraph works best)",
                lines=8,
                placeholder="Paste a paragraph from a sustainability report, press release, or marketing page...",
            )
            analyze_btn = gr.Button("Analyze", variant="primary")
            gr.Examples(examples=EXAMPLES, inputs=text_in, label="Try an example")
        with gr.Column(scale=3):
            summary_out = gr.Markdown(label="Verdict")
            explain_out = gr.Markdown(label="Why this score")
            signals_out = gr.JSON(label="Per-model signal breakdown")
            with gr.Accordion("Raw model outputs", open=False):
                raw_out = gr.Textbox(label="Raw", lines=8)

    gr.Markdown(
        "---\n"
        "Built on [ClimateBERT](https://huggingface.co/climatebert) by Webersinke, "
        "Kraus, Bingler & Leippold. Scoring heuristic inspired by Bingler et al., "
        "*Cheap talk and cherry-picking: What ClimateBERT has to say on corporate "
        "climate risk disclosures*, Finance Research Letters (2022)."
    )

    analyze_btn.click(
        classify,
        inputs=text_in,
        outputs=[summary_out, signals_out, explain_out, raw_out],
    )
    text_in.submit(
        classify,
        inputs=text_in,
        outputs=[summary_out, signals_out, explain_out, raw_out],
    )

if __name__ == "__main__":
    demo.launch()