ClimateBERT / app.py
narcis2007's picture
Fix Textbox init: remove unsupported show_copy_button kwarg
6030204
"""
ClimateBERT β€” Greenwashing Signal Detector (Gradio demo)
Runs six specialized ClimateBERT models on a paragraph of text and returns
a proxy "cheap talk" greenwashing risk score. Aligned with the EU ECGT
Directive (applies 27 September 2026) and the proposed Green Claims Directive.
All models are Apache-2.0, from https://huggingface.co/climatebert
"""
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
# CPU-only (HF Spaces free tier)
DEVICE = -1
MAX_LEN = 512
# Each entry: (internal_key, model_repo, tokenizer_repo_or_None)
MODELS = [
("detector", "climatebert/distilroberta-base-climate-detector", None),
("env_claims", "climatebert/environmental-claims", None),
("specificity", "climatebert/distilroberta-base-climate-specificity", None),
("commitment", "climatebert/distilroberta-base-climate-commitment", None),
("sentiment", "climatebert/distilroberta-base-climate-sentiment", None),
# netzero-reduction does not ship its own tokenizer β€” use the base LM
("netzero", "climatebert/netzero-reduction",
"climatebert/distilroberta-base-climate-f"),
]
print("Loading ClimateBERT models (first run downloads ~2 GB)...")
PIPES = {}
for key, model_repo, tok_repo in MODELS:
tok = AutoTokenizer.from_pretrained(tok_repo or model_repo, model_max_length=MAX_LEN)
mdl = AutoModelForSequenceClassification.from_pretrained(model_repo)
PIPES[key] = pipeline(
"text-classification",
model=mdl,
tokenizer=tok,
truncation=True,
padding=True,
max_length=MAX_LEN,
device=DEVICE,
)
print(f" loaded {key}: {model_repo}")
print("All models loaded.")
def _norm(label: str) -> str:
return (label or "").strip().lower()
def _is_positive(label: str, positive_keywords=("yes", "claim", "climate", "true", "1")) -> bool:
label = _norm(label)
return any(kw in label for kw in positive_keywords)
def _is_non_specific(label: str) -> bool:
label = _norm(label)
return "non" in label # "non-specific", "nonspecific"
def _no_commitment(label: str) -> bool:
label = _norm(label)
return label in ("no", "none") or "no" == label[:2] or "none" in label
def classify(text: str):
if not text or not text.strip():
return "Please enter some text to analyze.", {}, "", ""
text = text.strip()
results = {key: pipe(text)[0] for key, pipe in PIPES.items()}
det = results["detector"]
is_climate = _is_positive(det["label"])
# Greenwashing risk score (only meaningful if climate-related)
# Weights follow the Bingler/Kraus/Leippold/Webersinke "cheap talk" pattern:
# environmental claim + non-specific + no commitment + opportunity framing.
risk = 0.0
reasons = []
if is_climate:
claim = results["env_claims"]
spec = results["specificity"]
commit = results["commitment"]
senti = results["sentiment"]
if _is_positive(claim["label"]):
risk += 0.40 * claim["score"]
reasons.append(
"- **Environmental claim detected** β€” subject to the EU ECGT Directive (from 27 Sep 2026)."
)
if _is_non_specific(spec["label"]):
risk += 0.30 * spec["score"]
reasons.append("- **Non-specific language** β€” a classic cheap-talk signal.")
if _no_commitment(commit["label"]):
risk += 0.20 * commit["score"]
reasons.append("- **No concrete commitment detected** β€” claim without follow-through.")
if "opportunity" in _norm(senti["label"]):
risk += 0.10 * senti["score"]
reasons.append("- **Opportunity framing** β€” positive cherry-picking is common in greenwashing.")
risk_pct = round(risk * 100, 1)
# Verdict summary
if not is_climate:
summary = (
f"### Verdict: Not climate-related\n\n"
f"Detector confidence: **{det['score']:.1%}**\n\n"
f"_Greenwashing scoring is skipped for non-climate text. "
f"Other signals below are informational only._"
)
else:
if risk >= 0.5:
badge = "HIGH greenwashing risk"
elif risk >= 0.25:
badge = "MODERATE greenwashing risk"
else:
badge = "LOW greenwashing risk"
summary = (
f"### Verdict: {badge}\n\n"
f"**Risk score: {risk_pct} / 100**\n\n"
f"Climate detector confidence: {det['score']:.1%}"
)
# Signal breakdown (dict for Gradio JSON component)
def fmt(r):
return {"label": r["label"], "confidence": round(float(r["score"]), 4)}
signals = {
"climate_related": fmt(det),
"environmental_claim": fmt(results["env_claims"]),
"specificity": fmt(results["specificity"]),
"commitment": fmt(results["commitment"]),
"sentiment": fmt(results["sentiment"]),
"netzero_reduction": fmt(results["netzero"]),
}
explanation = "\n".join(reasons) if reasons else "_No strong greenwashing signals detected._"
raw = "\n".join(f"{k}: {v}" for k, v in results.items())
return summary, signals, explanation, raw
EXAMPLES = [
[
"We are proud to announce our commitment to become climate neutral by 2040 "
"through a combination of renewable energy investments and carbon offsetting."
],
[
"In 2024 we reduced our Scope 1 and Scope 2 emissions by 23% year-over-year, "
"from 145,000 tCO2e to 111,650 tCO2e, verified by an independent third-party "
"auditor and aligned with our SBTi-validated 1.5C pathway."
],
[
"Our eco-friendly products are designed with the planet in mind, featuring "
"sustainable materials and a greener approach to packaging that customers love."
],
[
"The quarterly earnings report showed revenue growth of 12% driven by strong "
"performance in our core European markets and improved operational efficiency."
],
[
"By 2030 we aim to achieve net-zero emissions across our entire value chain, "
"aligned with a 1.5C science-based target validated by SBTi, with interim "
"milestones of 50% absolute reduction by 2027 against a 2020 baseline."
],
]
INTRO = """
# ClimateBERT β€” Greenwashing Signal Detector
Paste a paragraph from a sustainability report, marketing copy, or corporate
disclosure. This demo runs **six specialized ClimateBERT classifiers** in parallel
to surface cheap-talk signals relevant to the upcoming EU regulations:
- **ECGT Directive** β€” applies 27 September 2026, bans vague green claims
and "climate neutral via offsetting" statements.
- **Green Claims Directive** (proposed) β€” pre-verification of environmental claims.
- **CSRD / ESRS** β€” the source of text that will be scrutinized.
**Models** (all from [climatebert on Hugging Face](https://huggingface.co/climatebert), Apache-2.0):
`distilroberta-base-climate-detector`, `environmental-claims`,
`distilroberta-base-climate-specificity`, `distilroberta-base-climate-commitment`,
`distilroberta-base-climate-sentiment`, `netzero-reduction`.
> **Caveats.** Models are trained on **paragraphs** (not single sentences) and on
> **English** only. Outputs are proxy signals, not a legal verdict. Ground-truth
> greenwashing labels do not exist in any public dataset β€” every detector
> operationalizes proxies (specificity, commitment gap, cheap talk).
"""
with gr.Blocks(title="ClimateBERT β€” Greenwashing Signal Detector") as demo:
gr.Markdown(INTRO)
with gr.Row():
with gr.Column(scale=2):
text_in = gr.Textbox(
label="Text to analyze (a paragraph works best)",
lines=8,
placeholder="Paste a paragraph from a sustainability report, press release, or marketing page...",
)
analyze_btn = gr.Button("Analyze", variant="primary")
gr.Examples(examples=EXAMPLES, inputs=text_in, label="Try an example")
with gr.Column(scale=3):
summary_out = gr.Markdown(label="Verdict")
explain_out = gr.Markdown(label="Why this score")
signals_out = gr.JSON(label="Per-model signal breakdown")
with gr.Accordion("Raw model outputs", open=False):
raw_out = gr.Textbox(label="Raw", lines=8)
gr.Markdown(
"---\n"
"Built on [ClimateBERT](https://huggingface.co/climatebert) by Webersinke, "
"Kraus, Bingler & Leippold. Scoring heuristic inspired by Bingler et al., "
"*Cheap talk and cherry-picking: What ClimateBERT has to say on corporate "
"climate risk disclosures*, Finance Research Letters (2022)."
)
analyze_btn.click(
classify,
inputs=text_in,
outputs=[summary_out, signals_out, explain_out, raw_out],
)
text_in.submit(
classify,
inputs=text_in,
outputs=[summary_out, signals_out, explain_out, raw_out],
)
if __name__ == "__main__":
demo.launch()