Spaces:

tlogandesigns
/

FairHousing-classifier

Sleeping

File size: 8,541 Bytes

import os
import re
import html
import yaml
from pathlib import Path
import gradio as gr

# -----------------------------
# Config
# -----------------------------
TITLE = "Fair Housing Text Checker - V1.7"
DESCRIPTION = (
    "Paste any ad, post, or listing text. The checker highlights potential Fair Housing risks "
    "and suggests compliant alternatives. It focuses on protected classes in the U.S. Fair Housing Act. "
    "This tool does not provide legal advice."
)

# Defaults from env
ENV_USE_TINY = os.getenv("USE_TINY_ML", "1") == "1"
ENV_REPO = os.getenv("ML_REPO", "tlogandesigns/fairhousing-bert-tiny")
ENV_THRESH = float(os.getenv("THRESH", "0.75"))  # decision threshold for Potential Violation
PHRASES_PATH = os.getenv("PHRASES_PATH", "phrases.yaml")

# -----------------------------
# Load patterns
# -----------------------------
if Path(PHRASES_PATH).exists():
    PHRASES = yaml.safe_load(Path(PHRASES_PATH).read_text(encoding="utf-8"))
else:
    print(f"Warning: Phrases file '{PHRASES_PATH}' not found. Using empty patterns.")

COMPILED = []
for cat, data in PHRASES.get("categories", {}).items():
    for p in data.get("patterns", []):
        COMPILED.append((cat, re.compile(p, re.IGNORECASE), data.get("suggest", [])))

# -----------------------------
# Optional tiny transformer
# -----------------------------
pipe = None
_transformers_ok = False
try:
    from transformers import pipeline
    _transformers_ok = True
except Exception:
    _transformers_ok = False

def _load_ml(repo: str):
    global pipe
    if not _transformers_ok:
        return False, "transformers not installed"
    if pipe is None:
        try:
            pipe = pipeline(
                "text-classification",
                model=repo,
                tokenizer=repo,
                device=-1,
                return_all_scores=True,
                truncation=True,
            )
        except Exception as e:
            return False, str(e)
    return True, None

# -----------------------------
# HTML highlighting helpers
# -----------------------------
CATEGORY_COLORS = {
    "Familial status": "#e57373",
    "Religion": "#64b5f6",
    "Disability": "#81c784",
    "Sex": "#ba68c8",
    "Race or color": "#4db6ac",
    "National origin": "#ffd54f",
    "Other preference": "#90a4ae",
}

STYLE_BLOCK = """
<style>
.mark { padding: 0.1em 0.25em; border-radius: 0.25rem; }
.badge { display: inline-block; padding: 0 0.35em; border-radius: 0.4rem; font-size: 0.8em; margin-left: 0.3em; opacity: 0.9; }
.legend { display:flex; flex-wrap:wrap; gap:8px; margin: 0.5rem 0 1rem; }
.legend .swatch { width: 12px; height: 12px; border-radius: 3px; display:inline-block; margin-right:6px; }
.notice { margin-top: 10px; padding: 8px 10px; border-radius: 8px; background: #ffcccb; }
</style>
"""
# .hl-container { background: #ffffff; color: #000000; padding: 12px; border-radius: 8px; line-height: 1.7; }


def build_legend(categories):
    parts = ["<div class='legend'>"]
    for cat in sorted(categories):
        color = CATEGORY_COLORS.get(cat, "#bdbdbd")
        parts.append(f"<span><span class='swatch' style='background:{color}'></span>{html.escape(cat)}</span>")
    parts.append("</div>")
    return "".join(parts)


def highlight_html(text, spans):
    if not spans:
        return STYLE_BLOCK + f"<div class='hl-container'>{html.escape(text)}</div>"
    spans = sorted(spans, key=lambda x: x[0])
    cur = 0
    out = [STYLE_BLOCK, "<div class='hl-container'>"]
    for s, e, cat in spans:
        if s > cur:
            out.append(html.escape(text[cur:s]))
        frag = html.escape(text[s:e])
        color = CATEGORY_COLORS.get(cat, "#bdbdbd")
        out.append(
            f"<span class='mark' style='background:{color}1A; outline: 1px solid {color}55'>" \
            f"{frag}<span class='badge' style='background:{color}33'>{html.escape(cat)}</span></span>"
        )
        cur = e
    if cur < len(text):
        out.append(html.escape(text[cur:]))
    out.append("</div>")
    return "".join(out)

# -----------------------------
# Core analysis
# -----------------------------

def analyze_text(text: str, use_ml: bool, repo_override: str, threshold: float):
    text = text or ""
    findings = []
    highlights = []

    # Rules-first
    for cat, pat, suggestions in COMPILED:
        for m in pat.finditer(text):
            s, e = m.span()
            snippet = text[max(0, s - 40) : min(len(text), e + 40)]
            findings.append(
                {
                    "category": cat,
                    "match": m.group(0),
                    "start": s,
                    "end": e,
                    "context": snippet,
                    "suggestions": suggestions[:3],
                }
            )
            highlights.append((s, e, cat))

    # Optional ML
    ml_score = None
    ml_status = "ML: off"
    ml_violation = None
    repo = (repo_override or ENV_REPO).strip()
    if use_ml:
        ok, err = _load_ml(repo)
        if ok:
            try:
                scores = pipe(text)[0]
                ml_score = {s["label"]: float(s["score"]) for s in scores}
                pv = ml_score.get("Potential Violation", 0.0)
                ml_violation = pv >= threshold
                ml_status = f"decision={'flag' if ml_violation else 'no flag'}"
            except Exception as e:
                ml_score = {"error": str(e)}
                ml_status = f"ML: error - {e}"
        else:
            ml_status = f"ML: error - {err}"

    html_out = build_legend({f["category"] for f in findings}) + highlight_html(text, highlights)

    # If ML flags and rules found nothing, show a small notice
    if use_ml and ml_violation and not findings:
        html_out += "<div class='notice'>Model flagged this text as a potential violation based on the threshold.</div>"

    summary = {
        "issues_found": len(findings),
        "categories": sorted(list({f["category"] for f in findings})),
        "ml_score": ml_score,
        "threshold": threshold,
        "ml_violation": ml_violation,
    }

    if findings:
        rows = []
        for f in findings:
            rows.append(
                f"- Category: {f['category']} | Phrase: \"{f['match']}\"\n"
                f"  Context: ...{f['context']}...\n"
                f"  Suggested alternatives: {', '.join(f['suggestions']) if f['suggestions'] else 'N/A'}"
            )
        report = "Potential issues:\n" + "\n".join(rows)
    else:
        report = "No obvious risk phrases found by the rules engine."

    return html_out, report, summary, ml_status

# -----------------------------
# UI
# -----------------------------
with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(f"# {TITLE}\n\n{DESCRIPTION}")

    with gr.Row():
        inp = gr.Textbox(label="Paste text", lines=10, placeholder="Paste listing or ad copy here...")

    with gr.Accordion("Advanced Options & Summary", open=False):
        with gr.Row():
            use_ml = gr.Checkbox(label="Use tiny ML classifier", value=ENV_USE_TINY)
            repo_box = gr.Textbox(label="Model repo", value=ENV_REPO, info="Hugging Face repo id for a sequence classifier")
            thresh = gr.Slider(label="Violation threshold", minimum=0.50, maximum=0.95, step=0.01, value=ENV_THRESH)
        with gr.Row():
            summary = gr.JSON(label="Summary")
        with gr.Row():
            ml_status = gr.Markdown()

    with gr.Row():
        btn_check = gr.Button("Check text", variant="primary")
        btn_clear = gr.Button("Clear")

    with gr.Row():
        marked_html = gr.HTML(label="Highlighted text")

    with gr.Row():
        report = gr.Markdown()



    # Wire actions
    inp.submit(analyze_text, inputs=[inp, use_ml, repo_box, thresh], outputs=[marked_html, report, summary, ml_status])
    btn_check.click(analyze_text, inputs=[inp, use_ml, repo_box, thresh], outputs=[marked_html, report, summary, ml_status])

    btn_clear.click(
        lambda: ("", "", {"issues_found": 0, "categories": [], "ml_score": None, "threshold": ENV_THRESH, "ml_violation": None}, "ML: off"),
        inputs=None,
        outputs=[marked_html, report, summary, ml_status],
    )

    gr.Examples(
        examples=[
            ["Beautiful condo in quiet, safe neighborhood. No children please. Ladies only."],
            ["Close to multiple community centers and parks. Service animals accommodated per law."],
            ["christians only"],
        ],
        inputs=inp,
    )

if __name__ == "__main__":
    demo.launch()