|
|
import os |
|
|
import re |
|
|
import html |
|
|
import yaml |
|
|
from pathlib import Path |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TITLE = "Fair Housing Text Checker - V1.7" |
|
|
DESCRIPTION = ( |
|
|
"Paste any ad, post, or listing text. The checker highlights potential Fair Housing risks " |
|
|
"and suggests compliant alternatives. It focuses on protected classes in the U.S. Fair Housing Act. " |
|
|
"This tool does not provide legal advice." |
|
|
) |
|
|
|
|
|
|
|
|
ENV_USE_TINY = os.getenv("USE_TINY_ML", "1") == "1" |
|
|
ENV_REPO = os.getenv("ML_REPO", "tlogandesigns/fairhousing-bert-tiny") |
|
|
ENV_THRESH = float(os.getenv("THRESH", "0.75")) |
|
|
PHRASES_PATH = os.getenv("PHRASES_PATH", "phrases.yaml") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if Path(PHRASES_PATH).exists(): |
|
|
PHRASES = yaml.safe_load(Path(PHRASES_PATH).read_text(encoding="utf-8")) |
|
|
else: |
|
|
print(f"Warning: Phrases file '{PHRASES_PATH}' not found. Using empty patterns.") |
|
|
|
|
|
COMPILED = [] |
|
|
for cat, data in PHRASES.get("categories", {}).items(): |
|
|
for p in data.get("patterns", []): |
|
|
COMPILED.append((cat, re.compile(p, re.IGNORECASE), data.get("suggest", []))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pipe = None |
|
|
_transformers_ok = False |
|
|
try: |
|
|
from transformers import pipeline |
|
|
_transformers_ok = True |
|
|
except Exception: |
|
|
_transformers_ok = False |
|
|
|
|
|
def _load_ml(repo: str): |
|
|
global pipe |
|
|
if not _transformers_ok: |
|
|
return False, "transformers not installed" |
|
|
if pipe is None: |
|
|
try: |
|
|
pipe = pipeline( |
|
|
"text-classification", |
|
|
model=repo, |
|
|
tokenizer=repo, |
|
|
device=-1, |
|
|
return_all_scores=True, |
|
|
truncation=True, |
|
|
) |
|
|
except Exception as e: |
|
|
return False, str(e) |
|
|
return True, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CATEGORY_COLORS = { |
|
|
"Familial status": "#e57373", |
|
|
"Religion": "#64b5f6", |
|
|
"Disability": "#81c784", |
|
|
"Sex": "#ba68c8", |
|
|
"Race or color": "#4db6ac", |
|
|
"National origin": "#ffd54f", |
|
|
"Other preference": "#90a4ae", |
|
|
} |
|
|
|
|
|
STYLE_BLOCK = """ |
|
|
<style> |
|
|
.mark { padding: 0.1em 0.25em; border-radius: 0.25rem; } |
|
|
.badge { display: inline-block; padding: 0 0.35em; border-radius: 0.4rem; font-size: 0.8em; margin-left: 0.3em; opacity: 0.9; } |
|
|
.legend { display:flex; flex-wrap:wrap; gap:8px; margin: 0.5rem 0 1rem; } |
|
|
.legend .swatch { width: 12px; height: 12px; border-radius: 3px; display:inline-block; margin-right:6px; } |
|
|
.notice { margin-top: 10px; padding: 8px 10px; border-radius: 8px; background: #ffcccb; } |
|
|
</style> |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
def build_legend(categories): |
|
|
parts = ["<div class='legend'>"] |
|
|
for cat in sorted(categories): |
|
|
color = CATEGORY_COLORS.get(cat, "#bdbdbd") |
|
|
parts.append(f"<span><span class='swatch' style='background:{color}'></span>{html.escape(cat)}</span>") |
|
|
parts.append("</div>") |
|
|
return "".join(parts) |
|
|
|
|
|
|
|
|
def highlight_html(text, spans): |
|
|
if not spans: |
|
|
return STYLE_BLOCK + f"<div class='hl-container'>{html.escape(text)}</div>" |
|
|
spans = sorted(spans, key=lambda x: x[0]) |
|
|
cur = 0 |
|
|
out = [STYLE_BLOCK, "<div class='hl-container'>"] |
|
|
for s, e, cat in spans: |
|
|
if s > cur: |
|
|
out.append(html.escape(text[cur:s])) |
|
|
frag = html.escape(text[s:e]) |
|
|
color = CATEGORY_COLORS.get(cat, "#bdbdbd") |
|
|
out.append( |
|
|
f"<span class='mark' style='background:{color}1A; outline: 1px solid {color}55'>" \ |
|
|
f"{frag}<span class='badge' style='background:{color}33'>{html.escape(cat)}</span></span>" |
|
|
) |
|
|
cur = e |
|
|
if cur < len(text): |
|
|
out.append(html.escape(text[cur:])) |
|
|
out.append("</div>") |
|
|
return "".join(out) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_text(text: str, use_ml: bool, repo_override: str, threshold: float): |
|
|
text = text or "" |
|
|
findings = [] |
|
|
highlights = [] |
|
|
|
|
|
|
|
|
for cat, pat, suggestions in COMPILED: |
|
|
for m in pat.finditer(text): |
|
|
s, e = m.span() |
|
|
snippet = text[max(0, s - 40) : min(len(text), e + 40)] |
|
|
findings.append( |
|
|
{ |
|
|
"category": cat, |
|
|
"match": m.group(0), |
|
|
"start": s, |
|
|
"end": e, |
|
|
"context": snippet, |
|
|
"suggestions": suggestions[:3], |
|
|
} |
|
|
) |
|
|
highlights.append((s, e, cat)) |
|
|
|
|
|
|
|
|
ml_score = None |
|
|
ml_status = "ML: off" |
|
|
ml_violation = None |
|
|
repo = (repo_override or ENV_REPO).strip() |
|
|
if use_ml: |
|
|
ok, err = _load_ml(repo) |
|
|
if ok: |
|
|
try: |
|
|
scores = pipe(text)[0] |
|
|
ml_score = {s["label"]: float(s["score"]) for s in scores} |
|
|
pv = ml_score.get("Potential Violation", 0.0) |
|
|
ml_violation = pv >= threshold |
|
|
ml_status = f"decision={'flag' if ml_violation else 'no flag'}" |
|
|
except Exception as e: |
|
|
ml_score = {"error": str(e)} |
|
|
ml_status = f"ML: error - {e}" |
|
|
else: |
|
|
ml_status = f"ML: error - {err}" |
|
|
|
|
|
html_out = build_legend({f["category"] for f in findings}) + highlight_html(text, highlights) |
|
|
|
|
|
|
|
|
if use_ml and ml_violation and not findings: |
|
|
html_out += "<div class='notice'>Model flagged this text as a potential violation based on the threshold.</div>" |
|
|
|
|
|
summary = { |
|
|
"issues_found": len(findings), |
|
|
"categories": sorted(list({f["category"] for f in findings})), |
|
|
"ml_score": ml_score, |
|
|
"threshold": threshold, |
|
|
"ml_violation": ml_violation, |
|
|
} |
|
|
|
|
|
if findings: |
|
|
rows = [] |
|
|
for f in findings: |
|
|
rows.append( |
|
|
f"- Category: {f['category']} | Phrase: \"{f['match']}\"\n" |
|
|
f" Context: ...{f['context']}...\n" |
|
|
f" Suggested alternatives: {', '.join(f['suggestions']) if f['suggestions'] else 'N/A'}" |
|
|
) |
|
|
report = "Potential issues:\n" + "\n".join(rows) |
|
|
else: |
|
|
report = "No obvious risk phrases found by the rules engine." |
|
|
|
|
|
return html_out, report, summary, ml_status |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(fill_height=True) as demo: |
|
|
gr.Markdown(f"# {TITLE}\n\n{DESCRIPTION}") |
|
|
|
|
|
with gr.Row(): |
|
|
inp = gr.Textbox(label="Paste text", lines=10, placeholder="Paste listing or ad copy here...") |
|
|
|
|
|
with gr.Accordion("Advanced Options & Summary", open=False): |
|
|
with gr.Row(): |
|
|
use_ml = gr.Checkbox(label="Use tiny ML classifier", value=ENV_USE_TINY) |
|
|
repo_box = gr.Textbox(label="Model repo", value=ENV_REPO, info="Hugging Face repo id for a sequence classifier") |
|
|
thresh = gr.Slider(label="Violation threshold", minimum=0.50, maximum=0.95, step=0.01, value=ENV_THRESH) |
|
|
with gr.Row(): |
|
|
summary = gr.JSON(label="Summary") |
|
|
with gr.Row(): |
|
|
ml_status = gr.Markdown() |
|
|
|
|
|
with gr.Row(): |
|
|
btn_check = gr.Button("Check text", variant="primary") |
|
|
btn_clear = gr.Button("Clear") |
|
|
|
|
|
with gr.Row(): |
|
|
marked_html = gr.HTML(label="Highlighted text") |
|
|
|
|
|
with gr.Row(): |
|
|
report = gr.Markdown() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inp.submit(analyze_text, inputs=[inp, use_ml, repo_box, thresh], outputs=[marked_html, report, summary, ml_status]) |
|
|
btn_check.click(analyze_text, inputs=[inp, use_ml, repo_box, thresh], outputs=[marked_html, report, summary, ml_status]) |
|
|
|
|
|
btn_clear.click( |
|
|
lambda: ("", "", {"issues_found": 0, "categories": [], "ml_score": None, "threshold": ENV_THRESH, "ml_violation": None}, "ML: off"), |
|
|
inputs=None, |
|
|
outputs=[marked_html, report, summary, ml_status], |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["Beautiful condo in quiet, safe neighborhood. No children please. Ladies only."], |
|
|
["Close to multiple community centers and parks. Service animals accommodated per law."], |
|
|
["christians only"], |
|
|
], |
|
|
inputs=inp, |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|