regtech-br / app.py
Fernandosr85's picture
Upload app.py
14c2a54 verified
"""
RegTech BR — Hugging Face Space
================================
Compliance analyzer for Brazilian crypto asset regulation.
Uses FAISS RAG + Claude Sonnet 4.6 to produce structured risk assessments.
Setup:
1. Upload faiss_index.bin, embeddings.npy, chunks_meta.jsonl to data/index/
2. Add ANTHROPIC_API_KEY as a Space Secret
3. Deploy
"""
import os, json, re, time, unicodedata
from pathlib import Path
import numpy as np
import gradio as gr
import requests
# ── Paths ──────────────────────────────────────────────────────────
INDEX_DIR = Path("data/index")
# ── Load index at startup ──────────────────────────────────────────
print("Loading RegTech BR index...")
import faiss
from sentence_transformers import SentenceTransformer
CHUNKS: list[dict] = []
with open(INDEX_DIR / "chunks_meta.jsonl", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
CHUNKS.append(json.loads(line))
EMBEDDINGS = np.load(INDEX_DIR / "embeddings.npy")
INDEX = faiss.read_index(str(INDEX_DIR / "faiss_index.bin"))
MODEL = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
print(f"Index ready — {len(CHUNKS)} chunks, {INDEX.ntotal} vectors")
# ── Normalization & routing ────────────────────────────────────────
def normalize(text: str) -> str:
text = unicodedata.normalize("NFD", text or "")
text = "".join(c for c in text if unicodedata.category(c) != "Mn")
text = text.lower()
text = re.sub(r"[^a-z0-9]+", " ", text)
return re.sub(r"\s+", " ", text).strip()
AUTHORITY_KW = {
"BCB": ["banco central","bcb","psav","ativo virtual","ativos virtuais",
"criptoativo","autorizacao","cambio","segregacao","patrimonial",
"ativos dos clientes","circular","resolucao bcb"],
"CVM": ["cvm","valores mobiliarios","valor mobiliario","token","tokens",
"oferta publica","dividendos","direito de voto","captar","rwa","parecer"],
"COAF": ["coaf","pessoa exposta politicamente","pep","kyc","lavagem",
"terrorismo","pld","ftp","anonimo","anonima","identificacao do cliente"],
}
SOURCE_KW = {
"lei_14478": ["lei 14 478","lei 14478","marco legal","lei dos criptoativos"],
"decreto_11563": ["decreto 11 563","decreto 11563"],
"bcb_circular_3978":["circular 3978","pld","ftp","lavagem","kyc","anonimo"],
"bcb_in701": ["instrucao normativa 701","certificacao tecnica","segregacao"],
"bcb_res548": ["resolucao 548","autorizacao psav"],
"cvm": ["cvm","valores mobiliarios","token","dividendos"],
"coaf": ["coaf","pep","pessoa exposta politicamente"],
}
def detect_route(query: str) -> dict:
q = normalize(query)
route = {"authority_filters":[], "source_id_contains":[], "query_expansion":[]}
if any(normalize(k) in q for k in SOURCE_KW["lei_14478"]):
route["source_id_contains"].append("lei_14478")
route["query_expansion"].extend(["Lei 14.478 de 2022","marco legal criptoativos"])
if any(normalize(k) in q for k in SOURCE_KW["decreto_11563"]):
route["source_id_contains"].append("decreto_11563")
route["query_expansion"].extend(["Decreto 11.563 de 2023","Banco Central"])
hits = {a: sum(1 for k in kws if normalize(k) in q)
for a, kws in AUTHORITY_KW.items()}
max_h = max(hits.values()) if hits else 0
if max_h:
route["authority_filters"] = [a for a,h in hits.items() if h==max_h or h>=2]
for key, kws in SOURCE_KW.items():
if any(normalize(k) in q for k in kws):
route["source_id_contains"].append(key)
for k in ["authority_filters","source_id_contains","query_expansion"]:
route[k] = list(dict.fromkeys(route[k]))
return route
def route_boost(chunk: dict, route: dict) -> float:
boost = 0.0
sid = normalize(chunk.get("source_id",""))
for token in route.get("source_id_contains",[]):
if normalize(token) in sid:
boost += 0.35
if chunk.get("authority") in route.get("authority_filters",[]):
boost += 0.08
return min(boost, 0.55)
def lexical_boost(query_norm: str, chunk: dict) -> float:
parts = " ".join([chunk.get("source_id",""), chunk.get("authority",""),
" ".join(chunk.get("tags",[])), chunk.get("text","")])
chunk_norm = normalize(parts)
terms = [t for t in query_norm.split() if len(t) >= 4]
if not terms:
return 0.0
hits = sum(1 for t in set(terms) if t in chunk_norm)
return min(0.12, hits / max(6, len(set(terms))) * 0.12)
def retrieve(query: str, top_k: int = 6) -> list[dict]:
route = detect_route(query)
expanded = query + "\n" + "\n".join(route.get("query_expansion",[]))
q_vec = MODEL.encode([expanded], normalize_embeddings=True).astype(np.float32)
scores, indices = INDEX.search(q_vec, min(len(CHUNKS), top_k * 20))
q_norm = normalize(expanded)
ranked = []
for score, idx in zip(scores[0], indices[0]):
if idx < 0 or float(score) < 0.20:
continue
chunk = CHUNKS[int(idx)].copy()
chunk["_score"] = float(score)
chunk["_final"] = float(score) + lexical_boost(q_norm, chunk) + route_boost(chunk, route)
ranked.append(chunk)
ranked.sort(key=lambda r: r["_final"], reverse=True)
seen, unique = set(), []
for r in ranked:
cid = r.get("chunk_id")
if cid not in seen:
seen.add(cid)
unique.append(r)
return unique[:top_k]
def format_context(results: list[dict]) -> str:
lines = []
for i, r in enumerate(results, 1):
art = f" — {r['article_hint']}" if r.get("article_hint") else ""
norm = f" [{r['normative_reference_hint']}]" if r.get("normative_reference_hint") else ""
lines.append(
f"[SOURCE {i}] {r.get('source_label','')}{art}{norm}\n"
f"Authority: {r.get('authority','?')} | Score: {r['_final']:.3f}\n"
f"{str(r.get('text',''))[:700]}..."
)
return "\n\n---\n\n".join(lines)
# ── LLM ───────────────────────────────────────────────────────────
SYSTEM = """You are RegTech BR, a specialist AI in Brazilian crypto asset regulation.
Analyze the compliance query and produce a structured JSON assessment.
Respond ONLY with valid JSON — no markdown fences.
Schema:
{
"risk_level": "LOW | MEDIUM | HIGH | UNCLEAR",
"compliance_status": "COMPLIANT | NON-COMPLIANT | REQUIRES_REVIEW | INSUFFICIENT_INFO",
"applicable_regulations": ["list"],
"relevant_articles": ["list"],
"finding": "2-5 sentence assessment",
"corrective_action": "specific steps or 'No action required'",
"confidence": "HIGH | MEDIUM | LOW",
"authority": "BCB | CVM | COAF | mixed | federal"
}"""
def call_claude(query: str, context: str) -> dict | None:
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
if not api_key:
return None
prompt = (f"COMPLIANCE QUERY:\n{query}\n\n"
f"REGULATORY CONTEXT:\n\n{context}\n\n"
f"Produce a structured compliance assessment.")
try:
r = requests.post(
"https://api.anthropic.com/v1/messages",
headers={"Content-Type":"application/json",
"x-api-key": api_key,
"anthropic-version":"2023-06-01"},
json={"model":"claude-sonnet-4-6","max_tokens":1200,
"system":SYSTEM,"messages":[{"role":"user","content":prompt}]},
timeout=90,
)
r.raise_for_status()
raw = "".join(b.get("text","") for b in r.json().get("content",[])
if b.get("type")=="text")
raw = re.sub(r"^```(?:json)?|```$","",raw.strip(),flags=re.IGNORECASE).strip()
start, end = raw.find("{"), raw.rfind("}")
if start >= 0 and end > start:
return json.loads(raw[start:end+1])
except Exception as e:
print(f"Claude error: {e}")
return None
# ── Risk badge ─────────────────────────────────────────────────────
RISK_COLOR = {"HIGH":"#dc2626","MEDIUM":"#d97706","LOW":"#16a34a","UNCLEAR":"#6b7280"}
STATUS_ICON = {"NON-COMPLIANT":"⛔","COMPLIANT":"✅","REQUIRES_REVIEW":"⚠️","INSUFFICIENT_INFO":"❓"}
def render_report(report: dict, query: str, results: list[dict]) -> str:
risk = report.get("risk_level","UNCLEAR")
status = report.get("compliance_status","INSUFFICIENT_INFO")
color = RISK_COLOR.get(risk,"#6b7280")
icon = STATUS_ICON.get(status,"❓")
conf = report.get("confidence","LOW")
auth = report.get("authority","?")
regs = "".join(f"<li>{r}</li>" for r in report.get("applicable_regulations",[]))
arts = "".join(f"<li>{a}</li>" for a in report.get("relevant_articles",[]))
srcs = "".join(
f'<span style="background:#1e3a5f;color:#93c5fd;padding:2px 8px;'
f'border-radius:4px;font-size:0.75rem;margin:2px;display:inline-block">'
f'{r.get("source_id","")}</span>'
for r in results
)
return f"""
<div style="font-family:'IBM Plex Mono',monospace;background:#0f172a;color:#e2e8f0;
padding:1.5rem;border-radius:12px;border:1px solid #1e3a5f;line-height:1.6">
<div style="display:flex;gap:1rem;align-items:center;margin-bottom:1.2rem;flex-wrap:wrap">
<span style="background:{color};color:#fff;padding:4px 14px;border-radius:6px;
font-weight:700;font-size:0.9rem;letter-spacing:0.05em">
{risk} RISK
</span>
<span style="background:#1e293b;color:#e2e8f0;padding:4px 14px;border-radius:6px;
font-size:0.9rem;border:1px solid #334155">
{icon} {status}
</span>
<span style="color:#64748b;font-size:0.8rem">
confidence: {conf} · authority: {auth}
</span>
</div>
<div style="background:#1e293b;border-radius:8px;padding:1rem;margin-bottom:1rem;
border-left:3px solid {color}">
<div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase;
letter-spacing:0.1em;margin-bottom:0.5rem">Finding</div>
<div style="color:#e2e8f0;font-size:0.88rem">{report.get('finding','—')}</div>
</div>
<div style="background:#1e293b;border-radius:8px;padding:1rem;margin-bottom:1rem">
<div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase;
letter-spacing:0.1em;margin-bottom:0.5rem">Corrective Action</div>
<div style="color:#e2e8f0;font-size:0.88rem">{report.get('corrective_action','—')}</div>
</div>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;margin-bottom:1rem">
<div style="background:#1e293b;border-radius:8px;padding:1rem">
<div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase;
letter-spacing:0.1em;margin-bottom:0.5rem">Applicable Regulations</div>
<ul style="margin:0;padding-left:1.2rem;color:#93c5fd;font-size:0.8rem">{regs}</ul>
</div>
<div style="background:#1e293b;border-radius:8px;padding:1rem">
<div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase;
letter-spacing:0.1em;margin-bottom:0.5rem">Relevant Articles</div>
<ul style="margin:0;padding-left:1.2rem;color:#86efac;font-size:0.8rem">{arts}</ul>
</div>
</div>
<div style="margin-top:0.8rem">
<div style="color:#64748b;font-size:0.7rem;text-transform:uppercase;
letter-spacing:0.1em;margin-bottom:0.4rem">Sources retrieved</div>
{srcs}
</div>
</div>"""
def render_error(msg: str) -> str:
return f"""
<div style="background:#1c1917;color:#fca5a5;padding:1.2rem;border-radius:8px;
border:1px solid #7f1d1d;font-family:monospace">
⚠️ {msg}
</div>"""
# ── Gradio interface ───────────────────────────────────────────────
EXAMPLES = [
"Nossa plataforma permite compra e venda de criptoativos sem autorização formal do Banco Central.",
"Realizamos KYC apenas para transações acima de R$100.000. Transações menores são anônimas.",
"Nosso token REV distribui dividendos e dá direito de voto. Será ofertado publicamente por R$30M.",
"Nossa exchange não realiza segregação patrimonial dos ativos dos clientes.",
"Não possuímos diretor responsável formalmente designado para PLD/FTP.",
"Qual é o marco legal dos criptoativos no Brasil segundo a Lei 14.478 de 2022?",
"Qual órgão foi designado pelo Decreto 11.563/2023 para regular as PSAVs?",
]
def analyze(query: str) -> tuple[str, str]:
if not query or not query.strip():
return render_error("Please enter a compliance query."), ""
results = retrieve(query.strip())
if not results:
return render_error("No relevant regulatory chunks found. Try rephrasing your query."), ""
context = format_context(results)
report = call_claude(query, context)
if not report:
return render_error(
"Could not reach Claude API. Check that ANTHROPIC_API_KEY is set as a Space Secret."
), context
return render_report(report, query, results), context
# ── Layout ─────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;700&family=IBM+Plex+Sans:wght@300;400;600&display=swap');
body { background: #0a0f1e !important; }
.gradio-container {
background: #0a0f1e !important;
font-family: 'IBM Plex Sans', sans-serif !important;
max-width: 900px !important;
margin: 0 auto !important;
}
#header {
text-align: center;
padding: 2rem 0 1rem;
border-bottom: 1px solid #1e3a5f;
margin-bottom: 1.5rem;
}
#header h1 {
font-family: 'IBM Plex Mono', monospace;
font-size: 1.8rem;
color: #38bdf8;
letter-spacing: -0.02em;
margin: 0 0 0.3rem;
}
#header p {
color: #64748b;
font-size: 0.85rem;
margin: 0;
}
.query-box textarea {
background: #0f172a !important;
border: 1px solid #1e3a5f !important;
color: #e2e8f0 !important;
font-family: 'IBM Plex Mono', monospace !important;
font-size: 0.9rem !important;
border-radius: 8px !important;
}
.query-box textarea:focus {
border-color: #38bdf8 !important;
box-shadow: 0 0 0 2px rgba(56,189,248,0.15) !important;
}
.analyze-btn {
background: #0369a1 !important;
color: #fff !important;
font-family: 'IBM Plex Mono', monospace !important;
font-weight: 600 !important;
letter-spacing: 0.05em !important;
border: none !important;
border-radius: 8px !important;
height: 44px !important;
font-size: 0.85rem !important;
transition: background 0.2s !important;
}
.analyze-btn:hover {
background: #0284c7 !important;
}
.context-box textarea {
background: #0f172a !important;
border: 1px solid #1e293b !important;
color: #64748b !important;
font-family: 'IBM Plex Mono', monospace !important;
font-size: 0.75rem !important;
}
label { color: #94a3b8 !important; font-size: 0.8rem !important; }
.svelte-1gfkn6j { background: #0a0f1e !important; }
"""
with gr.Blocks(css=CSS, title="RegTech BR") as demo:
gr.HTML("""
<div id="header">
<h1>⚖ RegTech BR</h1>
<p>Brazilian Crypto Asset Compliance Analyzer · BCB · CVM · COAF · Lei 14.478/2022</p>
<p style="color:#1e3a5f;font-size:0.75rem;margin-top:0.3rem">
103 regulatory chunks · RAG + Claude Sonnet 4.6 · BLI Legal Tech Hackathon 2
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=4):
query_box = gr.Textbox(
label="Compliance query or document excerpt",
placeholder="Describe your policy, product, or compliance question in Portuguese or English...",
lines=4,
elem_classes=["query-box"],
)
with gr.Column(scale=1, min_width=120):
analyze_btn = gr.Button("Analyze →", elem_classes=["analyze-btn"])
gr.Examples(
examples=[[e] for e in EXAMPLES],
inputs=[query_box],
label="Example queries",
)
report_html = gr.HTML(label="Compliance Assessment")
with gr.Accordion("Retrieved regulatory context", open=False):
context_box = gr.Textbox(
label="Raw chunks retrieved by RAG",
lines=10,
interactive=False,
elem_classes=["context-box"],
)
gr.HTML("""
<div style="text-align:center;color:#334155;font-size:0.72rem;
padding:1.5rem 0 0.5rem;font-family:'IBM Plex Mono',monospace">
⚠ Experimental pipeline. Not legal advice. Results require review by qualified professionals.<br>
RegTech BR · Fernando Rodrigues · Kaggle: fernandosr85
</div>
""")
analyze_btn.click(fn=analyze, inputs=[query_box], outputs=[report_html, context_box])
query_box.submit(fn=analyze, inputs=[query_box], outputs=[report_html, context_box])
if __name__ == "__main__":
demo.launch()