Spaces:
Runtime error
Runtime error
| """ | |
| RegTech BR — Hugging Face Space | |
| ================================ | |
| Compliance analyzer for Brazilian crypto asset regulation. | |
| Uses FAISS RAG + Claude Sonnet 4.6 to produce structured risk assessments. | |
| Setup: | |
| 1. Upload faiss_index.bin, embeddings.npy, chunks_meta.jsonl to data/index/ | |
| 2. Add ANTHROPIC_API_KEY as a Space Secret | |
| 3. Deploy | |
| """ | |
| import os, json, re, time, unicodedata | |
| from pathlib import Path | |
| import numpy as np | |
| import gradio as gr | |
| import requests | |
| # ── Paths ────────────────────────────────────────────────────────── | |
| INDEX_DIR = Path("data/index") | |
| # ── Load index at startup ────────────────────────────────────────── | |
| print("Loading RegTech BR index...") | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| CHUNKS: list[dict] = [] | |
| with open(INDEX_DIR / "chunks_meta.jsonl", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| CHUNKS.append(json.loads(line)) | |
| EMBEDDINGS = np.load(INDEX_DIR / "embeddings.npy") | |
| INDEX = faiss.read_index(str(INDEX_DIR / "faiss_index.bin")) | |
| MODEL = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") | |
| print(f"Index ready — {len(CHUNKS)} chunks, {INDEX.ntotal} vectors") | |
| # ── Normalization & routing ──────────────────────────────────────── | |
| def normalize(text: str) -> str: | |
| text = unicodedata.normalize("NFD", text or "") | |
| text = "".join(c for c in text if unicodedata.category(c) != "Mn") | |
| text = text.lower() | |
| text = re.sub(r"[^a-z0-9]+", " ", text) | |
| return re.sub(r"\s+", " ", text).strip() | |
| AUTHORITY_KW = { | |
| "BCB": ["banco central","bcb","psav","ativo virtual","ativos virtuais", | |
| "criptoativo","autorizacao","cambio","segregacao","patrimonial", | |
| "ativos dos clientes","circular","resolucao bcb"], | |
| "CVM": ["cvm","valores mobiliarios","valor mobiliario","token","tokens", | |
| "oferta publica","dividendos","direito de voto","captar","rwa","parecer"], | |
| "COAF": ["coaf","pessoa exposta politicamente","pep","kyc","lavagem", | |
| "terrorismo","pld","ftp","anonimo","anonima","identificacao do cliente"], | |
| } | |
| SOURCE_KW = { | |
| "lei_14478": ["lei 14 478","lei 14478","marco legal","lei dos criptoativos"], | |
| "decreto_11563": ["decreto 11 563","decreto 11563"], | |
| "bcb_circular_3978":["circular 3978","pld","ftp","lavagem","kyc","anonimo"], | |
| "bcb_in701": ["instrucao normativa 701","certificacao tecnica","segregacao"], | |
| "bcb_res548": ["resolucao 548","autorizacao psav"], | |
| "cvm": ["cvm","valores mobiliarios","token","dividendos"], | |
| "coaf": ["coaf","pep","pessoa exposta politicamente"], | |
| } | |
| def detect_route(query: str) -> dict: | |
| q = normalize(query) | |
| route = {"authority_filters":[], "source_id_contains":[], "query_expansion":[]} | |
| if any(normalize(k) in q for k in SOURCE_KW["lei_14478"]): | |
| route["source_id_contains"].append("lei_14478") | |
| route["query_expansion"].extend(["Lei 14.478 de 2022","marco legal criptoativos"]) | |
| if any(normalize(k) in q for k in SOURCE_KW["decreto_11563"]): | |
| route["source_id_contains"].append("decreto_11563") | |
| route["query_expansion"].extend(["Decreto 11.563 de 2023","Banco Central"]) | |
| hits = {a: sum(1 for k in kws if normalize(k) in q) | |
| for a, kws in AUTHORITY_KW.items()} | |
| max_h = max(hits.values()) if hits else 0 | |
| if max_h: | |
| route["authority_filters"] = [a for a,h in hits.items() if h==max_h or h>=2] | |
| for key, kws in SOURCE_KW.items(): | |
| if any(normalize(k) in q for k in kws): | |
| route["source_id_contains"].append(key) | |
| for k in ["authority_filters","source_id_contains","query_expansion"]: | |
| route[k] = list(dict.fromkeys(route[k])) | |
| return route | |
| def route_boost(chunk: dict, route: dict) -> float: | |
| boost = 0.0 | |
| sid = normalize(chunk.get("source_id","")) | |
| for token in route.get("source_id_contains",[]): | |
| if normalize(token) in sid: | |
| boost += 0.35 | |
| if chunk.get("authority") in route.get("authority_filters",[]): | |
| boost += 0.08 | |
| return min(boost, 0.55) | |
| def lexical_boost(query_norm: str, chunk: dict) -> float: | |
| parts = " ".join([chunk.get("source_id",""), chunk.get("authority",""), | |
| " ".join(chunk.get("tags",[])), chunk.get("text","")]) | |
| chunk_norm = normalize(parts) | |
| terms = [t for t in query_norm.split() if len(t) >= 4] | |
| if not terms: | |
| return 0.0 | |
| hits = sum(1 for t in set(terms) if t in chunk_norm) | |
| return min(0.12, hits / max(6, len(set(terms))) * 0.12) | |
| def retrieve(query: str, top_k: int = 6) -> list[dict]: | |
| route = detect_route(query) | |
| expanded = query + "\n" + "\n".join(route.get("query_expansion",[])) | |
| q_vec = MODEL.encode([expanded], normalize_embeddings=True).astype(np.float32) | |
| scores, indices = INDEX.search(q_vec, min(len(CHUNKS), top_k * 20)) | |
| q_norm = normalize(expanded) | |
| ranked = [] | |
| for score, idx in zip(scores[0], indices[0]): | |
| if idx < 0 or float(score) < 0.20: | |
| continue | |
| chunk = CHUNKS[int(idx)].copy() | |
| chunk["_score"] = float(score) | |
| chunk["_final"] = float(score) + lexical_boost(q_norm, chunk) + route_boost(chunk, route) | |
| ranked.append(chunk) | |
| ranked.sort(key=lambda r: r["_final"], reverse=True) | |
| seen, unique = set(), [] | |
| for r in ranked: | |
| cid = r.get("chunk_id") | |
| if cid not in seen: | |
| seen.add(cid) | |
| unique.append(r) | |
| return unique[:top_k] | |
| def format_context(results: list[dict]) -> str: | |
| lines = [] | |
| for i, r in enumerate(results, 1): | |
| art = f" — {r['article_hint']}" if r.get("article_hint") else "" | |
| norm = f" [{r['normative_reference_hint']}]" if r.get("normative_reference_hint") else "" | |
| lines.append( | |
| f"[SOURCE {i}] {r.get('source_label','')}{art}{norm}\n" | |
| f"Authority: {r.get('authority','?')} | Score: {r['_final']:.3f}\n" | |
| f"{str(r.get('text',''))[:700]}..." | |
| ) | |
| return "\n\n---\n\n".join(lines) | |
| # ── LLM ─────────────────────────────────────────────────────────── | |
| SYSTEM = """You are RegTech BR, a specialist AI in Brazilian crypto asset regulation. | |
| Analyze the compliance query and produce a structured JSON assessment. | |
| Respond ONLY with valid JSON — no markdown fences. | |
| Schema: | |
| { | |
| "risk_level": "LOW | MEDIUM | HIGH | UNCLEAR", | |
| "compliance_status": "COMPLIANT | NON-COMPLIANT | REQUIRES_REVIEW | INSUFFICIENT_INFO", | |
| "applicable_regulations": ["list"], | |
| "relevant_articles": ["list"], | |
| "finding": "2-5 sentence assessment", | |
| "corrective_action": "specific steps or 'No action required'", | |
| "confidence": "HIGH | MEDIUM | LOW", | |
| "authority": "BCB | CVM | COAF | mixed | federal" | |
| }""" | |
| def call_claude(query: str, context: str) -> dict | None: | |
| api_key = os.environ.get("ANTHROPIC_API_KEY", "") | |
| if not api_key: | |
| return None | |
| prompt = (f"COMPLIANCE QUERY:\n{query}\n\n" | |
| f"REGULATORY CONTEXT:\n\n{context}\n\n" | |
| f"Produce a structured compliance assessment.") | |
| try: | |
| r = requests.post( | |
| "https://api.anthropic.com/v1/messages", | |
| headers={"Content-Type":"application/json", | |
| "x-api-key": api_key, | |
| "anthropic-version":"2023-06-01"}, | |
| json={"model":"claude-sonnet-4-6","max_tokens":1200, | |
| "system":SYSTEM,"messages":[{"role":"user","content":prompt}]}, | |
| timeout=90, | |
| ) | |
| r.raise_for_status() | |
| raw = "".join(b.get("text","") for b in r.json().get("content",[]) | |
| if b.get("type")=="text") | |
| raw = re.sub(r"^```(?:json)?|```$","",raw.strip(),flags=re.IGNORECASE).strip() | |
| start, end = raw.find("{"), raw.rfind("}") | |
| if start >= 0 and end > start: | |
| return json.loads(raw[start:end+1]) | |
| except Exception as e: | |
| print(f"Claude error: {e}") | |
| return None | |
| # ── Risk badge ───────────────────────────────────────────────────── | |
| RISK_COLOR = {"HIGH":"#dc2626","MEDIUM":"#d97706","LOW":"#16a34a","UNCLEAR":"#6b7280"} | |
| STATUS_ICON = {"NON-COMPLIANT":"⛔","COMPLIANT":"✅","REQUIRES_REVIEW":"⚠️","INSUFFICIENT_INFO":"❓"} | |
| def render_report(report: dict, query: str, results: list[dict]) -> str: | |
| risk = report.get("risk_level","UNCLEAR") | |
| status = report.get("compliance_status","INSUFFICIENT_INFO") | |
| color = RISK_COLOR.get(risk,"#6b7280") | |
| icon = STATUS_ICON.get(status,"❓") | |
| conf = report.get("confidence","LOW") | |
| auth = report.get("authority","?") | |
| regs = "".join(f"<li>{r}</li>" for r in report.get("applicable_regulations",[])) | |
| arts = "".join(f"<li>{a}</li>" for a in report.get("relevant_articles",[])) | |
| srcs = "".join( | |
| f'<span style="background:#1e3a5f;color:#93c5fd;padding:2px 8px;' | |
| f'border-radius:4px;font-size:0.75rem;margin:2px;display:inline-block">' | |
| f'{r.get("source_id","")}</span>' | |
| for r in results | |
| ) | |
| return f""" | |
| <div style="font-family:'IBM Plex Mono',monospace;background:#0f172a;color:#e2e8f0; | |
| padding:1.5rem;border-radius:12px;border:1px solid #1e3a5f;line-height:1.6"> | |
| <div style="display:flex;gap:1rem;align-items:center;margin-bottom:1.2rem;flex-wrap:wrap"> | |
| <span style="background:{color};color:#fff;padding:4px 14px;border-radius:6px; | |
| font-weight:700;font-size:0.9rem;letter-spacing:0.05em"> | |
| {risk} RISK | |
| </span> | |
| <span style="background:#1e293b;color:#e2e8f0;padding:4px 14px;border-radius:6px; | |
| font-size:0.9rem;border:1px solid #334155"> | |
| {icon} {status} | |
| </span> | |
| <span style="color:#64748b;font-size:0.8rem"> | |
| confidence: {conf} · authority: {auth} | |
| </span> | |
| </div> | |
| <div style="background:#1e293b;border-radius:8px;padding:1rem;margin-bottom:1rem; | |
| border-left:3px solid {color}"> | |
| <div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase; | |
| letter-spacing:0.1em;margin-bottom:0.5rem">Finding</div> | |
| <div style="color:#e2e8f0;font-size:0.88rem">{report.get('finding','—')}</div> | |
| </div> | |
| <div style="background:#1e293b;border-radius:8px;padding:1rem;margin-bottom:1rem"> | |
| <div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase; | |
| letter-spacing:0.1em;margin-bottom:0.5rem">Corrective Action</div> | |
| <div style="color:#e2e8f0;font-size:0.88rem">{report.get('corrective_action','—')}</div> | |
| </div> | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;margin-bottom:1rem"> | |
| <div style="background:#1e293b;border-radius:8px;padding:1rem"> | |
| <div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase; | |
| letter-spacing:0.1em;margin-bottom:0.5rem">Applicable Regulations</div> | |
| <ul style="margin:0;padding-left:1.2rem;color:#93c5fd;font-size:0.8rem">{regs}</ul> | |
| </div> | |
| <div style="background:#1e293b;border-radius:8px;padding:1rem"> | |
| <div style="color:#94a3b8;font-size:0.75rem;text-transform:uppercase; | |
| letter-spacing:0.1em;margin-bottom:0.5rem">Relevant Articles</div> | |
| <ul style="margin:0;padding-left:1.2rem;color:#86efac;font-size:0.8rem">{arts}</ul> | |
| </div> | |
| </div> | |
| <div style="margin-top:0.8rem"> | |
| <div style="color:#64748b;font-size:0.7rem;text-transform:uppercase; | |
| letter-spacing:0.1em;margin-bottom:0.4rem">Sources retrieved</div> | |
| {srcs} | |
| </div> | |
| </div>""" | |
| def render_error(msg: str) -> str: | |
| return f""" | |
| <div style="background:#1c1917;color:#fca5a5;padding:1.2rem;border-radius:8px; | |
| border:1px solid #7f1d1d;font-family:monospace"> | |
| ⚠️ {msg} | |
| </div>""" | |
| # ── Gradio interface ─────────────────────────────────────────────── | |
| EXAMPLES = [ | |
| "Nossa plataforma permite compra e venda de criptoativos sem autorização formal do Banco Central.", | |
| "Realizamos KYC apenas para transações acima de R$100.000. Transações menores são anônimas.", | |
| "Nosso token REV distribui dividendos e dá direito de voto. Será ofertado publicamente por R$30M.", | |
| "Nossa exchange não realiza segregação patrimonial dos ativos dos clientes.", | |
| "Não possuímos diretor responsável formalmente designado para PLD/FTP.", | |
| "Qual é o marco legal dos criptoativos no Brasil segundo a Lei 14.478 de 2022?", | |
| "Qual órgão foi designado pelo Decreto 11.563/2023 para regular as PSAVs?", | |
| ] | |
| def analyze(query: str) -> tuple[str, str]: | |
| if not query or not query.strip(): | |
| return render_error("Please enter a compliance query."), "" | |
| results = retrieve(query.strip()) | |
| if not results: | |
| return render_error("No relevant regulatory chunks found. Try rephrasing your query."), "" | |
| context = format_context(results) | |
| report = call_claude(query, context) | |
| if not report: | |
| return render_error( | |
| "Could not reach Claude API. Check that ANTHROPIC_API_KEY is set as a Space Secret." | |
| ), context | |
| return render_report(report, query, results), context | |
| # ── Layout ───────────────────────────────────────────────────────── | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;700&family=IBM+Plex+Sans:wght@300;400;600&display=swap'); | |
| body { background: #0a0f1e !important; } | |
| .gradio-container { | |
| background: #0a0f1e !important; | |
| font-family: 'IBM Plex Sans', sans-serif !important; | |
| max-width: 900px !important; | |
| margin: 0 auto !important; | |
| } | |
| #header { | |
| text-align: center; | |
| padding: 2rem 0 1rem; | |
| border-bottom: 1px solid #1e3a5f; | |
| margin-bottom: 1.5rem; | |
| } | |
| #header h1 { | |
| font-family: 'IBM Plex Mono', monospace; | |
| font-size: 1.8rem; | |
| color: #38bdf8; | |
| letter-spacing: -0.02em; | |
| margin: 0 0 0.3rem; | |
| } | |
| #header p { | |
| color: #64748b; | |
| font-size: 0.85rem; | |
| margin: 0; | |
| } | |
| .query-box textarea { | |
| background: #0f172a !important; | |
| border: 1px solid #1e3a5f !important; | |
| color: #e2e8f0 !important; | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-size: 0.9rem !important; | |
| border-radius: 8px !important; | |
| } | |
| .query-box textarea:focus { | |
| border-color: #38bdf8 !important; | |
| box-shadow: 0 0 0 2px rgba(56,189,248,0.15) !important; | |
| } | |
| .analyze-btn { | |
| background: #0369a1 !important; | |
| color: #fff !important; | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-weight: 600 !important; | |
| letter-spacing: 0.05em !important; | |
| border: none !important; | |
| border-radius: 8px !important; | |
| height: 44px !important; | |
| font-size: 0.85rem !important; | |
| transition: background 0.2s !important; | |
| } | |
| .analyze-btn:hover { | |
| background: #0284c7 !important; | |
| } | |
| .context-box textarea { | |
| background: #0f172a !important; | |
| border: 1px solid #1e293b !important; | |
| color: #64748b !important; | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-size: 0.75rem !important; | |
| } | |
| label { color: #94a3b8 !important; font-size: 0.8rem !important; } | |
| .svelte-1gfkn6j { background: #0a0f1e !important; } | |
| """ | |
| with gr.Blocks(css=CSS, title="RegTech BR") as demo: | |
| gr.HTML(""" | |
| <div id="header"> | |
| <h1>⚖ RegTech BR</h1> | |
| <p>Brazilian Crypto Asset Compliance Analyzer · BCB · CVM · COAF · Lei 14.478/2022</p> | |
| <p style="color:#1e3a5f;font-size:0.75rem;margin-top:0.3rem"> | |
| 103 regulatory chunks · RAG + Claude Sonnet 4.6 · BLI Legal Tech Hackathon 2 | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| query_box = gr.Textbox( | |
| label="Compliance query or document excerpt", | |
| placeholder="Describe your policy, product, or compliance question in Portuguese or English...", | |
| lines=4, | |
| elem_classes=["query-box"], | |
| ) | |
| with gr.Column(scale=1, min_width=120): | |
| analyze_btn = gr.Button("Analyze →", elem_classes=["analyze-btn"]) | |
| gr.Examples( | |
| examples=[[e] for e in EXAMPLES], | |
| inputs=[query_box], | |
| label="Example queries", | |
| ) | |
| report_html = gr.HTML(label="Compliance Assessment") | |
| with gr.Accordion("Retrieved regulatory context", open=False): | |
| context_box = gr.Textbox( | |
| label="Raw chunks retrieved by RAG", | |
| lines=10, | |
| interactive=False, | |
| elem_classes=["context-box"], | |
| ) | |
| gr.HTML(""" | |
| <div style="text-align:center;color:#334155;font-size:0.72rem; | |
| padding:1.5rem 0 0.5rem;font-family:'IBM Plex Mono',monospace"> | |
| ⚠ Experimental pipeline. Not legal advice. Results require review by qualified professionals.<br> | |
| RegTech BR · Fernando Rodrigues · Kaggle: fernandosr85 | |
| </div> | |
| """) | |
| analyze_btn.click(fn=analyze, inputs=[query_box], outputs=[report_html, context_box]) | |
| query_box.submit(fn=analyze, inputs=[query_box], outputs=[report_html, context_box]) | |
| if __name__ == "__main__": | |
| demo.launch() | |