Spaces:

Fernandosr85
/

regtech-br

Running

App Files Files Community

Fernandosr85 commited on 3 days ago

Commit

7ac75d0

verified ·

1 Parent(s): 1ba2913

Update app.py

Browse files

Files changed (1) hide show

app.py +272 -58

app.py CHANGED Viewed

@@ -240,28 +240,26 @@ def format_context(results: list[dict]) -> str:
 SYSTEM = """You are RegTech BR, a specialist AI in Brazilian crypto asset regulation.
 Analyze the compliance query and produce a structured JSON assessment.
-Respond ONLY with a valid JSON object — no markdown fences, no preamble, no extra text.
-Use EXACTLY these key names (snake_case, no variations):
 {
   "risk_level": "LOW | MEDIUM | HIGH | UNCLEAR",
   "compliance_status": "COMPLIANT | NON-COMPLIANT | REQUIRES_REVIEW | INSUFFICIENT_INFO",
-  "applicable_regulations": ["regulation name 1", "regulation name 2"],
-  "relevant_articles": ["Article reference 1", "Article reference 2"],
   "finding": "2-5 sentence assessment",
   "corrective_action": "specific steps or 'No action required'",
   "confidence": "HIGH | MEDIUM | LOW",
   "authority": "BCB | CVM | COAF | mixed | federal"
 }
-CRITICAL rules:
-- Key names must be EXACTLY: applicable_regulations, relevant_articles (snake_case only).
-- Both applicable_regulations and relevant_articles MUST be non-empty arrays.
-- Do NOT use: applicableRegulations, regulations, articles, artigos, or any variant.
-- If operating without required authorization: HIGH risk, NON-COMPLIANT.
-- If weak KYC or anonymous transactions: HIGH risk, NON-COMPLIANT.
-- If no segregation of client assets: HIGH risk, NON-COMPLIANT.
-- If tokens with dividends, voting rights, or public fundraising: HIGH risk, CVM securities.
 - Base the answer strictly on the retrieved regulatory context.
 """
@@ -279,36 +277,223 @@ def extract_json_object(raw: str) -> str:
     return raw
-# Key aliases the model sometimes uses instead of the correct snake_case names
-_REG_ALIASES = [
-    "applicable_regulations", "applicableRegulations", "regulations",
-    "applicable_regulation", "regulacoes_aplicaveis", "regulacoes", "legal_references",
-]
-_ART_ALIASES = [
-    "relevant_articles", "relevantArticles", "articles", "relevant_article",
-    "artigos_relevantes", "artigos", "article_references",
-]
-def normalize_report_keys(parsed: dict) -> dict:
-    """Ensure applicable_regulations and relevant_articles use canonical key names."""
-    if not parsed.get("applicable_regulations"):
-        for alias in _REG_ALIASES:
-            val = parsed.get(alias)
-            if val and isinstance(val, list) and len(val) > 0:
-                parsed["applicable_regulations"] = val
-                print(f"KEY-FIX: mapped {alias!r} -> applicable_regulations", flush=True)
-                break
-    if not parsed.get("relevant_articles"):
-        for alias in _ART_ALIASES:
-            val = parsed.get(alias)
-            if val and isinstance(val, list) and len(val) > 0:
-                parsed["relevant_articles"] = val
-                print(f"KEY-FIX: mapped {alias!r} -> relevant_articles", flush=True)
-                break
-    return parsed
 def call_claude(query: str, context: str) -> dict | None:
@@ -316,13 +501,17 @@ def call_claude(query: str, context: str) -> dict | None:
     if not api_key:
         print("Missing ANTHROPIC_API_KEY.", flush=True)
         return None
     prompt = (
         f"COMPLIANCE QUERY:\n{query}\n\n"
         f"REGULATORY CONTEXT:\n\n{context}\n\n"
-        f"Produce a structured compliance assessment. "
-        f"Use EXACTLY these key names: applicable_regulations, relevant_articles (snake_case). "
-        f"Both must be non-empty arrays."
     )
     try:
         response = requests.post(
             "https://api.anthropic.com/v1/messages",
@@ -339,19 +528,27 @@ def call_claude(query: str, context: str) -> dict | None:
             },
             timeout=90,
         )
         response.raise_for_status()
         raw = "".join(
             block.get("text", "")
-            for block in response.json().get("content", [])
             if block.get("type") == "text"
         )
-        print(f"CLAUDE JSON KEYS: {list(json.loads(extract_json_object(raw)).keys()) if raw else 'empty'}", flush=True)
         clean = extract_json_object(raw)
-        parsed = json.loads(clean)
-        parsed = normalize_report_keys(parsed)
-        print(f"REGS({len(parsed.get('applicable_regulations') or [])}): {(parsed.get('applicable_regulations') or [])[:2]}", flush=True)
-        print(f"ARTS({len(parsed.get('relevant_articles') or [])}): {(parsed.get('relevant_articles') or [])[:2]}", flush=True)
-        return parsed
     except Exception as exc:
         print(f"Claude error: {type(exc).__name__}: {exc}", flush=True)
         return None
@@ -376,14 +573,6 @@ STATUS_ICON = {
 }
-def as_list(value) -> list[str]:
-    if value is None:
-        return []
-    if isinstance(value, list):
-        return [str(v) for v in value if v]
-    return [str(value)]
 def esc(value) -> str:
     return html.escape("" if value is None else str(value))
@@ -499,14 +688,39 @@ EXAMPLES = [
 def analyze(query: str) -> tuple[str, str]:
     if not query or not query.strip():
         return render_error("Please enter a compliance query."), ""
     query = query.strip()
     results = retrieve(query)
     if not results:
         return render_error("No relevant regulatory chunks found. Try rephrasing your query."), ""
     context = format_context(results)
     report = call_claude(query, context)
     if not report:
         return render_error("Could not reach Claude API. Check that ANTHROPIC_API_KEY is set as a Space Secret."), context
     return render_report(report, query, results), context

 SYSTEM = """You are RegTech BR, a specialist AI in Brazilian crypto asset regulation.
 Analyze the compliance query and produce a structured JSON assessment.
+Respond ONLY with valid JSON — no markdown fences.
+Use EXACTLY these snake_case keys:
 {
   "risk_level": "LOW | MEDIUM | HIGH | UNCLEAR",
   "compliance_status": "COMPLIANT | NON-COMPLIANT | REQUIRES_REVIEW | INSUFFICIENT_INFO",
+  "applicable_regulations": ["list of regulation names"],
+  "relevant_articles": ["list of specific article references"],
   "finding": "2-5 sentence assessment",
   "corrective_action": "specific steps or 'No action required'",
   "confidence": "HIGH | MEDIUM | LOW",
   "authority": "BCB | CVM | COAF | mixed | federal"
 }
+Rules:
+- Always populate applicable_regulations and relevant_articles as non-empty arrays.
+- Use only regulation/article references present in the retrieved context.
+- If an exact article is unclear, cite the closest source/article_hint from the retrieved context instead of leaving the array empty.
+- If the query describes operating without required authorization, flag high risk.
+- If the query describes weak KYC or anonymous transactions, flag high risk.
+- If the query describes no segregation of client assets, flag high risk.
+- If the query describes tokens with dividends, voting rights, or public fundraising, flag CVM securities risk.
 - Base the answer strictly on the retrieved regulatory context.
 """
     return raw
+# ============================================================
+# Claude output normalization and safety fallback
+# ============================================================
+KEY_ALIASES = {
+    "risk_level": [
+        "risk_level", "riskLevel", "risk", "level", "nivel_risco", "nível_risco",
+        "nivel_de_risco", "nível_de_risco",
+    ],
+    "compliance_status": [
+        "compliance_status", "complianceStatus", "status", "compliance",
+        "status_conformidade", "conformidade",
+    ],
+    "applicable_regulations": [
+        "applicable_regulations", "applicableRegulations", "applicable regulation",
+        "applicable regulations", "regulations", "regulation", "laws", "legal_basis",
+        "legalBasis", "normas_aplicaveis", "normas_aplicáveis", "regulacoes_aplicaveis",
+        "regulações_aplicáveis", "regulamentacoes", "regulamentações",
+    ],
+    "relevant_articles": [
+        "relevant_articles", "relevantArticles", "relevant articles", "articles",
+        "article_references", "legal_references", "citations", "references",
+        "artigos_relevantes", "artigos", "dispositivos", "dispositivos_relevantes",
+    ],
+    "finding": [
+        "finding", "findings", "assessment", "analysis", "analise", "análise",
+        "conclusao", "conclusão", "avaliacao", "avaliação",
+    ],
+    "corrective_action": [
+        "corrective_action", "correctiveAction", "action", "recommended_action",
+        "recommendation", "recomendacao", "recomendação", "acao_corretiva", "ação_corretiva",
+    ],
+    "confidence": [
+        "confidence", "confidence_level", "confidenceLevel", "confianca", "confiança",
+    ],
+    "authority": [
+        "authority", "authority_type", "regulator", "agency", "orgao", "órgão",
+        "autoridade", "autoridade_competente",
+    ],
+}
+def _norm_key(key: str) -> str:
+    key = unicodedata.normalize("NFD", str(key or ""))
+    key = "".join(c for c in key if unicodedata.category(c) != "Mn")
+    key = re.sub(r"[^a-zA-Z0-9]+", "_", key).strip("_").lower()
+    return key
+def _lookup_alias(data: dict, canonical_key: str):
+    if not isinstance(data, dict):
+        return None
+    direct_aliases = KEY_ALIASES.get(canonical_key, [])
+    for alias in direct_aliases:
+        if alias in data:
+            return data.get(alias)
+    norm_to_original = {_norm_key(k): k for k in data.keys()}
+    for alias in direct_aliases:
+        norm_alias = _norm_key(alias)
+        if norm_alias in norm_to_original:
+            return data.get(norm_to_original[norm_alias])
+    return None
+def as_list(value) -> list[str]:
+    """Coerce Claude output into a clean list of strings.
+    Handles arrays, strings, numbers, and arrays of objects such as:
+    [{"name": "Lei 14.478/2022"}, {"article": "Art. 7º"}]
+    """
+    if value is None:
+        return []
+    if isinstance(value, list):
+        out = []
+        for item in value:
+            out.extend(as_list(item))
+        return list(dict.fromkeys([str(v).strip() for v in out if str(v).strip()]))
+    if isinstance(value, dict):
+        preferred = [
+            "name", "title", "reference", "article", "regulation", "law",
+            "text", "label", "value", "source", "source_label",
+        ]
+        for key in preferred:
+            if key in value and value[key]:
+                return as_list(value[key])
+        return [
+            "; ".join(f"{k}: {v}" for k, v in value.items() if v)
+        ]
+    text_value = str(value).strip()
+    if not text_value:
+        return []
+    return [text_value]
+def infer_regulations_from_results(results: list[dict], max_items: int = 4) -> list[str]:
+    regs = []
+    for r in results or []:
+        label = str(r.get("source_label") or "").strip()
+        norm_ref = str(r.get("normative_reference_hint") or "").strip()
+        source_id = str(r.get("source_id") or "").strip()
+        if label:
+            item = label
+            if norm_ref and norm_ref not in item:
+                item = f"{item} — {norm_ref}"
+        elif norm_ref:
+            item = norm_ref
+        else:
+            item = source_id
+        if item:
+            regs.append(item)
+    return list(dict.fromkeys(regs))[:max_items]
+def infer_articles_from_results(results: list[dict], max_items: int = 6) -> list[str]:
+    articles = []
+    for r in results or []:
+        article = str(r.get("article_hint") or "").strip()
+        norm_ref = str(r.get("normative_reference_hint") or "").strip()
+        label = str(r.get("source_label") or "").strip()
+        source_id = str(r.get("source_id") or "").strip()
+        if article and norm_ref:
+            item = f"{norm_ref} — {article}"
+        elif article and label:
+            item = f"{label} — {article}"
+        elif article:
+            item = article
+        elif norm_ref:
+            item = norm_ref
+        elif source_id:
+            item = source_id
+        else:
+            item = ""
+        if item:
+            articles.append(item)
+    return list(dict.fromkeys(articles))[:max_items]
+def canonicalize_report(report: dict, results: list[dict]) -> dict:
+    """Normalize Claude response keys and guarantee non-empty legal-reference arrays."""
+    if not isinstance(report, dict):
+        report = {}
+    canonical = dict(report)
+    for key in KEY_ALIASES:
+        value = _lookup_alias(report, key)
+        if value is not None:
+            canonical[key] = value
+    canonical["risk_level"] = str(canonical.get("risk_level", "UNCLEAR")).upper().replace("-", "_")
+    canonical["compliance_status"] = (
+        str(canonical.get("compliance_status", "INSUFFICIENT_INFO"))
+        .upper()
+        .replace("_", "-")
+    )
+    canonical["confidence"] = str(canonical.get("confidence", "LOW")).upper()
+    regs = as_list(canonical.get("applicable_regulations"))
+    if not regs:
+        regs = infer_regulations_from_results(results)
+        print(
+            "[WARN] applicable_regulations empty or missing in Claude response; "
+            f"filled from retrieved sources: {regs}",
+            flush=True,
+        )
+    articles = as_list(canonical.get("relevant_articles"))
+    if not articles:
+        articles = infer_articles_from_results(results)
+        print(
+            "[WARN] relevant_articles empty or missing in Claude response; "
+            f"filled from retrieved sources: {articles}",
+            flush=True,
+        )
+    canonical["applicable_regulations"] = regs
+    canonical["relevant_articles"] = articles
+    if not canonical.get("finding"):
+        canonical["finding"] = "Assessment generated from the retrieved regulatory context."
+    if not canonical.get("corrective_action"):
+        canonical["corrective_action"] = "Review the cited regulatory sources and update the compliance procedure accordingly."
+    if not canonical.get("authority"):
+        authorities = [str(r.get("authority")) for r in results or [] if r.get("authority")]
+        canonical["authority"] = "mixed" if len(set(authorities)) > 1 else (authorities[0] if authorities else "?")
+    return canonical
+def debug_print_claude(raw: str, clean: str, parsed: dict | None = None) -> None:
+    print("\n" + "=" * 72, flush=True)
+    print("CLAUDE RAW RESPONSE START", flush=True)
+    print(raw or "<EMPTY RAW RESPONSE>", flush=True)
+    print("CLAUDE RAW RESPONSE END", flush=True)
+    print("-" * 72, flush=True)
+    print("CLAUDE EXTRACTED JSON START", flush=True)
+    print(clean or "<EMPTY EXTRACTED JSON>", flush=True)
+    print("CLAUDE EXTRACTED JSON END", flush=True)
+    if isinstance(parsed, dict):
+        print("-" * 72, flush=True)
+        print(f"CLAUDE PARSED KEYS: {sorted(parsed.keys())}", flush=True)
+        print(
+            "CLAUDE LEGAL ARRAYS: "
+            f"applicable_regulations={parsed.get('applicable_regulations')!r}; "
+            f"relevant_articles={parsed.get('relevant_articles')!r}",
+            flush=True,
+        )
+    print("=" * 72 + "\n", flush=True)
 def call_claude(query: str, context: str) -> dict | None:
     if not api_key:
         print("Missing ANTHROPIC_API_KEY.", flush=True)
         return None
     prompt = (
         f"COMPLIANCE QUERY:\n{query}\n\n"
         f"REGULATORY CONTEXT:\n\n{context}\n\n"
+        "Produce a structured compliance assessment. "
+        "Return ONLY valid JSON using EXACTLY these keys: "
+        "risk_level, compliance_status, applicable_regulations, relevant_articles, "
+        "finding, corrective_action, confidence, authority. "
+        "The arrays applicable_regulations and relevant_articles must be non-empty."
     )
     try:
         response = requests.post(
             "https://api.anthropic.com/v1/messages",
             },
             timeout=90,
         )
+        print(f"Claude HTTP status: {response.status_code}", flush=True)
         response.raise_for_status()
+        payload = response.json()
         raw = "".join(
             block.get("text", "")
+            for block in payload.get("content", [])
             if block.get("type") == "text"
         )
         clean = extract_json_object(raw)
+        try:
+            parsed = json.loads(clean)
+            debug_print_claude(raw, clean, parsed)
+            return parsed
+        except json.JSONDecodeError as json_exc:
+            debug_print_claude(raw, clean, None)
+            print(f"Claude JSON parse error: {json_exc}", flush=True)
+            return None
     except Exception as exc:
         print(f"Claude error: {type(exc).__name__}: {exc}", flush=True)
         return None
 }
 def esc(value) -> str:
     return html.escape("" if value is None else str(value))
 def analyze(query: str) -> tuple[str, str]:
     if not query or not query.strip():
         return render_error("Please enter a compliance query."), ""
     query = query.strip()
+    print("\n" + "=" * 72, flush=True)
+    print(f"NEW QUERY: {query}", flush=True)
     results = retrieve(query)
+    print(f"Retrieved chunks: {len(results)}", flush=True)
+    for i, r in enumerate(results, 1):
+        print(
+            f"[RAG {i}] source_id={r.get('source_id')} | "
+            f"authority={r.get('authority')} | "
+            f"article_hint={r.get('article_hint')} | "
+            f"normative_reference_hint={r.get('normative_reference_hint')} | "
+            f"final_score={float(r.get('_final', 0.0)):.3f}",
+            flush=True,
+        )
     if not results:
         return render_error("No relevant regulatory chunks found. Try rephrasing your query."), ""
     context = format_context(results)
     report = call_claude(query, context)
     if not report:
         return render_error("Could not reach Claude API. Check that ANTHROPIC_API_KEY is set as a Space Secret."), context
+    report = canonicalize_report(report, results)
+    print(
+        "FINAL NORMALIZED REPORT LEGAL ARRAYS: "
+        f"applicable_regulations={report.get('applicable_regulations')!r}; "
+        f"relevant_articles={report.get('relevant_articles')!r}",
+        flush=True,
+    )
     return render_report(report, query, results), context