Fernandosr85 commited on
Commit
7ac75d0
·
verified ·
1 Parent(s): 1ba2913

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -58
app.py CHANGED
@@ -240,28 +240,26 @@ def format_context(results: list[dict]) -> str:
240
 
241
  SYSTEM = """You are RegTech BR, a specialist AI in Brazilian crypto asset regulation.
242
  Analyze the compliance query and produce a structured JSON assessment.
243
- Respond ONLY with a valid JSON object — no markdown fences, no preamble, no extra text.
244
-
245
- Use EXACTLY these key names (snake_case, no variations):
246
  {
247
  "risk_level": "LOW | MEDIUM | HIGH | UNCLEAR",
248
  "compliance_status": "COMPLIANT | NON-COMPLIANT | REQUIRES_REVIEW | INSUFFICIENT_INFO",
249
- "applicable_regulations": ["regulation name 1", "regulation name 2"],
250
- "relevant_articles": ["Article reference 1", "Article reference 2"],
251
  "finding": "2-5 sentence assessment",
252
  "corrective_action": "specific steps or 'No action required'",
253
  "confidence": "HIGH | MEDIUM | LOW",
254
  "authority": "BCB | CVM | COAF | mixed | federal"
255
  }
256
-
257
- CRITICAL rules:
258
- - Key names must be EXACTLY: applicable_regulations, relevant_articles (snake_case only).
259
- - Both applicable_regulations and relevant_articles MUST be non-empty arrays.
260
- - Do NOT use: applicableRegulations, regulations, articles, artigos, or any variant.
261
- - If operating without required authorization: HIGH risk, NON-COMPLIANT.
262
- - If weak KYC or anonymous transactions: HIGH risk, NON-COMPLIANT.
263
- - If no segregation of client assets: HIGH risk, NON-COMPLIANT.
264
- - If tokens with dividends, voting rights, or public fundraising: HIGH risk, CVM securities.
265
  - Base the answer strictly on the retrieved regulatory context.
266
  """
267
 
@@ -279,36 +277,223 @@ def extract_json_object(raw: str) -> str:
279
  return raw
280
 
281
 
282
- # Key aliases the model sometimes uses instead of the correct snake_case names
283
- _REG_ALIASES = [
284
- "applicable_regulations", "applicableRegulations", "regulations",
285
- "applicable_regulation", "regulacoes_aplicaveis", "regulacoes", "legal_references",
286
- ]
287
- _ART_ALIASES = [
288
- "relevant_articles", "relevantArticles", "articles", "relevant_article",
289
- "artigos_relevantes", "artigos", "article_references",
290
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
 
 
 
 
292
 
293
- def normalize_report_keys(parsed: dict) -> dict:
294
- """Ensure applicable_regulations and relevant_articles use canonical key names."""
295
- if not parsed.get("applicable_regulations"):
296
- for alias in _REG_ALIASES:
297
- val = parsed.get(alias)
298
- if val and isinstance(val, list) and len(val) > 0:
299
- parsed["applicable_regulations"] = val
300
- print(f"KEY-FIX: mapped {alias!r} -> applicable_regulations", flush=True)
301
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
- if not parsed.get("relevant_articles"):
304
- for alias in _ART_ALIASES:
305
- val = parsed.get(alias)
306
- if val and isinstance(val, list) and len(val) > 0:
307
- parsed["relevant_articles"] = val
308
- print(f"KEY-FIX: mapped {alias!r} -> relevant_articles", flush=True)
309
- break
 
310
 
311
- return parsed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
 
314
  def call_claude(query: str, context: str) -> dict | None:
@@ -316,13 +501,17 @@ def call_claude(query: str, context: str) -> dict | None:
316
  if not api_key:
317
  print("Missing ANTHROPIC_API_KEY.", flush=True)
318
  return None
 
319
  prompt = (
320
  f"COMPLIANCE QUERY:\n{query}\n\n"
321
  f"REGULATORY CONTEXT:\n\n{context}\n\n"
322
- f"Produce a structured compliance assessment. "
323
- f"Use EXACTLY these key names: applicable_regulations, relevant_articles (snake_case). "
324
- f"Both must be non-empty arrays."
 
 
325
  )
 
326
  try:
327
  response = requests.post(
328
  "https://api.anthropic.com/v1/messages",
@@ -339,19 +528,27 @@ def call_claude(query: str, context: str) -> dict | None:
339
  },
340
  timeout=90,
341
  )
 
 
342
  response.raise_for_status()
 
 
343
  raw = "".join(
344
  block.get("text", "")
345
- for block in response.json().get("content", [])
346
  if block.get("type") == "text"
347
  )
348
- print(f"CLAUDE JSON KEYS: {list(json.loads(extract_json_object(raw)).keys()) if raw else 'empty'}", flush=True)
349
  clean = extract_json_object(raw)
350
- parsed = json.loads(clean)
351
- parsed = normalize_report_keys(parsed)
352
- print(f"REGS({len(parsed.get('applicable_regulations') or [])}): {(parsed.get('applicable_regulations') or [])[:2]}", flush=True)
353
- print(f"ARTS({len(parsed.get('relevant_articles') or [])}): {(parsed.get('relevant_articles') or [])[:2]}", flush=True)
354
- return parsed
 
 
 
 
 
355
  except Exception as exc:
356
  print(f"Claude error: {type(exc).__name__}: {exc}", flush=True)
357
  return None
@@ -376,14 +573,6 @@ STATUS_ICON = {
376
  }
377
 
378
 
379
- def as_list(value) -> list[str]:
380
- if value is None:
381
- return []
382
- if isinstance(value, list):
383
- return [str(v) for v in value if v]
384
- return [str(value)]
385
-
386
-
387
  def esc(value) -> str:
388
  return html.escape("" if value is None else str(value))
389
 
@@ -499,14 +688,39 @@ EXAMPLES = [
499
  def analyze(query: str) -> tuple[str, str]:
500
  if not query or not query.strip():
501
  return render_error("Please enter a compliance query."), ""
 
502
  query = query.strip()
 
 
 
503
  results = retrieve(query)
 
 
 
 
 
 
 
 
 
 
 
504
  if not results:
505
  return render_error("No relevant regulatory chunks found. Try rephrasing your query."), ""
 
506
  context = format_context(results)
507
  report = call_claude(query, context)
508
  if not report:
509
  return render_error("Could not reach Claude API. Check that ANTHROPIC_API_KEY is set as a Space Secret."), context
 
 
 
 
 
 
 
 
 
510
  return render_report(report, query, results), context
511
 
512
 
 
240
 
241
  SYSTEM = """You are RegTech BR, a specialist AI in Brazilian crypto asset regulation.
242
  Analyze the compliance query and produce a structured JSON assessment.
243
+ Respond ONLY with valid JSON — no markdown fences.
244
+ Use EXACTLY these snake_case keys:
 
245
  {
246
  "risk_level": "LOW | MEDIUM | HIGH | UNCLEAR",
247
  "compliance_status": "COMPLIANT | NON-COMPLIANT | REQUIRES_REVIEW | INSUFFICIENT_INFO",
248
+ "applicable_regulations": ["list of regulation names"],
249
+ "relevant_articles": ["list of specific article references"],
250
  "finding": "2-5 sentence assessment",
251
  "corrective_action": "specific steps or 'No action required'",
252
  "confidence": "HIGH | MEDIUM | LOW",
253
  "authority": "BCB | CVM | COAF | mixed | federal"
254
  }
255
+ Rules:
256
+ - Always populate applicable_regulations and relevant_articles as non-empty arrays.
257
+ - Use only regulation/article references present in the retrieved context.
258
+ - If an exact article is unclear, cite the closest source/article_hint from the retrieved context instead of leaving the array empty.
259
+ - If the query describes operating without required authorization, flag high risk.
260
+ - If the query describes weak KYC or anonymous transactions, flag high risk.
261
+ - If the query describes no segregation of client assets, flag high risk.
262
+ - If the query describes tokens with dividends, voting rights, or public fundraising, flag CVM securities risk.
 
263
  - Base the answer strictly on the retrieved regulatory context.
264
  """
265
 
 
277
  return raw
278
 
279
 
280
+ # ============================================================
281
+ # Claude output normalization and safety fallback
282
+ # ============================================================
283
+
284
+ KEY_ALIASES = {
285
+ "risk_level": [
286
+ "risk_level", "riskLevel", "risk", "level", "nivel_risco", "nível_risco",
287
+ "nivel_de_risco", "nível_de_risco",
288
+ ],
289
+ "compliance_status": [
290
+ "compliance_status", "complianceStatus", "status", "compliance",
291
+ "status_conformidade", "conformidade",
292
+ ],
293
+ "applicable_regulations": [
294
+ "applicable_regulations", "applicableRegulations", "applicable regulation",
295
+ "applicable regulations", "regulations", "regulation", "laws", "legal_basis",
296
+ "legalBasis", "normas_aplicaveis", "normas_aplicáveis", "regulacoes_aplicaveis",
297
+ "regulações_aplicáveis", "regulamentacoes", "regulamentações",
298
+ ],
299
+ "relevant_articles": [
300
+ "relevant_articles", "relevantArticles", "relevant articles", "articles",
301
+ "article_references", "legal_references", "citations", "references",
302
+ "artigos_relevantes", "artigos", "dispositivos", "dispositivos_relevantes",
303
+ ],
304
+ "finding": [
305
+ "finding", "findings", "assessment", "analysis", "analise", "análise",
306
+ "conclusao", "conclusão", "avaliacao", "avaliação",
307
+ ],
308
+ "corrective_action": [
309
+ "corrective_action", "correctiveAction", "action", "recommended_action",
310
+ "recommendation", "recomendacao", "recomendação", "acao_corretiva", "ação_corretiva",
311
+ ],
312
+ "confidence": [
313
+ "confidence", "confidence_level", "confidenceLevel", "confianca", "confiança",
314
+ ],
315
+ "authority": [
316
+ "authority", "authority_type", "regulator", "agency", "orgao", "órgão",
317
+ "autoridade", "autoridade_competente",
318
+ ],
319
+ }
320
+
321
+
322
+ def _norm_key(key: str) -> str:
323
+ key = unicodedata.normalize("NFD", str(key or ""))
324
+ key = "".join(c for c in key if unicodedata.category(c) != "Mn")
325
+ key = re.sub(r"[^a-zA-Z0-9]+", "_", key).strip("_").lower()
326
+ return key
327
+
328
+
329
+ def _lookup_alias(data: dict, canonical_key: str):
330
+ if not isinstance(data, dict):
331
+ return None
332
 
333
+ direct_aliases = KEY_ALIASES.get(canonical_key, [])
334
+ for alias in direct_aliases:
335
+ if alias in data:
336
+ return data.get(alias)
337
 
338
+ norm_to_original = {_norm_key(k): k for k in data.keys()}
339
+ for alias in direct_aliases:
340
+ norm_alias = _norm_key(alias)
341
+ if norm_alias in norm_to_original:
342
+ return data.get(norm_to_original[norm_alias])
343
+
344
+ return None
345
+
346
+
347
+ def as_list(value) -> list[str]:
348
+ """Coerce Claude output into a clean list of strings.
349
+
350
+ Handles arrays, strings, numbers, and arrays of objects such as:
351
+ [{"name": "Lei 14.478/2022"}, {"article": "Art. 7º"}]
352
+ """
353
+ if value is None:
354
+ return []
355
+ if isinstance(value, list):
356
+ out = []
357
+ for item in value:
358
+ out.extend(as_list(item))
359
+ return list(dict.fromkeys([str(v).strip() for v in out if str(v).strip()]))
360
+ if isinstance(value, dict):
361
+ preferred = [
362
+ "name", "title", "reference", "article", "regulation", "law",
363
+ "text", "label", "value", "source", "source_label",
364
+ ]
365
+ for key in preferred:
366
+ if key in value and value[key]:
367
+ return as_list(value[key])
368
+ return [
369
+ "; ".join(f"{k}: {v}" for k, v in value.items() if v)
370
+ ]
371
+ text_value = str(value).strip()
372
+ if not text_value:
373
+ return []
374
+ return [text_value]
375
+
376
+
377
+ def infer_regulations_from_results(results: list[dict], max_items: int = 4) -> list[str]:
378
+ regs = []
379
+ for r in results or []:
380
+ label = str(r.get("source_label") or "").strip()
381
+ norm_ref = str(r.get("normative_reference_hint") or "").strip()
382
+ source_id = str(r.get("source_id") or "").strip()
383
+
384
+ if label:
385
+ item = label
386
+ if norm_ref and norm_ref not in item:
387
+ item = f"{item} — {norm_ref}"
388
+ elif norm_ref:
389
+ item = norm_ref
390
+ else:
391
+ item = source_id
392
+
393
+ if item:
394
+ regs.append(item)
395
+
396
+ return list(dict.fromkeys(regs))[:max_items]
397
+
398
+
399
+ def infer_articles_from_results(results: list[dict], max_items: int = 6) -> list[str]:
400
+ articles = []
401
+ for r in results or []:
402
+ article = str(r.get("article_hint") or "").strip()
403
+ norm_ref = str(r.get("normative_reference_hint") or "").strip()
404
+ label = str(r.get("source_label") or "").strip()
405
+ source_id = str(r.get("source_id") or "").strip()
406
+
407
+ if article and norm_ref:
408
+ item = f"{norm_ref} — {article}"
409
+ elif article and label:
410
+ item = f"{label} — {article}"
411
+ elif article:
412
+ item = article
413
+ elif norm_ref:
414
+ item = norm_ref
415
+ elif source_id:
416
+ item = source_id
417
+ else:
418
+ item = ""
419
+
420
+ if item:
421
+ articles.append(item)
422
+
423
+ return list(dict.fromkeys(articles))[:max_items]
424
+
425
+
426
+ def canonicalize_report(report: dict, results: list[dict]) -> dict:
427
+ """Normalize Claude response keys and guarantee non-empty legal-reference arrays."""
428
+ if not isinstance(report, dict):
429
+ report = {}
430
+
431
+ canonical = dict(report)
432
+
433
+ for key in KEY_ALIASES:
434
+ value = _lookup_alias(report, key)
435
+ if value is not None:
436
+ canonical[key] = value
437
+
438
+ canonical["risk_level"] = str(canonical.get("risk_level", "UNCLEAR")).upper().replace("-", "_")
439
+ canonical["compliance_status"] = (
440
+ str(canonical.get("compliance_status", "INSUFFICIENT_INFO"))
441
+ .upper()
442
+ .replace("_", "-")
443
+ )
444
+ canonical["confidence"] = str(canonical.get("confidence", "LOW")).upper()
445
+
446
+ regs = as_list(canonical.get("applicable_regulations"))
447
+ if not regs:
448
+ regs = infer_regulations_from_results(results)
449
+ print(
450
+ "[WARN] applicable_regulations empty or missing in Claude response; "
451
+ f"filled from retrieved sources: {regs}",
452
+ flush=True,
453
+ )
454
 
455
+ articles = as_list(canonical.get("relevant_articles"))
456
+ if not articles:
457
+ articles = infer_articles_from_results(results)
458
+ print(
459
+ "[WARN] relevant_articles empty or missing in Claude response; "
460
+ f"filled from retrieved sources: {articles}",
461
+ flush=True,
462
+ )
463
 
464
+ canonical["applicable_regulations"] = regs
465
+ canonical["relevant_articles"] = articles
466
+
467
+ if not canonical.get("finding"):
468
+ canonical["finding"] = "Assessment generated from the retrieved regulatory context."
469
+ if not canonical.get("corrective_action"):
470
+ canonical["corrective_action"] = "Review the cited regulatory sources and update the compliance procedure accordingly."
471
+ if not canonical.get("authority"):
472
+ authorities = [str(r.get("authority")) for r in results or [] if r.get("authority")]
473
+ canonical["authority"] = "mixed" if len(set(authorities)) > 1 else (authorities[0] if authorities else "?")
474
+
475
+ return canonical
476
+
477
+
478
+ def debug_print_claude(raw: str, clean: str, parsed: dict | None = None) -> None:
479
+ print("\n" + "=" * 72, flush=True)
480
+ print("CLAUDE RAW RESPONSE START", flush=True)
481
+ print(raw or "<EMPTY RAW RESPONSE>", flush=True)
482
+ print("CLAUDE RAW RESPONSE END", flush=True)
483
+ print("-" * 72, flush=True)
484
+ print("CLAUDE EXTRACTED JSON START", flush=True)
485
+ print(clean or "<EMPTY EXTRACTED JSON>", flush=True)
486
+ print("CLAUDE EXTRACTED JSON END", flush=True)
487
+ if isinstance(parsed, dict):
488
+ print("-" * 72, flush=True)
489
+ print(f"CLAUDE PARSED KEYS: {sorted(parsed.keys())}", flush=True)
490
+ print(
491
+ "CLAUDE LEGAL ARRAYS: "
492
+ f"applicable_regulations={parsed.get('applicable_regulations')!r}; "
493
+ f"relevant_articles={parsed.get('relevant_articles')!r}",
494
+ flush=True,
495
+ )
496
+ print("=" * 72 + "\n", flush=True)
497
 
498
 
499
  def call_claude(query: str, context: str) -> dict | None:
 
501
  if not api_key:
502
  print("Missing ANTHROPIC_API_KEY.", flush=True)
503
  return None
504
+
505
  prompt = (
506
  f"COMPLIANCE QUERY:\n{query}\n\n"
507
  f"REGULATORY CONTEXT:\n\n{context}\n\n"
508
+ "Produce a structured compliance assessment. "
509
+ "Return ONLY valid JSON using EXACTLY these keys: "
510
+ "risk_level, compliance_status, applicable_regulations, relevant_articles, "
511
+ "finding, corrective_action, confidence, authority. "
512
+ "The arrays applicable_regulations and relevant_articles must be non-empty."
513
  )
514
+
515
  try:
516
  response = requests.post(
517
  "https://api.anthropic.com/v1/messages",
 
528
  },
529
  timeout=90,
530
  )
531
+
532
+ print(f"Claude HTTP status: {response.status_code}", flush=True)
533
  response.raise_for_status()
534
+
535
+ payload = response.json()
536
  raw = "".join(
537
  block.get("text", "")
538
+ for block in payload.get("content", [])
539
  if block.get("type") == "text"
540
  )
 
541
  clean = extract_json_object(raw)
542
+
543
+ try:
544
+ parsed = json.loads(clean)
545
+ debug_print_claude(raw, clean, parsed)
546
+ return parsed
547
+ except json.JSONDecodeError as json_exc:
548
+ debug_print_claude(raw, clean, None)
549
+ print(f"Claude JSON parse error: {json_exc}", flush=True)
550
+ return None
551
+
552
  except Exception as exc:
553
  print(f"Claude error: {type(exc).__name__}: {exc}", flush=True)
554
  return None
 
573
  }
574
 
575
 
 
 
 
 
 
 
 
 
576
  def esc(value) -> str:
577
  return html.escape("" if value is None else str(value))
578
 
 
688
  def analyze(query: str) -> tuple[str, str]:
689
  if not query or not query.strip():
690
  return render_error("Please enter a compliance query."), ""
691
+
692
  query = query.strip()
693
+ print("\n" + "=" * 72, flush=True)
694
+ print(f"NEW QUERY: {query}", flush=True)
695
+
696
  results = retrieve(query)
697
+ print(f"Retrieved chunks: {len(results)}", flush=True)
698
+ for i, r in enumerate(results, 1):
699
+ print(
700
+ f"[RAG {i}] source_id={r.get('source_id')} | "
701
+ f"authority={r.get('authority')} | "
702
+ f"article_hint={r.get('article_hint')} | "
703
+ f"normative_reference_hint={r.get('normative_reference_hint')} | "
704
+ f"final_score={float(r.get('_final', 0.0)):.3f}",
705
+ flush=True,
706
+ )
707
+
708
  if not results:
709
  return render_error("No relevant regulatory chunks found. Try rephrasing your query."), ""
710
+
711
  context = format_context(results)
712
  report = call_claude(query, context)
713
  if not report:
714
  return render_error("Could not reach Claude API. Check that ANTHROPIC_API_KEY is set as a Space Secret."), context
715
+
716
+ report = canonicalize_report(report, results)
717
+ print(
718
+ "FINAL NORMALIZED REPORT LEGAL ARRAYS: "
719
+ f"applicable_regulations={report.get('applicable_regulations')!r}; "
720
+ f"relevant_articles={report.get('relevant_articles')!r}",
721
+ flush=True,
722
+ )
723
+
724
  return render_report(report, query, results), context
725
 
726