Spaces:

Danielfonseca1212
/

assistentjuri

Runtime error

App Files Files Community

Danielfonseca1212 commited on Mar 25

Commit

e24d656

verified ·

1 Parent(s): 3c70859

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -306

app.py CHANGED Viewed

@@ -1,327 +1,134 @@
-"""
-lex-mcp — Assistente Jurídico via MCP + Hugging Face
-Expõe ferramentas especializadas em direito para qualquer LLM host compatível com MCP.
-"""
-from __future__ import annotations
 import os
-import re
-from typing import Any
 import httpx
-from fastmcp import FastMCP
-from huggingface_hub import HfApi, ModelFilter, list_models
-from huggingface_hub.utils import RepositoryNotFoundError
-from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
-# ── Bootstrap ─────────────────────────────────────────────────────────────────
-mcp = FastMCP(
-    name="lex-mcp",
-    instructions="""
-    Você é LEX, um assistente jurídico especializado alimentado por modelos e datasets
-    do Hugging Face Hub. Você possui quatro ferramentas:
-    • search_legal_models   — encontra modelos de NLP treinados em domínio jurídico
-    • explore_legal_dataset — inspeciona datasets jurídicos (jurisprudência, leis, contratos)
-    • analyze_legal_text    — roda inferência NLP em texto jurídico (classificação, NER, resumo)
-    • find_jurisprudence    — busca decisões e ementas em datasets de jurisprudência
-    IMPORTANTE: Sempre use as ferramentas para buscar dados atuais do Hub.
-    Nunca invente modelos ou citações. Indique limitações quando relevante.
-    Responda em português quando o usuário escrever em português.
-    """,
-)
 HF_TOKEN = os.getenv("HF_TOKEN")
-api = HfApi(token=HF_TOKEN)
-# Modelos jurídicos de referência no HF Hub (curados)
-LEGAL_MODEL_HINTS = [
-    "legal", "juridico", "jurídico", "law", "legislation",
-    "bert-legal", "legalbert", "law-bert", "contracts", "court",
-    "nlp-laval", "legal-xlm", "legalbench", "saul", "brazilianLegal",
-]
-LEGAL_DATASET_HINTS = [
-    "legal", "law", "court", "jurisprudence", "legislation",
-    "contracts", "case-law", "oab", "stf", "stj", "tjsp",
-]
-# ── Tool 1 — search_legal_models ──────────────────────────────────────────────
-@mcp.tool(
-    description=(
-        "Busca modelos de NLP especializados em domínio jurídico no Hugging Face Hub. "
-        "Filtre por língua (ex: 'pt' para português), tarefa (ex: 'text-classification', "
-        "'token-classification', 'summarization') e palavras-chave. "
-        "Retorna os modelos mais baixados com metadados completos."
-    )
-)
-def search_legal_models(
-    query: str = "legal",
-    language: str = "pt",
-    task: str = "",
-    limit: int = 8,
-) -> list[dict[str, Any]]:
-    """Retorna modelos jurídicos ordenados por downloads."""
-    # Enriquecer query com termos jurídicos se necessário
-    legal_query = query if any(h in query.lower() for h in LEGAL_MODEL_HINTS) else f"legal {query}"
-    filters = ModelFilter(
-        task=task or None,
-        language=language or None,
-    )
-    results = list(
-        list_models(
-            filter=filters,
-            search=legal_query,
-            sort="downloads",
-            direction=-1,
-            limit=limit,
-            token=HF_TOKEN,
-            cardData=True,
-        )
-    )
-    return [
-        {
-            "id": m.modelId,
-            "task": m.pipeline_tag,
-            "downloads": m.downloads,
-            "likes": m.likes,
-            "last_modified": str(m.lastModified)[:10],
-            "tags": [t for t in (m.tags or []) if len(t) < 40][:8],
-            "language": getattr(m, "language", None),
-            "hf_url": f"https://huggingface.co/{m.modelId}",
-        }
-        for m in results
-    ]
-# ── Tool 2 — explore_legal_dataset ───────────────────────────────────────────
-@mcp.tool(
-    description=(
-        "Inspeciona um dataset jurídico no Hugging Face Hub. "
-        "Retorna configs disponíveis, splits, schema de colunas e exemplos de registros. "
-        "Ideal para entender datasets de jurisprudência, legislação e contratos. "
-        "Use dataset_id como 'joelniklaus/MultiLegalPile', 'lexlms/lex_glue', etc."
-    )
-)
-def explore_legal_dataset(
-    dataset_id: str,
-    config: str = "default",
-    split: str = "train",
-    n_samples: int = 3,
-) -> dict[str, Any]:
-    """Retorna schema + amostras de um dataset jurídico."""
-    try:
-        configs = get_dataset_config_names(dataset_id, token=HF_TOKEN)
-    except Exception:
-        configs = [config]
-    resolved_config = config if config in configs else (configs[0] if configs else None)
-    try:
-        splits = get_dataset_split_names(dataset_id, config_name=resolved_config, token=HF_TOKEN)
-    except Exception:
-        splits = [split]
-    resolved_split = split if split in splits else (splits[0] if splits else "train")
     try:
-        ds = load_dataset(
-            dataset_id,
-            name=resolved_config,
-            split=f"{resolved_split}[:{n_samples}]",
-            token=HF_TOKEN,
-            trust_remote_code=False,
         )
-        features = {k: str(v) for k, v in ds.features.items()}
-        samples = ds.to_list()
-        # Truncar textos longos para não explodir o contexto
-        for sample in samples:
-            for key, val in sample.items():
-                if isinstance(val, str) and len(val) > 600:
-                    sample[key] = val[:600] + "…"
     except Exception as e:
-        features = {}
-        samples = []
-        return {
-            "dataset_id": dataset_id,
-            "error": str(e),
-            "configs_available": configs,
-            "splits_available": splits,
-        }
-    return {
-        "dataset_id": dataset_id,
-        "hf_url": f"https://huggingface.co/datasets/{dataset_id}",
-        "configs_available": configs,
-        "splits_available": splits,
-        "resolved": {"config": resolved_config, "split": resolved_split},
-        "total_features": len(features),
-        "features": features,
-        "samples": samples,
-    }
-# ── Tool 3 — analyze_legal_text ──────────────────────────────────────────────
-@mcp.tool(
-    description=(
-        "Roda inferência NLP em texto jurídico usando a Hugging Face Inference API. "
-        "Tarefas suportadas: 'summarization' (resumo de decisões), "
-        "'text-classification' (classificação de matéria/área do direito), "
-        "'token-classification' (NER: partes, datas, valores), "
-        "'question-answering' (responde perguntas sobre o texto). "
-        "Se model_id não for fornecido, usa modelos jurídicos recomendados."
-    )
-)
-def analyze_legal_text(
-    text: str,
-    task: str = "summarization",
-    model_id: str = "",
-    context: str = "",
-) -> dict[str, Any]:
-    """Executa análise NLP jurídica via Inference API."""
-    # Modelos padrão por tarefa (jurídicos ou multilíngues de qualidade)
-    DEFAULT_MODELS: dict[str, str] = {
-        "summarization": "facebook/bart-large-cnn",
-        "text-classification": "nlpaueb/legal-bert-base-uncased",
-        "token-classification": "nlpaueb/legal-bert-base-uncased",
-        "question-answering": "deepset/roberta-base-squad2",
-        "fill-mask": "nlpaueb/legal-bert-base-uncased",
-    }
-    resolved_model = model_id or DEFAULT_MODELS.get(task, "facebook/bart-large-cnn")
-    url = f"https://api-inference.huggingface.co/models/{resolved_model}"
-    headers = {"Content-Type": "application/json"}
-    if HF_TOKEN:
-        headers["Authorization"] = f"Bearer {HF_TOKEN}"
-    if task:
-        headers["X-Task"] = task
-    # Montar payload conforme a tarefa
-    if task == "question-answering" and context:
-        payload: dict[str, Any] = {"inputs": {"question": text, "context": context}}
-    elif task == "summarization":
-        # Truncar para evitar erros de tamanho máximo
-        payload = {
-            "inputs": text[:1024],
-            "parameters": {"max_length": 200, "min_length": 40, "do_sample": False},
         }
-    else:
         payload = {"inputs": text[:512]}
-    with httpx.Client(timeout=60.0) as client:
-        resp = client.post(url, headers=headers, json=payload)
-    if resp.status_code == 503:
-        return {
-            "status": "model_loading",
-            "model_id": resolved_model,
-            "message": "Modelo está carregando. Tente novamente em 20-30 segundos.",
-        }
-    if resp.status_code != 200:
-        return {
-            "error": f"HTTP {resp.status_code}",
-            "model_id": resolved_model,
-            "detail": resp.text[:400],
-        }
-    try:
-        result = resp.json()
-    except Exception:
-        result = resp.text
-    return {
-        "model_id": resolved_model,
-        "task": task,
-        "hf_url": f"https://huggingface.co/{resolved_model}",
-        "result": result,
-    }
-# ── Tool 4 — find_jurisprudence ───────────────────────────────────────────────
-@mcp.tool(
-    description=(
-        "Busca decisões judiciais e ementas em datasets de jurisprudência disponíveis "
-        "no Hugging Face Hub. Pesquisa por palavras-chave no texto das decisões. "
-        "Retorna ementas, tribunal, data e número do processo quando disponíveis. "
-        "Datasets suportados: 'joelniklaus/brazilian_court_decisions', "
-        "'lagepaul/jurisprudencia-brasil' e outros datasets jurídicos brasileiros."
-    )
-)
-def find_jurisprudence(
-    keywords: str,
-    dataset_id: str = "joelniklaus/brazilian_court_decisions",
-    max_results: int = 5,
-    split: str = "train",
-) -> dict[str, Any]:
-    """Busca decisões judiciais por palavras-chave."""
-    try:
-        configs = get_dataset_config_names(dataset_id, token=HF_TOKEN)
-        resolved_config = configs[0] if configs else None
-    except Exception:
-        resolved_config = None
     try:
-        # Carregar slice generoso para fazer busca textual
         ds = load_dataset(
             dataset_id,
-            name=resolved_config,
-            split=f"{split}[:500]",
             token=HF_TOKEN,
             trust_remote_code=False,
         )
     except Exception as e:
-        return {"error": str(e), "dataset_id": dataset_id}
-    # Identificar coluna de texto principal
-    text_cols = [
-        col for col in ds.column_names
-        if any(kw in col.lower() for kw in ["text", "ementa", "decision", "body", "content", "acordao"])
-    ]
-    text_col = text_cols[0] if text_cols else ds.column_names[0]
-    # Busca por keywords (case-insensitive)
-    kw_pattern = re.compile("|".join(re.escape(k.strip()) for k in keywords.split(",")), re.IGNORECASE)
-    matches = []
-    for row in ds:
-        haystack = str(row.get(text_col, ""))
-        if kw_pattern.search(haystack):
-            snippet = haystack[:500] + ("…" if len(haystack) > 500 else "")
-            matches.append({
-                "snippet": snippet,
-                "columns": {k: str(v)[:200] for k, v in row.items() if k != text_col},
-            })
-        if len(matches) >= max_results:
-            break
-    return {
-        "dataset_id": dataset_id,
-        "hf_url": f"https://huggingface.co/datasets/{dataset_id}",
-        "keywords_searched": keywords,
-        "text_column_used": text_col,
-        "total_matches": len(matches),
-        "results": matches,
-    }
-# ── Entry point ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":
-    mcp.run()

+# app.py
+import gradio as gr
 import os
 import httpx
+from huggingface_hub import list_models  # ← CORREÇÃO: sem ModelFilter
+from datasets import load_dataset
 HF_TOKEN = os.getenv("HF_TOKEN")
+def search_models(query="legal", language="pt", limit=5):
+    """Busca modelos no HF Hub"""
     try:
+        results = list(
+            list_models(
+                search=query,
+                language=language or None,
+                sort="downloads",
+                direction=-1,
+                limit=int(limit),
+                token=HF_TOKEN,
+            )
         )
+        output = ""
+        for m in results[:int(limit)]:
+            output += f"**{m.modelId}**\n"
+            output += f"- Task: {m.pipeline_tag}\n"
+            output += f"- Downloads: {m.downloads:,}\n"
+            output += f"- URL: https://huggingface.co/{m.modelId}\n\n"
+        return output if output else "Nenhum modelo encontrado."
     except Exception as e:
+        return f"Erro: {str(e)}"
+def analyze_text(text, task="summarization"):
+    """Analisa texto via Inference API"""
+    try:
+        models = {
+            "summarization": "facebook/bart-large-cnn",
+            "text-classification": "nlpaueb/legal-bert-base-uncased",
         }
+        model_id = models.get(task, "facebook/bart-large-cnn")
+        url = f"https://api-inference.huggingface.co/models/{model_id}"
+        headers = {"Content-Type": "application/json"}
+        if HF_TOKEN:
+            headers["Authorization"] = f"Bearer {os.getenv('HF_TOKEN')}"
         payload = {"inputs": text[:512]}
+        with httpx.Client(timeout=60.0) as client:
+            resp = client.post(url, headers=headers, json=payload)
+        if resp.status_code == 200:
+            return str(resp.json())
+        else:
+            return f"Erro HTTP {resp.status_code}: {resp.text[:200]}"
+    except Exception as e:
+        return f"Erro: {str(e)}"
+def explore_dataset(dataset_id, n_samples=3):
+    """Explora dataset do HF"""
     try:
         ds = load_dataset(
             dataset_id,
+            split=f"train[:{n_samples}]",
             token=HF_TOKEN,
             trust_remote_code=False,
         )
+        output = f"**Dataset:** {dataset_id}\n\n"
+        output += f"**Colunas:** {', '.join(ds.column_names)}\n\n"
+        for i, sample in enumerate(ds.to_list()[:int(n_samples)]):
+            output += f"--- Amostra {i+1} ---\n"
+            for key, val in sample.items():
+                if isinstance(val, str) and len(val) > 300:
+                    val = val[:300] + "..."
+                output += f"{key}: {val}\n"
+            output += "\n"
+        return output
     except Exception as e:
+        return f"Erro: {str(e)}"
+# Interface Gradio
+with gr.Blocks(title="LEX - Assistente Jurídico", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚖️ LEX - Assistente Jurídico MCP")
+    gr.Markdown("Powered by Hugging Face Hub + FastMCP")
+    with gr.Tab("🔍 Buscar Modelos"):
+        with gr.Row():
+            with gr.Column():
+                model_query = gr.Textbox(label="Query", value="legal")
+                model_lang = gr.Textbox(label="Idioma", value="pt")
+                model_limit = gr.Slider(1, 20, value=5, step=1, label="Limite")
+                model_btn = gr.Button("Buscar", variant="primary")
+            with gr.Column():
+                model_output = gr.Textbox(label="Resultados", lines=15, interactive=False)
+        model_btn.click(fn=search_models, inputs=[model_query, model_lang, model_limit], outputs=model_output)
+    with gr.Tab("📝 Analisar Texto"):
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(label="Texto Jurídico", lines=5)
+                task_type = gr.Dropdown(
+                    choices=["summarization", "text-classification"],
+                    value="summarization",
+                    label="Tarefa"
+                )
+                analyze_btn = gr.Button("Analisar", variant="primary")
+            with gr.Column():
+                analyze_output = gr.Textbox(label="Resultado", lines=10, interactive=False)
+        analyze_btn.click(fn=analyze_text, inputs=[text_input, task_type], outputs=analyze_output)
+    with gr.Tab("📊 Explorar Dataset"):
+        with gr.Row():
+            with gr.Column():
+                dataset_input = gr.Textbox(
+                    label="Dataset ID",
+                    value="joelniklaus/brazilian_court_decisions"
+                )
+                sample_count = gr.Slider(1, 10, value=3, step=1, label="Amostras")
+                dataset_btn = gr.Button("Explorar", variant="primary")
+            with gr.Column():
+                dataset_output = gr.Textbox(label="Dataset Info", lines=15, interactive=False)
+        dataset_btn.click(fn=explore_dataset, inputs=[dataset_input, sample_count], outputs=dataset_output)
 if __name__ == "__main__":
+    demo.launch()