Spaces:

Madras1
/

AetherMap

Running

App Files Files Community

Madras1 commited on Dec 20, 2025

Commit

3c9cd4c

verified ·

1 Parent(s): 8cf9cdd

Upload 6 files

Browse files

Files changed (2) hide show

app.py +137 -3
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # ==============================================================================
-#  API do AetherMap — VERSÃO 7.2 (CSV + TAVILY EDITION)
-#  Backend com RAG Híbrido, CSV Support, Web Search via Tavily
 # ==============================================================================
 import numpy as np
@@ -15,10 +15,11 @@ import logging
 import time
 import nltk
 from nltk.corpus import stopwords
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.responses import JSONResponse
-from typing import List, Dict, Any
 from functools import lru_cache
 # Ferramentas de Alquimia (ML & NLP)
@@ -30,6 +31,10 @@ from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from scipy.stats import entropy
 # Monitoramento (O Toque da Berta)
 from prometheus_fastapi_instrumentator import Instrumentator
 from prometheus_client import Histogram
@@ -151,6 +156,88 @@ def load_reranker():
     logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
     return CrossEncoder(RERANKER_MODEL, device=device)
 # ==============================================================================
 # PIPELINE DE PROCESSAMENTO DE DADOS
@@ -643,4 +730,51 @@ async def search_web_api(
     except Exception as e:
         logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))

 # ==============================================================================
+#  API do AetherMap — VERSÃO 7.3 (KNOWLEDGE GRAPH EDITION)
+#  Backend com RAG Híbrido, CSV, Tavily, NER Entity Graph
 # ==============================================================================
 import numpy as np
 import time
 import nltk
 from nltk.corpus import stopwords
+from collections import defaultdict
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.responses import JSONResponse
+from typing import List, Dict, Any, Tuple
 from functools import lru_cache
 # Ferramentas de Alquimia (ML & NLP)
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from scipy.stats import entropy
+# NER & Language Detection
+import spacy
+from langdetect import detect, LangDetectException
 # Monitoramento (O Toque da Berta)
 from prometheus_fastapi_instrumentator import Instrumentator
 from prometheus_client import Histogram
     logging.info(f"Carregando Reranker '{RERANKER_MODEL}' em: {device}")
     return CrossEncoder(RERANKER_MODEL, device=device)
+# Cache for spaCy models
+_spacy_models = {}
+def load_spacy_model(lang: str):
+    """Carrega modelo spaCy com cache."""
+    if lang not in _spacy_models:
+        model_name = "pt_core_news_sm" if lang == "pt" else "en_core_web_sm"
+        try:
+            _spacy_models[lang] = spacy.load(model_name)
+            logging.info(f"Modelo spaCy '{model_name}' carregado.")
+        except OSError:
+            logging.warning(f"Modelo {model_name} não encontrado. Baixando...")
+            spacy.cli.download(model_name.replace("_", "-").replace("-sm", ""))
+            _spacy_models[lang] = spacy.load(model_name)
+    return _spacy_models[lang]
+def detect_language(texts: List[str]) -> str:
+    """Detecta idioma predominante nos textos."""
+    sample = " ".join(texts[:10])[:1000]
+    try:
+        lang = detect(sample)
+        return "pt" if lang == "pt" else "en"
+    except LangDetectException:
+        return "en"
+def extract_entities(textos: List[str]) -> List[List[Tuple[str, str]]]:
+    """Extrai entidades nomeadas de cada texto."""
+    lang = detect_language(textos)
+    nlp = load_spacy_model(lang)
+    entities_by_doc = []
+    for text in textos:
+        # Limitar tamanho do texto para performance
+        doc = nlp(text[:2000])
+        entities = [(ent.text.lower().strip(), ent.label_) for ent in doc.ents
+                   if len(ent.text.strip()) > 2 and ent.label_ in ("PERSON", "PER", "ORG", "GPE", "LOC")]
+        entities_by_doc.append(entities)
+    return entities_by_doc
+def build_entity_graph(entities_by_doc: List[List[Tuple[str, str]]],
+                       positions: List[Dict]) -> Dict[str, Any]:
+    """Constrói grafo de conexões baseado em entidades compartilhadas."""
+    # Inverter: entidade -> lista de doc indices
+    entity_to_docs = defaultdict(set)
+    for doc_idx, entities in enumerate(entities_by_doc):
+        for entity_text, entity_type in entities:
+            entity_to_docs[(entity_text, entity_type)].add(doc_idx)
+    # Construir arestas (conexões entre docs que compartilham entidades)
+    edges = []
+    seen_pairs = set()
+    for (entity_text, entity_type), doc_indices in entity_to_docs.items():
+        if len(doc_indices) < 2:
+            continue
+        doc_list = sorted(doc_indices)
+        for i in range(len(doc_list)):
+            for j in range(i + 1, doc_list):
+                pair = (doc_list[i], doc_list[j])
+                if pair not in seen_pairs:
+                    seen_pairs.add(pair)
+                    edges.append({
+                        "source": doc_list[i],
+                        "target": doc_list[j],
+                        "entity": entity_text,
+                        "entity_type": entity_type
+                    })
+    # Contar entidades mais frequentes
+    entity_counts = [(k, len(v)) for k, v in entity_to_docs.items() if len(v) >= 2]
+    top_entities = sorted(entity_counts, key=lambda x: x[1], reverse=True)[:20]
+    return {
+        "edges": edges,
+        "edge_count": len(edges),
+        "connected_pairs": len(seen_pairs),
+        "top_entities": [{"entity": e[0][0], "type": e[0][1], "docs": e[1]} for e in top_entities]
+    }
 # ==============================================================================
 # PIPELINE DE PROCESSAMENTO DE DADOS
     except Exception as e:
         logging.error(f"ERRO EM /search_web/: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+# ==============================================================================
+# ENDPOINT KNOWLEDGE GRAPH (NER)
+# ==============================================================================
+@app.post("/entity_graph/")
+async def entity_graph_api(job_id: str = Form(...)):
+    """
+    Extrai entidades nomeadas e constrói grafo de conexões entre documentos.
+    """
+    if job_id not in cache:
+        raise HTTPException(status_code=404, detail="Job ID não encontrado.")
+    logging.info(f"Construindo Knowledge Graph para Job: {job_id}")
+    try:
+        cached_data = cache[job_id]
+        df = cached_data["df"]
+        textos = df["full_text"].tolist()
+        # Extrair entidades
+        logging.info(f"Extraindo entidades de {len(textos)} documentos...")
+        entities_by_doc = extract_entities(textos)
+        # Construir posições dos pontos
+        positions = df[["x", "y", "z"]].to_dict("records")
+        # Construir grafo
+        graph_data = build_entity_graph(entities_by_doc, positions)
+        # Adicionar posições ao resultado
+        graph_data["positions"] = positions
+        graph_data["num_documents"] = len(textos)
+        # Entidades por documento (para tooltip)
+        graph_data["entities_by_doc"] = [
+            [{"text": e[0], "type": e[1]} for e in ents]
+            for ents in entities_by_doc
+        ]
+        logging.info(f"Grafo construído: {graph_data['edge_count']} arestas, {len(graph_data['top_entities'])} entidades principais")
+        return graph_data
+    except Exception as e:
+        logging.error(f"ERRO EM /entity_graph/: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))

requirements.txt CHANGED Viewed

@@ -16,6 +16,8 @@ scipy
 umap-learn
 hdbscan
 nltk
 # --- TORCH CPU ---
 torch

 umap-learn
 hdbscan
 nltk
+spacy
+langdetect
 # --- TORCH CPU ---
 torch