Create routes .py
Browse files- routes .py +148 -0
routes .py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
data/routes.py
|
| 3 |
+
Descoberta automática de Rotas Atômicas a partir das chaves estrangeiras.
|
| 4 |
+
|
| 5 |
+
Uma "Rota Atômica" é um caminho entre tabelas conectadas via FKs.
|
| 6 |
+
NÃO convertemos para grafo — as rotas são inferidas do schema relacional.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from typing import List, Dict, Tuple, Optional
|
| 11 |
+
import numpy as np
|
| 12 |
+
from collections import deque
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ─── SCHEMA HARDCODED TPC-H ──────────────────────────────────────────────────
|
| 16 |
+
|
| 17 |
+
TPCH_FOREIGN_KEYS: List[Tuple[str, str, str, str]] = [
|
| 18 |
+
# (tabela_origem, fk_coluna, tabela_destino, pk_coluna)
|
| 19 |
+
("orders", "o_custkey", "customers", "c_custkey"),
|
| 20 |
+
("lineitem", "l_orderkey", "orders", "o_orderkey"),
|
| 21 |
+
("lineitem", "l_suppkey", "supplier", "s_suppkey"),
|
| 22 |
+
("lineitem", "l_partkey", "part", "p_partkey"),
|
| 23 |
+
("customers", "c_nationkey", "nation", "n_nationkey"),
|
| 24 |
+
("supplier", "s_nationkey", "nation", "n_nationkey"),
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class RouteConfig:
|
| 30 |
+
max_hops: int = 3
|
| 31 |
+
target_table: str = "customers" # entidade alvo da predição
|
| 32 |
+
min_attention_weight: float = 0.1
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class AtomicRoute:
|
| 37 |
+
path: List[str] # ex: ["customers", "orders", "lineitem"]
|
| 38 |
+
fk_edges: List[Tuple] # [(from, fk_col, to, pk_col), ...]
|
| 39 |
+
n_hops: int = 0
|
| 40 |
+
attention_weight: float = 1.0
|
| 41 |
+
active: bool = True
|
| 42 |
+
|
| 43 |
+
def __post_init__(self):
|
| 44 |
+
self.n_hops = len(self.path) - 1
|
| 45 |
+
|
| 46 |
+
def __repr__(self):
|
| 47 |
+
return f"Route({' → '.join(self.path)}, α={self.attention_weight:.3f})"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _build_adjacency(fk_list: List[Tuple]) -> Dict[str, List[Tuple]]:
|
| 51 |
+
"""Constrói lista de adjacência bidirecional a partir das FKs."""
|
| 52 |
+
adj = {}
|
| 53 |
+
for (src, src_col, dst, dst_col) in fk_list:
|
| 54 |
+
if src not in adj:
|
| 55 |
+
adj[src] = []
|
| 56 |
+
if dst not in adj:
|
| 57 |
+
adj[dst] = []
|
| 58 |
+
adj[src].append((dst, src_col, dst_col, "forward"))
|
| 59 |
+
adj[dst].append((src, dst_col, src_col, "backward"))
|
| 60 |
+
return adj
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def discover_atomic_routes(
|
| 64 |
+
tables: Dict,
|
| 65 |
+
config: RouteConfig,
|
| 66 |
+
fk_list: Optional[List[Tuple]] = None,
|
| 67 |
+
) -> List[AtomicRoute]:
|
| 68 |
+
"""
|
| 69 |
+
BFS a partir da tabela alvo para descobrir todas as rotas atômicas
|
| 70 |
+
até `config.max_hops` saltos.
|
| 71 |
+
|
| 72 |
+
NÃO requer conversão para grafo — opera diretamente no schema.
|
| 73 |
+
"""
|
| 74 |
+
if fk_list is None:
|
| 75 |
+
fk_list = TPCH_FOREIGN_KEYS
|
| 76 |
+
|
| 77 |
+
adj = _build_adjacency(fk_list)
|
| 78 |
+
start = config.target_table
|
| 79 |
+
routes: List[AtomicRoute] = []
|
| 80 |
+
|
| 81 |
+
# BFS: (caminho_atual, arestas_percorridas, visitados)
|
| 82 |
+
queue = deque()
|
| 83 |
+
queue.append(([start], [], {start}))
|
| 84 |
+
|
| 85 |
+
while queue:
|
| 86 |
+
path, edges, visited = queue.popleft()
|
| 87 |
+
current = path[-1]
|
| 88 |
+
n_hops = len(path) - 1
|
| 89 |
+
|
| 90 |
+
# Só registra rotas com pelo menos 1 hop
|
| 91 |
+
if n_hops >= 1:
|
| 92 |
+
fk_edges = [
|
| 93 |
+
(fk_list[i] if i < len(fk_list) else edges[i])
|
| 94 |
+
for i in range(len(edges))
|
| 95 |
+
]
|
| 96 |
+
route = AtomicRoute(
|
| 97 |
+
path=list(path),
|
| 98 |
+
fk_edges=list(edges),
|
| 99 |
+
attention_weight=_initial_attention(n_hops),
|
| 100 |
+
active=(n_hops <= 2),
|
| 101 |
+
)
|
| 102 |
+
routes.append(route)
|
| 103 |
+
|
| 104 |
+
if n_hops >= config.max_hops:
|
| 105 |
+
continue
|
| 106 |
+
|
| 107 |
+
# Expande vizinhos
|
| 108 |
+
for (neighbor, col_a, col_b, direction) in adj.get(current, []):
|
| 109 |
+
if neighbor not in visited and neighbor in tables:
|
| 110 |
+
new_path = path + [neighbor]
|
| 111 |
+
new_edges = edges + [(current, col_a, neighbor, col_b)]
|
| 112 |
+
new_visited = visited | {neighbor}
|
| 113 |
+
queue.append((new_path, new_edges, new_visited))
|
| 114 |
+
|
| 115 |
+
# Ordena por peso inicial (hops menores têm mais peso)
|
| 116 |
+
routes.sort(key=lambda r: -r.attention_weight)
|
| 117 |
+
|
| 118 |
+
# Normaliza pesos com softmax simulado
|
| 119 |
+
weights = np.array([r.attention_weight for r in routes])
|
| 120 |
+
weights = np.exp(weights) / np.exp(weights).sum()
|
| 121 |
+
for r, w in zip(routes, weights):
|
| 122 |
+
r.attention_weight = float(w)
|
| 123 |
+
|
| 124 |
+
return routes
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _initial_attention(n_hops: int) -> float:
|
| 128 |
+
"""Peso inicial decrescente por número de hops (heurística)."""
|
| 129 |
+
return 1.0 / (n_hops ** 1.5)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def routes_to_dataframe(routes: List[AtomicRoute]):
|
| 133 |
+
"""Converte lista de rotas para DataFrame para exibição."""
|
| 134 |
+
import pandas as pd
|
| 135 |
+
return pd.DataFrame([{
|
| 136 |
+
"Rota": " → ".join(r.path),
|
| 137 |
+
"Hops": r.n_hops,
|
| 138 |
+
"Peso α": round(r.attention_weight, 4),
|
| 139 |
+
"Ativa": r.active,
|
| 140 |
+
} for r in routes])
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def get_feature_tables_for_route(
|
| 144 |
+
route: AtomicRoute,
|
| 145 |
+
tables: Dict,
|
| 146 |
+
) -> List:
|
| 147 |
+
"""Retorna as tabelas (DataFrames) na ordem da rota."""
|
| 148 |
+
return [tables[t] for t in route.path if t in tables]
|