Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,8 +18,6 @@ import joblib
|
|
| 18 |
|
| 19 |
import torch
|
| 20 |
import torch.nn as nn
|
| 21 |
-
from datasets import load_dataset
|
| 22 |
-
|
| 23 |
|
| 24 |
# ============================
|
| 25 |
# Configuración general
|
|
@@ -34,7 +32,6 @@ META_LOGIT_FILENAME = "logreg_rrf_savant_15.joblib" # versión 15 features
|
|
| 34 |
|
| 35 |
# Dataset central con TODOS los artefactos RRF/Savant
|
| 36 |
RRF_DATASET_REPO = "antonypamo/savant_rrf1_curated"
|
| 37 |
-
RRF_TUTOR_DATASET_ID = RRF_DATASET_REPO # mismo repo para Tutor
|
| 38 |
|
| 39 |
|
| 40 |
def hf_data_path(filename: str) -> str:
|
|
@@ -533,66 +530,77 @@ def apply_role_profile(scores: Dict[str, float], role_name: Optional[str]) -> Di
|
|
| 533 |
|
| 534 |
|
| 535 |
# ============================
|
| 536 |
-
# RRF Tutor
|
| 537 |
# ============================
|
| 538 |
|
| 539 |
-
print(f"🔄 [Startup] Cargando dataset para RRF Tutor: {RRF_TUTOR_DATASET_ID}...", flush=True)
|
| 540 |
-
ds_rrf = None
|
| 541 |
rrf_corpus_texts: List[str] = []
|
| 542 |
rrf_corpus_prompts: List[str] = []
|
| 543 |
rrf_corpus_completions: List[str] = []
|
| 544 |
rrf_corpus_embeds = None
|
| 545 |
rrf_tutor_ready = False
|
| 546 |
|
| 547 |
-
try:
|
| 548 |
-
# Cargamos todos los splits y elegimos 'train' o el primero disponible
|
| 549 |
-
ds_dict = load_dataset(RRF_TUTOR_DATASET_ID)
|
| 550 |
-
if "train" in ds_dict:
|
| 551 |
-
ds_rrf = ds_dict["train"]
|
| 552 |
-
split_name = "train"
|
| 553 |
-
else:
|
| 554 |
-
split_name = list(ds_dict.keys())[0]
|
| 555 |
-
ds_rrf = ds_dict[split_name]
|
| 556 |
|
| 557 |
-
|
|
|
|
| 558 |
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
rrf_corpus_embeds = None
|
| 589 |
-
rrf_tutor_ready = False
|
| 590 |
-
print("⚠️ [RRF Tutor] Endpoint /v1/rrf_tutor devolverá error 503 si se usa.", flush=True)
|
| 591 |
|
| 592 |
|
| 593 |
def rrf_tutor_retrieve_examples(query: str, top_k: int = 3):
|
| 594 |
"""
|
| 595 |
-
Recupera los ejemplos más similares desde
|
| 596 |
usando embeddings del encoder RRF.
|
| 597 |
"""
|
| 598 |
if (not rrf_tutor_ready) or rrf_corpus_embeds is None or len(rrf_corpus_embeds) == 0:
|
|
@@ -624,7 +632,7 @@ def rrf_tutor_build_answer(query: str, retrieved_examples):
|
|
| 624 |
if not retrieved_examples:
|
| 625 |
return (
|
| 626 |
"No encontré ejemplos relevantes en el dataset RRF Tutor para tu consulta. "
|
| 627 |
-
"Verifica que
|
| 628 |
)
|
| 629 |
|
| 630 |
best = retrieved_examples[0]
|
|
@@ -634,7 +642,7 @@ def rrf_tutor_build_answer(query: str, retrieved_examples):
|
|
| 634 |
"🔎 Respuesta basada en el ejemplo más cercano del corpus RRF:\n\n"
|
| 635 |
f"{base_completion}\n\n"
|
| 636 |
"💡 Nota: Esta es una versión mínima que reutiliza directamente la 'completion' "
|
| 637 |
-
"del ejemplo más similar en
|
| 638 |
"se conectaría un LLM pequeño que combine varios ejemplos como contexto."
|
| 639 |
)
|
| 640 |
return answer
|
|
@@ -793,14 +801,28 @@ def root():
|
|
| 793 |
|
| 794 |
@app.get("/health")
|
| 795 |
def health():
|
|
|
|
|
|
|
|
|
|
| 796 |
return {
|
| 797 |
"status": "ok",
|
| 798 |
"encoder_model_id": ENCODER_MODEL_ID,
|
| 799 |
"meta_logit_filename": META_LOGIT_FILENAME,
|
| 800 |
"N_sites": N,
|
| 801 |
-
"rrf_tutor_examples": len(
|
|
|
|
| 802 |
"cnn_loaded": savant_cnn is not None,
|
| 803 |
"rrf_nodes_loaded": rrf_nodes is not None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
}
|
| 805 |
|
| 806 |
|
|
@@ -836,7 +858,7 @@ def evaluate_endpoint(req: EvaluateRequest):
|
|
| 836 |
"entropy_initial": float(sim["entropy"][0]),
|
| 837 |
"entropy_final": float(sim["entropy"][-1]),
|
| 838 |
"chirality_initial": float(sim["chirality"][0]),
|
| 839 |
-
"chirality_final": float(sim["chirality"][-1]),
|
| 840 |
"energy_mean": float(np.mean(sim["energy"])),
|
| 841 |
"energy_std": float(np.std(sim["energy"])),
|
| 842 |
"N_sites": int(N),
|
|
@@ -902,7 +924,7 @@ def rrf_tutor_endpoint(body: RRFTutorRequest):
|
|
| 902 |
status_code=503,
|
| 903 |
detail=(
|
| 904 |
"RRF Tutor no está listo: embeddings no cargados. "
|
| 905 |
-
"Verifica
|
| 906 |
),
|
| 907 |
)
|
| 908 |
|
|
|
|
| 18 |
|
| 19 |
import torch
|
| 20 |
import torch.nn as nn
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# ============================
|
| 23 |
# Configuración general
|
|
|
|
| 32 |
|
| 33 |
# Dataset central con TODOS los artefactos RRF/Savant
|
| 34 |
RRF_DATASET_REPO = "antonypamo/savant_rrf1_curated"
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
def hf_data_path(filename: str) -> str:
|
|
|
|
| 530 |
|
| 531 |
|
| 532 |
# ============================
|
| 533 |
+
# RRF Tutor desde JSONL curado
|
| 534 |
# ============================
|
| 535 |
|
|
|
|
|
|
|
| 536 |
rrf_corpus_texts: List[str] = []
|
| 537 |
rrf_corpus_prompts: List[str] = []
|
| 538 |
rrf_corpus_completions: List[str] = []
|
| 539 |
rrf_corpus_embeds = None
|
| 540 |
rrf_tutor_ready = False
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
+
def _load_rrf_tutor_from_jsonl(path: Optional[str]):
|
| 544 |
+
global rrf_corpus_texts, rrf_corpus_prompts, rrf_corpus_completions, rrf_corpus_embeds, rrf_tutor_ready
|
| 545 |
|
| 546 |
+
if path is None:
|
| 547 |
+
print("⚠️ [RRF Tutor] No se encontró ruta para rrf_tutor_curated.jsonl", flush=True)
|
| 548 |
+
rrf_tutor_ready = False
|
| 549 |
+
return
|
| 550 |
+
|
| 551 |
+
print(f"🔄 [RRF Tutor] Cargando ejemplos desde JSONL: {path}", flush=True)
|
| 552 |
+
try:
|
| 553 |
+
examples = []
|
| 554 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 555 |
+
for line in f:
|
| 556 |
+
line = line.strip()
|
| 557 |
+
if not line:
|
| 558 |
+
continue
|
| 559 |
+
try:
|
| 560 |
+
ex = json.loads(line)
|
| 561 |
+
except Exception:
|
| 562 |
+
continue
|
| 563 |
+
if "prompt" in ex and "completion" in ex and ex["prompt"] and ex["completion"]:
|
| 564 |
+
examples.append(ex)
|
| 565 |
+
|
| 566 |
+
if not examples:
|
| 567 |
+
raise ValueError("No se encontraron ejemplos válidos con 'prompt' y 'completion' en el JSONL.")
|
| 568 |
+
|
| 569 |
+
for ex in examples:
|
| 570 |
+
p = ex["prompt"]
|
| 571 |
+
c = ex["completion"]
|
| 572 |
+
rrf_corpus_prompts.append(p)
|
| 573 |
+
rrf_corpus_completions.append(c)
|
| 574 |
+
rrf_corpus_texts.append(p + "\n\n" + c)
|
| 575 |
+
|
| 576 |
+
print(f"🔄 [RRF Tutor] Construyendo embeddings para {len(rrf_corpus_texts)} ejemplos...", flush=True)
|
| 577 |
+
embeds = encoder.encode(
|
| 578 |
+
rrf_corpus_texts,
|
| 579 |
+
convert_to_numpy=True,
|
| 580 |
+
show_progress_bar=True,
|
| 581 |
+
normalize_embeddings=True,
|
| 582 |
+
)
|
| 583 |
+
rrf_corpus_embeds = embeds
|
| 584 |
+
rrf_tutor_ready = True
|
| 585 |
+
print("✅ [RRF Tutor] Embeddings construidos y listos.", flush=True)
|
| 586 |
+
|
| 587 |
+
except Exception as e:
|
| 588 |
+
print(f"❌ [RRF Tutor] Error cargando JSONL: {e}", flush=True)
|
| 589 |
+
rrf_corpus_texts = []
|
| 590 |
+
rrf_corpus_prompts = []
|
| 591 |
+
rrf_corpus_completions = []
|
| 592 |
+
rrf_corpus_embeds = None
|
| 593 |
+
rrf_tutor_ready = False
|
| 594 |
+
print("⚠️ [RRF Tutor] Endpoint /v1/rrf_tutor devolverá 503 si se usa.", flush=True)
|
| 595 |
|
| 596 |
+
|
| 597 |
+
# Cargar RRF Tutor en startup
|
| 598 |
+
_load_rrf_tutor_from_jsonl(RRF_TUTOR_JSONL_PATH)
|
|
|
|
|
|
|
|
|
|
| 599 |
|
| 600 |
|
| 601 |
def rrf_tutor_retrieve_examples(query: str, top_k: int = 3):
|
| 602 |
"""
|
| 603 |
+
Recupera los ejemplos más similares desde el JSONL curado
|
| 604 |
usando embeddings del encoder RRF.
|
| 605 |
"""
|
| 606 |
if (not rrf_tutor_ready) or rrf_corpus_embeds is None or len(rrf_corpus_embeds) == 0:
|
|
|
|
| 632 |
if not retrieved_examples:
|
| 633 |
return (
|
| 634 |
"No encontré ejemplos relevantes en el dataset RRF Tutor para tu consulta. "
|
| 635 |
+
"Verifica que rrf_tutor_curated.jsonl contenga 'prompt' y 'completion'."
|
| 636 |
)
|
| 637 |
|
| 638 |
best = retrieved_examples[0]
|
|
|
|
| 642 |
"🔎 Respuesta basada en el ejemplo más cercano del corpus RRF:\n\n"
|
| 643 |
f"{base_completion}\n\n"
|
| 644 |
"💡 Nota: Esta es una versión mínima que reutiliza directamente la 'completion' "
|
| 645 |
+
"del ejemplo más similar en el corpus curado. En una versión extendida, aquí "
|
| 646 |
"se conectaría un LLM pequeño que combine varios ejemplos como contexto."
|
| 647 |
)
|
| 648 |
return answer
|
|
|
|
| 801 |
|
| 802 |
@app.get("/health")
|
| 803 |
def health():
|
| 804 |
+
"""
|
| 805 |
+
Endpoint de health corporativo: resume el estado de todos los módulos.
|
| 806 |
+
"""
|
| 807 |
return {
|
| 808 |
"status": "ok",
|
| 809 |
"encoder_model_id": ENCODER_MODEL_ID,
|
| 810 |
"meta_logit_filename": META_LOGIT_FILENAME,
|
| 811 |
"N_sites": N,
|
| 812 |
+
"rrf_tutor_examples": len(rrf_corpus_prompts),
|
| 813 |
+
"rrf_tutor_ready": rrf_tutor_ready,
|
| 814 |
"cnn_loaded": savant_cnn is not None,
|
| 815 |
"rrf_nodes_loaded": rrf_nodes is not None,
|
| 816 |
+
"physics_artifacts": {
|
| 817 |
+
"rrf_resonance_matrix": PHYS_RRF_RESONANCE_MATRIX is not None,
|
| 818 |
+
"rrf_energy_profile": PHYS_RRF_ENERGY_PROFILE is not None,
|
| 819 |
+
"rrf_eigen_spectrum": PHYS_RRF_EIGEN_SPECTRUM is not None,
|
| 820 |
+
"resonance_matrix_13": PHYS_RES_MATRIX_13 is not None,
|
| 821 |
+
"nodes_13": PHYS_NODES_13 is not None,
|
| 822 |
+
"energy_logphi_13": PHYS_ENERGY_LOGPHI_13 is not None,
|
| 823 |
+
"degree_13": PHYS_DEGREE_13 is not None,
|
| 824 |
+
"adjacency_13": PHYS_ADJ_13 is not None,
|
| 825 |
+
},
|
| 826 |
}
|
| 827 |
|
| 828 |
|
|
|
|
| 858 |
"entropy_initial": float(sim["entropy"][0]),
|
| 859 |
"entropy_final": float(sim["entropy"][-1]),
|
| 860 |
"chirality_initial": float(sim["chirality"][0]),
|
| 861 |
+
"chirality_final": float(sim["chirility"][-1]) if "chirility" in sim else float(sim["chirality"][-1]),
|
| 862 |
"energy_mean": float(np.mean(sim["energy"])),
|
| 863 |
"energy_std": float(np.std(sim["energy"])),
|
| 864 |
"N_sites": int(N),
|
|
|
|
| 924 |
status_code=503,
|
| 925 |
detail=(
|
| 926 |
"RRF Tutor no está listo: embeddings no cargados. "
|
| 927 |
+
"Verifica rrf_tutor_curated.jsonl en antonypamo/savant_rrf1_curated y reinicia el Space."
|
| 928 |
),
|
| 929 |
)
|
| 930 |
|