Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -531,89 +531,63 @@ def apply_role_profile(scores: Dict[str, float], role_name: Optional[str]) -> Di
|
|
| 531 |
|
| 532 |
|
| 533 |
# ============================
|
| 534 |
-
# RRF Tutor
|
| 535 |
# ============================
|
| 536 |
|
| 537 |
-
|
|
|
|
|
|
|
|
|
|
| 538 |
rrf_corpus_texts: List[str] = []
|
| 539 |
rrf_corpus_prompts: List[str] = []
|
| 540 |
rrf_corpus_completions: List[str] = []
|
|
|
|
|
|
|
| 541 |
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
if p and c:
|
| 552 |
-
rrf_corpus_prompts.append(p)
|
| 553 |
-
rrf_corpus_completions.append(c)
|
| 554 |
-
rrf_corpus_texts.append(p + "\n\n" + c)
|
| 555 |
-
|
| 556 |
-
if rrf_corpus_texts:
|
| 557 |
-
print(f"✅ RRF Tutor: {len(rrf_corpus_texts)} ejemplos cargados.", flush=True)
|
| 558 |
-
rrf_corpus_embeds = encoder.encode(
|
| 559 |
-
rrf_corpus_texts,
|
| 560 |
-
convert_to_numpy=True,
|
| 561 |
-
show_progress_bar=True,
|
| 562 |
-
normalize_embeddings=True,
|
| 563 |
-
)
|
| 564 |
-
print("✅ [RRF Tutor] Embeddings construidos.", flush=True)
|
| 565 |
-
else:
|
| 566 |
-
print("⚠️ RRF Tutor JSONL no tiene ejemplos válidos.", file=sys.stderr, flush=True)
|
| 567 |
-
rrf_corpus_embeds = np.zeros((0, 384), dtype=np.float32)
|
| 568 |
-
except Exception as e:
|
| 569 |
-
print(f"❌ Error cargando/parsing RRF Tutor JSONL: {e}", file=sys.stderr, flush=True)
|
| 570 |
-
rrf_corpus_embeds = np.zeros((0, 384), dtype=np.float32)
|
| 571 |
-
else:
|
| 572 |
-
print("⚠️ No se encontró RRF_TUTOR_JSONL_PATH.", file=sys.stderr, flush=True)
|
| 573 |
-
rrf_corpus_embeds = np.zeros((0, 384), dtype=np.float32)
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
def rrf_tutor_retrieve_examples(query: str, top_k: int = 3):
|
| 577 |
-
if rrf_corpus_embeds is None or len(rrf_corpus_embeds) == 0:
|
| 578 |
-
raise RuntimeError("Embeddings de RRF Tutor no están disponibles.")
|
| 579 |
-
|
| 580 |
-
q_emb = encoder.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
|
| 581 |
-
sims = np.dot(rrf_corpus_embeds, q_emb)
|
| 582 |
-
|
| 583 |
-
top_k = min(top_k, len(rrf_corpus_embeds))
|
| 584 |
-
top_idx = np.argsort(-sims)[:top_k]
|
| 585 |
-
|
| 586 |
-
results = []
|
| 587 |
-
for idx in top_idx:
|
| 588 |
-
results.append(
|
| 589 |
-
{
|
| 590 |
-
"idx": int(idx),
|
| 591 |
-
"score": float(sims[idx]),
|
| 592 |
-
"prompt": rrf_corpus_prompts[idx],
|
| 593 |
-
"completion": rrf_corpus_completions[idx],
|
| 594 |
-
}
|
| 595 |
-
)
|
| 596 |
-
return results
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
def rrf_tutor_build_answer(query: str, retrieved_examples):
|
| 600 |
-
if not retrieved_examples:
|
| 601 |
-
return (
|
| 602 |
-
"No encontré ejemplos relevantes en el dataset RRF Tutor para tu consulta. "
|
| 603 |
-
"Intenta reformular la pregunta o revisar la configuración del dataset."
|
| 604 |
-
)
|
| 605 |
|
| 606 |
-
|
| 607 |
-
base_completion = best["completion"]
|
| 608 |
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
"💡 Nota: Esta es una versión mínima que reutiliza directamente la 'completion' "
|
| 613 |
-
"del ejemplo más similar en savant_rrf1_curated. En una versión extendida, aquí se "
|
| 614 |
-
"conectaría un LLM pequeño que use varios ejemplos como contexto."
|
| 615 |
)
|
| 616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
|
| 618 |
|
| 619 |
# ============================
|
|
|
|
| 531 |
|
| 532 |
|
| 533 |
# ============================
|
| 534 |
+
# RRF Tutor: carga de dataset savant_rrf1_curated
|
| 535 |
# ============================
|
| 536 |
|
| 537 |
+
from datasets import load_dataset
|
| 538 |
+
|
| 539 |
+
print(f"🔄 [Startup] Cargando dataset para RRF Tutor: {RRF_TUTOR_DATASET_ID}...", flush=True)
|
| 540 |
+
ds_rrf = None
|
| 541 |
rrf_corpus_texts: List[str] = []
|
| 542 |
rrf_corpus_prompts: List[str] = []
|
| 543 |
rrf_corpus_completions: List[str] = []
|
| 544 |
+
rrf_corpus_embeds = None
|
| 545 |
+
rrf_tutor_ready = False
|
| 546 |
|
| 547 |
+
try:
|
| 548 |
+
# Cargamos todos los splits y elegimos 'train' o el primero disponible
|
| 549 |
+
ds_dict = load_dataset(RRF_TUTOR_DATASET_ID)
|
| 550 |
+
if "train" in ds_dict:
|
| 551 |
+
ds_rrf = ds_dict["train"]
|
| 552 |
+
split_name = "train"
|
| 553 |
+
else:
|
| 554 |
+
split_name = list(ds_dict.keys())[0]
|
| 555 |
+
ds_rrf = ds_dict[split_name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
|
| 557 |
+
print(f"✅ Dataset RRF Tutor cargado desde split '{split_name}'. Ejemplos totales: {len(ds_rrf)}", flush=True)
|
|
|
|
| 558 |
|
| 559 |
+
# Nos quedamos solo con ejemplos que tengan prompt y completion
|
| 560 |
+
ds_rrf = ds_rrf.filter(
|
| 561 |
+
lambda ex: ex.get("prompt") is not None and ex.get("completion") is not None
|
|
|
|
|
|
|
|
|
|
| 562 |
)
|
| 563 |
+
print(f"✅ Dataset filtrado a ejemplos con 'prompt' y 'completion': {len(ds_rrf)}", flush=True)
|
| 564 |
+
|
| 565 |
+
if len(ds_rrf) == 0:
|
| 566 |
+
raise ValueError("Dataset filtrado quedó vacío (sin columnas 'prompt' y 'completion').")
|
| 567 |
+
|
| 568 |
+
print("🔄 [RRF Tutor] Construyendo textos y embeddings...", flush=True)
|
| 569 |
+
for ex in ds_rrf:
|
| 570 |
+
p = ex["prompt"]
|
| 571 |
+
c = ex["completion"]
|
| 572 |
+
rrf_corpus_prompts.append(p)
|
| 573 |
+
rrf_corpus_completions.append(c)
|
| 574 |
+
rrf_corpus_texts.append(p + "\n\n" + c)
|
| 575 |
+
|
| 576 |
+
rrf_corpus_embeds = encoder.encode(
|
| 577 |
+
rrf_corpus_texts,
|
| 578 |
+
convert_to_numpy=True,
|
| 579 |
+
show_progress_bar=True,
|
| 580 |
+
normalize_embeddings=True,
|
| 581 |
+
)
|
| 582 |
+
print("✅ [RRF Tutor] Embeddings construidos.", flush=True)
|
| 583 |
+
rrf_tutor_ready = True
|
| 584 |
+
|
| 585 |
+
except Exception as e:
|
| 586 |
+
print(f"❌ Error cargando dataset/embeddings de RRF Tutor: {e}", flush=True)
|
| 587 |
+
ds_rrf = None
|
| 588 |
+
rrf_corpus_embeds = None
|
| 589 |
+
rrf_tutor_ready = False
|
| 590 |
+
print("⚠️ [RRF Tutor] Endpoint /v1/rrf_tutor devolverá error 503 si se usa.", flush=True)
|
| 591 |
|
| 592 |
|
| 593 |
# ============================
|