Spaces:

aadisawant2912
/

topic_modelling

Sleeping

App Files Files Community

aadisawant2912 commited on May 2

Commit

ce6755c

verified ·

1 Parent(s): f083a2e

Update tools_v2.py

Browse files

Files changed (1) hide show

tools_v2.py +204 -114

tools_v2.py CHANGED Viewed

@@ -1,67 +1,89 @@
 # =============================================================================
-# V2 TOOL 3 — label_clusters_council_of_3  (TRUE multi-LLM ensemble)
 # =============================================================================
 @tool
 def label_clusters_council_of_3(batch_size: int = 5) -> str:
-    """Label each cluster using a TRUE council of 3 DIFFERENT LLMs:
-      1. Mistral (mistral-small-latest)
-      2. OpenAI  (gpt-4o-mini)
-      3. Groq    (llama3-70b-8192)
-    Each model receives the SAME prompt independently.
-    Final label = mode (most common) of the 3 responses.
-    Vote agreement = unanimous / majority / split.
-    Saves enriched summaries + full audit CSV (one row per paper) to data/v2/.
-    API keys are read automatically from environment variables:
       MISTRAL_API_KEY, OPENAI_API_KEY, GROQ_API_KEY
     Set these in HuggingFace Space → Settings → Variables and Secrets.
     Args:
         batch_size: Clusters per LLM call (default 5).
     """
     import time
-    import os
-    # ── NEW: import all 3 LangChain integrations ──────────────────────────────
     from langchain_mistralai import ChatMistralAI
     from langchain_openai import ChatOpenAI
     from langchain_groq import ChatGroq
-    # ─────────────────────────────────────────────────────────────────────────
     p        = _p2()
     clusters = json.loads(p["clusters"].read_text())
-    # ── NEW: define 3 real LLMs (keys picked up from env automatically) ───────
     COUNCIL = [
         {
-            "name":  "MISTRAL",
-            "model": ChatMistralAI(
-                model="mistral-small-latest",
-                temperature=0.2,
-                # api_key read from MISTRAL_API_KEY env var automatically
-            ),
         },
         {
-            "name":  "OPENAI",
-            "model": ChatOpenAI(
-                model="gpt-4o-mini",
-                temperature=0.2,
-                # api_key read from OPENAI_API_KEY env var automatically
-            ),
         },
         {
-            "name":  "GROQ",
-            "model": ChatGroq(
-                model="llama3-70b-8192",
-                temperature=0.2,
-                # api_key read from GROQ_API_KEY env var automatically
-            ),
         },
     ]
     # ─────────────────────────────────────────────────────────────────────────
-    # ── UNCHANGED: single shared prompt builder (same prompt for all 3 LLMs) ──
-    def make_prompt(batch):
         mini = [
             {
                 "cluster_id":     c["cluster_id"],
@@ -83,61 +105,116 @@ def label_clusters_council_of_3(batch_size: int = 5) -> str:
         )
     # ─────────────────────────────────────────────────────────────────────────
-    # ── NEW: run each LLM independently across all batches ───────────────────
-    # persona_results[i] = { cluster_id: {label, confidence, reasoning} }
-    # shape is identical to before so all downstream code is UNCHANGED
-    persona_results = [{}, {}, {}]
-    batch_starts    = list(range(0, len(clusters), batch_size))
-    for pi, member in enumerate(COUNCIL):
-        llm        = member["model"]
-        llm_name   = member["name"]
-        all_labels = []
-        print(f"Council member {pi+1}/3 ({llm_name}) labeling {len(clusters)} clusters...")
         for bi, start in enumerate(batch_starts):
             batch  = clusters[start: start + batch_size]
-            prompt = make_prompt(batch)          # same prompt for every LLM
-            # ── NEW: per-model error handling so one failure doesn't kill all ─
-            try:
-                result = _call_llm_json(llm, prompt)
-                all_labels.extend(result)
-            except Exception as e:
-                print(f"  WARNING: {llm_name} batch {bi} failed: {e}. Using fallback labels.")
-                for c in batch:
-                    all_labels.append({
-                        "cluster_id": c["cluster_id"],
-                        "label":      f"Cluster {c['cluster_id']} ({llm_name} error)",
-                        "confidence": "Low",
-                        "reasoning":  f"Fallback — {llm_name} error: {str(e)[:80]}",
-                    })
-            # ─────────────────────────────────────────────────────────────────
-            # small delay between batches to respect rate limits
             if bi < len(batch_starts) - 1:
-                time.sleep(8)
-        for item in all_labels:
-            cid = int(item.get("cluster_id", 0))
-            persona_results[pi][cid] = item
-        # delay between council members (Groq is fast, Mistral/OpenAI need breathing room)
-        if pi < len(COUNCIL) - 1:
-            time.sleep(10)
     # ─────────────────────────────────────────────────────────────────────────
-    # ── UNCHANGED from here down: voting + enrichment + CSV export ───────────
     def enrich(cluster):
-        cid = cluster["cluster_id"]
         raw_votes = [
-            str(persona_results[pi].get(cid, {}).get("label", "")).strip()
-            for pi in range(3)
         ]
         votes = [
             v if v and v.lower() not in ("", "none", "null")
-            else "Cluster {}".format(cid)
             for v in raw_votes
         ]
         final     = _mode_label(votes)
@@ -149,62 +226,75 @@ def label_clusters_council_of_3(batch_size: int = 5) -> str:
         return {
             **cluster,
             "label":                  final,
-            "llm_vote_1_MISTRAL":     votes[0],   # key renamed to match real model
-            "llm_vote_2_OPENAI":      votes[1],   # key renamed to match real model
-            "llm_vote_3_GROQ":        votes[2],   # key renamed to match real model
-            "confidence_1": persona_results[0].get(cid, {}).get("confidence", ""),
-            "confidence_2": persona_results[1].get(cid, {}).get("confidence", ""),
-            "confidence_3": persona_results[2].get(cid, {}).get("confidence", ""),
-            "reasoning_1":  persona_results[0].get(cid, {}).get("reasoning", ""),
-            "reasoning_2":  persona_results[1].get(cid, {}).get("reasoning", ""),
-            "reasoning_3":  persona_results[2].get(cid, {}).get("reasoning", ""),
             "vote_agreement": agreement,
         }
     enriched = list(map(enrich, clusters))
     p["summaries"].write_text(json.dumps(enriched, indent=2, ensure_ascii=False))
-    # Audit CSV — one row per paper in cluster
     rows = []
     for c in enriched:
         cid = c["cluster_id"]
         for li, paper in enumerate(c["papers"]):
             rows.append({
-                "cluster_id":             cid,
-                "final_label":            c["label"],
-                "vote_agreement":         c["vote_agreement"],
-                "llm1_MISTRAL_label":     c["llm_vote_1_MISTRAL"],   # renamed
-                "llm2_OPENAI_label":      c["llm_vote_2_OPENAI"],    # renamed
-                "llm3_GROQ_label":        c["llm_vote_3_GROQ"],      # renamed
-                "llm1_confidence":        c["confidence_1"],
-                "llm2_confidence":        c["confidence_2"],
-                "llm3_confidence":        c["confidence_3"],
-                "llm1_reasoning":         c["reasoning_1"],
-                "llm2_reasoning":         c["reasoning_2"],
-                "llm3_reasoning":         c["reasoning_3"],
-                "paper_doi":              paper.get("doi", ""),
-                "paper_title":            paper.get("title", ""),
-                "paper_year":             paper.get("year", ""),
-                "paper_journal":          paper.get("journal", ""),
-                "abstract_preview":       paper.get("abstract", "")[:300],
-                "combined_preview":       paper.get("combined", "")[:200],
-                "centroid_cosine_sim":    round(float(
                     c["centroid_sims"][li] if li < len(c["centroid_sims"]) else 0.0), 4),
-                "hdbscan_probability":    round(float(
                     c["hdbscan_probs"][li] if li < len(c["hdbscan_probs"]) else 0.0), 4),
-                "is_top3_centroid":       "YES" if li in c["top3_paper_idx"] else "no",
             })
     pd.DataFrame(rows).to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
     unanimous = sum(1 for c in enriched if c["vote_agreement"] == "unanimous")
     majority  = sum(1 for c in enriched if c["vote_agreement"] == "majority")
     return json.dumps({
-        "clusters_labeled": len(enriched),
-        "unanimous":        unanimous,
-        "majority":         majority,
-        "split":            len(enriched) - unanimous - majority,
-        "audit_csv_rows":   len(rows),
-        "council_members":  [m["name"] for m in COUNCIL],           # NEW: visible in output
-        "note": "True 3-LLM ensemble (Mistral+OpenAI+Groq). Audit CSV ready ({} rows).".format(len(rows)),
-    })

 # =============================================================================
+# V2 TOOL 3 — label_clusters_council_of_3  (parallel + cached multi-LLM)
 # =============================================================================
 @tool
 def label_clusters_council_of_3(batch_size: int = 5) -> str:
+    """Label clusters using a TRUE council of 3 LLMs running IN PARALLEL:
+      1. Mistral  (mistral-small-latest)
+      2. OpenAI   (gpt-4o-mini)
+      3. Groq     (llama3-70b-8192)
+    SPEED:   All 3 LLMs run concurrently via ThreadPoolExecutor → ~3x faster.
+    COST:    SHA-256 disk cache — identical prompts are NEVER sent twice.
+             Re-runs, retries, and reruns after crashes cost $0 for cached batches.
+    LIMITS:  Per-model retry with exponential backoff. Groq gets a small stagger
+             delay so all 3 don't burst simultaneously on the first call.
+    API keys auto-read from env:
       MISTRAL_API_KEY, OPENAI_API_KEY, GROQ_API_KEY
     Set these in HuggingFace Space → Settings → Variables and Secrets.
+    Cache lives at:  data/v2/llm_cache/
+    Clear the cache: delete that folder to force fresh API calls.
     Args:
         batch_size: Clusters per LLM call (default 5).
     """
     import time
+    import hashlib
+    import threading
+    from concurrent.futures import ThreadPoolExecutor, as_completed
     from langchain_mistralai import ChatMistralAI
     from langchain_openai import ChatOpenAI
     from langchain_groq import ChatGroq
     p        = _p2()
     clusters = json.loads(p["clusters"].read_text())
+    # ── 1. DISK CACHE SETUP ──────────────────────────────────────────────────
+    # Each unique (model_name + prompt) gets its own JSON file.
+    # Hit  → free, instant, no API call.
+    # Miss → call API, save result, never pay again for that prompt.
+    CACHE_DIR = p["dir"] / "llm_cache"
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    cache_lock = threading.Lock()          # safe for concurrent reads/writes
+    def _cache_key(model_name: str, prompt: str) -> str:
+        digest = hashlib.sha256(f"{model_name}::{prompt}".encode()).hexdigest()
+        return digest
+    def _cache_get(model_name: str, prompt: str):
+        key  = _cache_key(model_name, prompt)
+        path = CACHE_DIR / f"{key}.json"
+        with cache_lock:
+            if path.exists():
+                return json.loads(path.read_text(encoding="utf-8"))
+        return None                        # cache miss
+    def _cache_set(model_name: str, prompt: str, result):
+        key  = _cache_key(model_name, prompt)
+        path = CACHE_DIR / f"{key}.json"
+        with cache_lock:
+            path.write_text(json.dumps(result, ensure_ascii=False), encoding="utf-8")
+    # ─────────────────────────────────────────────────────────────────────────
+    # ── 2. COUNCIL DEFINITION ────────────────────────────────────────────────
     COUNCIL = [
         {
+            "name":    "MISTRAL",
+            "model":   ChatMistralAI(model="mistral-small-latest", temperature=0.2),
+            "stagger": 0,                  # seconds to wait before first call
         },
         {
+            "name":    "OPENAI",
+            "model":   ChatOpenAI(model="gpt-4o-mini", temperature=0.2),
+            "stagger": 1,                  # slight stagger so 3 don't burst at t=0
         },
         {
+            "name":    "GROQ",
+            "model":   ChatGroq(model="llama3-70b-8192", temperature=0.2),
+            "stagger": 2,
         },
     ]
     # ─────────────────────────────────────────────────────────────────────────
+    # ── 3. SHARED PROMPT BUILDER (same for all 3 LLMs) ───────────────────────
+    def make_prompt(batch: list) -> str:
         mini = [
             {
                 "cluster_id":     c["cluster_id"],
         )
     # ─────────────────────────────────────────────────────────────────────────
+    # ── 4. SINGLE-MEMBER WORKER (runs in its own thread) ─────────────────────
+    # Handles: cache check → stagger → retry-with-backoff → cache save
+    # Returns: { cluster_id: {label, confidence, reasoning} }
+    def run_one_member(member: dict) -> tuple[str, dict]:
+        """Returns (member_name, {cid: result_dict})"""
+        name      = member["name"]
+        llm       = member["model"]
+        stagger   = member["stagger"]
+        results   = {}
+        # small stagger so 3 threads don't all burst at the exact same millisecond
+        if stagger:
+            time.sleep(stagger)
+        batch_starts = list(range(0, len(clusters), batch_size))
         for bi, start in enumerate(batch_starts):
             batch  = clusters[start: start + batch_size]
+            prompt = make_prompt(batch)
+            # ── cache check (free) ──────────────────────────────────────────
+            cached = _cache_get(name, prompt)
+            if cached is not None:
+                print(f"  [{name}] batch {bi+1}/{len(batch_starts)} → CACHE HIT (free)")
+                for item in cached:
+                    results[int(item.get("cluster_id", 0))] = item
+                continue                   # skip API call entirely
+            # ───────────────────────────────────────────────────────────────
+            # ── API call with exponential backoff ───────────────────────────
+            MAX_RETRIES = 4
+            for attempt in range(MAX_RETRIES):
+                try:
+                    print(f"  [{name}] batch {bi+1}/{len(batch_starts)} attempt {attempt+1}")
+                    batch_result = _call_llm_json(llm, prompt)
+                    # save to cache immediately on success
+                    _cache_set(name, prompt, batch_result)
+                    for item in batch_result:
+                        results[int(item.get("cluster_id", 0))] = item
+                    break                  # success → exit retry loop
+                except Exception as e:
+                    wait = (2 ** attempt) * 15    # 15s, 30s, 60s, 120s
+                    print(f"  [{name}] batch {bi+1} attempt {attempt+1} FAILED: {e}")
+                    if attempt < MAX_RETRIES - 1:
+                        print(f"  [{name}] retrying in {wait}s...")
+                        time.sleep(wait)
+                    else:
+                        # all retries exhausted → use fallback, do NOT crash
+                        print(f"  [{name}] all retries exhausted, using fallback for batch {bi+1}")
+                        for c in batch:
+                            cid = c["cluster_id"]
+                            results[cid] = {
+                                "cluster_id": cid,
+                                "label":      f"Cluster {cid} ({name} error)",
+                                "confidence": "Low",
+                                "reasoning":  f"Fallback — {name} failed: {str(e)[:80]}",
+                            }
+            # ───────────────────────────────────────────────────────────────
+            # ── inter-batch delay (only for non-cached batches) ─────────────
+            # Groq is very fast but strict on RPM; Mistral/OpenAI need breathing room.
+            # We sleep INSIDE each thread so they don't interfere with each other.
+            BATCH_DELAYS = {"MISTRAL": 12, "OPENAI": 10, "GROQ": 20}
             if bi < len(batch_starts) - 1:
+                time.sleep(BATCH_DELAYS.get(name, 12))
+            # ───────────────────────────────────────────────────────────────
+        return name, results
+    # ─────────────────────────────────────────────────────────────────────────
+    # ── 5. PARALLEL DISPATCH ─────────────────────────────────────────────────
+    # All 3 threads run simultaneously. Wall time ≈ slowest single member,
+    # not sum of all three. Thread count = 3 (one per LLM).
+    persona_results = {}                   # { "MISTRAL": {cid: ...}, ... }
+    cache_hits      = 0
+    cache_misses    = 0
+    print("Dispatching 3 LLMs in parallel...")
+    with ThreadPoolExecutor(max_workers=3) as executor:
+        futures = {executor.submit(run_one_member, m): m["name"] for m in COUNCIL}
+        for future in as_completed(futures):
+            member_name = futures[future]
+            try:
+                name, result_dict = future.result()
+                persona_results[name] = result_dict
+                print(f"[DONE] {name} finished with {len(result_dict)} cluster labels")
+            except Exception as e:
+                # should never reach here (worker handles its own errors),
+                # but belt-and-suspenders just in case
+                print(f"[ERROR] {member_name} thread crashed unexpectedly: {e}")
+                persona_results[member_name] = {}
     # ─────────────────────────────────────────────────────────────────────────
+    # ── 6. VOTING + ENRICHMENT (unchanged logic) ─────────────────────────────
+    LLM_NAMES = ["MISTRAL", "OPENAI", "GROQ"]
     def enrich(cluster):
+        cid       = cluster["cluster_id"]
         raw_votes = [
+            str(persona_results.get(name, {}).get(cid, {}).get("label", "")).strip()
+            for name in LLM_NAMES
         ]
         votes = [
             v if v and v.lower() not in ("", "none", "null")
+            else f"Cluster {cid}"
             for v in raw_votes
         ]
         final     = _mode_label(votes)
         return {
             **cluster,
             "label":                  final,
+            "llm_vote_1_MISTRAL":     votes[0],
+            "llm_vote_2_OPENAI":      votes[1],
+            "llm_vote_3_GROQ":        votes[2],
+            "confidence_1": persona_results.get("MISTRAL", {}).get(cid, {}).get("confidence", ""),
+            "confidence_2": persona_results.get("OPENAI",  {}).get(cid, {}).get("confidence", ""),
+            "confidence_3": persona_results.get("GROQ",    {}).get(cid, {}).get("confidence", ""),
+            "reasoning_1":  persona_results.get("MISTRAL", {}).get(cid, {}).get("reasoning", ""),
+            "reasoning_2":  persona_results.get("OPENAI",  {}).get(cid, {}).get("reasoning", ""),
+            "reasoning_3":  persona_results.get("GROQ",    {}).get(cid, {}).get("reasoning", ""),
             "vote_agreement": agreement,
         }
     enriched = list(map(enrich, clusters))
     p["summaries"].write_text(json.dumps(enriched, indent=2, ensure_ascii=False))
+    # ─────────────────────────────────────────────────────────────────────────
+    # ── 7. AUDIT CSV (unchanged format) ──────────────────────────────────────
     rows = []
     for c in enriched:
         cid = c["cluster_id"]
         for li, paper in enumerate(c["papers"]):
             rows.append({
+                "cluster_id":            cid,
+                "final_label":           c["label"],
+                "vote_agreement":        c["vote_agreement"],
+                "llm1_MISTRAL_label":    c["llm_vote_1_MISTRAL"],
+                "llm2_OPENAI_label":     c["llm_vote_2_OPENAI"],
+                "llm3_GROQ_label":       c["llm_vote_3_GROQ"],
+                "llm1_confidence":       c["confidence_1"],
+                "llm2_confidence":       c["confidence_2"],
+                "llm3_confidence":       c["confidence_3"],
+                "llm1_reasoning":        c["reasoning_1"],
+                "llm2_reasoning":        c["reasoning_2"],
+                "llm3_reasoning":        c["reasoning_3"],
+                "paper_doi":             paper.get("doi", ""),
+                "paper_title":           paper.get("title", ""),
+                "paper_year":            paper.get("year", ""),
+                "paper_journal":         paper.get("journal", ""),
+                "abstract_preview":      paper.get("abstract", "")[:300],
+                "combined_preview":      paper.get("combined", "")[:200],
+                "centroid_cosine_sim":   round(float(
                     c["centroid_sims"][li] if li < len(c["centroid_sims"]) else 0.0), 4),
+                "hdbscan_probability":   round(float(
                     c["hdbscan_probs"][li] if li < len(c["hdbscan_probs"]) else 0.0), 4),
+                "is_top3_centroid":      "YES" if li in c["top3_paper_idx"] else "no",
             })
     pd.DataFrame(rows).to_csv(p["audit_csv"], index=False, encoding="utf-8-sig")
     unanimous = sum(1 for c in enriched if c["vote_agreement"] == "unanimous")
     majority  = sum(1 for c in enriched if c["vote_agreement"] == "majority")
+    # count cache hits by checking what's in cache_dir vs how many API calls were made
+    total_batches   = len(list(range(0, len(clusters), batch_size))) * 3
+    cached_files    = len(list(CACHE_DIR.glob("*.json")))
     return json.dumps({
+        "clusters_labeled":   len(enriched),
+        "unanimous":          unanimous,
+        "majority":           majority,
+        "split":              len(enriched) - unanimous - majority,
+        "audit_csv_rows":     len(rows),
+        "council_members":    LLM_NAMES,
+        "execution":          "parallel (ThreadPoolExecutor, 3 workers)",
+        "cache_files_on_disk": cached_files,
+        "cache_dir":          str(CACHE_DIR),
+        "note": (
+            "Parallel 3-LLM ensemble done. "
+            f"Cache has {cached_files} entries — re-runs use these for free. "
+            "Audit CSV ready ({} rows).".format(len(rows))
+        ),
+    })--how this where to paste this but