Spaces:

hchevva
/

QureadAI

Sleeping

App Files Files Community

hchevva commited on Jan 20

Commit

42ffefc

verified ·

1 Parent(s): c73a29e

Update quread/llm_explain.py

Browse files

Files changed (1) hide show

quread/llm_explain.py +22 -59

quread/llm_explain.py CHANGED Viewed

@@ -4,25 +4,19 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 @dataclass
 class ExplainConfig:
-    """
-    Local (in-Space) explainer config.
-    - Default model is small + reliable on CPU.
-    - You can upgrade later (e.g., flan-t5-base/large) if performance allows.
-    """
-    model_id: str = "google/flan-t5-small"
     max_new_tokens: int = 220
-    temperature: float = 0.2  # kept for future; seq2seq generate doesn't always use it
-    device: str = "cpu"       # Spaces free tier is CPU
-# --- simple in-memory cache so the model loads once per container ---
-_LOCAL_CACHE: Dict[str, Any] = {"model_id": None, "tokenizer": None, "model": None}
 def _build_grounded_prompt(
@@ -32,10 +26,6 @@ def _build_grounded_prompt(
     probs_top: List[Tuple[str, float]],
     shots: Optional[int] = None,
 ) -> str:
-    """
-    Prompt is explicitly grounded: it includes only circuit + computed outputs.
-    The model is instructed not to invent values.
-    """
     ops_lines = []
     for op in history:
         if op.get("type") == "single":
@@ -74,24 +64,16 @@ Return a concise explanation with bullet points and short paragraphs.
 """.strip()
-def _load_local_model(cfg: ExplainConfig):
-    """
-    Loads tokenizer+model once and caches them.
-    Uses Seq2Seq model family (FLAN-T5) which is CPU-friendly.
-    """
-    if _LOCAL_CACHE["model"] is not None and _LOCAL_CACHE["model_id"] == cfg.model_id:
-        return _LOCAL_CACHE["tokenizer"], _LOCAL_CACHE["model"]
     tok = AutoTokenizer.from_pretrained(cfg.model_id)
-    model = AutoModelForSeq2SeqLM.from_pretrained(cfg.model_id)
-    # Force CPU unless you later add GPU space
-    model.to(cfg.device)
-    model.eval()
-    _LOCAL_CACHE["model_id"] = cfg.model_id
-    _LOCAL_CACHE["tokenizer"] = tok
-    _LOCAL_CACHE["model"] = model
     return tok, model
@@ -104,10 +86,6 @@ def explain_circuit_with_hf(
     shots: Optional[int] = None,
     cfg: Optional[ExplainConfig] = None,
 ) -> str:
-    """
-    Local explainer (runs inside the HF Space).
-    Kept function name for compatibility with your app.py imports.
-    """
     cfg = cfg or ExplainConfig()
     prompt = _build_grounded_prompt(
@@ -119,9 +97,8 @@ def explain_circuit_with_hf(
     )
     try:
-        tok, model = _load_local_model(cfg)
-        # Tokenize
         inputs = tok(
             prompt,
             return_tensors="pt",
@@ -129,31 +106,17 @@ def explain_circuit_with_hf(
             max_length=1024,
         )
-        # Move tensors to device (CPU)
-        for k in inputs:
-            inputs[k] = inputs[k].to(cfg.device)
-        # Generate
-        with torch.no_grad():
-            out_ids = model.generate(
-                **inputs,
-                max_new_tokens=int(cfg.max_new_tokens),
-            )
         text = tok.decode(out_ids[0], skip_special_tokens=True).strip()
-        if not text:
-            return (
-                "LLM call failed (local model returned empty output).\n\n"
-                f"Local model: {cfg.model_id}\n"
-                "Try increasing max_new_tokens or using flan-t5-base."
-            )
-        return text
     except Exception as e:
         return (
-            "LLM call failed (local inference).\n\n"
-            f"Local model: {cfg.model_id}\n"
-            f"Error: {repr(e)}\n\n"
-            "If this is an out-of-memory error, use google/flan-t5-small.\n"
-            "If it is a missing dependency, confirm transformers + torch are in requirements.txt."
         )

 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForSeq2SeqLM
 @dataclass
 class ExplainConfig:
+    # ONNX model (no torch). Keep this default.
+    model_id: str = "onnx-community/flan-t5-small-ONNX"
     max_new_tokens: int = 220
+    temperature: float = 0.2  # not used by ORT generate; kept for compatibility
+_CACHE: Dict[str, Any] = {"model_id": None, "tok": None, "model": None}
 def _build_grounded_prompt(
     probs_top: List[Tuple[str, float]],
     shots: Optional[int] = None,
 ) -> str:
     ops_lines = []
     for op in history:
         if op.get("type") == "single":
 """.strip()
+def _load_onnx(cfg: ExplainConfig):
+    if _CACHE["model"] is not None and _CACHE["model_id"] == cfg.model_id:
+        return _CACHE["tok"], _CACHE["model"]
     tok = AutoTokenizer.from_pretrained(cfg.model_id)
+    model = ORTModelForSeq2SeqLM.from_pretrained(cfg.model_id)
+    _CACHE["model_id"] = cfg.model_id
+    _CACHE["tok"] = tok
+    _CACHE["model"] = model
     return tok, model
     shots: Optional[int] = None,
     cfg: Optional[ExplainConfig] = None,
 ) -> str:
     cfg = cfg or ExplainConfig()
     prompt = _build_grounded_prompt(
     )
     try:
+        tok, model = _load_onnx(cfg)
         inputs = tok(
             prompt,
             return_tensors="pt",
             max_length=1024,
         )
+        out_ids = model.generate(
+            **inputs,
+            max_new_tokens=int(cfg.max_new_tokens),
+        )
         text = tok.decode(out_ids[0], skip_special_tokens=True).strip()
+        return text if text else "LLM returned empty response (ONNX). Try increasing max_new_tokens."
     except Exception as e:
         return (
+            "LLM call failed (ONNX local inference).\n\n"
+            f"Model: {cfg.model_id}\n"
+            f"Error: {repr(e)}"
         )