Spaces:

hchevva
/

QureadAI

Running

App Files Files Community

hchevva commited on Jan 20

Commit

8a60fe7

verified ·

1 Parent(s): 6b69023

Update quread/llm_explain.py

Browse files

Files changed (1) hide show

quread/llm_explain.py +70 -65

quread/llm_explain.py CHANGED Viewed

@@ -1,19 +1,28 @@
 # quread/llm_explain.py
 from __future__ import annotations
-import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
-from huggingface_hub import InferenceClient
 @dataclass
 class ExplainConfig:
-    model_id: str = "HuggingFaceH4/zephyr-7b-beta"  # you can change later
-    provider: str = "hf-inference"
-    max_new_tokens: int = 280
-    temperature: float = 0.2
 def _build_grounded_prompt(
@@ -37,7 +46,6 @@ def _build_grounded_prompt(
             ops_lines.append(f"- {op}")
     top_lines = [f"- {b}: {p:.4f}" for b, p in probs_top]
     shots_line = f"Shots: {shots}\n" if shots is not None else ""
     return f"""
@@ -66,8 +74,25 @@ Return a concise explanation with bullet points and short paragraphs.
 """.strip()
-def _get_token() -> Optional[str]:
-    return os.getenv("HF_TOKEN")
 def explain_circuit_with_hf(
@@ -79,12 +104,11 @@ def explain_circuit_with_hf(
     shots: Optional[int] = None,
     cfg: Optional[ExplainConfig] = None,
 ) -> str:
     cfg = cfg or ExplainConfig()
-    token = _get_token()
-    if not token:
-        return "HF_TOKEN is not set (Space Settings → Secrets → HF_TOKEN)."
-    client = InferenceClient(provider=cfg.provider, token=token)
     prompt = _build_grounded_prompt(
         n_qubits=n_qubits,
@@ -94,61 +118,42 @@ def explain_circuit_with_hf(
         shots=shots,
     )
-    last_error = None
-    # 1) Chat completion
     try:
-        resp = client.chat_completion(
-            model=cfg.model_id,
-            messages=[
-                {"role": "system", "content": "You are a helpful quantum tutor."},
-                {"role": "user", "content": prompt},
-            ],
-            max_tokens=cfg.max_new_tokens,
-            temperature=cfg.temperature,
         )
-        text = resp.choices[0].message.content if resp and resp.choices else ""
-        if text and text.strip():
-            return text.strip()
-        last_error = ValueError("chat_completion returned empty text")
-    except Exception as e:
-        last_error = e
-    # 2) Text generation
-    try:
-        out = client.text_generation(
-            model=cfg.model_id,
-            prompt=prompt,
-            max_new_tokens=cfg.max_new_tokens,
-            temperature=cfg.temperature,
-        )
-        if out and str(out).strip():
-            return str(out).strip()
-        last_error = ValueError("text_generation returned empty text")
-    except Exception as e:
-        last_error = e
-    # 3) Text2Text only if the method exists AND the model looks like T5/FLAN
-    try:
-        is_t5_family = any(x in cfg.model_id.lower() for x in ["t5", "flan"])
-        fn = getattr(client, "text2text_generation", None)
-        if is_t5_family and fn is not None:
-            out = fn(
-                model=cfg.model_id,
-                prompt=prompt,
-                max_new_tokens=cfg.max_new_tokens,
             )
-            if out and str(out).strip():
-                return str(out).strip()
-            last_error = ValueError("text2text_generation returned empty text")
     except Exception as e:
-        last_error = e
-    return (
-        "LLM call failed.\n\n"
-        f"Model: {cfg.model_id}\n"
-        f"Provider: {cfg.provider}\n"
-        f"Error: {repr(last_error)}"
-    )

 # quread/llm_explain.py
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 @dataclass
 class ExplainConfig:
+    """
+    Local (in-Space) explainer config.
+    - Default model is small + reliable on CPU.
+    - You can upgrade later (e.g., flan-t5-base/large) if performance allows.
+    """
+    model_id: str = "google/flan-t5-small"
+    max_new_tokens: int = 220
+    temperature: float = 0.2  # kept for future; seq2seq generate doesn't always use it
+    device: str = "cpu"       # Spaces free tier is CPU
+# --- simple in-memory cache so the model loads once per container ---
+_LOCAL_CACHE: Dict[str, Any] = {"model_id": None, "tokenizer": None, "model": None}
 def _build_grounded_prompt(
             ops_lines.append(f"- {op}")
     top_lines = [f"- {b}: {p:.4f}" for b, p in probs_top]
     shots_line = f"Shots: {shots}\n" if shots is not None else ""
     return f"""
 """.strip()
+def _load_local_model(cfg: ExplainConfig):
+    """
+    Loads tokenizer+model once and caches them.
+    Uses Seq2Seq model family (FLAN-T5) which is CPU-friendly.
+    """
+    if _LOCAL_CACHE["model"] is not None and _LOCAL_CACHE["model_id"] == cfg.model_id:
+        return _LOCAL_CACHE["tokenizer"], _LOCAL_CACHE["model"]
+    tok = AutoTokenizer.from_pretrained(cfg.model_id)
+    model = AutoModelForSeq2SeqLM.from_pretrained(cfg.model_id)
+    # Force CPU unless you later add GPU space
+    model.to(cfg.device)
+    model.eval()
+    _LOCAL_CACHE["model_id"] = cfg.model_id
+    _LOCAL_CACHE["tokenizer"] = tok
+    _LOCAL_CACHE["model"] = model
+    return tok, model
 def explain_circuit_with_hf(
     shots: Optional[int] = None,
     cfg: Optional[ExplainConfig] = None,
 ) -> str:
+    """
+    Local explainer (runs inside the HF Space).
+    Kept function name for compatibility with your app.py imports.
+    """
     cfg = cfg or ExplainConfig()
     prompt = _build_grounded_prompt(
         n_qubits=n_qubits,
         shots=shots,
     )
     try:
+        tok, model = _load_local_model(cfg)
+        # Tokenize
+        inputs = tok(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024,
         )
+        # Move tensors to device (CPU)
+        for k in inputs:
+            inputs[k] = inputs[k].to(cfg.device)
+        # Generate
+        with torch.no_grad():
+            out_ids = model.generate(
+                **inputs,
+                max_new_tokens=int(cfg.max_new_tokens),
             )
+        text = tok.decode(out_ids[0], skip_special_tokens=True).strip()
+        if not text:
+            return (
+                "LLM call failed (local model returned empty output).\n\n"
+                f"Local model: {cfg.model_id}\n"
+                "Try increasing max_new_tokens or using flan-t5-base."
+            )
+        return text
     except Exception as e:
+        return (
+            "LLM call failed (local inference).\n\n"
+            f"Local model: {cfg.model_id}\n"
+            f"Error: {repr(e)}\n\n"
+            "If this is an out-of-memory error, use google/flan-t5-small.\n"
+            "If it is a missing dependency, confirm transformers + torch are in requirements.txt."
+        )