Spaces:

Slaiwala
/

Alpha_deploy

Paused

App Files Files Community

Slaiwala commited on Sep 29, 2025

Commit

7ff267a

verified ·

1 Parent(s): cec50ab

Update app.py

Browse files

Files changed (1) hide show

app.py +239 -249

app.py CHANGED Viewed

@@ -1,29 +1,19 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import os, re, json, time, sys, csv, uuid, datetime
 from typing import List, Dict, Any, Optional
 from functools import lru_cache
 from xml.etree import ElementTree as ET
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import AutoTokenizer, AutoModelForCausalLM
 try:
-    from transformers import BitsAndBytesConfig  # exists even if bitsandbytes isn't installed
 except Exception:
     BitsAndBytesConfig = None
-# Normalize QUANTIZE env
-QUANTIZE = os.environ.get("QUANTIZE", "none").strip().lower()
-# Detect bitsandbytes presence
-try:
-    import bitsandbytes as _bnb  # noqa: F401
-    _BNB_AVAILABLE = True
-except Exception:
-    _BNB_AVAILABLE = False
 import numpy as np
 import requests
 import gradio as gr
@@ -33,11 +23,13 @@ ASSETS_DIR = os.environ.get("ASSETS_DIR", "assets")
 FAISS_PATH = os.environ.get("FAISS_PATH", f"{ASSETS_DIR}/index.faiss")
 META_PATH  = os.environ.get("META_PATH",  f"{ASSETS_DIR}/index_meta.filtered.jsonl")
 REL_CONFIG_PATH = os.environ.get("REL_CONFIG_PATH", f"{ASSETS_DIR}/relevance_config.json")
-QUANTIZE = os.environ.get("QUANTIZE", "4bit")  # "none" | "8bit" | "4bit"
-# --- Turn logging ---
-TRANSCRIPT_PATH   = os.environ.get("TRANSCRIPT_PATH", "transcripts.jsonl")
-PUSH_TRANSCRIPTS  = os.environ.get("PUSH_TRANSCRIPTS", "1") == "1"  # set to "0" to disable
 # Models
 BASE_MODEL   = os.environ.get("BASE_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
@@ -53,12 +45,11 @@ NCBI_TOOL   = os.environ.get("NCBI_TOOL", "askstein")
 NCBI_APIKEY = os.environ.get("NCBI_APIKEY", "")
 # Feedback logging
-FEEDBACK_PATH = os.environ.get("FEEDBACK_PATH", "feedback.csv")
-PUSH_FEEDBACK = os.environ.get("PUSH_FEEDBACK", "0") == "1"  # set to "1" to enable Hub upload
-HF_READ_TOKEN  = os.environ.get("HF_READ_TOKEN", os.environ.get("HF_TOKEN", ""))
-HF_WRITE_TOKEN = os.environ.get("HF_WRITE_TOKEN", HF_READ_TOKEN)
-SPACE_REPO_ID  = os.environ.get("SPACE_REPO_ID", "")
 # Generation / toggles
 ALLOW_WIKIPEDIA = False
@@ -72,7 +63,6 @@ AUTO_CONTINUE = True
 AUTO_CONT_MAX_STEPS = 2        # continue up to 2 extra chunks
 AUTO_CONT_NEW_TOKENS = 256     # tokens per continuation step
 def dlog(tag, msg):
     if DEBUG: print(f"[{tag}] {msg}")
@@ -80,12 +70,14 @@ def dlog(tag, msg):
 import faiss
 from sentence_transformers import SentenceTransformer
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
 import wikipedia
 from wikipedia.exceptions import DisambiguationError, PageError
 from huggingface_hub import login, snapshot_download, HfApi
 # ================== GPU CHECK ==================
 if not torch.cuda.is_available():
     with gr.Blocks() as demo:
@@ -96,6 +88,8 @@ if not torch.cuda.is_available():
 device = "cuda"
 dtype = torch.float16
 torch.manual_seed(42)
 # ================== RELEVANCE CONFIG ==================
 DEFAULT_REL_CONFIG = {
@@ -206,7 +200,7 @@ except Exception as e:
 _IS_IP = isinstance(index, faiss.IndexFlatIP) or "IndexFlatIP" in type(index).__name__
-# ================== LOAD LLM (BASE + LORA) ==================
 if HF_READ_TOKEN:
     try:
         login(token=HF_READ_TOKEN)
@@ -214,17 +208,24 @@ if HF_READ_TOKEN:
     except Exception as e:
         dlog("HF", f"Login issue: {e}")
 if ADAPTER_REPO:
     ADAPTER_PATH = snapshot_download(repo_id=ADAPTER_REPO, allow_patterns=["*"])
-# --- LLM load (quantized optional) ---
 dlog("LLM", f"Loading base model: {BASE_MODEL}")
 tokenizer_lm = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
 use_bnb = QUANTIZE in {"8bit", "4bit"} and BitsAndBytesConfig is not None and _BNB_AVAILABLE
 if use_bnb:
     bnb_config = BitsAndBytesConfig(
         load_in_8bit=(QUANTIZE == "8bit"),
         load_in_4bit=(QUANTIZE == "4bit"),
@@ -238,23 +239,28 @@ if use_bnb:
         quantization_config=bnb_config,
     )
 else:
-    # Default / fallback: fp16 (no bitsandbytes required)
-    base_model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        torch_dtype=dtype,
-        device_map="auto",
-    )
 dlog("LLM", f"Loading LoRA adapter from: {ADAPTER_PATH}")
 model_lm = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
 model_lm.eval()
 GEN_ARGS_GROUNDED = dict(
     max_new_tokens=MAX_NEW_TOKENS_GROUNDED,
     do_sample=False,
     num_beams=1,
     no_repeat_ngram_size=3,
@@ -263,6 +269,7 @@ GEN_ARGS_GROUNDED = dict(
 )
 GEN_ARGS_FALLBACK = dict(
     max_new_tokens=MAX_NEW_TOKENS_FALLBACK,
     do_sample=False,
     num_beams=1,
     no_repeat_ngram_size=3,
@@ -270,38 +277,6 @@ GEN_ARGS_FALLBACK = dict(
     eos_token_id=tokenizer_lm.eos_token_id,
 )
-def _generate(inputs, grounded: bool):
-    args = GEN_ARGS_GROUNDED if grounded else GEN_ARGS_FALLBACK
-    in_len = inputs["input_ids"].shape[-1]
-    with torch.inference_mode():
-        out = model_lm.generate(**inputs, **args)
-        if not AUTO_CONTINUE:
-            return out
-        steps = 0
-        while steps < AUTO_CONT_MAX_STEPS:
-            seq = out[0]
-            ended_with_eos = (seq[-1].item() == tokenizer_lm.eos_token_id)
-            hit_cap = (seq.shape[0] - in_len) >= args["max_new_tokens"]
-            if ended_with_eos or not hit_cap:
-                break
-            # continue generation from the current sequence
-            cont_inputs = {
-                "input_ids": seq.unsqueeze(0),
-                "attention_mask": torch.ones_like(seq).unsqueeze(0),
-            }
-            cont_inputs = {k: v.to(device) for k, v in cont_inputs.items()}
-            cont_args = dict(args)
-            cont_args["max_new_tokens"] = AUTO_CONT_NEW_TOKENS
-            out = model_lm.generate(**cont_inputs, **cont_args)
-            steps += 1
-        return out
 # ================== UTILITIES ==================
 _SANITIZE = re.compile(r"```.*?```|<\s*script[^>]*>.*?<\s*/\s*script\s*>", re.DOTALL|re.IGNORECASE)
 def _to_text(rec: Any) -> str:
@@ -437,7 +412,9 @@ _ANATOMY_OR_HISTORY = re.compile(
     re.I
 )
 _PAPERS_INTENT = re.compile(r"\b(key\s+papers|suggest\s+papers|landmark|seminal|important|top\s+papers)\b", re.I)
 def fetch_pubmed_chunks(query_or_pmid: str, max_papers: int = 3) -> List[Dict[str, Any]]:
     retries = 1
     chunks: List[Dict[str, Any]] = []
@@ -617,7 +594,6 @@ def retrieve_context(query: str, top_k: int = 10) -> List[Dict[str, Any]]:
         if results:
             dlog("PUBMED", "PubMed search hit")
             return results
-        # Wikipedia fallback (unconditional after PubMed miss)
         wiki = wiki_summary_allow(q, sentences=3)
         if wiki:
             dlog("WIKI", "Wikipedia fallback hit")
@@ -625,7 +601,6 @@ def retrieve_context(query: str, top_k: int = 10) -> List[Dict[str, Any]]:
         dlog("RETRIEVAL", "No results found")
         return []
     # FAISS path
     q_emb = embed_model.encode([q], convert_to_numpy=True).astype("float32")
     if _IS_IP:
@@ -666,7 +641,6 @@ def retrieve_context(query: str, top_k: int = 10) -> List[Dict[str, Any]]:
         dlog("PUBMED", "PubMed search hit")
         return results
-    # Wikipedia fallback (unconditional after PubMed miss)
     wiki = wiki_summary_allow(q, sentences=3)
     if wiki:
         dlog("WIKI", "Wikipedia fallback hit")
@@ -675,7 +649,6 @@ def retrieve_context(query: str, top_k: int = 10) -> List[Dict[str, Any]]:
     dlog("RETRIEVAL", "No results at all")
     return []
 def build_prompt(chunks: List[Dict[str, Any]], question: str) -> str:
     header = (
         "You are Askstein (orthopedic biomechanics). Use ONLY the [Context] to answer. "
@@ -684,7 +657,7 @@ def build_prompt(chunks: List[Dict[str, Any]], question: str) -> str:
         "Do not discuss cardiology, neurology, or unrelated domains."
     )
     cleaned = []
-    per_chunk_chars = 1600
     for c in chunks:
         t = _to_text(c)
         if t: cleaned.append(t[:per_chunk_chars])
@@ -695,22 +668,64 @@ def _decode_generated(out_ids, in_len: int) -> str:
     gen = out_ids[0][in_len:]
     return tokenizer_lm.decode(gen, skip_special_tokens=True).lstrip(". \n").strip()
-@lru_cache(maxsize=None)
-def direct_llm_fallback(question: str) -> str:
-    sys_prompt = (
-        "You are Askstein (orthopedic biomechanics). If you lack enough domain context, say you don’t know. "
-        "Avoid discussing non-musculoskeletal systems (cardiology, neurology). Do NOT invent references."
-    )
-    llm_prompt = f"{sys_prompt}\n\nQuestion: {question}\nAnswer:"
-    inputs = tokenizer_lm(llm_prompt, return_tensors="pt").to(device)
-    out = _generate(inputs, grounded=False)
     in_len = inputs["input_ids"].shape[-1]
-    ans = _post_clean(_decode_generated(out, in_len))
-    # Strip any made-up reference sections the model might add
-    ans = re.sub(r"(?is)(^|\n)\s*references?:.*$", "", ans).strip()
-    return "[LLM fallback — ungrounded]\n\n" + ans
 def _synthesize_answer(chunks: List[Dict[str, Any]], question: str) -> str:
     prompt = build_prompt(chunks, question)
     inputs = tokenizer_lm(prompt, return_tensors="pt").to(device)
@@ -726,35 +741,21 @@ def _answer_from_chunks(chunks: List[Dict[str, Any]], question: str) -> str:
         return _synthesize_answer(chunks, question)
     return _synthesize_answer(chunks, question)
-def deterministic_definitions_text(core_q: str) -> Optional[str]:
-    q_lower = core_q.lower()
-    if "define axial rigidity" in q_lower or "what is axial rigidity" in q_lower:
-        return ("Axial rigidity (EA) is Σ(Eᵢ·dAᵢ) across a CT slice; units: N. "
-                "Modulus E per voxel comes from a density–modulus calibration; areas dAᵢ are voxel areas.")
-    if "define bending rigidity" in q_lower or "what is bending rigidity" in q_lower:
-        return ("Bending rigidity (EI) is Σ(Eᵢ·dAᵢ·yᵢ²) about a given axis; units: N·mm². "
-                "yᵢ is distance to the neutral axis; computed slice-by-slice from QCT.")
-    if ("define torsional rigidity" in q_lower) or ("what is torsional rigidity" in q_lower) or ("define gj" in q_lower):
-        return ("Torsional rigidity (GJ) = shear modulus G times polar moment J. "
-                "In QCT, J ≈ Σ(dAᵢ·rᵢ²) about the centroid; G ≈ E/(2(1+ν)).")
-    if "qct" in q_lower and ("torsional" in q_lower or "gj" in q_lower):
-        return ("From QCT, torsional rigidity is estimated as GJ, where J ≈ Σ(dAᵢ·rᵢ²) about the slice centroid and "
-                "G = E/(2(1+ν)) from the voxel E map (ν≈0.3). Compute per-slice and report the minimum.")
-    if re.search(r"\b(outline|steps|workflow|protocol)\b.*\b(ct|qct).*(rigidity|ea|ei|gj)", q_lower):
-        return (
-            "CT-based structural rigidity (CTRA/QCT) workflow:\n"
-            "1) Acquire QCT (≤1 mm; density phantom).\n"
-            "2) Preprocess & segment bone.\n"
-            "3) HU→ρ; ρ→E calibration.\n"
-            "4) Cross-sections along neck axis.\n"
-            "5) EA, EI_x/EI_y, GJ (G≈E/(2(1+ν))).\n"
-            "6) Extract minima & validate vs FEA/mech tests."
-        )
-    if re.search(r"\b(modulus)\b.*\brigidity\b|\bdefine\s+modulus\b", q_lower):
-        return ("Elastic modulus (E) is a material property (Pa). "
-                "Rigidity is structural (EA, EI, GJ). Modulus ≠ rigidity.")
-    return None
 def ask(question: str) -> str:
     q = question.strip()
     m = re.search(r"pmid[:\s]*(\d+)", q, re.IGNORECASE)
@@ -763,8 +764,11 @@ def ask(question: str) -> str:
         chunks = fetch_pubmed_chunks(pmid, max_papers=1)
         return "\n".join(c.get("text", "") for c in chunks) or "Sorry, no abstract found."
-    if _PAPERS_INTENT.search(q):
-        core_q = re.sub(_PAPERS_INTENT, "", q, flags=re.I).strip() or "CT/QCT structural rigidity femur hip finite element"
         compact = _compact_terms(core_q)
         pm_query = (
             f'(({compact}) AND (hip[TiAb] OR femur[TiAb] OR femoral[TiAb])) AND '
@@ -772,82 +776,75 @@ def ask(question: str) -> str:
             'AND ("2000"[DP] : "2025"[DP])'
         )
         cits = fetch_pubmed_citations(pm_query, max_results=5)
-        return "Recommended papers:\n" + "\n".join(f"- {c}" for c in cits) if cits else "Sorry, no good matches."
-    comp = re.match(r"(.+?)\s+and\s+(?:cite|references?|studies?|papers?)", q, flags=re.IGNORECASE)
-    if comp:
-        core_q = comp.group(1).strip()
-        det_text = deterministic_definitions_text(core_q)
-        used_term = None
-        if det_text:
-            explanation = det_text
-            lq = core_q.lower()
-            if ("torsional" in lq) or ("gj" in lq):
-                used_term = "GJ"
-                pm_query = ('(torsion[TiAb] OR "polar moment"[TiAb] OR GJ[TiAb]) AND '
-                            '("Bone and Bones"[MeSH] OR Femur[TiAb]) AND '
-                            '("Finite Element Analysis"[MeSH] OR QCT[TiAb] OR CT[TiAb]) AND '
-                            '("2000"[DP] : "2025"[DP])')
-            elif ("bending" in lq) or ("ei" in lq):
-                used_term = "EI"
-                pm_query = ('(bending[TiAb] OR "second moment"[TiAb] OR EI[TiAb]) AND '
-                            '("Bone and Bones"[MeSH] OR Femur[TiAb]) AND '
-                            '("Finite Element Analysis"[MeSH] OR QCT[TiAb] OR CT[TiAb]) AND '
-                            '("2000"[DP] : "2025"[DP])')
-            else:
-                used_term = "EA"
-                pm_query = ('("axial rigidity"[TiAb] OR EA[TiAb] OR "axial stiffness"[TiAb]) AND '
-                            '("Bone and Bones"[MeSH] OR Femur[TiAb]) AND '
-                            '("Finite Element Analysis"[MeSH] OR QCT[TiAb] OR CT[TiAb]) AND '
-                            '("2000"[DP] : "2025"[DP])')
-            citations = fetch_pubmed_citations(pm_query, max_results=5)
-            if not citations and used_term:
-                dlog("CITE", f"PubMed empty → fallback {used_term}")
-                citations = _fallback_cits_for(used_term)
-        else:
-            explanation = _answer_from_chunks(retrieve_context(core_q, top_k=5), core_q)
-            pm_query  = f'"{core_q}"[Title/Abstract]'
-            citations = fetch_pubmed_citations(pm_query, max_results=5)
-            if not citations:
-                lab = detect_lab(core_q)
-                pm_query = build_lab_query(core_q, lab=lab)
-                citations = fetch_pubmed_citations(pm_query, max_results=5)
-            if not citations:
-                compact = _compact_terms(core_q)
-                pm_query = (
-                    f'({compact}) AND ("Bone and Bones"[MeSH] OR Femur[TiAb] OR Hip[TiAb] '
-                    f'OR Rigidity[TiAb] OR "Tomography, X-Ray Computed"[MeSH] OR "Finite Element Analysis"[MeSH]) '
-                    f'NOT (heart[TiAb] OR cardiac[TiAb] OR brain[TiAb] OR skull[TiAb] OR EGFR[TiAb]) '
-                    f'AND ("2000"[DP] : "2025"[DP])'
-                )
-                citations = fetch_pubmed_citations(pm_query, max_results=5)
-        resp = explanation
-        if citations:
-            resp += "\n\nCitations:\n" + "\n".join(citations)
-        else:
-            resp += f"\n\nSorry, no relevant citations found for “{core_q}.”"
-        return _ensure_min_answer(_post_clean(resp))
-    det_answer = deterministic_definitions_text(q)
-    if det_answer:
         dlog("ASK", "Deterministic definition/workflow fired")
-        return det_answer
     if not (_MSK_MUST.search(q) or _is_fe_override(q)):
-        chunks = retrieve_context(q, top_k=5)
         if chunks:
-            dlog("CLEAN", "Post-clean applied")
             answer = _answer_from_chunks(chunks, q)
             return _ensure_min_answer(_post_clean(answer)) or direct_llm_fallback(q)
         return direct_llm_fallback(q)
-    chunks = retrieve_context(q, top_k=5)
     if not chunks:
         return direct_llm_fallback(q)
-    dlog("CLEAN", "Post-clean applied")
     answer = _answer_from_chunks(chunks, q)
     return _ensure_min_answer(_post_clean(answer)) or direct_llm_fallback(q)
 # ================== UI: NAME GATE + PER-ANSWER FEEDBACK ==================
 def _now_iso():
     return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
@@ -870,63 +867,7 @@ def enter_app(first_name, last_name, state):
     state["last_name"]  = last_name
     return gr.update(visible=False), gr.update(visible=True), state, f"Welcome, {first_name}! You can start chatting."
-def _log_turn(state: Dict[str, Any], question: str, answer: str):
-    rec = {
-        "timestamp_utc": _now_iso(),
-        "session_id": state.get("session_id", ""),
-        "first_name": state.get("first_name", ""),
-        "last_name":  state.get("last_name", ""),
-        "question": question,
-        "answer":   answer,
-    }
-    with open(TRANSCRIPT_PATH, "a", encoding="utf-8") as f:
-        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
-    if PUSH_TRANSCRIPTS:
-        _push_file_to_hub(TRANSCRIPT_PATH, "analytics/transcripts.jsonl")
-def predict(message, chat_history, state):
-    msg = (message or "").strip()
-    if not msg:
-        # No input → don't show feedback, just return current state
-        return chat_history, "", gr.update(visible=False), None, "", state
-    try:
-        answer = ask(msg)
-    except Exception as e:
-        answer = f"Sorry — something went wrong: {e!r}"
-    chat_history = (chat_history or []) + [(msg, answer)]
-    state["last_q"] = msg
-    state["last_a"] = answer
-    # Log every turn (safe if _log_turn isn't defined)
-    try:
-        _log_turn(state, msg, answer)
-    except Exception:
-        pass
-    return (
-        chat_history,
-        "",                          # clear input
-        gr.update(visible=True),     # show feedback pane
-        gr.update(value=None),       # reset rating
-        gr.update(value=""),         # reset comment
-        state
-    )
-# --- Hub upload helper --------------------------------------------------------
 def _push_file_to_hub(local_path: str, repo_path: str) -> None:
-    """
-    Upload a local file to your Space repo.
-    Requires:
-      - PUSH_FEEDBACK=1
-      - HF_WRITE_TOKEN (write access to the Space)
-      - SPACE_REPO_ID (e.g., "username/YourSpace")
-    """
     if not PUSH_FEEDBACK:
         return
     if not os.path.exists(local_path):
@@ -938,7 +879,6 @@ def _push_file_to_hub(local_path: str, repo_path: str) -> None:
     if not HF_WRITE_TOKEN:
         dlog("UPLOAD", "Skip: HF_WRITE_TOKEN not set")
         return
     try:
         api = HfApi(token=HF_WRITE_TOKEN)
         api.upload_file(
@@ -952,13 +892,28 @@ def _push_file_to_hub(local_path: str, repo_path: str) -> None:
     except Exception as e:
         dlog("UPLOAD", f"Upload failed: {e}")
-# --- Feedback uploader --------------------------------------------------------
 def _push_feedback_to_hub() -> None:
-    """Upload feedback.csv to analytics/feedback.csv in this Space repo (if enabled)."""
     _push_file_to_hub(FEEDBACK_PATH, "analytics/feedback.csv")
 def save_feedback(rating, comment, state):
     if rating is None:
@@ -986,6 +941,41 @@ def save_feedback(rating, comment, state):
     except Exception as e:
         return f"Failed to save feedback: {e}", gr.update(visible=True)
 with gr.Blocks(theme="soft") as demo:
     gr.Markdown("# Askstein — Orthopedic Biomechanics Chat (CT/QCT Rigidity, FE)")
     gr.Markdown("Grounded answers (FAISS + PubMed). Please enter your name to continue.")
@@ -997,9 +987,9 @@ with gr.Blocks(theme="soft") as demo:
     with gate:
         with gr.Row():
             first_tb = gr.Textbox(label="First name", placeholder="e.g., Shubh", scale=1)
-            last_tb = gr.Textbox(label="Last name", placeholder="e.g., Laiwala", scale=1)
         enter_btn = gr.Button("Enter", variant="primary")
-        gate_msg = gr.Markdown("", elem_classes=["text-sm"])
     # ---- App (hidden until gate passes) ----
     app = gr.Group(visible=False)
@@ -1013,12 +1003,12 @@ with gr.Blocks(theme="soft") as demo:
         feedback_grp = gr.Group(visible=False)
         with feedback_grp:
             gr.Markdown("### How helpful was this answer?")
-            rating = gr.Radio(choices=[1, 2, 3, 4, 5], label="Rating (1=poor, 5=great)")
             comment = gr.Textbox(label="Optional comment", placeholder="What was good or missing?")
             submit_fb = gr.Button("Submit feedback")
             fb_status = gr.Markdown("")
-    # ---- Wiring (MUST stay inside the Blocks context) ----
     enter_btn.click(
         fn=enter_app,
         inputs=[first_tb, last_tb, state],
@@ -1053,6 +1043,6 @@ with gr.Blocks(theme="soft") as demo:
         concurrency_limit=4,
     )
-# Queue & launch (outside the Blocks)
-demo.queue(max_size=64)
 demo.launch(max_threads=8)

 #!/usr/bin/env python3
 from __future__ import annotations
+# ================== STD / CORE ==================
 import os, re, json, time, sys, csv, uuid, datetime
 from typing import List, Dict, Any, Optional
 from functools import lru_cache
 from xml.etree import ElementTree as ET
+# ================== TRANSFORMERS / TORCH ==================
 from transformers import AutoTokenizer, AutoModelForCausalLM
 try:
+    from transformers import BitsAndBytesConfig  # may exist even if bnb isn't installed
 except Exception:
     BitsAndBytesConfig = None
 import numpy as np
 import requests
 import gradio as gr
 FAISS_PATH = os.environ.get("FAISS_PATH", f"{ASSETS_DIR}/index.faiss")
 META_PATH  = os.environ.get("META_PATH",  f"{ASSETS_DIR}/index_meta.filtered.jsonl")
 REL_CONFIG_PATH = os.environ.get("REL_CONFIG_PATH", f"{ASSETS_DIR}/relevance_config.json")
+# Normalize QUANTIZE env (default: no quantization)
+QUANTIZE = os.environ.get("QUANTIZE", "none").strip().lower()  # "none" | "8bit" | "4bit"
+# Turn logging
+TRANSCRIPT_PATH   = os.environ.get("TRANSCRIPT_PATH", "transcripts.jsonl")
+PUSH_TRANSCRIPTS  = os.environ.get("PUSH_TRANSCRIPTS", "1") == "1"  # set "0" to disable
 # Models
 BASE_MODEL   = os.environ.get("BASE_MODEL", "mistralai/Mistral-7B-Instruct-v0.2")
 NCBI_APIKEY = os.environ.get("NCBI_APIKEY", "")
 # Feedback logging
+FEEDBACK_PATH   = os.environ.get("FEEDBACK_PATH", "feedback.csv")
+PUSH_FEEDBACK   = os.environ.get("PUSH_FEEDBACK", "0") == "1"  # set "1" to enable Hub upload
+HF_READ_TOKEN   = os.environ.get("HF_READ_TOKEN", os.environ.get("HF_TOKEN", ""))
+HF_WRITE_TOKEN  = os.environ.get("HF_WRITE_TOKEN", HF_READ_TOKEN)
+SPACE_REPO_ID   = os.environ.get("SPACE_REPO_ID", "")
 # Generation / toggles
 ALLOW_WIKIPEDIA = False
 AUTO_CONT_MAX_STEPS = 2        # continue up to 2 extra chunks
 AUTO_CONT_NEW_TOKENS = 256     # tokens per continuation step
 def dlog(tag, msg):
     if DEBUG: print(f"[{tag}] {msg}")
 import faiss
 from sentence_transformers import SentenceTransformer
 import torch
 from peft import PeftModel
 import wikipedia
 from wikipedia.exceptions import DisambiguationError, PageError
 from huggingface_hub import login, snapshot_download, HfApi
+# ================== LOW-VRAM RUNTIME KNOBS ==================
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:128,expandable_segments:True")
 # ================== GPU CHECK ==================
 if not torch.cuda.is_available():
     with gr.Blocks() as demo:
 device = "cuda"
 dtype = torch.float16
 torch.manual_seed(42)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
 # ================== RELEVANCE CONFIG ==================
 DEFAULT_REL_CONFIG = {
 _IS_IP = isinstance(index, faiss.IndexFlatIP) or "IndexFlatIP" in type(index).__name__
+# ================== HUGGING FACE LOGIN & ADAPTER PATH ==================
 if HF_READ_TOKEN:
     try:
         login(token=HF_READ_TOKEN)
     except Exception as e:
         dlog("HF", f"Login issue: {e}")
 if ADAPTER_REPO:
     ADAPTER_PATH = snapshot_download(repo_id=ADAPTER_REPO, allow_patterns=["*"])
+# ================== QUANTIZATION AVAILABILITY ==================
+try:
+    import bitsandbytes as _bnb  # noqa: F401
+    _BNB_AVAILABLE = True
+except Exception:
+    _BNB_AVAILABLE = False
+# ================== LLM (BASE + LORA) ==================
 dlog("LLM", f"Loading base model: {BASE_MODEL}")
 tokenizer_lm = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
 use_bnb = QUANTIZE in {"8bit", "4bit"} and BitsAndBytesConfig is not None and _BNB_AVAILABLE
 if use_bnb:
+    # Quantized path (only if explicitly requested and bnb is installed)
     bnb_config = BitsAndBytesConfig(
         load_in_8bit=(QUANTIZE == "8bit"),
         load_in_4bit=(QUANTIZE == "4bit"),
         quantization_config=bnb_config,
     )
 else:
+    # fp16 path with SDPA attention (lower VRAM). Fallback if not supported.
+    try:
+        base_model = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            torch_dtype=dtype,
+            device_map="auto",
+            attn_implementation="sdpa",
+        )
+    except TypeError:
+        base_model = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            torch_dtype=dtype,
+            device_map="auto",
+        )
 dlog("LLM", f"Loading LoRA adapter from: {ADAPTER_PATH}")
 model_lm = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
 model_lm.eval()
 GEN_ARGS_GROUNDED = dict(
     max_new_tokens=MAX_NEW_TOKENS_GROUNDED,
+    min_new_tokens=220,
     do_sample=False,
     num_beams=1,
     no_repeat_ngram_size=3,
 )
 GEN_ARGS_FALLBACK = dict(
     max_new_tokens=MAX_NEW_TOKENS_FALLBACK,
+    min_new_tokens=120,
     do_sample=False,
     num_beams=1,
     no_repeat_ngram_size=3,
     eos_token_id=tokenizer_lm.eos_token_id,
 )
 # ================== UTILITIES ==================
 _SANITIZE = re.compile(r"```.*?```|<\s*script[^>]*>.*?<\s*/\s*script\s*>", re.DOTALL|re.IGNORECASE)
 def _to_text(rec: Any) -> str:
     re.I
 )
 _PAPERS_INTENT = re.compile(r"\b(key\s+papers|suggest\s+papers|landmark|seminal|important|top\s+papers)\b", re.I)
+CITE_TRIGGER = re.compile(r"\b(cite|citations?|references?)\b", re.I)
+# ================== PUBMED & RETRIEVAL ==================
 def fetch_pubmed_chunks(query_or_pmid: str, max_papers: int = 3) -> List[Dict[str, Any]]:
     retries = 1
     chunks: List[Dict[str, Any]] = []
         if results:
             dlog("PUBMED", "PubMed search hit")
             return results
         wiki = wiki_summary_allow(q, sentences=3)
         if wiki:
             dlog("WIKI", "Wikipedia fallback hit")
         dlog("RETRIEVAL", "No results found")
         return []
     # FAISS path
     q_emb = embed_model.encode([q], convert_to_numpy=True).astype("float32")
     if _IS_IP:
         dlog("PUBMED", "PubMed search hit")
         return results
     wiki = wiki_summary_allow(q, sentences=3)
     if wiki:
         dlog("WIKI", "Wikipedia fallback hit")
     dlog("RETRIEVAL", "No results at all")
     return []
 def build_prompt(chunks: List[Dict[str, Any]], question: str) -> str:
     header = (
         "You are Askstein (orthopedic biomechanics). Use ONLY the [Context] to answer. "
         "Do not discuss cardiology, neurology, or unrelated domains."
     )
     cleaned = []
+    per_chunk_chars = 900  # lower prompt length = lower KV memory
     for c in chunks:
         t = _to_text(c)
         if t: cleaned.append(t[:per_chunk_chars])
     gen = out_ids[0][in_len:]
     return tokenizer_lm.decode(gen, skip_special_tokens=True).lstrip(". \n").strip()
+def _gen_once(inputs, args) -> Any:
+    with torch.inference_mode():
+        return model_lm.generate(**inputs, **args, use_cache=True)
+def _generate(inputs, grounded: bool):
+    args = GEN_ARGS_GROUNDED if grounded else GEN_ARGS_FALLBACK
     in_len = inputs["input_ids"].shape[-1]
+    # First attempt
+    try:
+        out = _gen_once(inputs, args)
+    except torch.cuda.OutOfMemoryError:
+        try:
+            torch.cuda.empty_cache()
+        except Exception:
+            pass
+        small_args = dict(args)
+        small_args["max_new_tokens"] = min(256, args.get("max_new_tokens", 256))
+        # disable cache to save VRAM
+        with torch.inference_mode():
+            out = model_lm.generate(**inputs, **small_args, use_cache=False)
+    if not AUTO_CONTINUE:
+        return out
+    # Auto-continue if we hit cap without EOS
+    steps = 0
+    while steps < AUTO_CONT_MAX_STEPS:
+        seq = out[0]
+        ended_with_eos = (seq[-1].item() == tokenizer_lm.eos_token_id)
+        hit_cap = (seq.shape[0] - in_len) >= args["max_new_tokens"]
+        if ended_with_eos or not hit_cap:
+            break
+        cont_inputs = {
+            "input_ids": seq.unsqueeze(0),
+            "attention_mask": torch.ones_like(seq).unsqueeze(0),
+        }
+        cont_inputs = {k: v.to(device) for k, v in cont_inputs.items()}
+        cont_args = dict(args)
+        cont_args["max_new_tokens"] = AUTO_CONT_NEW_TOKENS
+        try:
+            with torch.inference_mode():
+                out = model_lm.generate(**cont_inputs, **cont_args, use_cache=True)
+        except torch.cuda.OutOfMemoryError:
+            try:
+                torch.cuda.empty_cache()
+            except Exception:
+                pass
+            with torch.inference_mode():
+                out = model_lm.generate(**cont_inputs, **cont_args, use_cache=False)
+        steps += 1
+    return out
+# ================== ANSWER SYNTHESIS ==================
 def _synthesize_answer(chunks: List[Dict[str, Any]], question: str) -> str:
     prompt = build_prompt(chunks, question)
     inputs = tokenizer_lm(prompt, return_tensors="pt").to(device)
         return _synthesize_answer(chunks, question)
     return _synthesize_answer(chunks, question)
+@lru_cache(maxsize=None)
+def direct_llm_fallback(question: str) -> str:
+    sys_prompt = (
+        "You are Askstein (orthopedic biomechanics). If you lack enough domain context, say you don’t know. "
+        "Avoid discussing non-musculoskeletal systems (cardiology, neurology). Do NOT invent references."
+    )
+    llm_prompt = f"{sys_prompt}\n\nQuestion: {question}\nAnswer:"
+    inputs = tokenizer_lm(llm_prompt, return_tensors="pt").to(device)
+    out = _generate(inputs, grounded=False)
+    in_len = inputs["input_ids"].shape[-1]
+    ans = _post_clean(_decode_generated(out, in_len))
+    ans = re.sub(r"(?is)(^|\n)\s*references?:.*$", "", ans).strip()
+    return "[LLM fallback — ungrounded]\n\n" + ans
+# ================== PUBLIC API ==================
 def ask(question: str) -> str:
     q = question.strip()
     m = re.search(r"pmid[:\s]*(\d+)", q, re.IGNORECASE)
         chunks = fetch_pubmed_chunks(pmid, max_papers=1)
         return "\n".join(c.get("text", "") for c in chunks) or "Sorry, no abstract found."
+    if _PAPERS_INTENT.search(q) or CITE_TRIGGER.search(q):
+        core_q = re.sub(CITE_TRIGGER, "", q, count=1, flags=re.I).strip().rstrip(".")
+        core_q = re.sub(_PAPERS_INTENT, "", core_q, flags=re.I).strip()
+        if not core_q:
+            core_q = "CT/QCT structural rigidity femur hip finite element"
         compact = _compact_terms(core_q)
         pm_query = (
             f'(({compact}) AND (hip[TiAb] OR femur[TiAb] OR femoral[TiAb])) AND '
             'AND ("2000"[DP] : "2025"[DP])'
         )
         cits = fetch_pubmed_citations(pm_query, max_results=5)
+        if not cits:
+            lab = detect_lab(core_q)
+            pm_query = build_lab_query(core_q, lab=lab)
+            cits = fetch_pubmed_citations(pm_query, max_results=5)
+        if not cits:
+            cits = _fallback_cits_for("EA")
+        # Provide a short explanation + citations
+        explanation = _answer_from_chunks(retrieve_context(core_q, top_k=3), core_q) or direct_llm_fallback(core_q)
+        explanation = _post_clean(explanation)
+        if cits:
+            explanation += "\n\nCitations:\n" + "\n".join(cits)
+        return _ensure_min_answer(explanation)
+    det = deterministic_definitions_text(q)
+    if det:
         dlog("ASK", "Deterministic definition/workflow fired")
+        return det
     if not (_MSK_MUST.search(q) or _is_fe_override(q)):
+        chunks = retrieve_context(q, top_k=3)
         if chunks:
             answer = _answer_from_chunks(chunks, q)
+            # tiny safety to release VRAM between turns
+            try:
+                torch.cuda.empty_cache()
+            except Exception:
+                pass
             return _ensure_min_answer(_post_clean(answer)) or direct_llm_fallback(q)
         return direct_llm_fallback(q)
+    chunks = retrieve_context(q, top_k=3)
     if not chunks:
         return direct_llm_fallback(q)
     answer = _answer_from_chunks(chunks, q)
+    try:
+        torch.cuda.empty_cache()
+    except Exception:
+        pass
     return _ensure_min_answer(_post_clean(answer)) or direct_llm_fallback(q)
+def deterministic_definitions_text(core_q: str) -> Optional[str]:
+    q_lower = core_q.lower()
+    if "define axial rigidity" in q_lower or "what is axial rigidity" in q_lower:
+        return ("Axial rigidity (EA) is Σ(Eᵢ·dAᵢ) across a CT slice; units: N. "
+                "Modulus E per voxel comes from a density–modulus calibration; areas dAᵢ are voxel areas.")
+    if "define bending rigidity" in q_lower or "what is bending rigidity" in q_lower:
+        return ("Bending rigidity (EI) is Σ(Eᵢ·dAᵢ·yᵢ²) about a given axis; units: N·mm². "
+                "yᵢ is distance to the neutral axis; computed slice-by-slice from QCT.")
+    if ("define torsional rigidity" in q_lower) or ("what is torsional rigidity" in q_lower) or ("define gj" in q_lower):
+        return ("Torsional rigidity (GJ) = shear modulus G times polar moment J. "
+                "In QCT, J ≈ Σ(dAᵢ·rᵢ²) about the centroid; G ≈ E/(2(1+ν)).")
+    if "qct" in q_lower and ("torsional" in q_lower or "gj" in q_lower):
+        return ("From QCT, torsional rigidity is estimated as GJ, where J ≈ Σ(dAᵢ·rᵢ²) about the slice centroid and "
+                "G = E/(2(1+ν)) from the voxel E map (ν≈0.3). Compute per-slice and report the minimum.")
+    if re.search(r"\b(outline|steps|workflow|protocol)\b.*\b(ct|qct).*(rigidity|ea|ei|gj)", q_lower):
+        return (
+            "CT-based structural rigidity (CTRA/QCT) workflow:\n"
+            "1) Acquire QCT (≤1 mm; density phantom).\n"
+            "2) Preprocess & segment bone.\n"
+            "3) HU→ρ; ρ→E calibration.\n"
+            "4) Cross-sections along neck axis.\n"
+            "5) EA, EI_x/EI_y, GJ (G≈E/(2(1+ν))).\n"
+            "6) Extract minima & validate vs FEA/mech tests."
+        )
+    if re.search(r"\b(modulus)\b.*\brigidity\b|\bdefine\s+modulus\b", q_lower):
+        return ("Elastic modulus (E) is a material property (Pa). "
+                "Rigidity is structural (EA, EI, GJ). Modulus ≠ rigidity.")
+    return None
 # ================== UI: NAME GATE + PER-ANSWER FEEDBACK ==================
 def _now_iso():
     return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
     state["last_name"]  = last_name
     return gr.update(visible=False), gr.update(visible=True), state, f"Welcome, {first_name}! You can start chatting."
 def _push_file_to_hub(local_path: str, repo_path: str) -> None:
     if not PUSH_FEEDBACK:
         return
     if not os.path.exists(local_path):
     if not HF_WRITE_TOKEN:
         dlog("UPLOAD", "Skip: HF_WRITE_TOKEN not set")
         return
     try:
         api = HfApi(token=HF_WRITE_TOKEN)
         api.upload_file(
     except Exception as e:
         dlog("UPLOAD", f"Upload failed: {e}")
 def _push_feedback_to_hub() -> None:
     _push_file_to_hub(FEEDBACK_PATH, "analytics/feedback.csv")
+def _log_turn(state: Dict[str, Any], question: str, answer: str):
+    rec = {
+        "timestamp_utc": _now_iso(),
+        "session_id": state.get("session_id", ""),
+        "first_name": state.get("first_name", ""),
+        "last_name":  state.get("last_name", ""),
+        "question": question,
+        "answer":   answer,
+    }
+    try:
+        with open(TRANSCRIPT_PATH, "a", encoding="utf-8") as f:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    except Exception:
+        pass
+    try:
+        if PUSH_TRANSCRIPTS:
+            _push_file_to_hub(TRANSCRIPT_PATH, "analytics/transcripts.jsonl")
+    except Exception:
+        pass
 def save_feedback(rating, comment, state):
     if rating is None:
     except Exception as e:
         return f"Failed to save feedback: {e}", gr.update(visible=True)
+def predict(message, chat_history, state):
+    msg = (message or "").strip()
+    if not msg:
+        return chat_history, "", gr.update(visible=False), None, "", state
+    try:
+        answer = ask(msg)
+    except Exception as e:
+        answer = f"Sorry — something went wrong: {e!r}"
+    chat_history = (chat_history or []) + [(msg, answer)]
+    state["last_q"] = msg
+    state["last_a"] = answer
+    try:
+        _log_turn(state, msg, answer)
+    except Exception:
+        pass
+    # free a bit of VRAM between turns
+    try:
+        torch.cuda.empty_cache()
+    except Exception:
+        pass
+    return (
+        chat_history,
+        "",                          # clear input
+        gr.update(visible=True),     # show feedback pane
+        gr.update(value=None),       # reset rating
+        gr.update(value=""),         # reset comment
+        state
+    )
+# ================== UI ==================
 with gr.Blocks(theme="soft") as demo:
     gr.Markdown("# Askstein — Orthopedic Biomechanics Chat (CT/QCT Rigidity, FE)")
     gr.Markdown("Grounded answers (FAISS + PubMed). Please enter your name to continue.")
     with gate:
         with gr.Row():
             first_tb = gr.Textbox(label="First name", placeholder="e.g., Shubh", scale=1)
+            last_tb  = gr.Textbox(label="Last name",  placeholder="e.g., Laiwala", scale=1)
         enter_btn = gr.Button("Enter", variant="primary")
+        gate_msg  = gr.Markdown("", elem_classes=["text-sm"])
     # ---- App (hidden until gate passes) ----
     app = gr.Group(visible=False)
         feedback_grp = gr.Group(visible=False)
         with feedback_grp:
             gr.Markdown("### How helpful was this answer?")
+            rating = gr.Radio(choices=[1,2,3,4,5], label="Rating (1=poor, 5=great)")
             comment = gr.Textbox(label="Optional comment", placeholder="What was good or missing?")
             submit_fb = gr.Button("Submit feedback")
             fb_status = gr.Markdown("")
+    # ---- Wiring (must stay inside Blocks) ----
     enter_btn.click(
         fn=enter_app,
         inputs=[first_tb, last_tb, state],
         concurrency_limit=4,
     )
+# ================== QUEUE & LAUNCH ==================
+demo.queue(max_size=64)           # no deprecated concurrency_count
 demo.launch(max_threads=8)