Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Aug 9

Commit

b23412f

verified ·

1 Parent(s): 88b3626

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -32

app.py CHANGED Viewed

@@ -4,24 +4,43 @@ import time
 from datetime import datetime, timezone
 from functools import lru_cache
-import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login, HfApi
-# ---- Config ----
 MODEL_ID = os.getenv("MODEL_ID", "CohereLabs/c4ai-command-r7b-12-2024")
 HF_TOKEN = (
-    os.getenv("HUGGINGFACE_HUB_TOKEN")  # canonical name in HF Spaces
     or os.getenv("HF_TOKEN")
 )
-def utc_now() -> str:
     return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
-def header(processing_time=None) -> str:
     s = (
-        f"Current Date and Time (UTC - YYYY-MM-DD HH:MM:SS formatted): {utc_now()}\n"
         f"Current User's Login: Raj-VedAI\n"
     )
     if processing_time is not None:
@@ -33,39 +52,99 @@ def pick_dtype_and_map():
         return torch.float16, "auto"
     if torch.backends.mps.is_available():
         return torch.float16, {"": "mps"}
-    return torch.float32, "cpu"
 @lru_cache(maxsize=1)
-def load_model():
-    # Login (optional for public models; safe if token is unset)
-    if HF_TOKEN:
-        login(token=HF_TOKEN, add_to_git_credential=False)
-    dtype, device_map = pick_dtype_and_map()
-    tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         use_fast=True,
         model_max_length=4096,
         padding_side="left",
-        trust_remote_code=True,   # <- allow custom model code
     )
-    model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         device_map=device_map,
         low_cpu_mem_usage=True,
         torch_dtype=dtype,
-        trust_remote_code=True,   # <- allow custom model code
     )
-    # Ensure EOS configured
-    if model.config.eos_token_id is None and tokenizer.eos_token_id is not None:
-        model.config.eos_token_id = tokenizer.eos_token_id
-    return model, tokenizer
 def build_inputs(tokenizer, message, history):
     msgs = []
@@ -74,13 +153,10 @@ def build_inputs(tokenizer, message, history):
         msgs.append({"role": "assistant", "content": a})
     msgs.append({"role": "user", "content": message})
     return tokenizer.apply_chat_template(
-        msgs,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_tensors="pt",
     )
-def generate_reply(model, tokenizer, input_ids, max_new_tokens=300):
     input_ids = input_ids.to(model.device)
     with torch.no_grad():
         out = model.generate(
@@ -97,37 +173,77 @@ def generate_reply(model, tokenizer, input_ids, max_new_tokens=300):
     text = tokenizer.decode(gen_only, skip_special_tokens=True)
     return text.strip()
 def chat_fn(message, history):
     t0 = time.time()
     try:
-        model, tokenizer = load_model()
         inputs = build_inputs(tokenizer, message, history)
-        reply = generate_reply(model, tokenizer, inputs, max_new_tokens=350)
         return f"{header(time.time() - t0)}{reply}"
     except Exception as e:
         return f"{header(time.time() - t0)}Error during chat: {e}"
 def check_connection():
     try:
         api = HfApi(token=HF_TOKEN)
         mi = api.model_info(MODEL_ID)
         return (
             f"{header()}"
             f"Connection Status: ✅ Connected\n"
             f"Model: {mi.modelId}\n"
             f"Last Modified: {mi.lastModified}\n"
         )
     except Exception as e:
         return f"{header()}Connection Status: ❌ Error\nDetails: {e}"
 with gr.Blocks(theme=gr.themes.Default()) as demo:
     gr.Markdown(f"# Medical Decision Support AI\n{header()}")
     with gr.Row():
         btn = gr.Button("Check Connection Status")
-        status = gr.Textbox(label="Connection Status", lines=6, value="Click to check…")
-    gr.Markdown("⚙️ First response may take a moment while the model warms up.")
     chat = gr.ChatInterface(
         fn=chat_fn,
@@ -143,6 +259,8 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
     btn.click(fn=check_connection, outputs=status)
 if __name__ == "__main__":
     demo.launch()

 from datetime import datetime, timezone
 from functools import lru_cache
 import gradio as gr
+import torch
+# Try to import Cohere SDK if present (for hosted path)
+try:
+    import cohere  # pip install cohere
+    _HAS_COHERE = True
+except Exception:
+    _HAS_COHERE = False
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login, HfApi
+# -------------------
+# Configuration
+# -------------------
 MODEL_ID = os.getenv("MODEL_ID", "CohereLabs/c4ai-command-r7b-12-2024")
 HF_TOKEN = (
+    os.getenv("HUGGINGFACE_HUB_TOKEN")  # official Spaces name
     or os.getenv("HF_TOKEN")
 )
+COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+USE_HOSTED_COHERE = bool(COHERE_API_KEY and _HAS_COHERE)
+# -------------------
+# Helpers
+# -------------------
+def utc_now():
     return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+def header(processing_time=None):
     s = (
+        f"Current Date and Time (UTC - YYYY-MM-DD HH:MM:SS formatted): {utc_now()} "
         f"Current User's Login: Raj-VedAI\n"
     )
     if processing_time is not None:
         return torch.float16, "auto"
     if torch.backends.mps.is_available():
         return torch.float16, {"": "mps"}
+    return torch.float32, "cpu"  # CPU path (likely too big for R7B)
+# -------------------
+# Cohere Hosted Path
+# -------------------
+_co_client = None
+if USE_HOSTED_COHERE:
+    _co_client = cohere.Client(api_key=COHERE_API_KEY)
+def _cohere_parse(resp):
+    """
+    Handle both Cohere SDK styles:
+    - responses.create(...): resp.output_text or resp.message.content[0].text
+    - chat(...): resp.text
+    """
+    # v5+ responses.create
+    if hasattr(resp, "output_text") and resp.output_text:
+        return resp.output_text.strip()
+    if getattr(resp, "message", None) and getattr(resp.message, "content", None):
+        parts = resp.message.content
+        # pick first text part
+        for p in parts:
+            if hasattr(p, "text") and p.text:
+                return p.text.strip()
+    # v4 chat
+    if hasattr(resp, "text") and resp.text:
+        return resp.text.strip()
+    return "Sorry, I couldn't parse the response from Cohere."
+def cohere_chat(message, history):
+    # Build a clean user prompt from history (simple, safe)
+    # If you want structured history, you can pass messages when using responses.create
+    try:
+        # Try modern API first
+        try:
+            msgs = []
+            for u, a in (history or []):
+                msgs.append({"role": "user", "content": u})
+                msgs.append({"role": "assistant", "content": a})
+            msgs.append({"role": "user", "content": message})
+            resp = _co_client.responses.create(
+                model="command-r7b-12-2024",
+                messages=msgs,
+                temperature=0.3,
+                max_tokens=350,
+            )
+        except Exception:
+            # Fallback to older chat API
+            resp = _co_client.chat(
+                model="command-r7b-12-2024",
+                message=message,
+                temperature=0.3,
+                max_tokens=350,
+            )
+        return _cohere_parse(resp)
+    except Exception as e:
+        return f"Error calling Cohere API: {e}"
+# -------------------
+# Local HF Path
+# -------------------
 @lru_cache(maxsize=1)
+def load_local_model():
+    if not HF_TOKEN:
+        raise RuntimeError(
+            "HUGGINGFACE_HUB_TOKEN (or HF_TOKEN) is not set. "
+            "Either set it, or provide COHERE_API_KEY to use Cohere's hosted API."
+        )
+    login(token=HF_TOKEN, add_to_git_credential=False)
+    dtype, device_map = pick_dtype_and_map()
+    tok = AutoTokenizer.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         use_fast=True,
         model_max_length=4096,
         padding_side="left",
+        trust_remote_code=True,
     )
+    mdl = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN,
         device_map=device_map,
         low_cpu_mem_usage=True,
         torch_dtype=dtype,
+        trust_remote_code=True,
     )
+    if mdl.config.eos_token_id is None and tok.eos_token_id is not None:
+        mdl.config.eos_token_id = tok.eos_token_id
+    return mdl, tok
 def build_inputs(tokenizer, message, history):
     msgs = []
         msgs.append({"role": "assistant", "content": a})
     msgs.append({"role": "user", "content": message})
     return tokenizer.apply_chat_template(
+        msgs, tokenize=True, add_generation_prompt=True, return_tensors="pt"
     )
+def local_generate(model, tokenizer, input_ids, max_new_tokens=350):
     input_ids = input_ids.to(model.device)
     with torch.no_grad():
         out = model.generate(
     text = tokenizer.decode(gen_only, skip_special_tokens=True)
     return text.strip()
+# -------------------
+# Chat callback
+# -------------------
 def chat_fn(message, history):
     t0 = time.time()
     try:
+        if USE_HOSTED_COHERE:
+            reply = cohere_chat(message, history)
+            return f"{header(time.time() - t0)}{reply}"
+        # Local load (GPU strongly recommended; CPU likely OOM for R7B)
+        model, tokenizer = load_local_model()
         inputs = build_inputs(tokenizer, message, history)
+        reply = local_generate(model, tokenizer, inputs, max_new_tokens=350)
         return f"{header(time.time() - t0)}{reply}"
+    except RuntimeError as e:
+        emsg = str(e)
+        if "out of memory" in emsg.lower() or "cuda" in emsg.lower():
+            return (
+                f"{header(time.time() - t0)}Local load likely OOM. "
+                "Use a GPU Space or set COHERE_API_KEY to run via Cohere hosted API."
+            )
+        return f"{header(time.time() - t0)}Error during chat: {e}"
     except Exception as e:
         return f"{header(time.time() - t0)}Error during chat: {e}"
+# -------------------
+# Connection check
+# -------------------
 def check_connection():
     try:
+        mode = "Cohere API (hosted)" if USE_HOSTED_COHERE else "Local HF"
+        if USE_HOSTED_COHERE:
+            return (
+                f"{header()}"
+                f"Connection Status: ✅ Using Cohere hosted API\n"
+                f"Mode: {mode}\n"
+                f"Model: command-r7b-12-2024\n"
+            )
+        # Local HF metadata
         api = HfApi(token=HF_TOKEN)
         mi = api.model_info(MODEL_ID)
         return (
             f"{header()}"
             f"Connection Status: ✅ Connected\n"
+            f"Mode: {mode}\n"
             f"Model: {mi.modelId}\n"
             f"Last Modified: {mi.lastModified}\n"
         )
     except Exception as e:
         return f"{header()}Connection Status: ❌ Error\nDetails: {e}"
+# -------------------
+# UI
+# -------------------
 with gr.Blocks(theme=gr.themes.Default()) as demo:
     gr.Markdown(f"# Medical Decision Support AI\n{header()}")
     with gr.Row():
         btn = gr.Button("Check Connection Status")
+        status = gr.Textbox(label="Connection Status", lines=7, value="Click to check…")
+    gr.Markdown(
+        "⚙️ First response may take a moment while the model warms up. "
+        "Currently configured to use **Cohere hosted API** if `COHERE_API_KEY` is set; "
+        "otherwise, tries **local HF**."
+    )
     chat = gr.ChatInterface(
         fn=chat_fn,
     btn.click(fn=check_connection, outputs=status)
 if __name__ == "__main__":
+    # You can disable SSR if it conflicts in your Space:
     demo.launch()