Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Aug 9

Commit

11a5624

verified ·

1 Parent(s): 76d5714

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -166

app.py CHANGED Viewed

@@ -1,33 +1,24 @@
-import shutil
 import os
-# Clear HuggingFace cache directory on every launch
-shutil.rmtree(os.path.expanduser("~/.cache/huggingface"), ignore_errors=True)
-shutil.rmtree("offload", ignore_errors=True)  # Or whatever folder you use for offloading/cache
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from datetime import datetime, timezone
-import os
 from huggingface_hub import login, HfApi
-from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError
-import time
-import requests
-from tenacity import retry, stop_after_attempt, wait_exponential
-from functools import lru_cache
-import torch
-# Global variables for model caching
-global_model = None
-global_tokenizer = None
 def get_timestamp():
-    """Get current UTC datetime in specified format"""
-    return datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%SS')
 def format_system_info(processing_time=None):
-    """Format system information header"""
     info = (
         f"Current Date and Time (UTC - YYYY-MM-DD HH:MM:SS formatted): {get_timestamp()}\n"
         f"Current User's Login: Raj-VedAI\n"
@@ -36,170 +27,123 @@ def format_system_info(processing_time=None):
         info += f"Processing Time: {processing_time:.2f} seconds\n"
     return info
 @lru_cache(maxsize=1)
 def load_model():
-    """Load and cache the model"""
-    global global_model, global_tokenizer
-    if global_model is not None and global_tokenizer is not None:
-        return global_model, global_tokenizer
-    try:
-        token = os.getenv("HUGGING_FACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
-        if not token:
-            raise ValueError("No token found. Please set HUGGING_FACE_HUB_TOKEN or HF_TOKEN in Space secrets.")
-        login(token=token, add_to_git_credential=False)
-        model_id = "CohereLabs/c4ai-command-a-03-2025"
-        # Load tokenizer with optimizations
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            token=token,
-            use_fast=True,
-            model_max_length=2048
-        )
-        # Load model with optimizations
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            token=token,
-            device_map="auto",
-            low_cpu_mem_usage=True,
-            torch_dtype=torch.float16,  # Use float16 for faster inference
-            offload_folder="offload"  # Enable model offloading if needed
-        )
-        global_model = model
-        global_tokenizer = tokenizer
-        return model, tokenizer
-    except Exception as e:
-        raise Exception(f"Error loading model: {str(e)}")
-def generate_with_timeout(model, input_ids, max_new_tokens=100, timeout=60):
-    """Generate response with timeout"""
-    try:
-        with torch.no_grad():
-            output = model.generate(
-                input_ids,
-                max_new_tokens=max_new_tokens,
-                do_sample=True,
-                temperature=0.3,
-                pad_token_id=model.config.eos_token_id,
-                attention_mask=input_ids.new_ones(input_ids.shape),
-                top_p=0.9,
-                repetition_penalty=1.2,
-                timeout_seconds=timeout
-            )
-        return output
-    except Exception as e:
-        raise Exception(f"Generation timeout or error: {str(e)}")
-@retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=1, min=2, max=4))
-def chat(message, history):
-    start_time = time.time()
     try:
-        # Load or get cached model
         model, tokenizer = load_model()
-        if history is None:
-            history = []
-        # Format messages
-        messages = [{"role": "user", "content": message}]
-        input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
-        # Generate response with timeout
-        gen_tokens = generate_with_timeout(model, input_ids)
-        # Decode response
-        gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
-        # Calculate processing time
-        processing_time = time.time() - start_time
-        system_info = format_system_info(processing_time)
-        # Format response
-        history.append({"role": "user", "content": message})
-        history.append({"role": "assistant", "content": f"{system_info}{gen_text}"})
-        return history
     except Exception as e:
-        processing_time = time.time() - start_time
-        system_info = format_system_info(processing_time)
-        error_msg = f"{system_info}Error during chat: {str(e)}\nAttempting reconnection..."
-        if history is None:
-            history = []
-        history.append({"role": "user", "content": message})
-        history.append({"role": "assistant", "content": error_msg})
-        return history
 def check_connection():
     try:
-        token = os.getenv("HUGGING_FACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
-        api = HfApi(token=token)
-        model_info = api.model_info("CohereLabs/c4ai-command-a-03-2025")
-        return f"""
-{format_system_info()}
-Connection Status: ✅ Connected
-Model: {model_info.modelId}
-Last Modified: {model_info.lastModified}
-Model Status: {'Loaded' if global_model is not None else 'Not Loaded'}
-"""
     except Exception as e:
-        return f"{format_system_info()}Connection Status: ❌ Error\nDetails: {str(e)}"
-# Create the Gradio interface with loading indicator
 with gr.Blocks(theme=gr.themes.Default()) as demo:
     gr.Markdown(f"# Medical Decision Support AI\n{format_system_info()}")
-    with gr.Row():
-        connection_btn = gr.Button("Check Connection Status")
-        connection_status = gr.Textbox(label="Connection Status", lines=6)
-    # Add loading configuration
     with gr.Row():
-        gr.Markdown("⚙️ Model is loading... Please wait for first response.")
-    chat_interface = gr.ChatInterface(
-        fn=chat,
-        description=f"A medical decision support system that provides healthcare-related information and guidance.\n{format_system_info()}",
         examples=[
             "What are the symptoms of hypertension?",
             "What are common drug interactions with aspirin?",
             "What are the warning signs of diabetes?",
         ],
-        # Buttons below are not valid in Gradio 4.x+:
-        # retry_btn="Retry ↺",
-        # undo_btn="Undo ↶",
-        # clear_btn="Clear 🗑️"
-        # type='messages'
-        # To customize buttons, see: https://www.gradio.app/docs/chatinterface/
     )
-    connection_btn.click(check_connection, outputs=connection_status)
-    # Check connection and load model on startup
-    connection_status.value = check_connection()
-    # Pre-load the model
-    try:
-        load_model()
-    except Exception as e:
-        gr.Warning(f"Model pre-loading failed: {str(e)}")
-# Update requirements
-requirements = """
-gradio>=3.50.2
-transformers
-torch
-accelerate
-huggingface_hub
-requests
-tenacity
-"""
-demo.launch()

+# app.py
 import os
+import time
+from datetime import datetime, timezone
+from functools import lru_cache
+import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from huggingface_hub import login, HfApi
+MODEL_ID = os.getenv("MODEL_ID", "CohereLabs/c4ai-command-a-03-2025")  # change if needed
+HF_TOKEN = (
+    os.getenv("HUGGINGFACE_HUB_TOKEN")  # <-- correct canonical name
+    or os.getenv("HF_TOKEN")
+)
 def get_timestamp():
+    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
 def format_system_info(processing_time=None):
     info = (
         f"Current Date and Time (UTC - YYYY-MM-DD HH:MM:SS formatted): {get_timestamp()}\n"
         f"Current User's Login: Raj-VedAI\n"
         info += f"Processing Time: {processing_time:.2f} seconds\n"
     return info
+def _pick_dtype_and_map():
+    if torch.cuda.is_available():
+        return torch.float16, "auto"
+    if torch.backends.mps.is_available():
+        # Apple Silicon (MPS) prefers float16/bfloat16 depending on model; float16 is usually OK.
+        return torch.float16, {"": "mps"}
+    return torch.float32, "cpu"  # CPU-safe
 @lru_cache(maxsize=1)
 def load_model():
+    if HF_TOKEN:
+        # In Spaces this isn’t strictly necessary if the secret is set, but it doesn’t hurt.
+        login(token=HF_TOKEN, add_to_git_credential=False)
+    dtype, device_map = _pick_dtype_and_map()
+    tok = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        token=HF_TOKEN,
+        use_fast=True,
+        model_max_length=4096,
+        padding_side="left",  # safer for some chat templates
+    )
+    mdl = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        token=HF_TOKEN,
+        device_map=device_map,
+        low_cpu_mem_usage=True,
+        torch_dtype=dtype,
+    )
+    # Fallback for models without an EOS defined
+    if mdl.config.eos_token_id is None and tok.eos_token_id is not None:
+        mdl.config.eos_token_id = tok.eos_token_id
+    return mdl, tok
+def build_inputs(tokenizer, message, history):
+    # Convert Gradio’s (message, history) into a chat template
+    msgs = []
+    # Optionally carry past turns if your model supports it
+    for u, a in history or []:
+        msgs.append({"role": "user", "content": u})
+        msgs.append({"role": "assistant", "content": a})
+    msgs.append({"role": "user", "content": message})
+    inputs = tokenizer.apply_chat_template(
+        msgs,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    )
+    return inputs
+def generate_reply(model, tokenizer, input_ids, max_new_tokens=256):
+    input_ids = input_ids.to(model.device)
+    with torch.no_grad():
+        out = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=0.3,
+            top_p=0.9,
+            repetition_penalty=1.2,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    # Slice off the prompt so we only return new tokens
+    gen_only = out[0, input_ids.shape[-1]:]
+    text = tokenizer.decode(gen_only, skip_special_tokens=True)
+    return text.strip()
+def chat_fn(message, history):
+    start = time.time()
     try:
         model, tokenizer = load_model()
+        inputs = build_inputs(tokenizer, message, history)
+        reply = generate_reply(model, tokenizer, inputs, max_new_tokens=300)
+        # Optional: prepend system info once per turn
+        reply = f"{format_system_info(time.time() - start)}{reply}"
+        return reply
     except Exception as e:
+        return f"{format_system_info(time.time() - start)}Error during chat: {e}"
 def check_connection():
     try:
+        api = HfApi(token=HF_TOKEN)
+        mi = api.model_info(MODEL_ID)
+        return (
+            f"{format_system_info()}"
+            f"Connection Status: ✅ Connected\n"
+            f"Model: {mi.modelId}\n"
+            f"Last Modified: {mi.lastModified}\n"
+        )
     except Exception as e:
+        return f"{format_system_info()}Connection Status: ❌ Error\nDetails: {e}"
 with gr.Blocks(theme=gr.themes.Default()) as demo:
     gr.Markdown(f"# Medical Decision Support AI\n{format_system_info()}")
     with gr.Row():
+        btn = gr.Button("Check Connection Status")
+        status = gr.Textbox(label="Connection Status", lines=6, value="Click to check…")
+    gr.Markdown("⚙️ Model is loading on first request. Please wait for the first answer.")
+    chat = gr.ChatInterface(
+        fn=chat_fn,
+        type="messages",  # use the modern message format
+        description="A medical decision support system that provides healthcare-related information and guidance.",
         examples=[
             "What are the symptoms of hypertension?",
             "What are common drug interactions with aspirin?",
             "What are the warning signs of diabetes?",
         ],
     )
+    btn.click(fn=check_connection, outputs=status)
+if __name__ == "__main__":
+    demo.launch()