Spaces:

yasserrmd
/

SinaReason

Running on Zero

App Files Files Community

yasserrmd commited on Sep 22

Commit

8128d07

verified ·

1 Parent(s): dec6090

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -27

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import torch
-#from transformers import AutoTokenizer, Mistral3ForConditionalGeneration
 import re
 import os
 from typing import List, Tuple
@@ -36,23 +36,23 @@ class SinaReasonMedicalChat:
         # The PixtralProcessor requires an image argument, even if it's None.
         # This is a mandatory part of the call signature.
         self.dummy_image = None
-        #self.load_model()
     def load_model(self):
         """Load the SinaReason medical model and tokenizer using Unsloth"""
         try:
-            from unsloth import FastLanguageModel
             print(f"Loading medical model with Unsloth: {MODEL_NAME}")
             print("cuda" if torch.cuda.is_available() else "cpu")
-            # Use FastLanguageModel from Unsloth to load the model and tokenizer
-            self.model, self.tokenizer = FastLanguageModel.from_pretrained(
-                model_name=MODEL_NAME,
-                dtype=torch.bfloat16,
-                load_in_4bit=True, # Or False if you have enough VRAM for 16-bit
-                device_map="cuda",
             )
             print("SinaReason medical model loaded successfully with Unsloth!")
         except Exception as e:
@@ -76,12 +76,13 @@ class SinaReasonMedicalChat:
     def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
                            temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
         """Generate medical reasoning responses using the Unsloth model."""
-        # No need for model.to(DEVICE), Unsloth's device_map handles it.
-        self.load_model()
-        self.model.eval()
         if not message.strip():
             return "", history
         # Apply the chat template with the medical system prompt
         messages = [{"role": "system", "content": MEDICAL_SYSTEM_PROMPT}]
         for user_msg, assistant_msg in history:
@@ -90,24 +91,16 @@ class SinaReasonMedicalChat:
             messages.append({"role": "assistant", "content": raw_assistant_msg})
         messages.append({"role": "user", "content": message})
-        # Format the prompt using the chat template
         formatted_prompt = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
         )
-        # Tokenize the input, correctly passing images=None
-        inputs = self.tokenizer(
-            text=formatted_prompt,
-            images=self.dummy_image,
-            return_tensors="pt"
-        ).to(self.model.device)
-        # Generation parameters
         generation_kwargs = {
             **inputs,
-            "images": self.dummy_image, # This MUST be passed to model.generate
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
@@ -115,10 +108,7 @@ class SinaReasonMedicalChat:
             "pad_token_id": self.tokenizer.eos_token_id,
         }
-        # Generate the full response
         output = self.model.generate(**generation_kwargs)[0]
-        # Decode only the newly generated tokens
         full_response = self.tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True)
         # Extract thinking and clinical summary

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import re
 import os
 from typing import List, Tuple
         # The PixtralProcessor requires an image argument, even if it's None.
         # This is a mandatory part of the call signature.
         self.dummy_image = None
+        self.load_model()
     def load_model(self):
         """Load the SinaReason medical model and tokenizer using Unsloth"""
         try:
             print(f"Loading medical model with Unsloth: {MODEL_NAME}")
             print("cuda" if torch.cuda.is_available() else "cpu")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                torch_dtype=torch.bfloat16, # Use bfloat16 for modern GPUs
+                device_map="auto",          # Automatically map to the available GPU
             )
+            # Load the standard tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
             print("SinaReason medical model loaded successfully with Unsloth!")
         except Exception as e:
     def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
                            temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
         """Generate medical reasoning responses using the Unsloth model."""
         if not message.strip():
             return "", history
+        self.model.to("cuda")
+        self.model.eval()
         # Apply the chat template with the medical system prompt
         messages = [{"role": "system", "content": MEDICAL_SYSTEM_PROMPT}]
         for user_msg, assistant_msg in history:
             messages.append({"role": "assistant", "content": raw_assistant_msg})
         messages.append({"role": "user", "content": message})
         formatted_prompt = self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True,
         )
+        # THE HACK IS GONE: Standard tokenization without any 'images' argument.
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
+        # THE HACK IS GONE: Standard generation call.
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "pad_token_id": self.tokenizer.eos_token_id,
         }
         output = self.model.generate(**generation_kwargs)[0]
         full_response = self.tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True)
         # Extract thinking and clinical summary