Spaces:

yasserrmd
/

SinaReason

Running on Zero

App Files Files Community

yasserrmd commited on Sep 22

Commit

c9f8cc4

verified ·

1 Parent(s): 00657a2

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -38

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import re
 import os
 from typing import List, Tuple
 import spaces
@@ -25,39 +26,39 @@ After closing </think>, provide a clear, self-contained medical summary appropri
 - Suggest next steps for investigation or management.
 """
 class SinaReasonMedicalChat:
     def __init__(self):
         self.tokenizer = None
         self.model = None
         self.load_model()
     def load_model(self):
-        """Load the SinaReason medical model and tokenizer"""
         try:
-            print(f"Loading medical model: {MODEL_NAME}")
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_NAME,tokenizer_type="mistral"
-            )
-            # Add padding token if not present
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-            self.model = Mistral3ForConditionalGeneration.from_pretrained(
-                MODEL_NAME,
-                dtype=torch.bfloat16
             )
-            print("SinaReason medical model loaded successfully!")
         except Exception as e:
-            print(f"Error loading model: {e}")
             raise e
     def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
         """Extract thinking process from <think>...</think> tags and clinical response"""
         think_pattern = r'<think>(.*?)</think>'
         thinking = ""
         response = text
@@ -71,53 +72,51 @@ class SinaReasonMedicalChat:
     @spaces.GPU(duration=120)
     def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
                            temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
-        """Generate medical reasoning responses without streaming."""
-        self.model.to(DEVICE).eval()
         if not message.strip():
             return "", history
         # Apply the chat template with the medical system prompt
-        messages = [
-            {"role": "system", "content": MEDICAL_SYSTEM_PROMPT},
-        ]
-        # Add conversation history
         for user_msg, assistant_msg in history:
-            # We need to reconstruct the full assistant message for the model
-            # For simplicity, we'll just use the user message and the final response part
-            # This part might need adjustment depending on how history is formatted
-            # For this modification, let's assume the assistant message is just the clinical summary
-            # A more robust solution might store the full generated text.
             raw_assistant_msg = assistant_msg.split("🩺 **Clinical Summary**")[-1].strip()
             messages.append({"role": "user", "content": user_msg})
             messages.append({"role": "assistant", "content": raw_assistant_msg})
-        # Add current message
         messages.append({"role": "user", "content": message})
-        tokenized = self.tokenizer.apply_chat_template(messages, return_dict=True)
-        input_ids = torch.tensor(tokenized.input_ids, device="cuda").unsqueeze(0)
-        attention_mask = torch.tensor(tokenized.attention_mask, device="cuda").unsqueeze(0)
         # Generation parameters
         generation_kwargs = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "do_sample": True,
             "pad_token_id": self.tokenizer.eos_token_id,
-            "repetition_penalty": 1.1
         }
         # Generate the full response
         output = self.model.generate(**generation_kwargs)[0]
-        # Decode the response
-        full_response = self.tokenizer.decode(output[len(tokenized.input_ids) : (-1 if output[-1] == self.tokenizer.eos_token_id else len(output) ) ])
         # Extract thinking and clinical summary
         thinking, response = self.extract_thinking_and_response(full_response)

 import os
 from typing import List, Tuple
 import spaces
+from unsloth import FastLanguageModel
 - Suggest next steps for investigation or management.
 """
 class SinaReasonMedicalChat:
     def __init__(self):
         self.tokenizer = None
         self.model = None
+        # The PixtralProcessor requires an image argument, even if it's None.
+        # This is a mandatory part of the call signature.
+        self.dummy_image = None
         self.load_model()
     def load_model(self):
+        """Load the SinaReason medical model and tokenizer using Unsloth"""
         try:
+            print(f"Loading medical model with Unsloth: {MODEL_NAME}")
+            # Use FastLanguageModel from Unsloth to load the model and tokenizer
+            self.model, self.tokenizer = FastLanguageModel.from_pretrained(
+                model_name=MODEL_NAME,
+                dtype=torch.bfloat16,
+                load_in_4bit=True, # Or False if you have enough VRAM for 16-bit
+                #device_map="auto",
             )
+            print("SinaReason medical model loaded successfully with Unsloth!")
         except Exception as e:
+            print(f"Error loading model with Unsloth: {e}")
             raise e
     def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
         """Extract thinking process from <think>...</think> tags and clinical response"""
         think_pattern = r'<think>(.*?)</think>'
         thinking = ""
         response = text
     @spaces.GPU(duration=120)
     def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
                            temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
+        """Generate medical reasoning responses using the Unsloth model."""
+        # No need for model.to(DEVICE), Unsloth's device_map handles it.
+        self.model.to(DEVICE)
+        self.model.eval()
         if not message.strip():
             return "", history
         # Apply the chat template with the medical system prompt
+        messages = [{"role": "system", "content": MEDICAL_SYSTEM_PROMPT}]
         for user_msg, assistant_msg in history:
             raw_assistant_msg = assistant_msg.split("🩺 **Clinical Summary**")[-1].strip()
             messages.append({"role": "user", "content": user_msg})
             messages.append({"role": "assistant", "content": raw_assistant_msg})
         messages.append({"role": "user", "content": message})
+        # Format the prompt using the chat template
+        formatted_prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        # Tokenize the input, correctly passing images=None
+        inputs = self.tokenizer(
+            text=formatted_prompt,
+            images=self.dummy_image,
+            return_tensors="pt"
+        ).to(self.model.device)
         # Generation parameters
         generation_kwargs = {
+            **inputs,
+            "images": self.dummy_image, # This MUST be passed to model.generate
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "do_sample": True,
             "pad_token_id": self.tokenizer.eos_token_id,
         }
         # Generate the full response
         output = self.model.generate(**generation_kwargs)[0]
+        # Decode only the newly generated tokens
+        full_response = self.tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True)
         # Extract thinking and clinical summary
         thinking, response = self.extract_thinking_and_response(full_response)