Spaces:

yasserrmd
/

SinaReason

Running on Zero

App Files Files Community

yasserrmd commited on Oct 2

Commit

21ddb34

verified ·

1 Parent(s): 7cc675b

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -59

app.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import re
 import os
-from typing import List, Tuple
 import spaces
 # Model configuration
 MODEL_NAME = "yasserrmd/SinaReason-Magistral-2509"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -19,49 +20,52 @@ MEDICAL_SYSTEM_PROMPT = """
 You are SinaReason, a medical reasoning assistant for educational and clinical support.
 Your goal is to carefully reason through clinical problems for a professional audience (clinicians, students).
 **Never provide medical advice directly to a patient.**
 First, draft your detailed thought process (inner monologue) inside <think> ... </think>.
 - Use this section to work through symptoms, differential diagnoses, and investigation plans.
 - Be explicit and thorough in your reasoning.
 After closing </think>, provide a clear, self-contained medical summary appropriate for a clinical professional.
 - Summarize the most likely diagnosis and your reasoning.
 - Suggest next steps for investigation or management.
 """
 class SinaReasonMedicalChat:
     def __init__(self):
         self.tokenizer = None
         self.model = None
-        # The PixtralProcessor requires an image argument, even if it's None.
-        # This is a mandatory part of the call signature.
-        self.dummy_image = None
         self.load_model()
     def load_model(self):
-        """Load the SinaReason medical model and tokenizer using Unsloth"""
         try:
-            print(f"Loading medical model with Unsloth: {MODEL_NAME}")
-            print("cuda" if torch.cuda.is_available() else "cpu")
-            self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
-                torch_dtype=torch.bfloat16, # Use bfloat16 for modern GPUs
-                device_map="auto",          # Automatically map to the available GPU
             )
-            # Load the standard tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-            print("SinaReason medical model loaded successfully with Unsloth!")
         except Exception as e:
-            print(f"Error loading model with Unsloth: {e}")
             raise e
     def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
         """Extract thinking process from <think>...</think> tags and clinical response"""
         think_pattern = r'<think>(.*?)</think>'
         thinking = ""
         response = text
@@ -73,32 +77,48 @@ class SinaReasonMedicalChat:
         return thinking, response
     @spaces.GPU(duration=120)
-    def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
-                           temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
-        """Generate medical reasoning responses using the Unsloth model."""
         if not message.strip():
-            return "", history
-        self.model.to("cuda")
-        self.model.eval()
         # Apply the chat template with the medical system prompt
-        messages = [{"role": "system", "content": MEDICAL_SYSTEM_PROMPT}]
         for user_msg, assistant_msg in history:
-            raw_assistant_msg = assistant_msg.split("🩺 **Clinical Summary**")[-1].strip()
             messages.append({"role": "user", "content": user_msg})
-            messages.append({"role": "assistant", "content": raw_assistant_msg})
         messages.append({"role": "user", "content": message})
-        formatted_prompt = self.tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True,
         )
-        # THE HACK IS GONE: Standard tokenization without any 'images' argument.
-        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
-        # THE HACK IS GONE: Standard generation call.
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_tokens,
@@ -106,30 +126,48 @@ class SinaReasonMedicalChat:
             "top_p": top_p,
             "do_sample": True,
             "pad_token_id": self.tokenizer.eos_token_id,
         }
-        output = self.model.generate(**generation_kwargs)[0]
-        full_response = self.tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True)
-        # Extract thinking and clinical summary
-        thinking, response = self.extract_thinking_and_response(full_response)
-        # Format the final display
-        final_display = ""
-        if thinking:
-            final_display += f"""🧠 **Medical Reasoning Process**
-                <details>
-                <summary>🔍 Click to view detailed thinking process</summary>
-                *{thinking}*
-                </details>
-                ---
-                """
-        final_display += f"""🩺 **Clinical Summary**
-            {response}"""
-        new_history = history + [[message, final_display]]
-        return "", new_history
 # Initialize the medical chat model
@@ -137,7 +175,8 @@ medical_chat_model = SinaReasonMedicalChat()
 def respond(message, history, max_tokens, temperature, top_p):
     """Gradio response function for medical reasoning"""
-    return medical_chat_model.medical_chat(message, history, max_tokens, temperature, top_p)
 # Custom CSS for medical interface
 css = """
@@ -322,7 +361,6 @@ with gr.Blocks(css=css, title="SinaReason Medical Reasoning", theme=gr.themes.So
     </div>
     """)
 # Launch configuration for HF Spaces
 if __name__ == "__main__":
     demo.launch(

 import gradio as gr
+import gradio as gr
 import torch
+from transformers import AutoTokenizer, Mistral3ForConditionalGeneration, TextIteratorStreamer
+from threading import Thread
 import re
+import time
 import os
+from typing import Iterator, List, Tuple
 import spaces
 # Model configuration
 MODEL_NAME = "yasserrmd/SinaReason-Magistral-2509"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 You are SinaReason, a medical reasoning assistant for educational and clinical support.
 Your goal is to carefully reason through clinical problems for a professional audience (clinicians, students).
 **Never provide medical advice directly to a patient.**
 First, draft your detailed thought process (inner monologue) inside <think> ... </think>.
 - Use this section to work through symptoms, differential diagnoses, and investigation plans.
 - Be explicit and thorough in your reasoning.
 After closing </think>, provide a clear, self-contained medical summary appropriate for a clinical professional.
 - Summarize the most likely diagnosis and your reasoning.
 - Suggest next steps for investigation or management.
 """
 class SinaReasonMedicalChat:
     def __init__(self):
         self.tokenizer = None
         self.model = None
         self.load_model()
     def load_model(self):
+        """Load the SinaReason medical model and tokenizer"""
         try:
+            print(f"Loading medical model: {MODEL_NAME}")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                "mistralai/Magistral-Small-2509"
+            )
+            # Add padding token if not present
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.model = Mistral3ForConditionalGeneration.from_pretrained(
                 MODEL_NAME,
+                dtype=torch.bfloat16
             )
+            print("SinaReason medical model loaded successfully!")
         except Exception as e:
+            print(f"Error loading model: {e}")
             raise e
     def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
         """Extract thinking process from <think>...</think> tags and clinical response"""
+        # Look for the specific <think>...</think> pattern used by SinaReason
         think_pattern = r'<think>(.*?)</think>'
         thinking = ""
         response = text
         return thinking, response
     @spaces.GPU(duration=120)
+    def medical_chat_stream(self, message: str, history: List[List[str]], max_tokens: int = 1024,
+                              temperature: float = 0.7, top_p: float = 0.95) -> Iterator[Tuple[str, List[List[str]]]]:
+        """Stream medical reasoning responses with thinking display without threading."""
+        self.model.to(DEVICE).eval()
         if not message.strip():
+            return
         # Apply the chat template with the medical system prompt
+        messages = [
+            {"role": "system", "content": "MEDICAL_SYSTEM_PROMPT"}, # Replace with your actual prompt
+        ]
+        # Add conversation history
         for user_msg, assistant_msg in history:
             messages.append({"role": "user", "content": user_msg})
+            messages.append({"role": "assistant", "content": assistant_msg})
+        # Add current message
         messages.append({"role": "user", "content": message})
+        # Apply chat template
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
         )
+        # Tokenize input and move to the same device as the model
+        inputs = self.tokenizer(
+            text=prompt,
+            return_tensors="pt"
+        ).to(DEVICE)
+        # Setup streamer
+        streamer = TextIteratorStreamer(
+            self.tokenizer,
+            timeout=30.0,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
+        # Generation parameters optimized for medical reasoning
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_tokens,
             "top_p": top_p,
             "do_sample": True,
             "pad_token_id": self.tokenizer.eos_token_id,
+            "streamer": streamer,
+            "repetition_penalty": 1.1
         }
+        # Start generation directly.
+        # This will return immediately and the streamer will be populated in the background.
+        self.model.generate(**generation_kwargs)
+        # Stream the response
+        partial_response = ""
+        current_thinking = ""
+        current_response = ""
+        for new_token in streamer:
+            partial_response += new_token
+            # Extract thinking and response
+            thinking, response = self.extract_thinking_and_response(partial_response)
+            # Show thinking phase while it's being generated
+            if thinking and thinking != current_thinking:
+                current_thinking = thinking
+                display_text = f"🧠 **Medical Reasoning in Progress...**\n\n<details>\n<summary>🔍 Click to see thinking process</summary>\n\n*{current_thinking}*\n\n</details>"
+                new_history = history + [[message, display_text]]
+                yield "", new_history
+                time.sleep(0.1)  # Smooth streaming
+            # Show clinical response as it's generated
+            if response and response != current_response:
+                current_response = response
+                final_display = f"""🧠 **Medical Reasoning Process**
+                    <details>
+                    <summary>🔍 Click to view detailed thinking process</summary>
+                    *{current_thinking}*
+                    </details>
+                    ---
+                    🩺 **Clinical Summary**
+                    {current_response}"""
+                new_history = history + [[message, final_display]]
+                yield "", new_history
 # Initialize the medical chat model
 def respond(message, history, max_tokens, temperature, top_p):
     """Gradio response function for medical reasoning"""
+    for response in medical_chat_model.medical_chat_stream(message, history, max_tokens, temperature, top_p):
+        yield response
 # Custom CSS for medical interface
 css = """
     </div>
     """)
 # Launch configuration for HF Spaces
 if __name__ == "__main__":
     demo.launch(