Spaces:

yasserrmd
/

SinaReason

Running on Zero

App Files Files Community

yasserrmd commited on Sep 22

Commit

cee4540

verified ·

1 Parent(s): 592c30f

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -69

app.py CHANGED Viewed

@@ -1,16 +1,11 @@
 import gradio as gr
-import gradio as gr
 import torch
-from transformers import AutoTokenizer, Mistral3ForConditionalGeneration, TextIteratorStreamer
-from threading import Thread
 import re
-import time
 import os
-from typing import Iterator, List, Tuple
 import spaces
 # Model configuration
 MODEL_NAME = "yasserrmd/SinaReason-Magistral-2509"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -20,11 +15,9 @@ MEDICAL_SYSTEM_PROMPT = """
 You are SinaReason, a medical reasoning assistant for educational and clinical support.
 Your goal is to carefully reason through clinical problems for a professional audience (clinicians, students).
 **Never provide medical advice directly to a patient.**
 First, draft your detailed thought process (inner monologue) inside <think> ... </think>.
 - Use this section to work through symptoms, differential diagnoses, and investigation plans.
 - Be explicit and thorough in your reasoning.
 After closing </think>, provide a clear, self-contained medical summary appropriate for a clinical professional.
 - Summarize the most likely diagnosis and your reasoning.
 - Suggest next steps for investigation or management.
@@ -53,8 +46,6 @@ class SinaReasonMedicalChat:
                 dtype=torch.bfloat16
             )
             print("SinaReason medical model loaded successfully!")
         except Exception as e:
@@ -63,7 +54,6 @@ class SinaReasonMedicalChat:
     def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
         """Extract thinking process from <think>...</think> tags and clinical response"""
-        # Look for the specific <think>...</think> pattern used by SinaReason
         think_pattern = r'<think>(.*?)</think>'
         thinking = ""
@@ -77,89 +67,75 @@ class SinaReasonMedicalChat:
         return thinking, response
     @spaces.GPU(duration=120)
-    def medical_chat_stream(self, message: str, history: List[List[str]], max_tokens: int = 1024,
-                              temperature: float = 0.7, top_p: float = 0.95) -> Iterator[Tuple[str, List[List[str]]]]:
-        """Stream medical reasoning responses with thinking display without threading."""
         self.model.to(DEVICE).eval()
         if not message.strip():
-            return
         # Apply the chat template with the medical system prompt
         messages = [
-            {"role": "system", "content": "MEDICAL_SYSTEM_PROMPT"}, # Replace with your actual prompt
         ]
         # Add conversation history
         for user_msg, assistant_msg in history:
             messages.append({"role": "user", "content": user_msg})
-            messages.append({"role": "assistant", "content": assistant_msg})
         # Add current message
         messages.append({"role": "user", "content": message})
-        tokenized = self.tokenizer.apply_chat_template(messages, return_dict=True)
-        input_ids = torch.tensor(tokenized.input_ids, device="cuda").unsqueeze(0)
-        attention_mask = torch.tensor(tokenized.attention_mask, device="cuda").unsqueeze(0)
-        # Setup streamer
-        streamer = TextIteratorStreamer(
-            self.tokenizer,
-            timeout=30.0,
-            skip_prompt=True,
-            skip_special_tokens=True
-        )
-        # Generation parameters optimized for medical reasoning
         generation_kwargs = {
-            "input_ids" :input_ids,
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "do_sample": True,
             "pad_token_id": self.tokenizer.eos_token_id,
-            "streamer": streamer,
             "repetition_penalty": 1.1
         }
-        # Start generation directly.
-        # This will return immediately and the streamer will be populated in the background.
-        self.model.generate(**generation_kwargs)
-        # Stream the response
-        partial_response = ""
-        current_thinking = ""
-        current_response = ""
-        for new_token in streamer:
-            partial_response += new_token
-            # Extract thinking and response
-            thinking, response = self.extract_thinking_and_response(partial_response)
-            # Show thinking phase while it's being generated
-            if thinking and thinking != current_thinking:
-                current_thinking = thinking
-                display_text = f"🧠 **Medical Reasoning in Progress...**\n\n<details>\n<summary>🔍 Click to see thinking process</summary>\n\n*{current_thinking}*\n\n</details>"
-                new_history = history + [[message, display_text]]
-                yield "", new_history
-                time.sleep(0.1)  # Smooth streaming
-            # Show clinical response as it's generated
-            if response and response != current_response:
-                current_response = response
-                final_display = f"""🧠 **Medical Reasoning Process**
-                    <details>
-                    <summary>🔍 Click to view detailed thinking process</summary>
-                    *{current_thinking}*
-                    </details>
-                    ---
-                    🩺 **Clinical Summary**
-                    {current_response}"""
-                new_history = history + [[message, final_display]]
-                yield "", new_history
 # Initialize the medical chat model
@@ -167,8 +143,7 @@ medical_chat_model = SinaReasonMedicalChat()
 def respond(message, history, max_tokens, temperature, top_p):
     """Gradio response function for medical reasoning"""
-    for response in medical_chat_model.medical_chat_stream(message, history, max_tokens, temperature, top_p):
-        yield response
 # Custom CSS for medical interface
 css = """

 import gradio as gr
 import torch
+from transformers import AutoTokenizer, Mistral3ForConditionalGeneration
 import re
 import os
+from typing import List, Tuple
 import spaces
 # Model configuration
 MODEL_NAME = "yasserrmd/SinaReason-Magistral-2509"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 You are SinaReason, a medical reasoning assistant for educational and clinical support.
 Your goal is to carefully reason through clinical problems for a professional audience (clinicians, students).
 **Never provide medical advice directly to a patient.**
 First, draft your detailed thought process (inner monologue) inside <think> ... </think>.
 - Use this section to work through symptoms, differential diagnoses, and investigation plans.
 - Be explicit and thorough in your reasoning.
 After closing </think>, provide a clear, self-contained medical summary appropriate for a clinical professional.
 - Summarize the most likely diagnosis and your reasoning.
 - Suggest next steps for investigation or management.
                 dtype=torch.bfloat16
             )
             print("SinaReason medical model loaded successfully!")
         except Exception as e:
     def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
         """Extract thinking process from <think>...</think> tags and clinical response"""
         think_pattern = r'<think>(.*?)</think>'
         thinking = ""
         return thinking, response
     @spaces.GPU(duration=120)
+    def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
+                           temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
+        """Generate medical reasoning responses without streaming."""
         self.model.to(DEVICE).eval()
         if not message.strip():
+            return "", history
         # Apply the chat template with the medical system prompt
         messages = [
+            {"role": "system", "content": MEDICAL_SYSTEM_PROMPT},
         ]
         # Add conversation history
         for user_msg, assistant_msg in history:
+            # We need to reconstruct the full assistant message for the model
+            # For simplicity, we'll just use the user message and the final response part
+            # This part might need adjustment depending on how history is formatted
+            # For this modification, let's assume the assistant message is just the clinical summary
+            # A more robust solution might store the full generated text.
+            raw_assistant_msg = assistant_msg.split("🩺 **Clinical Summary**")[-1].strip()
             messages.append({"role": "user", "content": user_msg})
+            messages.append({"role": "assistant", "content": raw_assistant_msg})
         # Add current message
         messages.append({"role": "user", "content": message})
+        tokenized = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
+        input_ids = tokenized.input_ids.to(DEVICE)
+        attention_mask = tokenized.attention_mask.to(DEVICE)
+        # Generation parameters
         generation_kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "do_sample": True,
             "pad_token_id": self.tokenizer.eos_token_id,
             "repetition_penalty": 1.1
         }
+        # Generate the full response
+        generated_ids = self.model.generate(**generation_kwargs)[0]
+        # Decode the response
+        full_response = self.tokenizer.decode(output[len(tokenized.input_ids) : (-1 if output[-1] == tokenizer.eos_token_id else len(output) ) ])
+        # Extract thinking and clinical summary
+        thinking, response = self.extract_thinking_and_response(full_response)
+        # Format the final display
+        final_display = ""
+        if thinking:
+            final_display += f"""🧠 **Medical Reasoning Process**
+                <details>
+                <summary>🔍 Click to view detailed thinking process</summary>
+                *{thinking}*
+                </details>
+                ---
+                """
+        final_display += f"""🩺 **Clinical Summary**
+            {response}"""
+        new_history = history + [[message, final_display]]
+        return "", new_history
 # Initialize the medical chat model
 def respond(message, history, max_tokens, temperature, top_p):
     """Gradio response function for medical reasoning"""
+    return medical_chat_model.medical_chat(message, history, max_tokens, temperature, top_p)
 # Custom CSS for medical interface
 css = """