Spaces:

yasserrmd
/

SinaReason

Sleeping

App Files Files Community

yasserrmd commited on Sep 21

Commit

393cb4e

verified ·

1 Parent(s): c7bf587

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -49

app.py CHANGED Viewed

@@ -80,54 +80,53 @@ class SinaReasonMedicalChat:
         return thinking, response
     @spaces.GPU(duration=120)
-    def medical_chat_stream(self, message: str, history: List[List[str]], max_tokens: int = 1024,
-                          temperature: float = 0.7, top_p: float = 0.95) -> Iterator[Tuple[str, List[List[str]]]]:
-        """Stream medical reasoning responses with thinking display"""
         if not message.strip():
             return
-        #self.model = self.model.to(DEVICE)
         # Apply the chat template with the medical system prompt
         messages = [
-            {"role": "system", "content": MEDICAL_SYSTEM_PROMPT},
         ]
         # Add conversation history
         for user_msg, assistant_msg in history:
             messages.append({"role": "user", "content": user_msg})
             messages.append({"role": "assistant", "content": assistant_msg})
         # Add current message
         messages.append({"role": "user", "content": message})
         # Apply chat template
         prompt = self.tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True,
         )
-        # Tokenize input
         inputs = self.tokenizer(
             text=prompt,
-            #images=None,  # Required for this multimodal architecture
             return_tensors="pt"
         ).to(DEVICE)
         # Setup streamer
         streamer = TextIteratorStreamer(
-            self.tokenizer,
-            timeout=30.0,
-            skip_prompt=True,
             skip_special_tokens=True
         )
         # Generation parameters optimized for medical reasoning
         generation_kwargs = {
             **inputs,
-            "images": None,  # Also required here for text-only inference
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
@@ -136,23 +135,22 @@ class SinaReasonMedicalChat:
             "streamer": streamer,
             "repetition_penalty": 1.1
         }
-        # Start generation in a separate thread
-        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
-        thread.start()
         # Stream the response
         partial_response = ""
         current_thinking = ""
         current_response = ""
-        thinking_phase = True
         for new_token in streamer:
             partial_response += new_token
             # Extract thinking and response
             thinking, response = self.extract_thinking_and_response(partial_response)
             # Show thinking phase while it's being generated
             if thinking and thinking != current_thinking:
                 current_thinking = thinking
@@ -160,33 +158,23 @@ class SinaReasonMedicalChat:
                 new_history = history + [[message, display_text]]
                 yield "", new_history
                 time.sleep(0.1)  # Smooth streaming
             # Show clinical response as it's generated
             if response and response != current_response:
                 current_response = response
-                final_display = f"🩺 **Clinical Analysis**\n\n{current_response}"
-                if current_thinking:
-                    final_display = f"""🧠 **Medical Reasoning Process**
-<details>
-<summary>🔍 Click to view detailed thinking process</summary>
-*{current_thinking}*
-</details>
----
-🩺 **Clinical Summary**
-{current_response}"""
                 new_history = history + [[message, final_display]]
                 yield "", new_history
-        thread.join()
 # Initialize the medical chat model
 medical_chat_model = SinaReasonMedicalChat()

         return thinking, response
     @spaces.GPU(duration=120)
+    def medical_chat_stream(self, message: str, history: List[List[str]], max_tokens: int = 1024,
+                              temperature: float = 0.7, top_p: float = 0.95) -> Iterator[Tuple[str, List[List[str]]]]:
+        """Stream medical reasoning responses with thinking display without threading."""
         if not message.strip():
             return
+        # Ensure the model is on the correct device (e.g., CUDA)
+        self.model = self.model.to(DEVICE)
         # Apply the chat template with the medical system prompt
         messages = [
+            {"role": "system", "content": "MEDICAL_SYSTEM_PROMPT"}, # Replace with your actual prompt
         ]
         # Add conversation history
         for user_msg, assistant_msg in history:
             messages.append({"role": "user", "content": user_msg})
             messages.append({"role": "assistant", "content": assistant_msg})
         # Add current message
         messages.append({"role": "user", "content": message})
         # Apply chat template
         prompt = self.tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True,
         )
+        # Tokenize input and move to the same device as the model
         inputs = self.tokenizer(
             text=prompt,
             return_tensors="pt"
         ).to(DEVICE)
         # Setup streamer
         streamer = TextIteratorStreamer(
+            self.tokenizer,
+            timeout=30.0,
+            skip_prompt=True,
             skip_special_tokens=True
         )
         # Generation parameters optimized for medical reasoning
         generation_kwargs = {
             **inputs,
             "max_new_tokens": max_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "streamer": streamer,
             "repetition_penalty": 1.1
         }
+        # Start generation directly.
+        # This will return immediately and the streamer will be populated in the background.
+        self.model.generate(**generation_kwargs)
         # Stream the response
         partial_response = ""
         current_thinking = ""
         current_response = ""
         for new_token in streamer:
             partial_response += new_token
             # Extract thinking and response
             thinking, response = self.extract_thinking_and_response(partial_response)
             # Show thinking phase while it's being generated
             if thinking and thinking != current_thinking:
                 current_thinking = thinking
                 new_history = history + [[message, display_text]]
                 yield "", new_history
                 time.sleep(0.1)  # Smooth streaming
             # Show clinical response as it's generated
             if response and response != current_response:
                 current_response = response
+                final_display = f"""🧠 **Medical Reasoning Process**
+                    <details>
+                    <summary>🔍 Click to view detailed thinking process</summary>
+                    *{current_thinking}*
+                    </details>
+                    ---
+                    🩺 **Clinical Summary**
+                    {current_response}"""
                 new_history = history + [[message, final_display]]
                 yield "", new_history
 # Initialize the medical chat model
 medical_chat_model = SinaReasonMedicalChat()