yasserrmd commited on
Commit
393cb4e
·
verified ·
1 Parent(s): c7bf587

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -49
app.py CHANGED
@@ -80,54 +80,53 @@ class SinaReasonMedicalChat:
80
  return thinking, response
81
 
82
  @spaces.GPU(duration=120)
83
- def medical_chat_stream(self, message: str, history: List[List[str]], max_tokens: int = 1024,
84
- temperature: float = 0.7, top_p: float = 0.95) -> Iterator[Tuple[str, List[List[str]]]]:
85
- """Stream medical reasoning responses with thinking display"""
86
-
87
  if not message.strip():
88
  return
89
-
90
- #self.model = self.model.to(DEVICE)
91
-
 
92
  # Apply the chat template with the medical system prompt
93
  messages = [
94
- {"role": "system", "content": MEDICAL_SYSTEM_PROMPT},
95
  ]
96
-
97
  # Add conversation history
98
  for user_msg, assistant_msg in history:
99
  messages.append({"role": "user", "content": user_msg})
100
  messages.append({"role": "assistant", "content": assistant_msg})
101
-
102
  # Add current message
103
  messages.append({"role": "user", "content": message})
104
-
105
  # Apply chat template
106
  prompt = self.tokenizer.apply_chat_template(
107
  messages,
108
  tokenize=False,
109
  add_generation_prompt=True,
110
  )
111
-
112
- # Tokenize input
113
  inputs = self.tokenizer(
114
  text=prompt,
115
- #images=None, # Required for this multimodal architecture
116
  return_tensors="pt"
117
  ).to(DEVICE)
118
-
119
  # Setup streamer
120
  streamer = TextIteratorStreamer(
121
- self.tokenizer,
122
- timeout=30.0,
123
- skip_prompt=True,
124
  skip_special_tokens=True
125
  )
126
-
127
  # Generation parameters optimized for medical reasoning
128
  generation_kwargs = {
129
  **inputs,
130
- "images": None, # Also required here for text-only inference
131
  "max_new_tokens": max_tokens,
132
  "temperature": temperature,
133
  "top_p": top_p,
@@ -136,23 +135,22 @@ class SinaReasonMedicalChat:
136
  "streamer": streamer,
137
  "repetition_penalty": 1.1
138
  }
139
-
140
- # Start generation in a separate thread
141
- thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
142
- thread.start()
143
-
144
  # Stream the response
145
  partial_response = ""
146
  current_thinking = ""
147
  current_response = ""
148
- thinking_phase = True
149
-
150
  for new_token in streamer:
151
  partial_response += new_token
152
-
153
  # Extract thinking and response
154
  thinking, response = self.extract_thinking_and_response(partial_response)
155
-
156
  # Show thinking phase while it's being generated
157
  if thinking and thinking != current_thinking:
158
  current_thinking = thinking
@@ -160,33 +158,23 @@ class SinaReasonMedicalChat:
160
  new_history = history + [[message, display_text]]
161
  yield "", new_history
162
  time.sleep(0.1) # Smooth streaming
163
-
164
  # Show clinical response as it's generated
165
  if response and response != current_response:
166
  current_response = response
167
-
168
- final_display = f"🩺 **Clinical Analysis**\n\n{current_response}"
169
-
170
- if current_thinking:
171
- final_display = f"""🧠 **Medical Reasoning Process**
172
-
173
- <details>
174
- <summary>🔍 Click to view detailed thinking process</summary>
175
-
176
- *{current_thinking}*
177
-
178
- </details>
179
 
180
- ---
 
 
 
 
 
 
 
181
 
182
- 🩺 **Clinical Summary**
183
-
184
- {current_response}"""
185
-
186
  new_history = history + [[message, final_display]]
187
  yield "", new_history
188
-
189
- thread.join()
190
 
191
  # Initialize the medical chat model
192
  medical_chat_model = SinaReasonMedicalChat()
 
80
  return thinking, response
81
 
82
  @spaces.GPU(duration=120)
83
+ def medical_chat_stream(self, message: str, history: List[List[str]], max_tokens: int = 1024,
84
+ temperature: float = 0.7, top_p: float = 0.95) -> Iterator[Tuple[str, List[List[str]]]]:
85
+ """Stream medical reasoning responses with thinking display without threading."""
86
+
87
  if not message.strip():
88
  return
89
+
90
+ # Ensure the model is on the correct device (e.g., CUDA)
91
+ self.model = self.model.to(DEVICE)
92
+
93
  # Apply the chat template with the medical system prompt
94
  messages = [
95
+ {"role": "system", "content": "MEDICAL_SYSTEM_PROMPT"}, # Replace with your actual prompt
96
  ]
97
+
98
  # Add conversation history
99
  for user_msg, assistant_msg in history:
100
  messages.append({"role": "user", "content": user_msg})
101
  messages.append({"role": "assistant", "content": assistant_msg})
102
+
103
  # Add current message
104
  messages.append({"role": "user", "content": message})
105
+
106
  # Apply chat template
107
  prompt = self.tokenizer.apply_chat_template(
108
  messages,
109
  tokenize=False,
110
  add_generation_prompt=True,
111
  )
112
+
113
+ # Tokenize input and move to the same device as the model
114
  inputs = self.tokenizer(
115
  text=prompt,
 
116
  return_tensors="pt"
117
  ).to(DEVICE)
118
+
119
  # Setup streamer
120
  streamer = TextIteratorStreamer(
121
+ self.tokenizer,
122
+ timeout=30.0,
123
+ skip_prompt=True,
124
  skip_special_tokens=True
125
  )
126
+
127
  # Generation parameters optimized for medical reasoning
128
  generation_kwargs = {
129
  **inputs,
 
130
  "max_new_tokens": max_tokens,
131
  "temperature": temperature,
132
  "top_p": top_p,
 
135
  "streamer": streamer,
136
  "repetition_penalty": 1.1
137
  }
138
+
139
+ # Start generation directly.
140
+ # This will return immediately and the streamer will be populated in the background.
141
+ self.model.generate(**generation_kwargs)
142
+
143
  # Stream the response
144
  partial_response = ""
145
  current_thinking = ""
146
  current_response = ""
147
+
 
148
  for new_token in streamer:
149
  partial_response += new_token
150
+
151
  # Extract thinking and response
152
  thinking, response = self.extract_thinking_and_response(partial_response)
153
+
154
  # Show thinking phase while it's being generated
155
  if thinking and thinking != current_thinking:
156
  current_thinking = thinking
 
158
  new_history = history + [[message, display_text]]
159
  yield "", new_history
160
  time.sleep(0.1) # Smooth streaming
161
+
162
  # Show clinical response as it's generated
163
  if response and response != current_response:
164
  current_response = response
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ final_display = f"""🧠 **Medical Reasoning Process**
167
+ <details>
168
+ <summary>🔍 Click to view detailed thinking process</summary>
169
+ *{current_thinking}*
170
+ </details>
171
+ ---
172
+ 🩺 **Clinical Summary**
173
+ {current_response}"""
174
 
 
 
 
 
175
  new_history = history + [[message, final_display]]
176
  yield "", new_history
177
+
 
178
 
179
  # Initialize the medical chat model
180
  medical_chat_model = SinaReasonMedicalChat()