yasserrmd commited on
Commit
cee4540
·
verified ·
1 Parent(s): 592c30f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -69
app.py CHANGED
@@ -1,16 +1,11 @@
1
  import gradio as gr
2
- import gradio as gr
3
  import torch
4
- from transformers import AutoTokenizer, Mistral3ForConditionalGeneration, TextIteratorStreamer
5
- from threading import Thread
6
  import re
7
- import time
8
  import os
9
- from typing import Iterator, List, Tuple
10
  import spaces
11
 
12
-
13
-
14
  # Model configuration
15
  MODEL_NAME = "yasserrmd/SinaReason-Magistral-2509"
16
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -20,11 +15,9 @@ MEDICAL_SYSTEM_PROMPT = """
20
  You are SinaReason, a medical reasoning assistant for educational and clinical support.
21
  Your goal is to carefully reason through clinical problems for a professional audience (clinicians, students).
22
  **Never provide medical advice directly to a patient.**
23
-
24
  First, draft your detailed thought process (inner monologue) inside <think> ... </think>.
25
  - Use this section to work through symptoms, differential diagnoses, and investigation plans.
26
  - Be explicit and thorough in your reasoning.
27
-
28
  After closing </think>, provide a clear, self-contained medical summary appropriate for a clinical professional.
29
  - Summarize the most likely diagnosis and your reasoning.
30
  - Suggest next steps for investigation or management.
@@ -53,8 +46,6 @@ class SinaReasonMedicalChat:
53
  dtype=torch.bfloat16
54
  )
55
 
56
-
57
-
58
  print("SinaReason medical model loaded successfully!")
59
 
60
  except Exception as e:
@@ -63,7 +54,6 @@ class SinaReasonMedicalChat:
63
 
64
  def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
65
  """Extract thinking process from <think>...</think> tags and clinical response"""
66
- # Look for the specific <think>...</think> pattern used by SinaReason
67
  think_pattern = r'<think>(.*?)</think>'
68
 
69
  thinking = ""
@@ -77,89 +67,75 @@ class SinaReasonMedicalChat:
77
  return thinking, response
78
 
79
  @spaces.GPU(duration=120)
80
- def medical_chat_stream(self, message: str, history: List[List[str]], max_tokens: int = 1024,
81
- temperature: float = 0.7, top_p: float = 0.95) -> Iterator[Tuple[str, List[List[str]]]]:
82
- """Stream medical reasoning responses with thinking display without threading."""
83
  self.model.to(DEVICE).eval()
84
  if not message.strip():
85
- return
86
 
87
  # Apply the chat template with the medical system prompt
88
  messages = [
89
- {"role": "system", "content": "MEDICAL_SYSTEM_PROMPT"}, # Replace with your actual prompt
90
  ]
91
 
92
  # Add conversation history
93
  for user_msg, assistant_msg in history:
 
 
 
 
 
 
94
  messages.append({"role": "user", "content": user_msg})
95
- messages.append({"role": "assistant", "content": assistant_msg})
 
96
 
97
  # Add current message
98
  messages.append({"role": "user", "content": message})
99
 
100
- tokenized = self.tokenizer.apply_chat_template(messages, return_dict=True)
101
 
102
- input_ids = torch.tensor(tokenized.input_ids, device="cuda").unsqueeze(0)
103
- attention_mask = torch.tensor(tokenized.attention_mask, device="cuda").unsqueeze(0)
104
 
105
- # Setup streamer
106
- streamer = TextIteratorStreamer(
107
- self.tokenizer,
108
- timeout=30.0,
109
- skip_prompt=True,
110
- skip_special_tokens=True
111
- )
112
-
113
- # Generation parameters optimized for medical reasoning
114
  generation_kwargs = {
115
- "input_ids" :input_ids,
 
116
  "max_new_tokens": max_tokens,
117
  "temperature": temperature,
118
  "top_p": top_p,
119
  "do_sample": True,
120
  "pad_token_id": self.tokenizer.eos_token_id,
121
- "streamer": streamer,
122
  "repetition_penalty": 1.1
123
  }
124
 
125
- # Start generation directly.
126
- # This will return immediately and the streamer will be populated in the background.
127
- self.model.generate(**generation_kwargs)
128
-
129
- # Stream the response
130
- partial_response = ""
131
- current_thinking = ""
132
- current_response = ""
133
-
134
- for new_token in streamer:
135
- partial_response += new_token
136
-
137
- # Extract thinking and response
138
- thinking, response = self.extract_thinking_and_response(partial_response)
139
-
140
- # Show thinking phase while it's being generated
141
- if thinking and thinking != current_thinking:
142
- current_thinking = thinking
143
- display_text = f"🧠 **Medical Reasoning in Progress...**\n\n<details>\n<summary>🔍 Click to see thinking process</summary>\n\n*{current_thinking}*\n\n</details>"
144
- new_history = history + [[message, display_text]]
145
- yield "", new_history
146
- time.sleep(0.1) # Smooth streaming
147
 
148
- # Show clinical response as it's generated
149
- if response and response != current_response:
150
- current_response = response
151
 
152
- final_display = f"""🧠 **Medical Reasoning Process**
153
- <details>
154
- <summary>🔍 Click to view detailed thinking process</summary>
155
- *{current_thinking}*
156
- </details>
157
- ---
158
- 🩺 **Clinical Summary**
159
- {current_response}"""
 
 
 
 
 
160
 
161
- new_history = history + [[message, final_display]]
162
- yield "", new_history
163
 
164
 
165
  # Initialize the medical chat model
@@ -167,8 +143,7 @@ medical_chat_model = SinaReasonMedicalChat()
167
 
168
  def respond(message, history, max_tokens, temperature, top_p):
169
  """Gradio response function for medical reasoning"""
170
- for response in medical_chat_model.medical_chat_stream(message, history, max_tokens, temperature, top_p):
171
- yield response
172
 
173
  # Custom CSS for medical interface
174
  css = """
 
1
  import gradio as gr
 
2
  import torch
3
+ from transformers import AutoTokenizer, Mistral3ForConditionalGeneration
 
4
  import re
 
5
  import os
6
+ from typing import List, Tuple
7
  import spaces
8
 
 
 
9
  # Model configuration
10
  MODEL_NAME = "yasserrmd/SinaReason-Magistral-2509"
11
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
15
  You are SinaReason, a medical reasoning assistant for educational and clinical support.
16
  Your goal is to carefully reason through clinical problems for a professional audience (clinicians, students).
17
  **Never provide medical advice directly to a patient.**
 
18
  First, draft your detailed thought process (inner monologue) inside <think> ... </think>.
19
  - Use this section to work through symptoms, differential diagnoses, and investigation plans.
20
  - Be explicit and thorough in your reasoning.
 
21
  After closing </think>, provide a clear, self-contained medical summary appropriate for a clinical professional.
22
  - Summarize the most likely diagnosis and your reasoning.
23
  - Suggest next steps for investigation or management.
 
46
  dtype=torch.bfloat16
47
  )
48
 
 
 
49
  print("SinaReason medical model loaded successfully!")
50
 
51
  except Exception as e:
 
54
 
55
  def extract_thinking_and_response(self, text: str) -> Tuple[str, str]:
56
  """Extract thinking process from <think>...</think> tags and clinical response"""
 
57
  think_pattern = r'<think>(.*?)</think>'
58
 
59
  thinking = ""
 
67
  return thinking, response
68
 
69
  @spaces.GPU(duration=120)
70
+ def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
71
+ temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
72
+ """Generate medical reasoning responses without streaming."""
73
  self.model.to(DEVICE).eval()
74
  if not message.strip():
75
+ return "", history
76
 
77
  # Apply the chat template with the medical system prompt
78
  messages = [
79
+ {"role": "system", "content": MEDICAL_SYSTEM_PROMPT},
80
  ]
81
 
82
  # Add conversation history
83
  for user_msg, assistant_msg in history:
84
+ # We need to reconstruct the full assistant message for the model
85
+ # For simplicity, we'll just use the user message and the final response part
86
+ # This part might need adjustment depending on how history is formatted
87
+ # For this modification, let's assume the assistant message is just the clinical summary
88
+ # A more robust solution might store the full generated text.
89
+ raw_assistant_msg = assistant_msg.split("🩺 **Clinical Summary**")[-1].strip()
90
  messages.append({"role": "user", "content": user_msg})
91
+ messages.append({"role": "assistant", "content": raw_assistant_msg})
92
+
93
 
94
  # Add current message
95
  messages.append({"role": "user", "content": message})
96
 
97
+ tokenized = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
98
 
99
+ input_ids = tokenized.input_ids.to(DEVICE)
100
+ attention_mask = tokenized.attention_mask.to(DEVICE)
101
 
102
+ # Generation parameters
 
 
 
 
 
 
 
 
103
  generation_kwargs = {
104
+ "input_ids": input_ids,
105
+ "attention_mask": attention_mask,
106
  "max_new_tokens": max_tokens,
107
  "temperature": temperature,
108
  "top_p": top_p,
109
  "do_sample": True,
110
  "pad_token_id": self.tokenizer.eos_token_id,
 
111
  "repetition_penalty": 1.1
112
  }
113
 
114
+ # Generate the full response
115
+ generated_ids = self.model.generate(**generation_kwargs)[0]
116
+
117
+ # Decode the response
118
+ full_response = self.tokenizer.decode(output[len(tokenized.input_ids) : (-1 if output[-1] == tokenizer.eos_token_id else len(output) ) ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ # Extract thinking and clinical summary
121
+ thinking, response = self.extract_thinking_and_response(full_response)
 
122
 
123
+ # Format the final display
124
+ final_display = ""
125
+ if thinking:
126
+ final_display += f"""🧠 **Medical Reasoning Process**
127
+ <details>
128
+ <summary>🔍 Click to view detailed thinking process</summary>
129
+ *{thinking}*
130
+ </details>
131
+ ---
132
+ """
133
+
134
+ final_display += f"""🩺 **Clinical Summary**
135
+ {response}"""
136
 
137
+ new_history = history + [[message, final_display]]
138
+ return "", new_history
139
 
140
 
141
  # Initialize the medical chat model
 
143
 
144
  def respond(message, history, max_tokens, temperature, top_p):
145
  """Gradio response function for medical reasoning"""
146
+ return medical_chat_model.medical_chat(message, history, max_tokens, temperature, top_p)
 
147
 
148
  # Custom CSS for medical interface
149
  css = """