yasserrmd commited on
Commit
8128d07
·
verified ·
1 Parent(s): dec6090

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -27
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import torch
3
- #from transformers import AutoTokenizer, Mistral3ForConditionalGeneration
4
  import re
5
  import os
6
  from typing import List, Tuple
@@ -36,23 +36,23 @@ class SinaReasonMedicalChat:
36
  # The PixtralProcessor requires an image argument, even if it's None.
37
  # This is a mandatory part of the call signature.
38
  self.dummy_image = None
39
- #self.load_model()
40
 
41
  def load_model(self):
42
  """Load the SinaReason medical model and tokenizer using Unsloth"""
43
  try:
44
- from unsloth import FastLanguageModel
45
  print(f"Loading medical model with Unsloth: {MODEL_NAME}")
46
  print("cuda" if torch.cuda.is_available() else "cpu")
47
 
48
- # Use FastLanguageModel from Unsloth to load the model and tokenizer
49
- self.model, self.tokenizer = FastLanguageModel.from_pretrained(
50
- model_name=MODEL_NAME,
51
- dtype=torch.bfloat16,
52
- load_in_4bit=True, # Or False if you have enough VRAM for 16-bit
53
- device_map="cuda",
54
  )
55
 
 
 
 
56
  print("SinaReason medical model loaded successfully with Unsloth!")
57
 
58
  except Exception as e:
@@ -76,12 +76,13 @@ class SinaReasonMedicalChat:
76
  def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
77
  temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
78
  """Generate medical reasoning responses using the Unsloth model."""
79
- # No need for model.to(DEVICE), Unsloth's device_map handles it.
80
- self.load_model()
81
- self.model.eval()
82
  if not message.strip():
83
  return "", history
84
 
 
 
 
85
  # Apply the chat template with the medical system prompt
86
  messages = [{"role": "system", "content": MEDICAL_SYSTEM_PROMPT}]
87
  for user_msg, assistant_msg in history:
@@ -90,24 +91,16 @@ class SinaReasonMedicalChat:
90
  messages.append({"role": "assistant", "content": raw_assistant_msg})
91
  messages.append({"role": "user", "content": message})
92
 
93
- # Format the prompt using the chat template
94
  formatted_prompt = self.tokenizer.apply_chat_template(
95
- messages,
96
- tokenize=False,
97
- add_generation_prompt=True,
98
  )
99
 
100
- # Tokenize the input, correctly passing images=None
101
- inputs = self.tokenizer(
102
- text=formatted_prompt,
103
- images=self.dummy_image,
104
- return_tensors="pt"
105
- ).to(self.model.device)
106
 
107
- # Generation parameters
108
  generation_kwargs = {
109
  **inputs,
110
- "images": self.dummy_image, # This MUST be passed to model.generate
111
  "max_new_tokens": max_tokens,
112
  "temperature": temperature,
113
  "top_p": top_p,
@@ -115,10 +108,7 @@ class SinaReasonMedicalChat:
115
  "pad_token_id": self.tokenizer.eos_token_id,
116
  }
117
 
118
- # Generate the full response
119
  output = self.model.generate(**generation_kwargs)[0]
120
-
121
- # Decode only the newly generated tokens
122
  full_response = self.tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True)
123
 
124
  # Extract thinking and clinical summary
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import re
5
  import os
6
  from typing import List, Tuple
 
36
  # The PixtralProcessor requires an image argument, even if it's None.
37
  # This is a mandatory part of the call signature.
38
  self.dummy_image = None
39
+ self.load_model()
40
 
41
  def load_model(self):
42
  """Load the SinaReason medical model and tokenizer using Unsloth"""
43
  try:
 
44
  print(f"Loading medical model with Unsloth: {MODEL_NAME}")
45
  print("cuda" if torch.cuda.is_available() else "cpu")
46
 
47
+ self.model = AutoModelForCausalLM.from_pretrained(
48
+ MODEL_NAME,
49
+ torch_dtype=torch.bfloat16, # Use bfloat16 for modern GPUs
50
+ device_map="auto", # Automatically map to the available GPU
 
 
51
  )
52
 
53
+ # Load the standard tokenizer
54
+ self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
55
+
56
  print("SinaReason medical model loaded successfully with Unsloth!")
57
 
58
  except Exception as e:
 
76
  def medical_chat(self, message: str, history: List[List[str]], max_tokens: int = 1024,
77
  temperature: float = 0.7, top_p: float = 0.95) -> Tuple[str, List[List[str]]]:
78
  """Generate medical reasoning responses using the Unsloth model."""
79
+
 
 
80
  if not message.strip():
81
  return "", history
82
 
83
+ self.model.to("cuda")
84
+ self.model.eval()
85
+
86
  # Apply the chat template with the medical system prompt
87
  messages = [{"role": "system", "content": MEDICAL_SYSTEM_PROMPT}]
88
  for user_msg, assistant_msg in history:
 
91
  messages.append({"role": "assistant", "content": raw_assistant_msg})
92
  messages.append({"role": "user", "content": message})
93
 
 
94
  formatted_prompt = self.tokenizer.apply_chat_template(
95
+ messages, tokenize=False, add_generation_prompt=True,
 
 
96
  )
97
 
98
+ # THE HACK IS GONE: Standard tokenization without any 'images' argument.
99
+ inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.model.device)
 
 
 
 
100
 
101
+ # THE HACK IS GONE: Standard generation call.
102
  generation_kwargs = {
103
  **inputs,
 
104
  "max_new_tokens": max_tokens,
105
  "temperature": temperature,
106
  "top_p": top_p,
 
108
  "pad_token_id": self.tokenizer.eos_token_id,
109
  }
110
 
 
111
  output = self.model.generate(**generation_kwargs)[0]
 
 
112
  full_response = self.tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True)
113
 
114
  # Extract thinking and clinical summary