alijkdkar commited on
Commit
c2ddb8a
·
verified ·
1 Parent(s): d706476

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -13
app.py CHANGED
@@ -4,7 +4,91 @@ from huggingface_hub import InferenceClient
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
  """
7
- client = InferenceClient("google/medgemma-27b-text-it")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def respond(
@@ -15,7 +99,19 @@ def respond(
15
  temperature,
16
  top_p,
17
  ):
18
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  for val in history:
21
  if val[0]:
@@ -27,17 +123,9 @@ def respond(
27
 
28
  response = ""
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
 
42
 
43
  """
 
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
  """
7
+ import os
8
+ import sys
9
+
10
+ google_colab = "google.colab" in sys.modules and not os.environ.get("VERTEX_PRODUCT")
11
+
12
+ if google_colab:
13
+ # Use secret if running in Google Colab
14
+ from google.colab import userdata
15
+ os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
16
+ else:
17
+ # Store Hugging Face data under `/content` if running in Colab Enterprise
18
+ if os.environ.get("VERTEX_PRODUCT") == "COLAB_ENTERPRISE":
19
+ os.environ["HF_HOME"] = "/content/hf"
20
+ # Authenticate with Hugging Face
21
+ from huggingface_hub import get_token
22
+ if get_token() is None:
23
+ from huggingface_hub import notebook_login
24
+ notebook_login()
25
+
26
+
27
+
28
+
29
+ from transformers import BitsAndBytesConfig
30
+ import torch
31
+
32
+ model_variant = "27b-text-it" # @param ["4b-it", "27b-it", "27b-text-it"]
33
+ model_id = f"google/medgemma-{model_variant}"
34
+
35
+ use_quantization = True # @param {type: "boolean"}
36
+
37
+ # @markdown Set `is_thinking` to `True` to turn on thinking mode. **Note:** Thinking is supported for the 27B variants only.
38
+ is_thinking = False # @param {type: "boolean"}
39
+
40
+ # If running a 27B variant in Google Colab, check if the runtime satisfies
41
+ # memory requirements
42
+ if "27b" in model_variant and google_colab:
43
+ if not ("A100" in torch.cuda.get_device_name(0) and use_quantization):
44
+ raise ValueError(
45
+ "Runtime has insufficient memory to run a 27B variant. "
46
+ "Please select an A100 GPU and use 4-bit quantization."
47
+ )
48
+
49
+ model_kwargs = dict(
50
+ torch_dtype=torch.bfloat16,
51
+ device_map="auto",
52
+ )
53
+
54
+ if use_quantization:
55
+ model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True)
56
+
57
+
58
+
59
+ from transformers import pipeline
60
+
61
+ if "text" in model_variant:
62
+ pipe = pipeline("text-generation", model=model_id, model_kwargs=model_kwargs)
63
+ else:
64
+ pipe = pipeline("image-text-to-text", model=model_id, model_kwargs=model_kwargs)
65
+
66
+ pipe.model.generation_config.do_sample = False
67
+
68
+
69
+ if "text" in model_variant:
70
+ from transformers import AutoModelForCausalLM, AutoTokenizer
71
+ model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
72
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
73
+ else:
74
+ from transformers import AutoModelForImageTextToText, AutoProcessor
75
+ model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
76
+ processor = AutoProcessor.from_pretrained(model_id)
77
+
78
+
79
+ role_instruction = "You are an expert radiologist."
80
+ if "27b" in model_variant and is_thinking:
81
+ system_instruction = f"SYSTEM INSTRUCTION: think silently if needed. {role_instruction}"
82
+ max_new_tokens = 1300
83
+ else:
84
+ system_instruction = role_instruction
85
+ max_new_tokens = 300
86
+
87
+
88
+
89
+
90
+ response = output[0]["generated_text"][-1]["content"]
91
+
92
 
93
 
94
  def respond(
 
99
  temperature,
100
  top_p,
101
  ):
102
+ messages = [
103
+ {
104
+ "role": "system",
105
+ "content": [{"type": "text", "text": system_instruction}]
106
+ },
107
+ {
108
+ "role": "user",
109
+ "content": [
110
+ {"type": "text", "text": prompt},
111
+ {"type": "text", "text": message}
112
+ ]
113
+ }
114
+ ]
115
 
116
  for val in history:
117
  if val[0]:
 
123
 
124
  response = ""
125
 
126
+ output = pipe(text=messages, max_new_tokens=max_new_tokens)
127
+
128
+ yield response = output[0]["generated_text"][-1]["content"]
 
 
 
 
 
 
 
 
129
 
130
 
131
  """