File size: 3,919 Bytes
e7720c7 c2ddb8a ce2e5cf c2ddb8a e7720c7 c2ddb8a e7720c7 4cbc873 c2ddb8a bf36260 e7720c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
from huggingface_hub import InferenceClient
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
import os
import sys
google_colab = "google.colab" in sys.modules and not os.environ.get("VERTEX_PRODUCT")
if google_colab:
# Use secret if running in Google Colab
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
else:
# Store Hugging Face data under `/content` if running in Colab Enterprise
if os.environ.get("VERTEX_PRODUCT") == "COLAB_ENTERPRISE":
os.environ["HF_HOME"] = "/content/hf"
# Authenticate with Hugging Face
from huggingface_hub import get_token
if get_token() is None:
from huggingface_hub import notebook_login
notebook_login()
from transformers import BitsAndBytesConfig
import torch
model_variant = "4b-it" # @param ["4b-it", "27b-it", "27b-text-it"]
model_id = f"google/medgemma-{model_variant}"
use_quantization = True # @param {type: "boolean"}
# @markdown Set `is_thinking` to `True` to turn on thinking mode. **Note:** Thinking is supported for the 27B variants only.
is_thinking = False # @param {type: "boolean"}
# If running a 27B variant in Google Colab, check if the runtime satisfies
# memory requirements
if "27b" in model_variant and google_colab:
if not ("A100" in torch.cuda.get_device_name(0) and use_quantization):
raise ValueError(
"Runtime has insufficient memory to run a 27B variant. "
"Please select an A100 GPU and use 4-bit quantization."
)
model_kwargs = dict(
torch_dtype=torch.bfloat16,
device_map="auto",
)
if use_quantization:
model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True)
from transformers import pipeline
if "text" in model_variant:
pipe = pipeline("text-generation", model=model_id, model_kwargs=model_kwargs)
else:
pipe = pipeline("image-text-to-text", model=model_id, model_kwargs=model_kwargs)
pipe.model.generation_config.do_sample = False
role_instruction = "You are an expert radiologist."
if "27b" in model_variant and is_thinking:
system_instruction = f"SYSTEM INSTRUCTION: think silently if needed. {role_instruction}"
max_new_tokens = 1300
else:
system_instruction = role_instruction
max_new_tokens = 300
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [
{
"role": "system",
"content": [{"type": "text", "text": system_instruction}]
},
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "text", "text": message}
]
}
]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
output = pipe(text=messages, max_new_tokens=max_new_tokens)
return output[0]["generated_text"][-1]["content"]
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|