Spaces:

alijkdkar
/

gemma3

Runtime error

App Files Files Community

alijkdkar commited on Jul 29, 2025

Commit

c2ddb8a

verified ·

1 Parent(s): d706476

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -13

app.py CHANGED Viewed

@@ -4,7 +4,91 @@ from huggingface_hub import InferenceClient
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("google/medgemma-27b-text-it")
 def respond(
@@ -15,7 +99,19 @@ def respond(
     temperature,
     top_p,
 ):
-    messages = [{"role": "system", "content": system_message}]
     for val in history:
         if val[0]:
@@ -27,17 +123,9 @@ def respond(
     response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """

 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
+import os
+import sys
+google_colab = "google.colab" in sys.modules and not os.environ.get("VERTEX_PRODUCT")
+if google_colab:
+    # Use secret if running in Google Colab
+    from google.colab import userdata
+    os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
+else:
+    # Store Hugging Face data under `/content` if running in Colab Enterprise
+    if os.environ.get("VERTEX_PRODUCT") == "COLAB_ENTERPRISE":
+        os.environ["HF_HOME"] = "/content/hf"
+    # Authenticate with Hugging Face
+    from huggingface_hub import get_token
+    if get_token() is None:
+        from huggingface_hub import notebook_login
+        notebook_login()
+from transformers import BitsAndBytesConfig
+import torch
+model_variant = "27b-text-it"  # @param ["4b-it", "27b-it", "27b-text-it"]
+model_id = f"google/medgemma-{model_variant}"
+use_quantization = True  # @param {type: "boolean"}
+# @markdown Set `is_thinking` to `True` to turn on thinking mode. **Note:** Thinking is supported for the 27B variants only.
+is_thinking = False  # @param {type: "boolean"}
+# If running a 27B variant in Google Colab, check if the runtime satisfies
+# memory requirements
+if "27b" in model_variant and google_colab:
+    if not ("A100" in torch.cuda.get_device_name(0) and use_quantization):
+        raise ValueError(
+            "Runtime has insufficient memory to run a 27B variant. "
+            "Please select an A100 GPU and use 4-bit quantization."
+        )
+model_kwargs = dict(
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+if use_quantization:
+    model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True)
+from transformers import pipeline
+if "text" in model_variant:
+    pipe = pipeline("text-generation", model=model_id, model_kwargs=model_kwargs)
+else:
+    pipe = pipeline("image-text-to-text", model=model_id, model_kwargs=model_kwargs)
+pipe.model.generation_config.do_sample = False
+if "text" in model_variant:
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+else:
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+    model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
+    processor = AutoProcessor.from_pretrained(model_id)
+role_instruction = "You are an expert radiologist."
+if "27b" in model_variant and is_thinking:
+    system_instruction = f"SYSTEM INSTRUCTION: think silently if needed. {role_instruction}"
+    max_new_tokens = 1300
+else:
+    system_instruction = role_instruction
+    max_new_tokens = 300
+response = output[0]["generated_text"][-1]["content"]
 def respond(
     temperature,
     top_p,
 ):
+    messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": system_instruction}]
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": prompt},
+            {"type": "text", "text": message}
+        ]
+    }
+]
     for val in history:
         if val[0]:
     response = ""
+   output = pipe(text=messages, max_new_tokens=max_new_tokens)
+    yield response = output[0]["generated_text"][-1]["content"]
 """