import os import gradio as gr from fastapi import FastAPI from transformers import AutoTokenizer, pipeline import threading MODEL = os.environ.get("MODEL_NAME", "google/gemma-4-E2B") tokenizer = None generator = None _model_lock = threading.Lock() def load_model(): global tokenizer, generator with _model_lock: if tokenizer is not None and generator is not None: return tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( MODEL, device_map={"": "cpu"}, torch_dtype="float32", low_cpu_mem_usage=True, trust_remote_code=True ) generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) def generate(prompt): if generator is None: load_model() out = generator(prompt, max_new_tokens=64, do_sample=False) return out[0]["generated_text"] demo = gr.Interface(fn=generate, inputs=gr.Textbox(lines=4, label="Prompt"), outputs="text", title="Gemma-4-E2B (CPU)") app = FastAPI() app = gr.mount_gradio_app(app, demo, path="/")