import spaces import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from transformers import TextStreamer import gradio as gr text_generator = None is_hugging_face = True model_id = "google/gemma-2-9b-it"# too big model_id = "google/gemma-2-2b-it" huggingface_token = os.getenv("HUGGINGFACE_TOKEN") device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu") device = "cuda" dtype = torch.bfloat16 dtype = torch.float16 if not huggingface_token: pass print("no HUGGINGFACE_TOKEN if you need set secret ") #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set") tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token) print(model_id,device,dtype) histories = [] #model = None if not is_hugging_face: model = AutoModelForCausalLM.from_pretrained( model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device ) text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device) if next(model.parameters()).is_cuda: print("The model is on a GPU") else: print("The model is on a CPU") #print(f"text_generator.device='{text_generator.device}") if str(text_generator.device).strip() == 'cuda': print("The pipeline is using a GPU") else: print("The pipeline is using a CPU") print("initialized") @spaces.GPU(duration=60) def generate_text(messages): if is_hugging_face:#need everytime initialize for ZeroGPU model = AutoModelForCausalLM.from_pretrained( model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device ) model.to(device) question = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) question = tokenizer(question, return_tensors="pt").to(device) streamer = TextStreamer(tokenizer, skip_prompt=True) #text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ,streamer=streamer) #pipeline has not to(device) #result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7) result = model.generate(**question, streamer=streamer, pad_token_id=tokenizer.eos_token_id, max_length=2048, temperature=0, top_p=0.8, repetition_penalty=1.25) print(f"result={result}") generated_output = "" for token in result: print(f"token={token}") generated_output += token["text"] yield generated_output def call_generate_text(message, history): # history.append({"role": "user", "content": message}) print(message) print(history) messages = history+[{"role":"user","content":message}] try: for text in generate_text(messages): yield text except RuntimeError as e: print(f"An unexpected error occurred: {e}") yield "" demo = gr.ChatInterface(call_generate_text,type="messages") if __name__ == "__main__": demo.launch(share=True)