import os import gradio as gr from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch model_name = "Guavacoderepo/gclm-3b-pidgin" # Get token from Hugging Face Secrets hf_token = os.environ.get("HF_TOKEN") # FP4 quantization config quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, llm_int8_enable_fp32_cpu_offload=True # allows CPU offload for large model ) tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token,) # Load model with BitsAndBytesConfig ONLY model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quant_config, device_map="auto", token=hf_token ) # Create pipeline once (outside function!) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto") def chat_fn(user_input): # Generate response output = pipe(user_input, max_new_tokens=150, do_sample=True, temperature=0.7, top_p=0.9) return output[0]["generated_text"] iface = gr.Interface(fn=chat_fn, inputs="text", outputs="text", title="GCLM-3B-Pidgin Chat") iface.launch()