Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| import torch | |
| model_name = "Guavacoderepo/gclm-3b-pidgin" | |
| # Get token from Hugging Face Secrets | |
| hf_token = os.environ.get("HF_TOKEN") | |
| # FP4 quantization config | |
| quant_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| llm_int8_enable_fp32_cpu_offload=True # allows CPU offload for large model | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token,) | |
| # Load model with BitsAndBytesConfig ONLY | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=quant_config, | |
| device_map="auto", | |
| token=hf_token | |
| ) | |
| # Create pipeline once (outside function!) | |
| pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto") | |
| def chat_fn(user_input): | |
| # Generate response | |
| output = pipe(user_input, max_new_tokens=150, do_sample=True, temperature=0.7, top_p=0.9) | |
| return output[0]["generated_text"] | |
| iface = gr.Interface(fn=chat_fn, inputs="text", outputs="text", title="GCLM-3B-Pidgin Chat") | |
| iface.launch() |