import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig import torch # Load model with proper 4-bit quantization model_id = "HelpMumHQ/MamaBot-Llama" print("Loading MamaBot with 4-bit quantization...") tokenizer = AutoTokenizer.from_pretrained(model_id) # Proper 4-bit configuration quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16 ) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", quantization_config=quantization_config, torch_dtype=torch.float16, low_cpu_mem_usage=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Create pipeline with loaded model pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer ) print("✅ Model loaded!") def infer(prompt): # Generate response output = pipe( prompt, max_new_tokens=200, do_sample=True, temperature=0.7, pad_token_id=tokenizer.pad_token_id )[0]['generated_text'] return output # Gradio interface (for testing + exposes API) gr.Interface( infer, inputs="text", outputs="text", title="MamaBot-Llama Inference API", description="Enter a maternal health prompt (e.g., 'Explain NT 2.8mm at 12 weeks')." ).launch()