randd / app.py
Hameedshk's picture
Update app.py
31616f7 verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
# Load model with proper 4-bit quantization
model_id = "HelpMumHQ/MamaBot-Llama"
print("Loading MamaBot with 4-bit quantization...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Proper 4-bit configuration
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
quantization_config=quantization_config,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Create pipeline with loaded model
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer
)
print("✅ Model loaded!")
def infer(prompt):
# Generate response
output = pipe(
prompt,
max_new_tokens=200,
do_sample=True,
temperature=0.7,
pad_token_id=tokenizer.pad_token_id
)[0]['generated_text']
return output
# Gradio interface (for testing + exposes API)
gr.Interface(
infer,
inputs="text",
outputs="text",
title="MamaBot-Llama Inference API",
description="Enter a maternal health prompt (e.g., 'Explain NT 2.8mm at 12 weeks')."
).launch()