| | import gradio as gr |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig |
| | import torch |
| |
|
| | |
| | model_id = "HelpMumHQ/MamaBot-Llama" |
| |
|
| | print("Loading MamaBot with 4-bit quantization...") |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| |
|
| | |
| | quantization_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_compute_dtype=torch.float16 |
| | ) |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, |
| | device_map="auto", |
| | quantization_config=quantization_config, |
| | torch_dtype=torch.float16, |
| | low_cpu_mem_usage=True |
| | ) |
| |
|
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | |
| | pipe = pipeline( |
| | "text-generation", |
| | model=model, |
| | tokenizer=tokenizer |
| | ) |
| |
|
| | print("✅ Model loaded!") |
| |
|
| | def infer(prompt): |
| | |
| | output = pipe( |
| | prompt, |
| | max_new_tokens=200, |
| | do_sample=True, |
| | temperature=0.7, |
| | pad_token_id=tokenizer.pad_token_id |
| | )[0]['generated_text'] |
| | return output |
| |
|
| | |
| | gr.Interface( |
| | infer, |
| | inputs="text", |
| | outputs="text", |
| | title="MamaBot-Llama Inference API", |
| | description="Enter a maternal health prompt (e.g., 'Explain NT 2.8mm at 12 weeks')." |
| | ).launch() |