Abhijeet
Add initial implementation of MahaMarathi-7B CPU Inference API with Gradio interface
b432075
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "marathi-llm/MahaMarathi-7B-v24.01-Base"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load model strictly on CPU with memory optimization
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="cpu",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True
)
def generate_text(prompt, max_new_tokens):
inputs = tokenizer(prompt, return_tensors="pt")
# Generate output
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
pad_token_id=tokenizer.eos_token_id
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Gradio automatically builds an API around this function
iface = gr.Interface(
fn=generate_text,
inputs=[
gr.Textbox(lines=5, label="Input Prompt"),
gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Max New Tokens")
],
outputs=gr.Textbox(label="Generated Text"),
title="MahaMarathi-7B CPU Inference API"
)
iface.launch()