Spaces:
Sleeping
Sleeping
File size: 1,170 Bytes
b432075 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "marathi-llm/MahaMarathi-7B-v24.01-Base"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load model strictly on CPU with memory optimization
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="cpu",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True
)
def generate_text(prompt, max_new_tokens):
inputs = tokenizer(prompt, return_tensors="pt")
# Generate output
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
pad_token_id=tokenizer.eos_token_id
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Gradio automatically builds an API around this function
iface = gr.Interface(
fn=generate_text,
inputs=[
gr.Textbox(lines=5, label="Input Prompt"),
gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Max New Tokens")
],
outputs=gr.Textbox(label="Generated Text"),
title="MahaMarathi-7B CPU Inference API"
)
iface.launch() |