import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM model_id = "marathi-llm/MahaMarathi-7B-v24.01-Base" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) # Load model strictly on CPU with memory optimization model = AutoModelForCausalLM.from_pretrained( model_id, device_map="cpu", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True ) def generate_text(prompt, max_new_tokens): inputs = tokenizer(prompt, return_tensors="pt") # Generate output with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio automatically builds an API around this function iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(lines=5, label="Input Prompt"), gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Max New Tokens") ], outputs=gr.Textbox(label="Generated Text"), title="MahaMarathi-7B CPU Inference API" ) iface.launch()