import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # 1. Setup for low memory (Free tier friendly) model_id = "replit/replit-code-v1_5-3b" quantization_config = BitsAndBytesConfig(load_in_4bit=True) # 2. Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, quantization_config=quantization_config, device_map="auto" ) def generate(prompt): inputs = tokenizer.encode(prompt, return_tensors='pt').to("cuda") outputs = model.generate( inputs, max_new_tokens=100, do_sample=True, temperature=0.2 ) return tokenizer.decode(outputs[0]) # 3. Create the UI demo = gr.Interface( fn=generate, inputs=gr.Textbox(lines=5, label="Input Code/Prompt"), outputs=gr.Code(label="Generated Code"), title="Replit Code 3B Demo" ) demo.launch()