import gradio as gr import torch import sys from transformers import AutoModelForCausalLM, AutoTokenizer model_id = "https://huggingface.co/imsuprtwo2/NanoBit-300M" print("Starting MASA Boot Sequence...") sys.stdout.flush() # Forces the logs to actually show up tokenizer = AutoTokenizer.from_pretrained(model_id) # The "Low RAM" loader is mandatory for a 1.4GB file on a free Space model = AutoModelForCausalLM.from_pretrained( model_id, low_cpu_mem_usage=True, torch_dtype=torch.float32, trust_remote_code=True ) def chat(message, history): inputs = tokenizer(message, return_tensors="pt") with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=50) return tokenizer.decode(outputs[0], skip_special_tokens=True).replace(message, "").strip() # We MUST bind to 0.0.0.0 and port 7860 for Hugging Face to see us demo = gr.ChatInterface(chat) demo.launch(server_name="0.0.0.0", server_port=7860)