import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig import torch from peft import AutoPeftModelForCausalLM, PeftModel MODEL_NAME = "TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T" model_path = "rahul-shrivastav/medical-qna" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, dtype=torch.float16, device_map="auto" ) peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto") model = peft_model.merge_and_unload() def formatted_prompt(question)-> str: return f"<|user|>\n{question}\n<|assistant|>" def generate_response(prompt): prompt = formatted_prompt(prompt) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) generation_config = GenerationConfig( do_sample=True, top_k=50, temperature=0.7, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id ) outputs = model.generate(**inputs, generation_config=generation_config) text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": text} # API mode only — no UI demo = gr.Interface( fn=generate_response, inputs=gr.Textbox(), outputs="json", flagging_mode="never" ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", # Needed for Spaces to accept incoming requests server_port=7860 )