import gradio as gr from huggingface_hub import InferenceClient import os # 1. Setup the Client # We fetch the token you just added to Secrets client = InferenceClient(token=os.getenv("HF_TOKEN")) # 2. Your Model ID (The Adapter) # The API is smart enough to see it's an adapter and load the Base Model automatically. MODEL_ID = "shri171981/medical_chat_generative" def ask_api(message, history): # 3. Format the prompt (Strict Llama-3 format) system_prompt = "You are a helpful and empathetic medical doctor. Answer the patient's question based on the input provided." prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {system_prompt} ### Input: {message} ### Response: """ try: # 4. Send to the API response = client.text_generation( prompt, model=MODEL_ID, max_new_tokens=128, temperature=0.7, return_full_text=False # We only want the new part ) return response except Exception as e: # 5. Handle "Model Loading" errors # If the model is cold, the API returns a 503 error. if "Model is loading" in str(e): return "⚠️ The model is waking up (Cold Start). Please wait 30 seconds and try again!" return f"Error: {str(e)}" # 6. Launch demo = gr.ChatInterface( fn=ask_api, title="🚑 HACK_DOC (API Powered)", description="Running on Hugging Face Serverless GPU via API.", examples=["I have a sharp pain in my chest.", "What is good for a fever?"], ) if __name__ == "__main__": demo.launch()