import os # Limit parallelism to fit 2 CPU cores os.environ["OMP_NUM_THREADS"] = "2" os.environ["MKL_NUM_THREADS"] = "2" os.environ["TOKENIZERS_PARALLELISM"] = "false" from fastapi import FastAPI, HTTPException from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import gradio as gr # Load the Phi-1.5 Instruct model (1.3B) from Hugging Face model_id = "rasyosef/Phi-1_5-Instruct-v0.1" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer ) app = FastAPI() @app.get("/chat") def chat(query: str): """ REST API endpoint. Use: GET /chat?query=Your question Returns a JSON {"response": "..."}. """ if not query: raise HTTPException(status_code=400, detail="Query parameter 'query' is required.") # Use the same prompt format expected by the model: messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": query} ] result = pipe( messages, max_new_tokens=100, do_sample=False, return_full_text=False ) answer = result[0]["generated_text"].strip() return {"response": answer} # Define Gradio UI (optional) def gradio_chat(input_text): if not input_text: return "" messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": input_text} ] result = pipe(messages, max_new_tokens=100, do_sample=False, return_full_text=False) return result[0]["generated_text"].strip() iface = gr.Interface( fn=gradio_chat, inputs=gr.Textbox(lines=2, placeholder="Type a message..."), outputs="text", title="Phi-1.5 Chatbot", description="Enter a message and press **Submit** to get a response." ) # Mount Gradio at root so it does not conflict with /chat app = gr.mount_gradio_app(app, iface, path="/")