Spaces:

Trigger82
/

Work

Sleeping

App Files Files Community

Work / app.py

Trigger82

Create app.py

2d9214b verified 9 months ago

raw

history blame

2.01 kB

	import os
	# Limit parallelism to fit 2 CPU cores
	os.environ["OMP_NUM_THREADS"] = "2"
	os.environ["MKL_NUM_THREADS"] = "2"
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	from fastapi import FastAPI, HTTPException
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import gradio as gr

	# Load the Phi-1.5 Instruct model (1.3B) from Hugging Face
	model_id = "rasyosef/Phi-1_5-Instruct-v0.1"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id)
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer
	)

	app = FastAPI()

	@app.get("/chat")
	def chat(query: str):
	"""
	REST API endpoint. Use: GET /chat?query=Your question
	Returns a JSON {"response": "..."}.
	"""
	if not query:
	raise HTTPException(status_code=400, detail="Query parameter 'query' is required.")
	# Use the same prompt format expected by the model:
	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": query}
	]
	result = pipe(
	messages,
	max_new_tokens=100,
	do_sample=False,
	return_full_text=False
	)
	answer = result[0]["generated_text"].strip()
	return {"response": answer}

	# Define Gradio UI (optional)
	def gradio_chat(input_text):
	if not input_text:
	return ""
	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": input_text}
	]
	result = pipe(messages, max_new_tokens=100, do_sample=False, return_full_text=False)
	return result[0]["generated_text"].strip()

	iface = gr.Interface(
	fn=gradio_chat,
	inputs=gr.Textbox(lines=2, placeholder="Type a message..."),
	outputs="text",
	title="Phi-1.5 Chatbot",
	description="Enter a message and press Submit to get a response."
	)

	# Mount Gradio at root so it does not conflict with /chat
	app = gr.mount_gradio_app(app, iface, path="/")