import os
# Limit parallelism to fit 2 CPU cores
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from fastapi import FastAPI, HTTPException
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

# Load the Phi-1.5 Instruct model (1.3B) from Hugging Face
model_id = "rasyosef/Phi-1_5-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer
)

app = FastAPI()

@app.get("/chat")
def chat(query: str):
    """
    REST API endpoint. Use: GET /chat?query=Your question
    Returns a JSON {"response": "..."}.
    """
    if not query:
        raise HTTPException(status_code=400, detail="Query parameter 'query' is required.")
    # Use the same prompt format expected by the model:
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": query}
    ]
    result = pipe(
        messages, 
        max_new_tokens=100, 
        do_sample=False, 
        return_full_text=False
    )
    answer = result[0]["generated_text"].strip()
    return {"response": answer}

# Define Gradio UI (optional)
def gradio_chat(input_text):
    if not input_text:
        return ""
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": input_text}
    ]
    result = pipe(messages, max_new_tokens=100, do_sample=False, return_full_text=False)
    return result[0]["generated_text"].strip()

iface = gr.Interface(
    fn=gradio_chat, 
    inputs=gr.Textbox(lines=2, placeholder="Type a message..."), 
    outputs="text",
    title="Phi-1.5 Chatbot",
    description="Enter a message and press **Submit** to get a response."
)

# Mount Gradio at root so it does not conflict with /chat
app = gr.mount_gradio_app(app, iface, path="/")