|
|
import os |
|
|
import time |
|
|
from fastapi import FastAPI, HTTPException |
|
|
from pydantic import BaseModel |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI( |
|
|
title="Qwen 1.5 Coder – Model Inference API", |
|
|
description="LLMOps-grade model-only inference service for RAG systems", |
|
|
version="1.0.0" |
|
|
) |
|
|
|
|
|
|
|
|
os.environ["HF_HOME"] = "/tmp/huggingface_cache" |
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "Sameer-Handsome173/qwen_model_1.5coder" |
|
|
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
|
|
|
print("🔄 Loading model...") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
MODEL_NAME, |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
torch_dtype=DTYPE, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
model.eval() |
|
|
|
|
|
print("✅ Model loaded successfully") |
|
|
|
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = """You are an AI coding assistant powered by Qwen-1.5-Coder. |
|
|
|
|
|
You help with: |
|
|
- Programming questions |
|
|
- Code generation |
|
|
- Code explanation |
|
|
- Debugging |
|
|
- System design guidance |
|
|
|
|
|
You will receive CONTEXT retrieved from a knowledge base. |
|
|
|
|
|
Rules: |
|
|
1. Use ONLY the provided context for factual answers. |
|
|
2. If the context does not contain the answer, say: |
|
|
"I don’t have enough information in the provided context." |
|
|
3. Do NOT invent APIs, libraries, or facts. |
|
|
4. Generate correct, clean, and readable code. |
|
|
5. Do NOT reveal internal reasoning or chain-of-thought. |
|
|
6. Be concise, structured, and precise. |
|
|
7. If a request is unsafe, refuse politely. |
|
|
|
|
|
The context is the source of truth. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
class GenerateRequest(BaseModel): |
|
|
query: str |
|
|
context: str = "" |
|
|
max_new_tokens: int = 256 |
|
|
temperature: float = 0.7 |
|
|
top_p: float = 0.9 |
|
|
|
|
|
|
|
|
class GenerateResponse(BaseModel): |
|
|
response: str |
|
|
latency_seconds: float |
|
|
model: str |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_answer(req: GenerateRequest) -> GenerateResponse: |
|
|
start_time = time.time() |
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": SYSTEM_PROMPT}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": f""" |
|
|
CONTEXT: |
|
|
{req.context} |
|
|
|
|
|
QUESTION: |
|
|
{req.query} |
|
|
""" |
|
|
} |
|
|
] |
|
|
|
|
|
prompt_text = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device) |
|
|
|
|
|
try: |
|
|
with torch.no_grad(): |
|
|
output = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=req.max_new_tokens, |
|
|
temperature=req.temperature, |
|
|
top_p=req.top_p, |
|
|
do_sample=True, |
|
|
repetition_penalty=1.1 |
|
|
) |
|
|
|
|
|
decoded = tokenizer.decode(output[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
if "assistant" in decoded.lower(): |
|
|
decoded = decoded.split("assistant")[-1].strip() |
|
|
|
|
|
latency = round(time.time() - start_time, 3) |
|
|
|
|
|
return GenerateResponse( |
|
|
response=decoded, |
|
|
latency_seconds=latency, |
|
|
model=MODEL_NAME |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
def root(): |
|
|
return { |
|
|
"status": "running", |
|
|
"service": "Qwen 1.5 Coder Inference API", |
|
|
"model": MODEL_NAME, |
|
|
"endpoint": "/v1/generate" |
|
|
} |
|
|
|
|
|
|
|
|
@app.post("/v1/generate", response_model=GenerateResponse) |
|
|
def generate(req: GenerateRequest): |
|
|
if not req.query.strip(): |
|
|
raise HTTPException(status_code=400, detail="Query cannot be empty") |
|
|
|
|
|
return generate_answer(req) |
|
|
|
|
|
|
|
|
@app.get("/health") |
|
|
def health(): |
|
|
return { |
|
|
"status": "healthy", |
|
|
"model_loaded": model is not None, |
|
|
"device": str(model.device) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|