Sameer-Handsome173's picture
Create app.py
6a41fe2 verified
import os
import time
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# ------------------ Basic App Config ------------------
app = FastAPI(
title="Qwen 1.5 Coder – Model Inference API",
description="LLMOps-grade model-only inference service for RAG systems",
version="1.0.0"
)
# Writable cache (HF Spaces free tier requirement)
os.environ["HF_HOME"] = "/tmp/huggingface_cache"
# ------------------ Model Config ------------------
MODEL_NAME = "Sameer-Handsome173/qwen_model_1.5coder"
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
print("🔄 Loading model...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=DTYPE,
device_map="auto",
trust_remote_code=True
)
model.eval()
print("✅ Model loaded successfully")
# ------------------ RAG-SAFE SYSTEM PROMPT ------------------
SYSTEM_PROMPT = """You are an AI coding assistant powered by Qwen-1.5-Coder.
You help with:
- Programming questions
- Code generation
- Code explanation
- Debugging
- System design guidance
You will receive CONTEXT retrieved from a knowledge base.
Rules:
1. Use ONLY the provided context for factual answers.
2. If the context does not contain the answer, say:
"I don’t have enough information in the provided context."
3. Do NOT invent APIs, libraries, or facts.
4. Generate correct, clean, and readable code.
5. Do NOT reveal internal reasoning or chain-of-thought.
6. Be concise, structured, and precise.
7. If a request is unsafe, refuse politely.
The context is the source of truth.
"""
# ------------------ Request / Response Schema ------------------
class GenerateRequest(BaseModel):
query: str
context: str = ""
max_new_tokens: int = 256
temperature: float = 0.7
top_p: float = 0.9
class GenerateResponse(BaseModel):
response: str
latency_seconds: float
model: str
# ------------------ Generation Logic ------------------
def generate_answer(req: GenerateRequest) -> GenerateResponse:
start_time = time.time()
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": f"""
CONTEXT:
{req.context}
QUESTION:
{req.query}
"""
}
]
prompt_text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
try:
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=req.max_new_tokens,
temperature=req.temperature,
top_p=req.top_p,
do_sample=True,
repetition_penalty=1.1
)
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract assistant message only (Qwen-safe)
if "assistant" in decoded.lower():
decoded = decoded.split("assistant")[-1].strip()
latency = round(time.time() - start_time, 3)
return GenerateResponse(
response=decoded,
latency_seconds=latency,
model=MODEL_NAME
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# ------------------ API Endpoints ------------------
@app.get("/")
def root():
return {
"status": "running",
"service": "Qwen 1.5 Coder Inference API",
"model": MODEL_NAME,
"endpoint": "/v1/generate"
}
@app.post("/v1/generate", response_model=GenerateResponse)
def generate(req: GenerateRequest):
if not req.query.strip():
raise HTTPException(status_code=400, detail="Query cannot be empty")
return generate_answer(req)
@app.get("/health")
def health():
return {
"status": "healthy",
"model_loaded": model is not None,
"device": str(model.device)
}
# ------------------ Local Run (Optional) ------------------
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)