import os import time from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM import torch # ------------------ Basic App Config ------------------ app = FastAPI( title="Qwen 1.5 Coder – Model Inference API", description="LLMOps-grade model-only inference service for RAG systems", version="1.0.0" ) # Writable cache (HF Spaces free tier requirement) os.environ["HF_HOME"] = "/tmp/huggingface_cache" # ------------------ Model Config ------------------ MODEL_NAME = "Sameer-Handsome173/qwen_model_1.5coder" DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 print("πŸ”„ Loading model...") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=DTYPE, device_map="auto", trust_remote_code=True ) model.eval() print("βœ… Model loaded successfully") # ------------------ RAG-SAFE SYSTEM PROMPT ------------------ SYSTEM_PROMPT = """You are an AI coding assistant powered by Qwen-1.5-Coder. You help with: - Programming questions - Code generation - Code explanation - Debugging - System design guidance You will receive CONTEXT retrieved from a knowledge base. Rules: 1. Use ONLY the provided context for factual answers. 2. If the context does not contain the answer, say: "I don’t have enough information in the provided context." 3. Do NOT invent APIs, libraries, or facts. 4. Generate correct, clean, and readable code. 5. Do NOT reveal internal reasoning or chain-of-thought. 6. Be concise, structured, and precise. 7. If a request is unsafe, refuse politely. The context is the source of truth. """ # ------------------ Request / Response Schema ------------------ class GenerateRequest(BaseModel): query: str context: str = "" max_new_tokens: int = 256 temperature: float = 0.7 top_p: float = 0.9 class GenerateResponse(BaseModel): response: str latency_seconds: float model: str # ------------------ Generation Logic ------------------ def generate_answer(req: GenerateRequest) -> GenerateResponse: start_time = time.time() messages = [ {"role": "system", "content": SYSTEM_PROMPT}, { "role": "user", "content": f""" CONTEXT: {req.context} QUESTION: {req.query} """ } ] prompt_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device) try: with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=req.max_new_tokens, temperature=req.temperature, top_p=req.top_p, do_sample=True, repetition_penalty=1.1 ) decoded = tokenizer.decode(output[0], skip_special_tokens=True) # Extract assistant message only (Qwen-safe) if "assistant" in decoded.lower(): decoded = decoded.split("assistant")[-1].strip() latency = round(time.time() - start_time, 3) return GenerateResponse( response=decoded, latency_seconds=latency, model=MODEL_NAME ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # ------------------ API Endpoints ------------------ @app.get("/") def root(): return { "status": "running", "service": "Qwen 1.5 Coder Inference API", "model": MODEL_NAME, "endpoint": "/v1/generate" } @app.post("/v1/generate", response_model=GenerateResponse) def generate(req: GenerateRequest): if not req.query.strip(): raise HTTPException(status_code=400, detail="Query cannot be empty") return generate_answer(req) @app.get("/health") def health(): return { "status": "healthy", "model_loaded": model is not None, "device": str(model.device) } # ------------------ Local Run (Optional) ------------------ if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)