Spaces:

Sameer-Handsome173
/

coder_model_space

Sleeping

App Files Files Community

coder_model_space / app.py

Sameer-Handsome173

Create app.py

6a41fe2 verified about 1 month ago

raw

history blame contribute delete

4.27 kB

	import os
	import time
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch

	# ------------------ Basic App Config ------------------

	app = FastAPI(
	title="Qwen 1.5 Coder – Model Inference API",
	description="LLMOps-grade model-only inference service for RAG systems",
	version="1.0.0"
	)

	# Writable cache (HF Spaces free tier requirement)
	os.environ["HF_HOME"] = "/tmp/huggingface_cache"

	# ------------------ Model Config ------------------

	MODEL_NAME = "Sameer-Handsome173/qwen_model_1.5coder"
	DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

	print("🔄 Loading model...")

	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_NAME,
	trust_remote_code=True
	)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=DTYPE,
	device_map="auto",
	trust_remote_code=True
	)

	model.eval()

	print("✅ Model loaded successfully")

	# ------------------ RAG-SAFE SYSTEM PROMPT ------------------

	SYSTEM_PROMPT = """You are an AI coding assistant powered by Qwen-1.5-Coder.

	You help with:
	- Programming questions
	- Code generation
	- Code explanation
	- Debugging
	- System design guidance

	You will receive CONTEXT retrieved from a knowledge base.

	Rules:
	1. Use ONLY the provided context for factual answers.
	2. If the context does not contain the answer, say:
	"I don’t have enough information in the provided context."
	3. Do NOT invent APIs, libraries, or facts.
	4. Generate correct, clean, and readable code.
	5. Do NOT reveal internal reasoning or chain-of-thought.
	6. Be concise, structured, and precise.
	7. If a request is unsafe, refuse politely.

	The context is the source of truth.
	"""

	# ------------------ Request / Response Schema ------------------

	class GenerateRequest(BaseModel):
	query: str
	context: str = ""
	max_new_tokens: int = 256
	temperature: float = 0.7
	top_p: float = 0.9


	class GenerateResponse(BaseModel):
	response: str
	latency_seconds: float
	model: str


	# ------------------ Generation Logic ------------------

	def generate_answer(req: GenerateRequest) -> GenerateResponse:
	start_time = time.time()

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{
	"role": "user",
	"content": f"""
	CONTEXT:
	{req.context}

	QUESTION:
	{req.query}
	"""
	}
	]

	prompt_text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

	try:
	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_new_tokens=req.max_new_tokens,
	temperature=req.temperature,
	top_p=req.top_p,
	do_sample=True,
	repetition_penalty=1.1
	)

	decoded = tokenizer.decode(output[0], skip_special_tokens=True)

	# Extract assistant message only (Qwen-safe)
	if "assistant" in decoded.lower():
	decoded = decoded.split("assistant")[-1].strip()

	latency = round(time.time() - start_time, 3)

	return GenerateResponse(
	response=decoded,
	latency_seconds=latency,
	model=MODEL_NAME
	)

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	# ------------------ API Endpoints ------------------

	@app.get("/")
	def root():
	return {
	"status": "running",
	"service": "Qwen 1.5 Coder Inference API",
	"model": MODEL_NAME,
	"endpoint": "/v1/generate"
	}


	@app.post("/v1/generate", response_model=GenerateResponse)
	def generate(req: GenerateRequest):
	if not req.query.strip():
	raise HTTPException(status_code=400, detail="Query cannot be empty")

	return generate_answer(req)


	@app.get("/health")
	def health():
	return {
	"status": "healthy",
	"model_loaded": model is not None,
	"device": str(model.device)
	}


	# ------------------ Local Run (Optional) ------------------

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)