Spaces:

Sameer-Handsome173
/

coder_model_space

Sleeping

App Files Files Community

Sameer-Handsome173 commited on Dec 15, 2025

Commit

6a41fe2

verified ·

1 Parent(s): 4af1816

Create app.py

Browse files

Files changed (1) hide show

app.py +173 -0

app.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+import time
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# ------------------ Basic App Config ------------------
+app = FastAPI(
+    title="Qwen 1.5 Coder – Model Inference API",
+    description="LLMOps-grade model-only inference service for RAG systems",
+    version="1.0.0"
+)
+# Writable cache (HF Spaces free tier requirement)
+os.environ["HF_HOME"] = "/tmp/huggingface_cache"
+# ------------------ Model Config ------------------
+MODEL_NAME = "Sameer-Handsome173/qwen_model_1.5coder"
+DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+print("🔄 Loading model...")
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_NAME,
+    trust_remote_code=True
+)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=DTYPE,
+    device_map="auto",
+    trust_remote_code=True
+)
+model.eval()
+print("✅ Model loaded successfully")
+# ------------------ RAG-SAFE SYSTEM PROMPT ------------------
+SYSTEM_PROMPT = """You are an AI coding assistant powered by Qwen-1.5-Coder.
+You help with:
+- Programming questions
+- Code generation
+- Code explanation
+- Debugging
+- System design guidance
+You will receive CONTEXT retrieved from a knowledge base.
+Rules:
+1. Use ONLY the provided context for factual answers.
+2. If the context does not contain the answer, say:
+   "I don’t have enough information in the provided context."
+3. Do NOT invent APIs, libraries, or facts.
+4. Generate correct, clean, and readable code.
+5. Do NOT reveal internal reasoning or chain-of-thought.
+6. Be concise, structured, and precise.
+7. If a request is unsafe, refuse politely.
+The context is the source of truth.
+"""
+# ------------------ Request / Response Schema ------------------
+class GenerateRequest(BaseModel):
+    query: str
+    context: str = ""
+    max_new_tokens: int = 256
+    temperature: float = 0.7
+    top_p: float = 0.9
+class GenerateResponse(BaseModel):
+    response: str
+    latency_seconds: float
+    model: str
+# ------------------ Generation Logic ------------------
+def generate_answer(req: GenerateRequest) -> GenerateResponse:
+    start_time = time.time()
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": f"""
+CONTEXT:
+{req.context}
+QUESTION:
+{req.query}
+"""
+        }
+    ]
+    prompt_text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
+    try:
+        with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                max_new_tokens=req.max_new_tokens,
+                temperature=req.temperature,
+                top_p=req.top_p,
+                do_sample=True,
+                repetition_penalty=1.1
+            )
+        decoded = tokenizer.decode(output[0], skip_special_tokens=True)
+        # Extract assistant message only (Qwen-safe)
+        if "assistant" in decoded.lower():
+            decoded = decoded.split("assistant")[-1].strip()
+        latency = round(time.time() - start_time, 3)
+        return GenerateResponse(
+            response=decoded,
+            latency_seconds=latency,
+            model=MODEL_NAME
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ------------------ API Endpoints ------------------
+@app.get("/")
+def root():
+    return {
+        "status": "running",
+        "service": "Qwen 1.5 Coder Inference API",
+        "model": MODEL_NAME,
+        "endpoint": "/v1/generate"
+    }
+@app.post("/v1/generate", response_model=GenerateResponse)
+def generate(req: GenerateRequest):
+    if not req.query.strip():
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    return generate_answer(req)
+@app.get("/health")
+def health():
+    return {
+        "status": "healthy",
+        "model_loaded": model is not None,
+        "device": str(model.device)
+    }
+# ------------------ Local Run (Optional) ------------------
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)