Sameer-Handsome173 commited on
Commit
6a41fe2
·
verified ·
1 Parent(s): 4af1816

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -0
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ import torch
7
+
8
+ # ------------------ Basic App Config ------------------
9
+
10
+ app = FastAPI(
11
+ title="Qwen 1.5 Coder – Model Inference API",
12
+ description="LLMOps-grade model-only inference service for RAG systems",
13
+ version="1.0.0"
14
+ )
15
+
16
+ # Writable cache (HF Spaces free tier requirement)
17
+ os.environ["HF_HOME"] = "/tmp/huggingface_cache"
18
+
19
+ # ------------------ Model Config ------------------
20
+
21
+ MODEL_NAME = "Sameer-Handsome173/qwen_model_1.5coder"
22
+ DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
23
+
24
+ print("🔄 Loading model...")
25
+
26
+ tokenizer = AutoTokenizer.from_pretrained(
27
+ MODEL_NAME,
28
+ trust_remote_code=True
29
+ )
30
+
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ MODEL_NAME,
33
+ torch_dtype=DTYPE,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ model.eval()
39
+
40
+ print("✅ Model loaded successfully")
41
+
42
+ # ------------------ RAG-SAFE SYSTEM PROMPT ------------------
43
+
44
+ SYSTEM_PROMPT = """You are an AI coding assistant powered by Qwen-1.5-Coder.
45
+
46
+ You help with:
47
+ - Programming questions
48
+ - Code generation
49
+ - Code explanation
50
+ - Debugging
51
+ - System design guidance
52
+
53
+ You will receive CONTEXT retrieved from a knowledge base.
54
+
55
+ Rules:
56
+ 1. Use ONLY the provided context for factual answers.
57
+ 2. If the context does not contain the answer, say:
58
+ "I don’t have enough information in the provided context."
59
+ 3. Do NOT invent APIs, libraries, or facts.
60
+ 4. Generate correct, clean, and readable code.
61
+ 5. Do NOT reveal internal reasoning or chain-of-thought.
62
+ 6. Be concise, structured, and precise.
63
+ 7. If a request is unsafe, refuse politely.
64
+
65
+ The context is the source of truth.
66
+ """
67
+
68
+ # ------------------ Request / Response Schema ------------------
69
+
70
+ class GenerateRequest(BaseModel):
71
+ query: str
72
+ context: str = ""
73
+ max_new_tokens: int = 256
74
+ temperature: float = 0.7
75
+ top_p: float = 0.9
76
+
77
+
78
+ class GenerateResponse(BaseModel):
79
+ response: str
80
+ latency_seconds: float
81
+ model: str
82
+
83
+
84
+ # ------------------ Generation Logic ------------------
85
+
86
+ def generate_answer(req: GenerateRequest) -> GenerateResponse:
87
+ start_time = time.time()
88
+
89
+ messages = [
90
+ {"role": "system", "content": SYSTEM_PROMPT},
91
+ {
92
+ "role": "user",
93
+ "content": f"""
94
+ CONTEXT:
95
+ {req.context}
96
+
97
+ QUESTION:
98
+ {req.query}
99
+ """
100
+ }
101
+ ]
102
+
103
+ prompt_text = tokenizer.apply_chat_template(
104
+ messages,
105
+ tokenize=False,
106
+ add_generation_prompt=True
107
+ )
108
+
109
+ inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
110
+
111
+ try:
112
+ with torch.no_grad():
113
+ output = model.generate(
114
+ **inputs,
115
+ max_new_tokens=req.max_new_tokens,
116
+ temperature=req.temperature,
117
+ top_p=req.top_p,
118
+ do_sample=True,
119
+ repetition_penalty=1.1
120
+ )
121
+
122
+ decoded = tokenizer.decode(output[0], skip_special_tokens=True)
123
+
124
+ # Extract assistant message only (Qwen-safe)
125
+ if "assistant" in decoded.lower():
126
+ decoded = decoded.split("assistant")[-1].strip()
127
+
128
+ latency = round(time.time() - start_time, 3)
129
+
130
+ return GenerateResponse(
131
+ response=decoded,
132
+ latency_seconds=latency,
133
+ model=MODEL_NAME
134
+ )
135
+
136
+ except Exception as e:
137
+ raise HTTPException(status_code=500, detail=str(e))
138
+
139
+
140
+ # ------------------ API Endpoints ------------------
141
+
142
+ @app.get("/")
143
+ def root():
144
+ return {
145
+ "status": "running",
146
+ "service": "Qwen 1.5 Coder Inference API",
147
+ "model": MODEL_NAME,
148
+ "endpoint": "/v1/generate"
149
+ }
150
+
151
+
152
+ @app.post("/v1/generate", response_model=GenerateResponse)
153
+ def generate(req: GenerateRequest):
154
+ if not req.query.strip():
155
+ raise HTTPException(status_code=400, detail="Query cannot be empty")
156
+
157
+ return generate_answer(req)
158
+
159
+
160
+ @app.get("/health")
161
+ def health():
162
+ return {
163
+ "status": "healthy",
164
+ "model_loaded": model is not None,
165
+ "device": str(model.device)
166
+ }
167
+
168
+
169
+ # ------------------ Local Run (Optional) ------------------
170
+
171
+ if __name__ == "__main__":
172
+ import uvicorn
173
+ uvicorn.run(app, host="0.0.0.0", port=7860)