ProfessorCEO commited on
Commit
a0650cd
Β·
verified Β·
1 Parent(s): 8613805

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -22
app.py CHANGED
@@ -1,22 +1,45 @@
1
  from fastapi import FastAPI
 
 
2
  from pydantic import BaseModel
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
- import torch
5
  from typing import List, Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- app = FastAPI()
 
8
 
9
- model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
10
- tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
 
 
 
 
 
 
12
 
 
13
  class CodeRequest(BaseModel):
14
  code: str
15
  language: str = "python"
16
  max_tokens: int = 128
17
 
18
  class ChatMessage(BaseModel):
19
- role: str
20
  content: str
21
 
22
  class ChatRequest(BaseModel):
@@ -24,33 +47,134 @@ class ChatRequest(BaseModel):
24
  system: Optional[str] = ""
25
  max_tokens: int = 1024
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  @app.get("/")
28
  def root():
29
- return {"status": "DevOS AI is running"}
 
 
 
 
30
 
 
 
 
 
 
 
31
  @app.post("/complete")
32
  def complete_code(request: CodeRequest):
33
  prompt = f"Continue the following {request.language} code:\n{request.code}"
34
- inputs = tokenizer(prompt, return_tensors="pt")
 
 
 
 
 
 
 
35
  with torch.no_grad():
36
- outputs = model.generate(**inputs, max_new_tokens=request.max_tokens,
37
- temperature=0.2, do_sample=True, pad_token_id=tokenizer.eos_token_id)
 
 
 
 
 
 
 
38
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
39
- return {"suggestion": generated[len(prompt):].strip()}
 
 
40
 
 
 
41
  @app.post("/chat")
42
  def chat(request: ChatRequest):
43
- # Build conversation prompt
44
- prompt = request.system + "\n\n" if request.system else ""
45
- for msg in request.messages[-8:]: # last 8 messages for context
46
- role = "User" if msg.role == "user" else "DevOS AI"
47
- prompt += f"{role}: {msg.content}\n"
48
- prompt += "DevOS AI:"
 
 
49
 
50
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
51
  with torch.no_grad():
52
- outputs = model.generate(**inputs, max_new_tokens=request.max_tokens,
53
- temperature=0.4, do_sample=True, pad_token_id=tokenizer.eos_token_id)
 
 
 
 
 
 
 
 
54
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
55
  reply = generated[len(prompt):].strip()
56
- return {"reply": reply}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
+ from fastapi.responses import StreamingResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
  from pydantic import BaseModel
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 
6
  from typing import List, Optional
7
+ import torch
8
+ import asyncio
9
+ from threading import Thread
10
+
11
+ # ── APP SETUP ─────────────────────────────────────────
12
+ app = FastAPI(title="DevOS AI", description="AI coding agent by Cool Shot System")
13
+
14
+ app.add_middleware(
15
+ CORSMiddleware,
16
+ allow_origins=["*"],
17
+ allow_credentials=True,
18
+ allow_methods=["*"],
19
+ allow_headers=["*"],
20
+ )
21
 
22
+ # ── MODEL LOADING ─────────────────────────────────────
23
+ MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
24
 
25
+ print(f"Loading model: {MODEL_NAME} ...")
26
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ MODEL_NAME,
29
+ torch_dtype=torch.float32, # CPU-safe
30
+ low_cpu_mem_usage=True,
31
+ )
32
+ model.eval()
33
+ print("Model loaded βœ“")
34
 
35
+ # ── SCHEMAS ───────────────────────────────────────────
36
  class CodeRequest(BaseModel):
37
  code: str
38
  language: str = "python"
39
  max_tokens: int = 128
40
 
41
  class ChatMessage(BaseModel):
42
+ role: str # "user" or "assistant"
43
  content: str
44
 
45
  class ChatRequest(BaseModel):
 
47
  system: Optional[str] = ""
48
  max_tokens: int = 1024
49
 
50
+ # ── HELPERS ───────────────────────────────────────────
51
+ def build_prompt(messages: List[ChatMessage], system: str = "") -> str:
52
+ prompt = system.strip() + "\n\n" if system and system.strip() else ""
53
+ for msg in messages[-10:]: # last 10 messages for context window
54
+ role_label = "User" if msg.role == "user" else "DevOS AI"
55
+ prompt += f"{role_label}: {msg.content.strip()}\n"
56
+ prompt += "DevOS AI:"
57
+ return prompt
58
+
59
+ # ── ROUTES ────────────────────────────────────────────
60
+
61
  @app.get("/")
62
  def root():
63
+ return {
64
+ "status": "DevOS AI is running",
65
+ "model": MODEL_NAME,
66
+ "endpoints": ["/complete", "/chat", "/stream"]
67
+ }
68
 
69
+ @app.get("/health")
70
+ def health():
71
+ return {"status": "ok"}
72
+
73
+
74
+ # ── /complete β€” inline code completion ────────────────
75
  @app.post("/complete")
76
  def complete_code(request: CodeRequest):
77
  prompt = f"Continue the following {request.language} code:\n{request.code}"
78
+
79
+ inputs = tokenizer(
80
+ prompt,
81
+ return_tensors="pt",
82
+ truncation=True,
83
+ max_length=2048
84
+ )
85
+
86
  with torch.no_grad():
87
+ outputs = model.generate(
88
+ **inputs,
89
+ max_new_tokens=request.max_tokens,
90
+ temperature=0.2,
91
+ do_sample=True,
92
+ pad_token_id=tokenizer.eos_token_id,
93
+ eos_token_id=tokenizer.eos_token_id,
94
+ )
95
+
96
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
97
+ suggestion = generated[len(prompt):].strip()
98
+
99
+ return {"suggestion": suggestion}
100
 
101
+
102
+ # ── /chat β€” full conversation, single response ─────────
103
  @app.post("/chat")
104
  def chat(request: ChatRequest):
105
+ prompt = build_prompt(request.messages, request.system)
106
+
107
+ inputs = tokenizer(
108
+ prompt,
109
+ return_tensors="pt",
110
+ truncation=True,
111
+ max_length=2048
112
+ )
113
 
 
114
  with torch.no_grad():
115
+ outputs = model.generate(
116
+ **inputs,
117
+ max_new_tokens=request.max_tokens,
118
+ temperature=0.4,
119
+ do_sample=True,
120
+ pad_token_id=tokenizer.eos_token_id,
121
+ eos_token_id=tokenizer.eos_token_id,
122
+ repetition_penalty=1.1,
123
+ )
124
+
125
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
126
  reply = generated[len(prompt):].strip()
127
+
128
+ return {"reply": reply}
129
+
130
+
131
+ # ── /stream β€” streaming response (SSE) ────────────────
132
+ @app.post("/stream")
133
+ async def stream_chat(request: ChatRequest):
134
+ prompt = build_prompt(request.messages, request.system)
135
+
136
+ inputs = tokenizer(
137
+ prompt,
138
+ return_tensors="pt",
139
+ truncation=True,
140
+ max_length=2048
141
+ )
142
+
143
+ streamer = TextIteratorStreamer(
144
+ tokenizer,
145
+ skip_prompt=True,
146
+ skip_special_tokens=True
147
+ )
148
+
149
+ generation_kwargs = dict(
150
+ **inputs,
151
+ max_new_tokens=request.max_tokens,
152
+ temperature=0.4,
153
+ do_sample=True,
154
+ pad_token_id=tokenizer.eos_token_id,
155
+ eos_token_id=tokenizer.eos_token_id,
156
+ repetition_penalty=1.1,
157
+ streamer=streamer,
158
+ )
159
+
160
+ # Run generation in background thread so we can stream
161
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
162
+ thread.start()
163
+
164
+ async def token_generator():
165
+ for token in streamer:
166
+ if token:
167
+ # SSE format
168
+ yield f"data: {token}\n\n"
169
+ await asyncio.sleep(0) # yield control to event loop
170
+ yield "data: [DONE]\n\n"
171
+
172
+ return StreamingResponse(
173
+ token_generator(),
174
+ media_type="text/event-stream",
175
+ headers={
176
+ "Cache-Control": "no-cache",
177
+ "X-Accel-Buffering": "no",
178
+ "Connection": "keep-alive",
179
+ }
180
+ )