AjinkyaPagare commited on
Commit
3fbe92b
ยท
1 Parent(s): 7175181

feat: configure StarCoder2 3B model deployment configurations and ports

Browse files
Files changed (4) hide show
  1. app/config.py +3 -3
  2. app/main.py +8 -8
  3. download_model.py +3 -3
  4. setup.py +3 -3
app/config.py CHANGED
@@ -6,7 +6,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
6
 
7
  # Model Configuration
8
  MODEL_DIR = os.path.join(BASE_DIR, "models")
9
- MODEL_NAME = "qwen2.5-coder-7b-instruct-q4_k_m.gguf"
10
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
11
 
12
  # llama.cpp Configuration
@@ -17,8 +17,8 @@ else:
17
  LLAMA_SERVER_PATH = os.environ.get("LLAMA_SERVER_PATH", os.path.join(BIN_DIR, "llama-server"))
18
 
19
  # Server Ports
20
- LLAMA_PORT = int(os.environ.get("LLAMA_PORT", 8081))
21
- FASTAPI_PORT = int(os.environ.get("FASTAPI_PORT", 8001))
22
  LLAMA_HOST = os.environ.get("LLAMA_HOST", "127.0.0.1")
23
 
24
  # Memory and CPU Performance Management
 
6
 
7
  # Model Configuration
8
  MODEL_DIR = os.path.join(BASE_DIR, "models")
9
+ MODEL_NAME = "starcoder2-3b-instruct.Q4_K_M.gguf"
10
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
11
 
12
  # llama.cpp Configuration
 
17
  LLAMA_SERVER_PATH = os.environ.get("LLAMA_SERVER_PATH", os.path.join(BIN_DIR, "llama-server"))
18
 
19
  # Server Ports
20
+ LLAMA_PORT = int(os.environ.get("LLAMA_PORT", 8084))
21
+ FASTAPI_PORT = int(os.environ.get("FASTAPI_PORT", 8004))
22
  LLAMA_HOST = os.environ.get("LLAMA_HOST", "127.0.0.1")
23
 
24
  # Memory and CPU Performance Management
app/main.py CHANGED
@@ -7,7 +7,7 @@ from fastapi import FastAPI, HTTPException, status, Request
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.responses import StreamingResponse, Response, HTMLResponse
9
 
10
- from app.config import MODEL_PATH, FASTAPI_PORT, CONTEXT_SIZE
11
  from app.schemas.chat import ChatRequest, AutocompleteRequest, StatusResponse
12
  from app.services.model_service import ModelService
13
  from app.services.FIM_service import FIMService
@@ -70,7 +70,7 @@ async def lifespan(app: FastAPI):
70
  # Create FastAPI App
71
  app = FastAPI(
72
  title="Local IDE Code Completion Engine",
73
- description="Optimized backend providing high-speed coding completions and chat using local Qwen 2.5 Coder.",
74
  version="1.0.0",
75
  lifespan=lifespan
76
  )
@@ -237,7 +237,7 @@ async def proxy_openai(path: str, request: Request):
237
  Directly proxies OpenAI-compatible requests to the underlying llama-server backend.
238
  Enables instant compatibility with standard VS Code / Cursor extensions.
239
  """
240
- url = f"http://127.0.0.1:8080/v1/{path}"
241
  body = await request.body()
242
  headers = dict(request.headers)
243
  headers.pop("host", None)
@@ -305,7 +305,7 @@ async def get_dashboard():
305
  <head>
306
  <meta charset="UTF-8">
307
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
308
- <title>Qwen 2.5 Coder | Interactive Engine Dashboard</title>
309
  <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;700&family=Outfit:wght@300;400;600;700&display=swap" rel="stylesheet">
310
  <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
311
  <style>
@@ -658,8 +658,8 @@ async def get_dashboard():
658
  <body>
659
  <header>
660
  <div class="logo-section">
661
- <div class="logo-badge">Q</div>
662
- <div class="logo-title">Qwen 2.5 Coder</div>
663
  </div>
664
  <div class="status-pill">
665
  <div class="status-dot"></div>
@@ -720,7 +720,7 @@ async def get_dashboard():
720
  <div class="chat-messages" id="chat-box">
721
  <!-- Welcome/default message of the LLM is inside the scrollable flow -->
722
  <div class="message bot" style="max-width: 100%;">
723
- <div class="message-content">Hello! I am Qwen 2.5 Coder. Ask me any coding questions, request complete script generation, or let me explain complex logic!</div>
724
  <div class="message-meta" style="display: flex; align-items: center; gap: 0.75rem; margin-top: 0.5rem; font-size: 0.75rem; color: var(--text-muted); opacity: 0.8;">
725
  <span class="message-time">Welcome</span>
726
  <button class="copy-all-btn" onclick="copyEntireMessage(this)" style="background: none; border: none; color: var(--text-muted); cursor: pointer; display: flex; align-items: center; gap: 0.25rem; font-size: 0.75rem; transition: color 0.2s; padding: 0;">
@@ -732,7 +732,7 @@ async def get_dashboard():
732
  </div>
733
  <!-- Input row sits cleanly below the message box, aligning dynamically -->
734
  <div class="input-row">
735
- <input type="text" class="chat-input" id="user-input" placeholder="Ask Qwen-Coder something (e.g. Write a quicksort in Go)..." onkeydown="if(event.key === 'Enter') handleAction()">
736
  <button class="send-btn" id="action-btn" onclick="handleAction()">Send</button>
737
  </div>
738
  </div>
 
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.responses import StreamingResponse, Response, HTMLResponse
9
 
10
+ from app.config import MODEL_PATH, FASTAPI_PORT, CONTEXT_SIZE, LLAMA_PORT
11
  from app.schemas.chat import ChatRequest, AutocompleteRequest, StatusResponse
12
  from app.services.model_service import ModelService
13
  from app.services.FIM_service import FIMService
 
70
  # Create FastAPI App
71
  app = FastAPI(
72
  title="Local IDE Code Completion Engine",
73
+ description="Optimized backend providing high-speed coding completions and chat using local StarCoder2 3B.",
74
  version="1.0.0",
75
  lifespan=lifespan
76
  )
 
237
  Directly proxies OpenAI-compatible requests to the underlying llama-server backend.
238
  Enables instant compatibility with standard VS Code / Cursor extensions.
239
  """
240
+ url = f"http://127.0.0.1:{LLAMA_PORT}/v1/{path}"
241
  body = await request.body()
242
  headers = dict(request.headers)
243
  headers.pop("host", None)
 
305
  <head>
306
  <meta charset="UTF-8">
307
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
308
+ <title>StarCoder2 3B | Interactive Engine Dashboard</title>
309
  <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;700&family=Outfit:wght@300;400;600;700&display=swap" rel="stylesheet">
310
  <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
311
  <style>
 
658
  <body>
659
  <header>
660
  <div class="logo-section">
661
+ <div class="logo-badge">S</div>
662
+ <div class="logo-title">StarCoder2 3B</div>
663
  </div>
664
  <div class="status-pill">
665
  <div class="status-dot"></div>
 
720
  <div class="chat-messages" id="chat-box">
721
  <!-- Welcome/default message of the LLM is inside the scrollable flow -->
722
  <div class="message bot" style="max-width: 100%;">
723
+ <div class="message-content">Hello! I am StarCoder2 3B. Ask me any coding questions, request complete script generation, or let me explain complex logic!</div>
724
  <div class="message-meta" style="display: flex; align-items: center; gap: 0.75rem; margin-top: 0.5rem; font-size: 0.75rem; color: var(--text-muted); opacity: 0.8;">
725
  <span class="message-time">Welcome</span>
726
  <button class="copy-all-btn" onclick="copyEntireMessage(this)" style="background: none; border: none; color: var(--text-muted); cursor: pointer; display: flex; align-items: center; gap: 0.25rem; font-size: 0.75rem; transition: color 0.2s; padding: 0;">
 
732
  </div>
733
  <!-- Input row sits cleanly below the message box, aligning dynamically -->
734
  <div class="input-row">
735
+ <input type="text" class="chat-input" id="user-input" placeholder="Ask StarCoder2 something (e.g. Write a quicksort in Go)..." onkeydown="if(event.key === 'Enter') handleAction()">
736
  <button class="send-btn" id="action-btn" onclick="handleAction()">Send</button>
737
  </div>
738
  </div>
download_model.py CHANGED
@@ -3,9 +3,9 @@ import sys
3
  import requests
4
 
5
  MODEL_DIR = "models"
6
- MODEL_NAME = "qwen2.5-coder-7b-instruct-q4_k_m.gguf"
7
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
8
- MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
9
 
10
  def download_model():
11
  os.makedirs(MODEL_DIR, exist_ok=True)
@@ -13,7 +13,7 @@ def download_model():
13
  print(f"Model '{MODEL_NAME}' already exists. Skipping download.")
14
  return
15
 
16
- print(f"Initiating download for Qwen 2.5 Coder 7B GGUF...")
17
  response = requests.get(MODEL_URL, stream=True)
18
  response.raise_for_status()
19
 
 
3
  import requests
4
 
5
  MODEL_DIR = "models"
6
+ MODEL_NAME = "starcoder2-3b-instruct.Q4_K_M.gguf"
7
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
8
+ MODEL_URL = "https://huggingface.co/QuantFactory/starcoder2-3b-instruct-GGUF/resolve/main/starcoder2-3b-instruct.Q4_K_M.gguf"
9
 
10
  def download_model():
11
  os.makedirs(MODEL_DIR, exist_ok=True)
 
13
  print(f"Model '{MODEL_NAME}' already exists. Skipping download.")
14
  return
15
 
16
+ print(f"Initiating download for StarCoder2 3B GGUF...")
17
  response = requests.get(MODEL_URL, stream=True)
18
  response.raise_for_status()
19
 
setup.py CHANGED
@@ -8,9 +8,9 @@ import requests
8
  MODEL_DIR = "models"
9
  BIN_DIR = "bin"
10
 
11
- # Hugging Face URL for Qwen 2.5 Coder 1.5B Instruct Q4_K_M GGUF (approx 1.2 GB)
12
- MODEL_URL = "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
13
- MODEL_NAME = "qwen2.5-coder-7b-instruct-q4_k_m.gguf"
14
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
15
 
16
  # llama.cpp stable release b4834 for Windows AVX2 (Supports almost all modern CPUs)
 
8
  MODEL_DIR = "models"
9
  BIN_DIR = "bin"
10
 
11
+ # Hugging Face URL for StarCoder2 3B Instruct Q4_K_M GGUF (approx 1.8 GB)
12
+ MODEL_URL = "https://huggingface.co/QuantFactory/starcoder2-3b-instruct-GGUF/resolve/main/starcoder2-3b-instruct.Q4_K_M.gguf"
13
+ MODEL_NAME = "starcoder2-3b-instruct.Q4_K_M.gguf"
14
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
15
 
16
  # llama.cpp stable release b4834 for Windows AVX2 (Supports almost all modern CPUs)