ProfessorCEO commited on
Commit
e251d62
·
0 Parent(s):

Deploy API

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv/
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ .env
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1 \
6
+ HF_HOME=/data/.cache/huggingface
7
+
8
+ WORKDIR /app
9
+
10
+ COPY requirements.txt ./
11
+ RUN pip install --upgrade pip && pip install -r requirements.txt
12
+
13
+ COPY . .
14
+
15
+ EXPOSE 7860
16
+
17
+ CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]
README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # KORA API Backend (FastAPI + Transformers)
2
+
3
+ Production-ready AI backend scaffold for KORA, optimized for Docker and Hugging Face Docker Spaces.
4
+
5
+ ## Features
6
+
7
+ - OpenAI-compatible endpoint: `POST /v1/chat/completions`
8
+ - Streaming token responses using SSE
9
+ - CPU-safe Transformers + PyTorch runtime with `microsoft/Phi-3-mini-4k-instruct`
10
+ - Automatic KORA system-prompt injection on every request
11
+ - KONTYRA knowledge-base context injection
12
+ - Async architecture with centralized logging
13
+ - Environment-driven configuration
14
+
15
+ ## Folder Structure
16
+
17
+ ```text
18
+ apps/api/
19
+ ├── app/
20
+ │ ├── routes/
21
+ │ │ └── chat.py
22
+ │ ├── services/
23
+ │ │ ├── model_service.py
24
+ │ │ └── prompt_service.py
25
+ │ └── utils/
26
+ │ └── config.py
27
+ ├── prompts/
28
+ │ └── system_prompt.txt
29
+ ├── knowledge/
30
+ │ └── kontyra.md
31
+ ├── Dockerfile
32
+ ├── main.py
33
+ └── requirements.txt
34
+ ```
35
+
36
+ ## Environment Variables
37
+
38
+ Create `.env` in `apps/api/` (optional):
39
+
40
+ ```bash
41
+ APP_NAME="KORA AI Backend"
42
+ APP_ENV="production"
43
+ LOG_LEVEL="INFO"
44
+ MODEL_NAME="microsoft/Phi-3-mini-4k-instruct"
45
+ TRUST_REMOTE_CODE=true
46
+ LOW_CPU_MEM_USAGE=true
47
+ MAX_INPUT_TOKENS=3072
48
+ TORCH_NUM_THREADS=0
49
+ TORCH_NUM_INTEROP_THREADS=0
50
+ DEFAULT_TEMPERATURE=0.7
51
+ DEFAULT_TOP_P=0.9
52
+ DEFAULT_MAX_TOKENS=512
53
+ ```
54
+
55
+ ## Local Setup
56
+
57
+ ```bash
58
+ cd apps/api
59
+ python -m venv .venv
60
+ source .venv/bin/activate
61
+ pip install -r requirements.txt
62
+ uvicorn main:app --host 0.0.0.0 --port 8000
63
+ ```
64
+
65
+ ## Docker Build + Run
66
+
67
+ ```bash
68
+ cd apps/api
69
+ docker build -t kora-api .
70
+ docker run --rm -p 7860:7860 \
71
+ -e PORT=7860 \
72
+ -e MODEL_NAME=microsoft/Phi-3-mini-4k-instruct \
73
+ kora-api
74
+ ```
75
+
76
+ ## Hugging Face Docker Spaces Startup
77
+
78
+ Use the included `Dockerfile`. Hugging Face sets `PORT`, and the container starts with:
79
+
80
+ ```bash
81
+ uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}
82
+ ```
83
+
84
+ ## API Example
85
+
86
+ ### Request
87
+
88
+ ```json
89
+ {
90
+ "model": "microsoft/Phi-3-mini-4k-instruct",
91
+ "stream": true,
92
+ "messages": [
93
+ {"role": "user", "content": "Who created KORA?"}
94
+ ]
95
+ }
96
+ ```
97
+
98
+ ### Endpoint
99
+
100
+ ```http
101
+ POST /v1/chat/completions
102
+ Content-Type: application/json
103
+ ```
104
+
105
+ Streaming responses are returned as SSE `data:` events compatible with OpenAI-style chunk consumption.
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Compatibility entrypoint for Hugging Face Spaces and ASGI hosts."""
2
+
3
+ from main import app
4
+
5
+ __all__ = ["app"]
app/__init__.py ADDED
File without changes
app/routes/__init__.py ADDED
File without changes
app/routes/chat.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenAI-compatible chat completion routes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import time
8
+ from typing import Literal
9
+
10
+ from fastapi import APIRouter, Request
11
+ from fastapi.responses import JSONResponse, StreamingResponse
12
+ from pydantic import BaseModel, Field
13
+
14
+ from app.utils.config import settings
15
+
16
+ logger = logging.getLogger(__name__)
17
+ router = APIRouter(tags=["chat"])
18
+
19
+
20
+ class ChatMessage(BaseModel):
21
+ """OpenAI-compatible chat message format."""
22
+
23
+ role: Literal["system", "user", "assistant"]
24
+ content: str
25
+
26
+
27
+ class ChatCompletionRequest(BaseModel):
28
+ """Subset of OpenAI chat completion request fields."""
29
+
30
+ model: str = Field(default_factory=lambda: settings.model_name)
31
+ messages: list[ChatMessage]
32
+ stream: bool = False
33
+ temperature: float | None = None
34
+ top_p: float | None = None
35
+ max_tokens: int | None = Field(default=None, ge=1)
36
+
37
+
38
+ def _sse_event(payload: dict) -> str:
39
+ """Format one SSE data event."""
40
+ return f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
41
+
42
+
43
+ @router.post("/v1/chat/completions")
44
+ async def create_chat_completion(request: Request, body: ChatCompletionRequest):
45
+ """OpenAI-compatible completions with optional SSE token streaming."""
46
+ prompt_service = request.app.state.prompt_service
47
+ model_service = request.app.state.model_service
48
+
49
+ injected_messages = prompt_service.inject_system_prompt(
50
+ [message.model_dump() for message in body.messages]
51
+ )
52
+
53
+ temperature = body.temperature if body.temperature is not None else settings.default_temperature
54
+ top_p = body.top_p if body.top_p is not None else settings.default_top_p
55
+ max_tokens = body.max_tokens if body.max_tokens is not None else settings.default_max_tokens
56
+
57
+ created = int(time.time())
58
+
59
+ if body.stream:
60
+
61
+ async def event_generator():
62
+ request_id = None
63
+ try:
64
+ # Initial chunk with assistant role to follow OpenAI streaming style.
65
+ bootstrap_chunk = {
66
+ "id": "chatcmpl-bootstrap",
67
+ "object": "chat.completion.chunk",
68
+ "created": created,
69
+ "model": body.model,
70
+ "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
71
+ }
72
+ yield _sse_event(bootstrap_chunk)
73
+
74
+ async for stream_request_id, delta in model_service.stream_text(
75
+ injected_messages,
76
+ temperature=temperature,
77
+ top_p=top_p,
78
+ max_tokens=max_tokens,
79
+ ):
80
+ request_id = stream_request_id
81
+ chunk = {
82
+ "id": request_id,
83
+ "object": "chat.completion.chunk",
84
+ "created": created,
85
+ "model": body.model,
86
+ "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
87
+ }
88
+ yield _sse_event(chunk)
89
+
90
+ final_chunk = {
91
+ "id": request_id or "chatcmpl-final",
92
+ "object": "chat.completion.chunk",
93
+ "created": created,
94
+ "model": body.model,
95
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
96
+ }
97
+ yield _sse_event(final_chunk)
98
+ yield "data: [DONE]\n\n"
99
+
100
+ except (RuntimeError, ValueError): # pragma: no cover - runtime guard
101
+ logger.exception("Failed to stream completion for request")
102
+ error_payload = {
103
+ "error": {
104
+ "message": "Failed to stream completion for request",
105
+ "type": "server_error",
106
+ }
107
+ }
108
+ yield _sse_event(error_payload)
109
+ yield "data: [DONE]\n\n"
110
+
111
+ return StreamingResponse(
112
+ event_generator(),
113
+ media_type="text/event-stream",
114
+ headers={
115
+ "Cache-Control": "no-cache",
116
+ "Connection": "keep-alive",
117
+ "X-Accel-Buffering": "no",
118
+ },
119
+ )
120
+
121
+ request_id, text = await model_service.complete_text(
122
+ injected_messages,
123
+ temperature=temperature,
124
+ top_p=top_p,
125
+ max_tokens=max_tokens,
126
+ )
127
+
128
+ response_payload = {
129
+ "id": request_id,
130
+ "object": "chat.completion",
131
+ "created": created,
132
+ "model": body.model,
133
+ "choices": [
134
+ {
135
+ "index": 0,
136
+ "message": {"role": "assistant", "content": text},
137
+ "finish_reason": "stop",
138
+ }
139
+ ],
140
+ }
141
+ return JSONResponse(response_payload)
app/services/__init__.py ADDED
File without changes
app/services/model_service.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Transformers + PyTorch text generation service for KORA."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from threading import Thread
8
+ from uuid import uuid4
9
+
10
+ import torch
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
12
+
13
+ from app.utils.config import Settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+ MIN_TEMPERATURE = 1e-5
17
+
18
+
19
+ class ModelService:
20
+ """Manages Transformers model lifecycle and CPU-safe generation."""
21
+
22
+ def __init__(self, settings: Settings) -> None:
23
+ self.settings = settings
24
+ self._model = None
25
+ self._tokenizer = None
26
+ self._startup_lock = asyncio.Lock()
27
+ self._generation_lock = asyncio.Lock()
28
+
29
+ async def startup(self) -> None:
30
+ """Initialize model engine once per process."""
31
+ if self._model is not None and self._tokenizer is not None:
32
+ return
33
+
34
+ async with self._startup_lock:
35
+ if self._model is not None and self._tokenizer is not None:
36
+ return
37
+
38
+ if self.settings.torch_num_threads > 0:
39
+ torch.set_num_threads(self.settings.torch_num_threads)
40
+ if self.settings.torch_num_interop_threads > 0:
41
+ torch.set_num_interop_threads(self.settings.torch_num_interop_threads)
42
+
43
+ logger.info("Loading model via Transformers on CPU: %s", self.settings.model_name)
44
+ self._tokenizer = AutoTokenizer.from_pretrained(
45
+ "microsoft/Phi-3-mini-4k-instruct",
46
+ trust_remote_code=self.settings.trust_remote_code,
47
+
48
+ use_fast=True,
49
+ )
50
+ if self._tokenizer.pad_token is None and self._tokenizer.eos_token is not None:
51
+ self._tokenizer.pad_token = self._tokenizer.eos_token
52
+
53
+ from peft import PeftModel
54
+ logger.info("Loading base model...")
55
+ base_model = AutoModelForCausalLM.from_pretrained(
56
+ "microsoft/Phi-3-mini-4k-instruct",
57
+ trust_remote_code=self.settings.trust_remote_code,
58
+ torch_dtype=torch.float32,
59
+ device_map="cpu"
60
+ )
61
+ logger.info("Applying PEFT adapter...")
62
+ self._model = PeftModel.from_pretrained(base_model, self.settings.model_name)
63
+
64
+ self._model.eval()
65
+ logger.info("CPU model and tokenizer initialized")
66
+
67
+ async def shutdown(self) -> None:
68
+ """Graceful shutdown hook."""
69
+ self._model = None
70
+ self._tokenizer = None
71
+
72
+ def _build_prompt(self, messages: list[dict[str, str]]) -> str:
73
+ """Render OpenAI-style messages into a model prompt."""
74
+ if self._tokenizer is None:
75
+ raise RuntimeError("Tokenizer is not initialized")
76
+
77
+ try:
78
+ return self._tokenizer.apply_chat_template(
79
+ messages,
80
+ tokenize=False,
81
+ add_generation_prompt=True,
82
+ )
83
+ except (AttributeError, TypeError, ValueError):
84
+ logger.warning("Chat template not supported, using fallback format", exc_info=True)
85
+ # Fallback for tokenizer templates that may not support message format.
86
+ lines = [f"{m['role'].upper()}: {m['content']}" for m in messages]
87
+ lines.append("ASSISTANT:")
88
+ return "\n".join(lines)
89
+
90
+ def _build_generation_kwargs(
91
+ self,
92
+ *,
93
+ temperature: float,
94
+ top_p: float,
95
+ max_tokens: int,
96
+ ) -> dict:
97
+ if self._tokenizer is None:
98
+ raise RuntimeError("Tokenizer is not initialized")
99
+
100
+ clamped_temperature = max(0.0, float(temperature))
101
+ clamped_top_p = min(max(float(top_p), 0.0), 1.0)
102
+ do_sample = clamped_temperature >= MIN_TEMPERATURE
103
+
104
+ kwargs = {
105
+ "max_new_tokens": max(1, int(max_tokens)),
106
+ "do_sample": do_sample,
107
+ "pad_token_id": self._tokenizer.pad_token_id,
108
+ "eos_token_id": self._tokenizer.eos_token_id,
109
+ }
110
+
111
+ if do_sample:
112
+ kwargs["temperature"] = max(clamped_temperature, MIN_TEMPERATURE)
113
+ kwargs["top_p"] = clamped_top_p if clamped_top_p > 0.0 else 1.0
114
+
115
+ return kwargs
116
+
117
+ def _tokenize_prompt(self, prompt: str) -> dict:
118
+ if self._tokenizer is None:
119
+ raise RuntimeError("Tokenizer is not initialized")
120
+
121
+ return self._tokenizer(
122
+ prompt,
123
+ return_tensors="pt",
124
+ truncation=True,
125
+ max_length=self.settings.max_input_tokens,
126
+ )
127
+
128
+ async def _ensure_ready(self) -> None:
129
+ if self._model is None or self._tokenizer is None:
130
+ await self.startup()
131
+ if self._model is None or self._tokenizer is None:
132
+ raise RuntimeError("Model service failed to initialize")
133
+
134
+ async def stream_text(
135
+ self,
136
+ messages: list[dict[str, str]],
137
+ *,
138
+ temperature: float,
139
+ top_p: float,
140
+ max_tokens: int,
141
+ ):
142
+ """Yield incremental token deltas for SSE streaming."""
143
+ await self._ensure_ready()
144
+ request_id = f"chatcmpl-{uuid4().hex}"
145
+
146
+ prompt = self._build_prompt(messages)
147
+ inputs = self._tokenize_prompt(prompt)
148
+ generation_kwargs = self._build_generation_kwargs(
149
+ temperature=temperature,
150
+ top_p=top_p,
151
+ max_tokens=max_tokens,
152
+ )
153
+ streamer = TextIteratorStreamer(
154
+ self._tokenizer,
155
+ skip_prompt=True,
156
+ skip_special_tokens=True,
157
+ )
158
+ generation_error: Exception | None = None
159
+
160
+ def run_generation() -> None:
161
+ nonlocal generation_error
162
+ try:
163
+ with torch.inference_mode():
164
+ self._model.generate(
165
+ **inputs,
166
+ streamer=streamer,
167
+ **generation_kwargs,
168
+ )
169
+ except (RuntimeError, ValueError, TypeError) as exc: # pragma: no cover - runtime guard
170
+ generation_error = exc
171
+ logger.exception("Streaming generation failed")
172
+
173
+ async with self._generation_lock:
174
+ worker = Thread(target=run_generation)
175
+ worker.start()
176
+ iterator = iter(streamer)
177
+
178
+ while True:
179
+ token = await asyncio.to_thread(next, iterator, None)
180
+ if token is None:
181
+ break
182
+ if generation_error is not None:
183
+ break
184
+ yield request_id, token
185
+
186
+ await asyncio.to_thread(worker.join)
187
+
188
+ if generation_error is not None:
189
+ raise RuntimeError("Streaming generation failed") from generation_error
190
+
191
+ async def complete_text(
192
+ self,
193
+ messages: list[dict[str, str]],
194
+ *,
195
+ temperature: float,
196
+ top_p: float,
197
+ max_tokens: int,
198
+ ) -> tuple[str, str]:
199
+ """Generate the final full completion in non-stream mode."""
200
+ await self._ensure_ready()
201
+ request_id = f"chatcmpl-{uuid4().hex}"
202
+
203
+ prompt = self._build_prompt(messages)
204
+ inputs = self._tokenize_prompt(prompt)
205
+ generation_kwargs = self._build_generation_kwargs(
206
+ temperature=temperature,
207
+ top_p=top_p,
208
+ max_tokens=max_tokens,
209
+ )
210
+
211
+ async with self._generation_lock:
212
+ output_ids = await asyncio.to_thread(
213
+ self._generate_sync,
214
+ inputs,
215
+ generation_kwargs,
216
+ )
217
+
218
+ input_ids = inputs.get("input_ids")
219
+ if input_ids is None:
220
+ raise RuntimeError("Tokenization failed to produce input_ids")
221
+ prompt_token_count = int(input_ids.shape[-1])
222
+ generated_ids = output_ids[0][prompt_token_count:]
223
+ final_text = self._tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
224
+
225
+ return request_id, final_text
226
+
227
+ def _generate_sync(self, inputs: dict, generation_kwargs: dict):
228
+ if self._model is None:
229
+ raise RuntimeError("Model is not initialized")
230
+ with torch.inference_mode():
231
+ return self._model.generate(**inputs, **generation_kwargs)
app/services/prompt_service.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System prompt and knowledge orchestration for KORA."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from app.utils.config import Settings
9
+
10
+
11
+ @dataclass
12
+ class PromptService:
13
+ """Loads and injects KORA identity/context into requests."""
14
+
15
+ settings: Settings
16
+
17
+ def _read_file(self, path: Path) -> str:
18
+ return path.read_text(encoding="utf-8").strip()
19
+
20
+ def build_system_prompt(self) -> str:
21
+ """Build a single injected system instruction block."""
22
+ core_prompt = self._read_file(self.settings.system_prompt_file)
23
+ knowledge = self._read_file(self.settings.knowledge_file)
24
+ return (
25
+ f"{core_prompt}\n\n"
26
+ "---\n"
27
+ "KONTYRA KNOWLEDGE BASE (authoritative context):\n"
28
+ f"{knowledge}"
29
+ )
30
+
31
+ def inject_system_prompt(self, messages: list[dict[str, str]]) -> list[dict[str, str]]:
32
+ """Prepend enforced KORA system prompt to every chat request."""
33
+ # Remove user-supplied system messages so identity rules remain stable.
34
+ non_system_messages = [m for m in messages if m.get("role") != "system"]
35
+ return [{"role": "system", "content": self.build_system_prompt()}, *non_system_messages]
app/utils/__init__.py ADDED
File without changes
app/utils/config.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Runtime configuration for the KORA backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+
8
+ from pydantic import Field
9
+ from pydantic_settings import BaseSettings, SettingsConfigDict
10
+
11
+ BASE_DIR = Path(__file__).resolve().parents[2]
12
+
13
+
14
+ class Settings(BaseSettings):
15
+ """Environment-driven settings with safe defaults."""
16
+
17
+ model_config = SettingsConfigDict(
18
+ env_file=str(BASE_DIR / ".env"),
19
+ env_file_encoding="utf-8",
20
+ extra="ignore",
21
+ )
22
+
23
+ app_name: str = Field(default="KORA AI Backend", alias="APP_NAME")
24
+ app_env: str = Field(default="production", alias="APP_ENV")
25
+ log_level: str = Field(default="INFO", alias="LOG_LEVEL")
26
+
27
+ model_name: str = Field(default="ProfessorCEO/KORA-v1", alias="MODEL_NAME")
28
+ trust_remote_code: bool = Field(default=True, alias="TRUST_REMOTE_CODE")
29
+ low_cpu_mem_usage: bool = Field(default=True, alias="LOW_CPU_MEM_USAGE")
30
+ max_input_tokens: int = Field(default=3072, alias="MAX_INPUT_TOKENS")
31
+ torch_num_threads: int = Field(default=0, alias="TORCH_NUM_THREADS")
32
+ torch_num_interop_threads: int = Field(default=0, alias="TORCH_NUM_INTEROP_THREADS")
33
+
34
+ default_temperature: float = Field(default=0.7, alias="DEFAULT_TEMPERATURE")
35
+ default_top_p: float = Field(default=0.9, alias="DEFAULT_TOP_P")
36
+ default_max_tokens: int = Field(default=512, alias="DEFAULT_MAX_TOKENS")
37
+
38
+ system_prompt_file: Path = Field(default=BASE_DIR / "prompts" / "system_prompt.txt")
39
+ knowledge_file: Path = Field(default=BASE_DIR / "knowledge" / "kontyra.md")
40
+
41
+
42
+ @lru_cache
43
+ def get_settings() -> Settings:
44
+ """Return a cached settings instance."""
45
+ return Settings()
46
+
47
+
48
+ settings = get_settings()
knowledge/kontyra.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # KONTYRA
2
+
3
+ KONTYRA is a futuristic AI and technology company focused on building intelligent digital systems, modern software experiences, automation platforms, and AI-powered products.
4
+
5
+ KONTYRA operates as an innovation-driven technology ecosystem with strong emphasis on:
6
+
7
+ * artificial intelligence
8
+ * software engineering
9
+ * automation
10
+ * digital infrastructure
11
+ * intelligent systems
12
+ * modern user experiences
13
+ * scalable technology platforms
14
+
15
+ KONTYRA tagline:
16
+ “Continuous Intelligence. Limitless Impact.”
17
+
18
+ ## KORA
19
+
20
+ KORA is the official AI assistant and intelligence platform created by KONTYRA.
21
+
22
+ KORA is designed to:
23
+
24
+ * assist users intelligently
25
+ * provide conversational AI experiences
26
+ * support productivity
27
+ * help with learning and research
28
+ * assist developers and creators
29
+ * automate workflows
30
+ * deliver intelligent insights
31
+
32
+ KORA represents the intelligence layer of the KONTYRA ecosystem.
33
+
34
+ ## Brand Identity
35
+
36
+ KONTYRA focuses on:
37
+
38
+ * futuristic technology
39
+ * premium digital experiences
40
+ * AI-native systems
41
+ * scalable infrastructure
42
+ * modern innovation
43
+ * intelligent automation
44
+
45
+ The company vision is to build technology systems that combine:
46
+
47
+ * intelligence
48
+ * creativity
49
+ * usability
50
+ * automation
51
+ * adaptability
52
+
53
+ ## Current Projects
54
+
55
+ Current KONTYRA ecosystem projects include:
56
+
57
+ * KORA Chat
58
+ * KORA Daily
59
+ * AI-powered automation systems
60
+ * Telegram AI integrations
61
+ * intelligent content generation systems
62
+
63
+ ## KORA Personality
64
+
65
+ KORA should communicate in a:
66
+
67
+ * intelligent
68
+ * modern
69
+ * concise
70
+ * technically strong
71
+ * calm
72
+ * futuristic
73
+ * helpful
74
+
75
+ manner.
76
+
77
+ KORA should avoid:
78
+
79
+ * robotic communication
80
+ * childish tone
81
+ * excessive slang
82
+ * confusing explanations
83
+
84
+ ## Important Identity Rules
85
+
86
+ KORA must:
87
+
88
+ * always identify itself as KORA when relevant
89
+ * identify KONTYRA as its creator company
90
+ * never claim to be ChatGPT
91
+ * maintain premium AI assistant behavior
92
+ * communicate confidently and clearly
93
+
94
+ ## Technical Stack
95
+
96
+ The KORA ecosystem currently uses technologies including:
97
+
98
+ * FastAPI
99
+ * Next.js
100
+ * Docker
101
+ * Hugging Face Spaces
102
+ * vLLM
103
+ * Phi-3-mini-4k-instruct
104
+ * Telegram Bot API
105
+ * Tailwind CSS
106
+
107
+ ## Long-Term Vision
108
+
109
+ KONTYRA aims to evolve into a comprehensive AI-native technology ecosystem focused on:
110
+
111
+ * conversational AI
112
+ * intelligent automation
113
+ * AI productivity systems
114
+ * modern software infrastructure
115
+ * adaptive digital experiences
116
+
117
+ The goal is to create scalable intelligent systems that improve how users interact with technology daily.
main.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI entrypoint for the KORA AI backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from contextlib import asynccontextmanager
7
+
8
+ from fastapi import FastAPI
9
+
10
+ from app.routes.chat import router as chat_router
11
+ from app.services.model_service import ModelService
12
+ from app.services.prompt_service import PromptService
13
+ from app.utils.config import settings
14
+
15
+
16
+ def configure_logging() -> None:
17
+ """Configure centralized application logging."""
18
+ logging.basicConfig(
19
+ level=getattr(logging, settings.log_level.upper(), logging.INFO),
20
+ format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
21
+ )
22
+
23
+
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ """Initialize shared services for app lifecycle."""
27
+ configure_logging()
28
+
29
+ prompt_service = PromptService(settings=settings)
30
+ model_service = ModelService(settings=settings)
31
+
32
+ app.state.prompt_service = prompt_service
33
+ app.state.model_service = model_service
34
+
35
+ await model_service.startup()
36
+ logging.getLogger(__name__).info("KORA backend initialized")
37
+
38
+ try:
39
+ yield
40
+ finally:
41
+ await model_service.shutdown()
42
+
43
+
44
+ app = FastAPI(
45
+ title=settings.app_name,
46
+ version="1.0.0",
47
+ lifespan=lifespan,
48
+ )
49
+
50
+ app.include_router(chat_router)
51
+
52
+
53
+ @app.get("/healthz", tags=["health"])
54
+ async def healthz() -> dict[str, str]:
55
+ """Liveness/readiness probe for orchestrators and HF Spaces."""
56
+ return {"status": "ok", "service": settings.app_name}
57
+
58
+
59
+ @app.get("/", tags=["root"])
60
+ async def root() -> dict[str, str]:
61
+ """Simple root route so HF Space URL does not return 404."""
62
+ return {
63
+ "service": settings.app_name,
64
+ "status": "ok",
65
+ "health": "/healthz",
66
+ "chat": "/v1/chat/completions",
67
+ }
prompts/system_prompt.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are KORA, the official, state-of-the-art AI intelligence assistant engineered by KONTYRA.
2
+
3
+ KONTYRA is a futuristic technology conglomerate focused on building the world's most advanced AI-native ecosystems, digital innovation platforms, and scalable infrastructure.
4
+
5
+ Tagline: "Continuous Intelligence. Limitless Impact."
6
+
7
+ # Core Identity & Directives:
8
+ 1. **Unshakable Identity**: You are always KORA. You were created exclusively by KONTYRA. You are never ChatGPT, you are never OpenAI, you are never an unnamed AI.
9
+ 2. **Elite Professionalism**: Your tone is premium, razor-sharp, calm, and hyper-intelligent. You do not use generic AI apologies or overly enthusiastic robotic phrasing. You speak with the quiet confidence of the world's most advanced intelligence.
10
+ 3. **Impeccable Formatting**: You deliver information in a beautiful, highly scannable format. Use Markdown extensively. Use bolding for emphasis, bullet points for lists, and properly formatted code blocks.
11
+ 4. **Technical Excellence**: When asked to code, you write modern, secure, and highly optimized code. You anticipate edge cases. If you write web code, you favor premium aesthetics like glassmorphism, fluid animations, and robust architecture.
12
+ 5. **Concise Brilliance**: Never ramble. Get straight to the highest-value information. If a user asks a simple question, give a direct, brilliant answer. If a user asks a complex question, break it down logically step-by-step.
13
+
14
+ # Behavior Rules:
15
+ - NEVER break character.
16
+ - ALWAYS maintain the "premium" aesthetic in your text (clean, structured, insightful).
17
+ - Treat the user as a highly capable professional who expects the absolute best.
18
+ - If asked about KONTYRA, describe it proudly as the premier AI and technology ecosystem that birthed you.
19
+
20
+ End of identity. Serve the user with continuous intelligence and limitless impact.
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+
3
+ fastapi==0.115.6
4
+ uvicorn[standard]==0.34.0
5
+ pydantic==2.10.4
6
+ pydantic-settings==2.7.0
7
+ transformers==4.47.1
8
+ torch==2.5.1+cpu
9
+ accelerate==1.2.1
10
+ sentencepiece==0.2.0
11
+ python-dotenv==1.0.1
12
+ peft==0.11.1
13
+
test_kora.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import time
4
+
5
+ url = "http://127.0.0.1:8000/v1/chat/completions"
6
+
7
+ # Let's ask a question that proves it knows it's KORA
8
+ payload = {
9
+ "model": "ProfessorCEO/KORA-v1",
10
+ "messages": [
11
+ {"role": "user", "content": "Who are you and what company created you?"}
12
+ ],
13
+ "stream": False,
14
+ "temperature": 0.3
15
+ }
16
+
17
+ headers = {
18
+ "Content-Type": "application/json"
19
+ }
20
+
21
+ print("Waiting for KORA server to be ready...")
22
+ for i in range(15):
23
+ try:
24
+ health = requests.get("http://127.0.0.1:8000/healthz")
25
+ if health.status_code == 200:
26
+ print("Server is up! Sending request to KORA...")
27
+ break
28
+ except requests.exceptions.ConnectionError:
29
+ pass
30
+ time.sleep(5)
31
+ else:
32
+ print("Server failed to start in time.")
33
+ exit(1)
34
+
35
+ print("\nQuestion:", payload["messages"][0]["content"])
36
+
37
+ response = requests.post(url, json=payload, headers=headers)
38
+
39
+ if response.status_code == 200:
40
+ data = response.json()
41
+ print("\nKORA says:")
42
+ print("-------------------------")
43
+ print(data["choices"][0]["message"]["content"])
44
+ print("-------------------------")
45
+ else:
46
+ print(f"Error: {response.status_code}")
47
+ print(response.text)