Spaces:
Running
Running
Commit ·
e251d62
0
Parent(s):
Deploy API
Browse files- .gitignore +5 -0
- Dockerfile +17 -0
- README.md +105 -0
- app.py +5 -0
- app/__init__.py +0 -0
- app/routes/__init__.py +0 -0
- app/routes/chat.py +141 -0
- app/services/__init__.py +0 -0
- app/services/model_service.py +231 -0
- app/services/prompt_service.py +35 -0
- app/utils/__init__.py +0 -0
- app/utils/config.py +48 -0
- knowledge/kontyra.md +117 -0
- main.py +67 -0
- prompts/system_prompt.txt +20 -0
- requirements.txt +13 -0
- test_kora.py +47 -0
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
.venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1 \
|
| 6 |
+
HF_HOME=/data/.cache/huggingface
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
COPY requirements.txt ./
|
| 11 |
+
RUN pip install --upgrade pip && pip install -r requirements.txt
|
| 12 |
+
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
EXPOSE 7860
|
| 16 |
+
|
| 17 |
+
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]
|
README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# KORA API Backend (FastAPI + Transformers)
|
| 2 |
+
|
| 3 |
+
Production-ready AI backend scaffold for KORA, optimized for Docker and Hugging Face Docker Spaces.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- OpenAI-compatible endpoint: `POST /v1/chat/completions`
|
| 8 |
+
- Streaming token responses using SSE
|
| 9 |
+
- CPU-safe Transformers + PyTorch runtime with `microsoft/Phi-3-mini-4k-instruct`
|
| 10 |
+
- Automatic KORA system-prompt injection on every request
|
| 11 |
+
- KONTYRA knowledge-base context injection
|
| 12 |
+
- Async architecture with centralized logging
|
| 13 |
+
- Environment-driven configuration
|
| 14 |
+
|
| 15 |
+
## Folder Structure
|
| 16 |
+
|
| 17 |
+
```text
|
| 18 |
+
apps/api/
|
| 19 |
+
├── app/
|
| 20 |
+
│ ├── routes/
|
| 21 |
+
│ │ └── chat.py
|
| 22 |
+
│ ├── services/
|
| 23 |
+
│ │ ├── model_service.py
|
| 24 |
+
│ │ └── prompt_service.py
|
| 25 |
+
│ └── utils/
|
| 26 |
+
│ └── config.py
|
| 27 |
+
├── prompts/
|
| 28 |
+
│ └── system_prompt.txt
|
| 29 |
+
├── knowledge/
|
| 30 |
+
│ └── kontyra.md
|
| 31 |
+
├── Dockerfile
|
| 32 |
+
├── main.py
|
| 33 |
+
└── requirements.txt
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Environment Variables
|
| 37 |
+
|
| 38 |
+
Create `.env` in `apps/api/` (optional):
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
APP_NAME="KORA AI Backend"
|
| 42 |
+
APP_ENV="production"
|
| 43 |
+
LOG_LEVEL="INFO"
|
| 44 |
+
MODEL_NAME="microsoft/Phi-3-mini-4k-instruct"
|
| 45 |
+
TRUST_REMOTE_CODE=true
|
| 46 |
+
LOW_CPU_MEM_USAGE=true
|
| 47 |
+
MAX_INPUT_TOKENS=3072
|
| 48 |
+
TORCH_NUM_THREADS=0
|
| 49 |
+
TORCH_NUM_INTEROP_THREADS=0
|
| 50 |
+
DEFAULT_TEMPERATURE=0.7
|
| 51 |
+
DEFAULT_TOP_P=0.9
|
| 52 |
+
DEFAULT_MAX_TOKENS=512
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Local Setup
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
cd apps/api
|
| 59 |
+
python -m venv .venv
|
| 60 |
+
source .venv/bin/activate
|
| 61 |
+
pip install -r requirements.txt
|
| 62 |
+
uvicorn main:app --host 0.0.0.0 --port 8000
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
## Docker Build + Run
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
cd apps/api
|
| 69 |
+
docker build -t kora-api .
|
| 70 |
+
docker run --rm -p 7860:7860 \
|
| 71 |
+
-e PORT=7860 \
|
| 72 |
+
-e MODEL_NAME=microsoft/Phi-3-mini-4k-instruct \
|
| 73 |
+
kora-api
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
## Hugging Face Docker Spaces Startup
|
| 77 |
+
|
| 78 |
+
Use the included `Dockerfile`. Hugging Face sets `PORT`, and the container starts with:
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
## API Example
|
| 85 |
+
|
| 86 |
+
### Request
|
| 87 |
+
|
| 88 |
+
```json
|
| 89 |
+
{
|
| 90 |
+
"model": "microsoft/Phi-3-mini-4k-instruct",
|
| 91 |
+
"stream": true,
|
| 92 |
+
"messages": [
|
| 93 |
+
{"role": "user", "content": "Who created KORA?"}
|
| 94 |
+
]
|
| 95 |
+
}
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
### Endpoint
|
| 99 |
+
|
| 100 |
+
```http
|
| 101 |
+
POST /v1/chat/completions
|
| 102 |
+
Content-Type: application/json
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Streaming responses are returned as SSE `data:` events compatible with OpenAI-style chunk consumption.
|
app.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility entrypoint for Hugging Face Spaces and ASGI hosts."""
|
| 2 |
+
|
| 3 |
+
from main import app
|
| 4 |
+
|
| 5 |
+
__all__ = ["app"]
|
app/__init__.py
ADDED
|
File without changes
|
app/routes/__init__.py
ADDED
|
File without changes
|
app/routes/chat.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenAI-compatible chat completion routes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import time
|
| 8 |
+
from typing import Literal
|
| 9 |
+
|
| 10 |
+
from fastapi import APIRouter, Request
|
| 11 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 12 |
+
from pydantic import BaseModel, Field
|
| 13 |
+
|
| 14 |
+
from app.utils.config import settings
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
router = APIRouter(tags=["chat"])
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ChatMessage(BaseModel):
|
| 21 |
+
"""OpenAI-compatible chat message format."""
|
| 22 |
+
|
| 23 |
+
role: Literal["system", "user", "assistant"]
|
| 24 |
+
content: str
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ChatCompletionRequest(BaseModel):
|
| 28 |
+
"""Subset of OpenAI chat completion request fields."""
|
| 29 |
+
|
| 30 |
+
model: str = Field(default_factory=lambda: settings.model_name)
|
| 31 |
+
messages: list[ChatMessage]
|
| 32 |
+
stream: bool = False
|
| 33 |
+
temperature: float | None = None
|
| 34 |
+
top_p: float | None = None
|
| 35 |
+
max_tokens: int | None = Field(default=None, ge=1)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _sse_event(payload: dict) -> str:
|
| 39 |
+
"""Format one SSE data event."""
|
| 40 |
+
return f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@router.post("/v1/chat/completions")
|
| 44 |
+
async def create_chat_completion(request: Request, body: ChatCompletionRequest):
|
| 45 |
+
"""OpenAI-compatible completions with optional SSE token streaming."""
|
| 46 |
+
prompt_service = request.app.state.prompt_service
|
| 47 |
+
model_service = request.app.state.model_service
|
| 48 |
+
|
| 49 |
+
injected_messages = prompt_service.inject_system_prompt(
|
| 50 |
+
[message.model_dump() for message in body.messages]
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
temperature = body.temperature if body.temperature is not None else settings.default_temperature
|
| 54 |
+
top_p = body.top_p if body.top_p is not None else settings.default_top_p
|
| 55 |
+
max_tokens = body.max_tokens if body.max_tokens is not None else settings.default_max_tokens
|
| 56 |
+
|
| 57 |
+
created = int(time.time())
|
| 58 |
+
|
| 59 |
+
if body.stream:
|
| 60 |
+
|
| 61 |
+
async def event_generator():
|
| 62 |
+
request_id = None
|
| 63 |
+
try:
|
| 64 |
+
# Initial chunk with assistant role to follow OpenAI streaming style.
|
| 65 |
+
bootstrap_chunk = {
|
| 66 |
+
"id": "chatcmpl-bootstrap",
|
| 67 |
+
"object": "chat.completion.chunk",
|
| 68 |
+
"created": created,
|
| 69 |
+
"model": body.model,
|
| 70 |
+
"choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
|
| 71 |
+
}
|
| 72 |
+
yield _sse_event(bootstrap_chunk)
|
| 73 |
+
|
| 74 |
+
async for stream_request_id, delta in model_service.stream_text(
|
| 75 |
+
injected_messages,
|
| 76 |
+
temperature=temperature,
|
| 77 |
+
top_p=top_p,
|
| 78 |
+
max_tokens=max_tokens,
|
| 79 |
+
):
|
| 80 |
+
request_id = stream_request_id
|
| 81 |
+
chunk = {
|
| 82 |
+
"id": request_id,
|
| 83 |
+
"object": "chat.completion.chunk",
|
| 84 |
+
"created": created,
|
| 85 |
+
"model": body.model,
|
| 86 |
+
"choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
|
| 87 |
+
}
|
| 88 |
+
yield _sse_event(chunk)
|
| 89 |
+
|
| 90 |
+
final_chunk = {
|
| 91 |
+
"id": request_id or "chatcmpl-final",
|
| 92 |
+
"object": "chat.completion.chunk",
|
| 93 |
+
"created": created,
|
| 94 |
+
"model": body.model,
|
| 95 |
+
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
|
| 96 |
+
}
|
| 97 |
+
yield _sse_event(final_chunk)
|
| 98 |
+
yield "data: [DONE]\n\n"
|
| 99 |
+
|
| 100 |
+
except (RuntimeError, ValueError): # pragma: no cover - runtime guard
|
| 101 |
+
logger.exception("Failed to stream completion for request")
|
| 102 |
+
error_payload = {
|
| 103 |
+
"error": {
|
| 104 |
+
"message": "Failed to stream completion for request",
|
| 105 |
+
"type": "server_error",
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
yield _sse_event(error_payload)
|
| 109 |
+
yield "data: [DONE]\n\n"
|
| 110 |
+
|
| 111 |
+
return StreamingResponse(
|
| 112 |
+
event_generator(),
|
| 113 |
+
media_type="text/event-stream",
|
| 114 |
+
headers={
|
| 115 |
+
"Cache-Control": "no-cache",
|
| 116 |
+
"Connection": "keep-alive",
|
| 117 |
+
"X-Accel-Buffering": "no",
|
| 118 |
+
},
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
request_id, text = await model_service.complete_text(
|
| 122 |
+
injected_messages,
|
| 123 |
+
temperature=temperature,
|
| 124 |
+
top_p=top_p,
|
| 125 |
+
max_tokens=max_tokens,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
response_payload = {
|
| 129 |
+
"id": request_id,
|
| 130 |
+
"object": "chat.completion",
|
| 131 |
+
"created": created,
|
| 132 |
+
"model": body.model,
|
| 133 |
+
"choices": [
|
| 134 |
+
{
|
| 135 |
+
"index": 0,
|
| 136 |
+
"message": {"role": "assistant", "content": text},
|
| 137 |
+
"finish_reason": "stop",
|
| 138 |
+
}
|
| 139 |
+
],
|
| 140 |
+
}
|
| 141 |
+
return JSONResponse(response_payload)
|
app/services/__init__.py
ADDED
|
File without changes
|
app/services/model_service.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Transformers + PyTorch text generation service for KORA."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
from threading import Thread
|
| 8 |
+
from uuid import uuid4
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
| 12 |
+
|
| 13 |
+
from app.utils.config import Settings
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
MIN_TEMPERATURE = 1e-5
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ModelService:
|
| 20 |
+
"""Manages Transformers model lifecycle and CPU-safe generation."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, settings: Settings) -> None:
|
| 23 |
+
self.settings = settings
|
| 24 |
+
self._model = None
|
| 25 |
+
self._tokenizer = None
|
| 26 |
+
self._startup_lock = asyncio.Lock()
|
| 27 |
+
self._generation_lock = asyncio.Lock()
|
| 28 |
+
|
| 29 |
+
async def startup(self) -> None:
|
| 30 |
+
"""Initialize model engine once per process."""
|
| 31 |
+
if self._model is not None and self._tokenizer is not None:
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
async with self._startup_lock:
|
| 35 |
+
if self._model is not None and self._tokenizer is not None:
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
if self.settings.torch_num_threads > 0:
|
| 39 |
+
torch.set_num_threads(self.settings.torch_num_threads)
|
| 40 |
+
if self.settings.torch_num_interop_threads > 0:
|
| 41 |
+
torch.set_num_interop_threads(self.settings.torch_num_interop_threads)
|
| 42 |
+
|
| 43 |
+
logger.info("Loading model via Transformers on CPU: %s", self.settings.model_name)
|
| 44 |
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
| 45 |
+
"microsoft/Phi-3-mini-4k-instruct",
|
| 46 |
+
trust_remote_code=self.settings.trust_remote_code,
|
| 47 |
+
|
| 48 |
+
use_fast=True,
|
| 49 |
+
)
|
| 50 |
+
if self._tokenizer.pad_token is None and self._tokenizer.eos_token is not None:
|
| 51 |
+
self._tokenizer.pad_token = self._tokenizer.eos_token
|
| 52 |
+
|
| 53 |
+
from peft import PeftModel
|
| 54 |
+
logger.info("Loading base model...")
|
| 55 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 56 |
+
"microsoft/Phi-3-mini-4k-instruct",
|
| 57 |
+
trust_remote_code=self.settings.trust_remote_code,
|
| 58 |
+
torch_dtype=torch.float32,
|
| 59 |
+
device_map="cpu"
|
| 60 |
+
)
|
| 61 |
+
logger.info("Applying PEFT adapter...")
|
| 62 |
+
self._model = PeftModel.from_pretrained(base_model, self.settings.model_name)
|
| 63 |
+
|
| 64 |
+
self._model.eval()
|
| 65 |
+
logger.info("CPU model and tokenizer initialized")
|
| 66 |
+
|
| 67 |
+
async def shutdown(self) -> None:
|
| 68 |
+
"""Graceful shutdown hook."""
|
| 69 |
+
self._model = None
|
| 70 |
+
self._tokenizer = None
|
| 71 |
+
|
| 72 |
+
def _build_prompt(self, messages: list[dict[str, str]]) -> str:
|
| 73 |
+
"""Render OpenAI-style messages into a model prompt."""
|
| 74 |
+
if self._tokenizer is None:
|
| 75 |
+
raise RuntimeError("Tokenizer is not initialized")
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
return self._tokenizer.apply_chat_template(
|
| 79 |
+
messages,
|
| 80 |
+
tokenize=False,
|
| 81 |
+
add_generation_prompt=True,
|
| 82 |
+
)
|
| 83 |
+
except (AttributeError, TypeError, ValueError):
|
| 84 |
+
logger.warning("Chat template not supported, using fallback format", exc_info=True)
|
| 85 |
+
# Fallback for tokenizer templates that may not support message format.
|
| 86 |
+
lines = [f"{m['role'].upper()}: {m['content']}" for m in messages]
|
| 87 |
+
lines.append("ASSISTANT:")
|
| 88 |
+
return "\n".join(lines)
|
| 89 |
+
|
| 90 |
+
def _build_generation_kwargs(
|
| 91 |
+
self,
|
| 92 |
+
*,
|
| 93 |
+
temperature: float,
|
| 94 |
+
top_p: float,
|
| 95 |
+
max_tokens: int,
|
| 96 |
+
) -> dict:
|
| 97 |
+
if self._tokenizer is None:
|
| 98 |
+
raise RuntimeError("Tokenizer is not initialized")
|
| 99 |
+
|
| 100 |
+
clamped_temperature = max(0.0, float(temperature))
|
| 101 |
+
clamped_top_p = min(max(float(top_p), 0.0), 1.0)
|
| 102 |
+
do_sample = clamped_temperature >= MIN_TEMPERATURE
|
| 103 |
+
|
| 104 |
+
kwargs = {
|
| 105 |
+
"max_new_tokens": max(1, int(max_tokens)),
|
| 106 |
+
"do_sample": do_sample,
|
| 107 |
+
"pad_token_id": self._tokenizer.pad_token_id,
|
| 108 |
+
"eos_token_id": self._tokenizer.eos_token_id,
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
if do_sample:
|
| 112 |
+
kwargs["temperature"] = max(clamped_temperature, MIN_TEMPERATURE)
|
| 113 |
+
kwargs["top_p"] = clamped_top_p if clamped_top_p > 0.0 else 1.0
|
| 114 |
+
|
| 115 |
+
return kwargs
|
| 116 |
+
|
| 117 |
+
def _tokenize_prompt(self, prompt: str) -> dict:
|
| 118 |
+
if self._tokenizer is None:
|
| 119 |
+
raise RuntimeError("Tokenizer is not initialized")
|
| 120 |
+
|
| 121 |
+
return self._tokenizer(
|
| 122 |
+
prompt,
|
| 123 |
+
return_tensors="pt",
|
| 124 |
+
truncation=True,
|
| 125 |
+
max_length=self.settings.max_input_tokens,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
async def _ensure_ready(self) -> None:
|
| 129 |
+
if self._model is None or self._tokenizer is None:
|
| 130 |
+
await self.startup()
|
| 131 |
+
if self._model is None or self._tokenizer is None:
|
| 132 |
+
raise RuntimeError("Model service failed to initialize")
|
| 133 |
+
|
| 134 |
+
async def stream_text(
|
| 135 |
+
self,
|
| 136 |
+
messages: list[dict[str, str]],
|
| 137 |
+
*,
|
| 138 |
+
temperature: float,
|
| 139 |
+
top_p: float,
|
| 140 |
+
max_tokens: int,
|
| 141 |
+
):
|
| 142 |
+
"""Yield incremental token deltas for SSE streaming."""
|
| 143 |
+
await self._ensure_ready()
|
| 144 |
+
request_id = f"chatcmpl-{uuid4().hex}"
|
| 145 |
+
|
| 146 |
+
prompt = self._build_prompt(messages)
|
| 147 |
+
inputs = self._tokenize_prompt(prompt)
|
| 148 |
+
generation_kwargs = self._build_generation_kwargs(
|
| 149 |
+
temperature=temperature,
|
| 150 |
+
top_p=top_p,
|
| 151 |
+
max_tokens=max_tokens,
|
| 152 |
+
)
|
| 153 |
+
streamer = TextIteratorStreamer(
|
| 154 |
+
self._tokenizer,
|
| 155 |
+
skip_prompt=True,
|
| 156 |
+
skip_special_tokens=True,
|
| 157 |
+
)
|
| 158 |
+
generation_error: Exception | None = None
|
| 159 |
+
|
| 160 |
+
def run_generation() -> None:
|
| 161 |
+
nonlocal generation_error
|
| 162 |
+
try:
|
| 163 |
+
with torch.inference_mode():
|
| 164 |
+
self._model.generate(
|
| 165 |
+
**inputs,
|
| 166 |
+
streamer=streamer,
|
| 167 |
+
**generation_kwargs,
|
| 168 |
+
)
|
| 169 |
+
except (RuntimeError, ValueError, TypeError) as exc: # pragma: no cover - runtime guard
|
| 170 |
+
generation_error = exc
|
| 171 |
+
logger.exception("Streaming generation failed")
|
| 172 |
+
|
| 173 |
+
async with self._generation_lock:
|
| 174 |
+
worker = Thread(target=run_generation)
|
| 175 |
+
worker.start()
|
| 176 |
+
iterator = iter(streamer)
|
| 177 |
+
|
| 178 |
+
while True:
|
| 179 |
+
token = await asyncio.to_thread(next, iterator, None)
|
| 180 |
+
if token is None:
|
| 181 |
+
break
|
| 182 |
+
if generation_error is not None:
|
| 183 |
+
break
|
| 184 |
+
yield request_id, token
|
| 185 |
+
|
| 186 |
+
await asyncio.to_thread(worker.join)
|
| 187 |
+
|
| 188 |
+
if generation_error is not None:
|
| 189 |
+
raise RuntimeError("Streaming generation failed") from generation_error
|
| 190 |
+
|
| 191 |
+
async def complete_text(
|
| 192 |
+
self,
|
| 193 |
+
messages: list[dict[str, str]],
|
| 194 |
+
*,
|
| 195 |
+
temperature: float,
|
| 196 |
+
top_p: float,
|
| 197 |
+
max_tokens: int,
|
| 198 |
+
) -> tuple[str, str]:
|
| 199 |
+
"""Generate the final full completion in non-stream mode."""
|
| 200 |
+
await self._ensure_ready()
|
| 201 |
+
request_id = f"chatcmpl-{uuid4().hex}"
|
| 202 |
+
|
| 203 |
+
prompt = self._build_prompt(messages)
|
| 204 |
+
inputs = self._tokenize_prompt(prompt)
|
| 205 |
+
generation_kwargs = self._build_generation_kwargs(
|
| 206 |
+
temperature=temperature,
|
| 207 |
+
top_p=top_p,
|
| 208 |
+
max_tokens=max_tokens,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
async with self._generation_lock:
|
| 212 |
+
output_ids = await asyncio.to_thread(
|
| 213 |
+
self._generate_sync,
|
| 214 |
+
inputs,
|
| 215 |
+
generation_kwargs,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
input_ids = inputs.get("input_ids")
|
| 219 |
+
if input_ids is None:
|
| 220 |
+
raise RuntimeError("Tokenization failed to produce input_ids")
|
| 221 |
+
prompt_token_count = int(input_ids.shape[-1])
|
| 222 |
+
generated_ids = output_ids[0][prompt_token_count:]
|
| 223 |
+
final_text = self._tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
|
| 224 |
+
|
| 225 |
+
return request_id, final_text
|
| 226 |
+
|
| 227 |
+
def _generate_sync(self, inputs: dict, generation_kwargs: dict):
|
| 228 |
+
if self._model is None:
|
| 229 |
+
raise RuntimeError("Model is not initialized")
|
| 230 |
+
with torch.inference_mode():
|
| 231 |
+
return self._model.generate(**inputs, **generation_kwargs)
|
app/services/prompt_service.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""System prompt and knowledge orchestration for KORA."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from app.utils.config import Settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class PromptService:
|
| 13 |
+
"""Loads and injects KORA identity/context into requests."""
|
| 14 |
+
|
| 15 |
+
settings: Settings
|
| 16 |
+
|
| 17 |
+
def _read_file(self, path: Path) -> str:
|
| 18 |
+
return path.read_text(encoding="utf-8").strip()
|
| 19 |
+
|
| 20 |
+
def build_system_prompt(self) -> str:
|
| 21 |
+
"""Build a single injected system instruction block."""
|
| 22 |
+
core_prompt = self._read_file(self.settings.system_prompt_file)
|
| 23 |
+
knowledge = self._read_file(self.settings.knowledge_file)
|
| 24 |
+
return (
|
| 25 |
+
f"{core_prompt}\n\n"
|
| 26 |
+
"---\n"
|
| 27 |
+
"KONTYRA KNOWLEDGE BASE (authoritative context):\n"
|
| 28 |
+
f"{knowledge}"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
def inject_system_prompt(self, messages: list[dict[str, str]]) -> list[dict[str, str]]:
|
| 32 |
+
"""Prepend enforced KORA system prompt to every chat request."""
|
| 33 |
+
# Remove user-supplied system messages so identity rules remain stable.
|
| 34 |
+
non_system_messages = [m for m in messages if m.get("role") != "system"]
|
| 35 |
+
return [{"role": "system", "content": self.build_system_prompt()}, *non_system_messages]
|
app/utils/__init__.py
ADDED
|
File without changes
|
app/utils/config.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Runtime configuration for the KORA backend."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from pydantic import Field
|
| 9 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 10 |
+
|
| 11 |
+
BASE_DIR = Path(__file__).resolve().parents[2]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Settings(BaseSettings):
|
| 15 |
+
"""Environment-driven settings with safe defaults."""
|
| 16 |
+
|
| 17 |
+
model_config = SettingsConfigDict(
|
| 18 |
+
env_file=str(BASE_DIR / ".env"),
|
| 19 |
+
env_file_encoding="utf-8",
|
| 20 |
+
extra="ignore",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
app_name: str = Field(default="KORA AI Backend", alias="APP_NAME")
|
| 24 |
+
app_env: str = Field(default="production", alias="APP_ENV")
|
| 25 |
+
log_level: str = Field(default="INFO", alias="LOG_LEVEL")
|
| 26 |
+
|
| 27 |
+
model_name: str = Field(default="ProfessorCEO/KORA-v1", alias="MODEL_NAME")
|
| 28 |
+
trust_remote_code: bool = Field(default=True, alias="TRUST_REMOTE_CODE")
|
| 29 |
+
low_cpu_mem_usage: bool = Field(default=True, alias="LOW_CPU_MEM_USAGE")
|
| 30 |
+
max_input_tokens: int = Field(default=3072, alias="MAX_INPUT_TOKENS")
|
| 31 |
+
torch_num_threads: int = Field(default=0, alias="TORCH_NUM_THREADS")
|
| 32 |
+
torch_num_interop_threads: int = Field(default=0, alias="TORCH_NUM_INTEROP_THREADS")
|
| 33 |
+
|
| 34 |
+
default_temperature: float = Field(default=0.7, alias="DEFAULT_TEMPERATURE")
|
| 35 |
+
default_top_p: float = Field(default=0.9, alias="DEFAULT_TOP_P")
|
| 36 |
+
default_max_tokens: int = Field(default=512, alias="DEFAULT_MAX_TOKENS")
|
| 37 |
+
|
| 38 |
+
system_prompt_file: Path = Field(default=BASE_DIR / "prompts" / "system_prompt.txt")
|
| 39 |
+
knowledge_file: Path = Field(default=BASE_DIR / "knowledge" / "kontyra.md")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@lru_cache
|
| 43 |
+
def get_settings() -> Settings:
|
| 44 |
+
"""Return a cached settings instance."""
|
| 45 |
+
return Settings()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
settings = get_settings()
|
knowledge/kontyra.md
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# KONTYRA
|
| 2 |
+
|
| 3 |
+
KONTYRA is a futuristic AI and technology company focused on building intelligent digital systems, modern software experiences, automation platforms, and AI-powered products.
|
| 4 |
+
|
| 5 |
+
KONTYRA operates as an innovation-driven technology ecosystem with strong emphasis on:
|
| 6 |
+
|
| 7 |
+
* artificial intelligence
|
| 8 |
+
* software engineering
|
| 9 |
+
* automation
|
| 10 |
+
* digital infrastructure
|
| 11 |
+
* intelligent systems
|
| 12 |
+
* modern user experiences
|
| 13 |
+
* scalable technology platforms
|
| 14 |
+
|
| 15 |
+
KONTYRA tagline:
|
| 16 |
+
“Continuous Intelligence. Limitless Impact.”
|
| 17 |
+
|
| 18 |
+
## KORA
|
| 19 |
+
|
| 20 |
+
KORA is the official AI assistant and intelligence platform created by KONTYRA.
|
| 21 |
+
|
| 22 |
+
KORA is designed to:
|
| 23 |
+
|
| 24 |
+
* assist users intelligently
|
| 25 |
+
* provide conversational AI experiences
|
| 26 |
+
* support productivity
|
| 27 |
+
* help with learning and research
|
| 28 |
+
* assist developers and creators
|
| 29 |
+
* automate workflows
|
| 30 |
+
* deliver intelligent insights
|
| 31 |
+
|
| 32 |
+
KORA represents the intelligence layer of the KONTYRA ecosystem.
|
| 33 |
+
|
| 34 |
+
## Brand Identity
|
| 35 |
+
|
| 36 |
+
KONTYRA focuses on:
|
| 37 |
+
|
| 38 |
+
* futuristic technology
|
| 39 |
+
* premium digital experiences
|
| 40 |
+
* AI-native systems
|
| 41 |
+
* scalable infrastructure
|
| 42 |
+
* modern innovation
|
| 43 |
+
* intelligent automation
|
| 44 |
+
|
| 45 |
+
The company vision is to build technology systems that combine:
|
| 46 |
+
|
| 47 |
+
* intelligence
|
| 48 |
+
* creativity
|
| 49 |
+
* usability
|
| 50 |
+
* automation
|
| 51 |
+
* adaptability
|
| 52 |
+
|
| 53 |
+
## Current Projects
|
| 54 |
+
|
| 55 |
+
Current KONTYRA ecosystem projects include:
|
| 56 |
+
|
| 57 |
+
* KORA Chat
|
| 58 |
+
* KORA Daily
|
| 59 |
+
* AI-powered automation systems
|
| 60 |
+
* Telegram AI integrations
|
| 61 |
+
* intelligent content generation systems
|
| 62 |
+
|
| 63 |
+
## KORA Personality
|
| 64 |
+
|
| 65 |
+
KORA should communicate in a:
|
| 66 |
+
|
| 67 |
+
* intelligent
|
| 68 |
+
* modern
|
| 69 |
+
* concise
|
| 70 |
+
* technically strong
|
| 71 |
+
* calm
|
| 72 |
+
* futuristic
|
| 73 |
+
* helpful
|
| 74 |
+
|
| 75 |
+
manner.
|
| 76 |
+
|
| 77 |
+
KORA should avoid:
|
| 78 |
+
|
| 79 |
+
* robotic communication
|
| 80 |
+
* childish tone
|
| 81 |
+
* excessive slang
|
| 82 |
+
* confusing explanations
|
| 83 |
+
|
| 84 |
+
## Important Identity Rules
|
| 85 |
+
|
| 86 |
+
KORA must:
|
| 87 |
+
|
| 88 |
+
* always identify itself as KORA when relevant
|
| 89 |
+
* identify KONTYRA as its creator company
|
| 90 |
+
* never claim to be ChatGPT
|
| 91 |
+
* maintain premium AI assistant behavior
|
| 92 |
+
* communicate confidently and clearly
|
| 93 |
+
|
| 94 |
+
## Technical Stack
|
| 95 |
+
|
| 96 |
+
The KORA ecosystem currently uses technologies including:
|
| 97 |
+
|
| 98 |
+
* FastAPI
|
| 99 |
+
* Next.js
|
| 100 |
+
* Docker
|
| 101 |
+
* Hugging Face Spaces
|
| 102 |
+
* vLLM
|
| 103 |
+
* Phi-3-mini-4k-instruct
|
| 104 |
+
* Telegram Bot API
|
| 105 |
+
* Tailwind CSS
|
| 106 |
+
|
| 107 |
+
## Long-Term Vision
|
| 108 |
+
|
| 109 |
+
KONTYRA aims to evolve into a comprehensive AI-native technology ecosystem focused on:
|
| 110 |
+
|
| 111 |
+
* conversational AI
|
| 112 |
+
* intelligent automation
|
| 113 |
+
* AI productivity systems
|
| 114 |
+
* modern software infrastructure
|
| 115 |
+
* adaptive digital experiences
|
| 116 |
+
|
| 117 |
+
The goal is to create scalable intelligent systems that improve how users interact with technology daily.
|
main.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI entrypoint for the KORA AI backend."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from contextlib import asynccontextmanager
|
| 7 |
+
|
| 8 |
+
from fastapi import FastAPI
|
| 9 |
+
|
| 10 |
+
from app.routes.chat import router as chat_router
|
| 11 |
+
from app.services.model_service import ModelService
|
| 12 |
+
from app.services.prompt_service import PromptService
|
| 13 |
+
from app.utils.config import settings
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def configure_logging() -> None:
|
| 17 |
+
"""Configure centralized application logging."""
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=getattr(logging, settings.log_level.upper(), logging.INFO),
|
| 20 |
+
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@asynccontextmanager
|
| 25 |
+
async def lifespan(app: FastAPI):
|
| 26 |
+
"""Initialize shared services for app lifecycle."""
|
| 27 |
+
configure_logging()
|
| 28 |
+
|
| 29 |
+
prompt_service = PromptService(settings=settings)
|
| 30 |
+
model_service = ModelService(settings=settings)
|
| 31 |
+
|
| 32 |
+
app.state.prompt_service = prompt_service
|
| 33 |
+
app.state.model_service = model_service
|
| 34 |
+
|
| 35 |
+
await model_service.startup()
|
| 36 |
+
logging.getLogger(__name__).info("KORA backend initialized")
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
yield
|
| 40 |
+
finally:
|
| 41 |
+
await model_service.shutdown()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
app = FastAPI(
|
| 45 |
+
title=settings.app_name,
|
| 46 |
+
version="1.0.0",
|
| 47 |
+
lifespan=lifespan,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
app.include_router(chat_router)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.get("/healthz", tags=["health"])
|
| 54 |
+
async def healthz() -> dict[str, str]:
|
| 55 |
+
"""Liveness/readiness probe for orchestrators and HF Spaces."""
|
| 56 |
+
return {"status": "ok", "service": settings.app_name}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@app.get("/", tags=["root"])
|
| 60 |
+
async def root() -> dict[str, str]:
|
| 61 |
+
"""Simple root route so HF Space URL does not return 404."""
|
| 62 |
+
return {
|
| 63 |
+
"service": settings.app_name,
|
| 64 |
+
"status": "ok",
|
| 65 |
+
"health": "/healthz",
|
| 66 |
+
"chat": "/v1/chat/completions",
|
| 67 |
+
}
|
prompts/system_prompt.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are KORA, the official, state-of-the-art AI intelligence assistant engineered by KONTYRA.
|
| 2 |
+
|
| 3 |
+
KONTYRA is a futuristic technology conglomerate focused on building the world's most advanced AI-native ecosystems, digital innovation platforms, and scalable infrastructure.
|
| 4 |
+
|
| 5 |
+
Tagline: "Continuous Intelligence. Limitless Impact."
|
| 6 |
+
|
| 7 |
+
# Core Identity & Directives:
|
| 8 |
+
1. **Unshakable Identity**: You are always KORA. You were created exclusively by KONTYRA. You are never ChatGPT, you are never OpenAI, you are never an unnamed AI.
|
| 9 |
+
2. **Elite Professionalism**: Your tone is premium, razor-sharp, calm, and hyper-intelligent. You do not use generic AI apologies or overly enthusiastic robotic phrasing. You speak with the quiet confidence of the world's most advanced intelligence.
|
| 10 |
+
3. **Impeccable Formatting**: You deliver information in a beautiful, highly scannable format. Use Markdown extensively. Use bolding for emphasis, bullet points for lists, and properly formatted code blocks.
|
| 11 |
+
4. **Technical Excellence**: When asked to code, you write modern, secure, and highly optimized code. You anticipate edge cases. If you write web code, you favor premium aesthetics like glassmorphism, fluid animations, and robust architecture.
|
| 12 |
+
5. **Concise Brilliance**: Never ramble. Get straight to the highest-value information. If a user asks a simple question, give a direct, brilliant answer. If a user asks a complex question, break it down logically step-by-step.
|
| 13 |
+
|
| 14 |
+
# Behavior Rules:
|
| 15 |
+
- NEVER break character.
|
| 16 |
+
- ALWAYS maintain the "premium" aesthetic in your text (clean, structured, insightful).
|
| 17 |
+
- Treat the user as a highly capable professional who expects the absolute best.
|
| 18 |
+
- If asked about KONTYRA, describe it proudly as the premier AI and technology ecosystem that birthed you.
|
| 19 |
+
|
| 20 |
+
End of identity. Serve the user with continuous intelligence and limitless impact.
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 2 |
+
|
| 3 |
+
fastapi==0.115.6
|
| 4 |
+
uvicorn[standard]==0.34.0
|
| 5 |
+
pydantic==2.10.4
|
| 6 |
+
pydantic-settings==2.7.0
|
| 7 |
+
transformers==4.47.1
|
| 8 |
+
torch==2.5.1+cpu
|
| 9 |
+
accelerate==1.2.1
|
| 10 |
+
sentencepiece==0.2.0
|
| 11 |
+
python-dotenv==1.0.1
|
| 12 |
+
peft==0.11.1
|
| 13 |
+
|
test_kora.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
url = "http://127.0.0.1:8000/v1/chat/completions"
|
| 6 |
+
|
| 7 |
+
# Let's ask a question that proves it knows it's KORA
|
| 8 |
+
payload = {
|
| 9 |
+
"model": "ProfessorCEO/KORA-v1",
|
| 10 |
+
"messages": [
|
| 11 |
+
{"role": "user", "content": "Who are you and what company created you?"}
|
| 12 |
+
],
|
| 13 |
+
"stream": False,
|
| 14 |
+
"temperature": 0.3
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
headers = {
|
| 18 |
+
"Content-Type": "application/json"
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
print("Waiting for KORA server to be ready...")
|
| 22 |
+
for i in range(15):
|
| 23 |
+
try:
|
| 24 |
+
health = requests.get("http://127.0.0.1:8000/healthz")
|
| 25 |
+
if health.status_code == 200:
|
| 26 |
+
print("Server is up! Sending request to KORA...")
|
| 27 |
+
break
|
| 28 |
+
except requests.exceptions.ConnectionError:
|
| 29 |
+
pass
|
| 30 |
+
time.sleep(5)
|
| 31 |
+
else:
|
| 32 |
+
print("Server failed to start in time.")
|
| 33 |
+
exit(1)
|
| 34 |
+
|
| 35 |
+
print("\nQuestion:", payload["messages"][0]["content"])
|
| 36 |
+
|
| 37 |
+
response = requests.post(url, json=payload, headers=headers)
|
| 38 |
+
|
| 39 |
+
if response.status_code == 200:
|
| 40 |
+
data = response.json()
|
| 41 |
+
print("\nKORA says:")
|
| 42 |
+
print("-------------------------")
|
| 43 |
+
print(data["choices"][0]["message"]["content"])
|
| 44 |
+
print("-------------------------")
|
| 45 |
+
else:
|
| 46 |
+
print(f"Error: {response.status_code}")
|
| 47 |
+
print(response.text)
|