File size: 2,713 Bytes
cc5d488
ba1690d
58c0c73
ba1690d
 
58c0c73
 
cc5d488
58c0c73
ba1690d
cc5d488
 
ba1690d
cc5d488
 
 
 
 
 
 
 
ba1690d
 
58c0c73
cc5d488
 
 
 
 
d463102
ba1690d
 
cc5d488
 
58c0c73
bcc64f9
58c0c73
 
cc5d488
ba1690d
 
 
 
 
 
 
 
cc5d488
 
 
 
ba1690d
 
 
 
cc5d488
 
 
 
ba1690d
58c0c73
cc5d488
 
 
 
 
 
ba1690d
 
 
58c0c73
cc5d488
 
bcc64f9
cc5d488
 
 
bcc64f9
cc5d488
 
bcc64f9
4f9fa8f
ba1690d
bcc64f9
4f9fa8f
cc5d488
 
ba1690d
58c0c73
d463102
58c0c73
cc5d488
 
 
 
d463102
 
58c0c73
cc5d488
 
58c0c73
cc5d488
 
58c0c73
 
d463102
cc5d488
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
 from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# =========================================================
# MODEL (ULTRA FAST FREE-TIER)
# =========================================================

REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
FILENAME = "tinyllama-1.1b-chat.Q4_K_M.gguf"

print("[SYSTEM] Downloading TinyLlama...")

MODEL_PATH = hf_hub_download(
    repo_id=REPO_ID,
    filename=FILENAME
)

print("[SYSTEM] Initializing model...")

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=512,           # КРИТИЧНО для скорости
    n_batch=1024,
    n_threads=2,         # РОВНО под HF CPU Basic
    use_mmap=True,
    use_mlock=False,
    verbose=False
)

print("[SYSTEM] TinyLlama READY")

# =========================================================
# FASTAPI
# =========================================================

app = FastAPI(title="Apex Free Engine")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# =========================================================
# REQUEST MODEL
# =========================================================

class AnalysisRequest(BaseModel):
    context: str
    query: str

# =========================================================
# ROUTES
# =========================================================

@app.get("/")
def health():
    return {
        "status": "online",
        "engine": "Apex",
        "model": "TinyLlama-1.1B",
        "tier": "HF Free"
    }

@app.post("/analyze")
def analyze(req: AnalysisRequest):
    try:
        # СУПЕР КОРОТКИЙ, НО УМНЫЙ PROMPT
        prompt = f"""<|system|>
Ты — Apex.
Отвечай кратко, логично и по делу.
Используй только данный контекст.
Если данных недостаточно — скажи об этом.
Язык: русский.
</s>
<|user|>
Контекст:
{req.context}

Вопрос:
{req.query}
</s>
<|assistant|>
"""

        output = llm(
            prompt,
            max_tokens=60,        # БОЛЬШЕ НЕЛЬЗЯ НА FREE
            temperature=0.1,     # Минимум фантазии
            top_p=0.8,
            stop=["</s>"],
            echo=False
        )

        answer = output["choices"][0]["text"].strip()

        return {
            "result": answer,
            "model": "TinyLlama-1.1B"
        }

    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=str(e)
        )