import os import gc from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) model = None current_id = "" class ChatRequest(BaseModel): repo_id: str filename: str prompt: str system_prompt: str = "You are a helpful assistant." max_tokens: int = 512 temperature: float = 0.7 # Это главная страница. Если ты перейдешь по ссылке в браузере, ты должен увидеть это: @app.get("/") async def health(): return {"status": "online", "message": "API is running. Use POST /chat to interact."} @app.post("/chat") async def chat(request: ChatRequest): global model, current_id new_id = f"{request.repo_id}/{request.filename}" try: if model is None or current_id != new_id: if model is not None: del model gc.collect() path = hf_hub_download(repo_id=request.repo_id, filename=request.filename) model = Llama( model_path=path, n_ctx=2048, n_threads=os.cpu_count() or 4, n_gpu_layers=0, verbose=False ) current_id = new_id full_prompt = f"System: {request.system_prompt}\nUser: {request.prompt}\nAssistant:" output = model.create_completion( prompt=full_prompt, max_tokens=request.max_tokens, temperature=request.temperature, stop=["User:", "System:", ""] ) return { "response": output["choices"][0]["text"].strip(), "model": current_id } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn # Hugging Face всегда использует порт 7860 uvicorn.run(app, host="0.0.0.0", port=7860)