VietCat commited on
Commit
0afcb9c
·
1 Parent(s): 1becc27

init project

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. Dockerfile +27 -0
  3. app/main.py +162 -0
  4. app/model_loader.py +13 -0
  5. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # files
2
+ *.DS_Store
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Cài công cụ cần thiết để build llama-cpp-python
4
+ RUN apt-get update && apt-get install -y \
5
+ build-essential \
6
+ cmake \
7
+ git \
8
+ wget \
9
+ curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Cập nhật pip
13
+ RUN pip install --upgrade pip
14
+
15
+ WORKDIR /app
16
+
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Tải model GGUF từ Hugging Face Hub (ví dụ: TheBloke's repo)
21
+ # Thay đổi URL và tên file nếu bạn dùng repo khác
22
+ RUN mkdir -p models && \
23
+ wget -O models/gemma34b.gguf https://huggingface.co/Mungert/gemma-3-4b-it-gguf/resolve/main/google_gemma-3-4b-it-q4_k_l.gguf
24
+
25
+ COPY ./app ./app
26
+
27
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/main.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request
2
+ from pydantic import BaseModel
3
+ import logging
4
+ import time
5
+ import asyncio
6
+ import os
7
+
8
+ from app.model_loader import load_model
9
+
10
+ app = FastAPI()
11
+ llm = None # Khởi tạo sau
12
+
13
+ class PromptRequest(BaseModel):
14
+ prompt: str
15
+
16
+ # Setup logging
17
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
18
+
19
+ def format_prompt_as_chat(user_prompt: str) -> str:
20
+ messages = [
21
+ {
22
+ "role": "system",
23
+ "content": "Bạn là trợ lý đáng tin cậy, luôn trả lời ngắn gọn, và chính xác.",
24
+ },
25
+ {"role": "user", "content": user_prompt.strip()},
26
+ ]
27
+ formatted = (
28
+ "<|system|>\n" + messages[0]["content"] + "</s>\n"
29
+ "<|user|>\n" + messages[1]["content"] + "</s>\n"
30
+ "<|assistant|>\n"
31
+ )
32
+ return formatted
33
+
34
+ def format_prompt_as_user_prompt(user_prompt: str) -> str:
35
+ messages = [
36
+ {"role": "user", "content": user_prompt.strip()},
37
+ ]
38
+ formatted = (
39
+ "<|user|>\n" + messages[0]["content"] + "</s>\n"
40
+ )
41
+ return formatted
42
+
43
+ def format_prompt_as_pure_prompt(user_prompt: str) -> str:
44
+ messages = [
45
+ {"role": "user", "content": user_prompt.strip()},
46
+ ]
47
+ formatted = (
48
+ "" + messages[0]["content"] + "\n"
49
+ )
50
+ return formatted
51
+
52
+ @app.on_event("startup")
53
+ async def startup_event():
54
+ global llm
55
+ model_path = "models/gemma34b.gguf"
56
+
57
+ # Đợi file mô hình nếu chưa có (tối đa 60 giây)
58
+ timeout = 60
59
+ waited = 0
60
+ while not os.path.exists(model_path) and waited < timeout:
61
+ logging.info(f"Đang chờ mô hình xuất hiện tại {model_path}...")
62
+ await asyncio.sleep(2)
63
+ waited += 2
64
+
65
+ if not os.path.exists(model_path):
66
+ raise FileNotFoundError(f"Không tìm thấy mô hình sau {timeout} giây: {model_path}")
67
+
68
+ # Load mô hình trong thread riêng để không block event loop
69
+ llm = await asyncio.to_thread(load_model)
70
+ logging.info("✅ Đã tải mô hình thành công.")
71
+
72
+ @app.post("/chat")
73
+ async def chat(request: Request, prompt: PromptRequest):
74
+ start_time = time.time()
75
+ logging.info(f"📩 Nhận request từ {request.client.host} lúc {time.strftime('%Y-%m-%d %H:%M:%S')}")
76
+
77
+ formatted_prompt = format_prompt_as_chat(prompt.prompt)
78
+
79
+ output = await asyncio.to_thread(
80
+ llm,
81
+ formatted_prompt,
82
+ max_tokens=256,
83
+ temperature=0.7,
84
+ top_k=50,
85
+ top_p=0.95,
86
+ stop=["</s>"]
87
+ )
88
+
89
+ end_time = time.time()
90
+ duration = end_time - start_time
91
+ logging.info(f"✅ Xử lý xong sau {duration:.2f} giây.")
92
+
93
+ return {"response": output["choices"][0]["text"].strip()}
94
+
95
+ @app.post("/userchat")
96
+ async def userchat(request: Request, prompt: PromptRequest):
97
+ start_time = time.time()
98
+ logging.info(f"📩 Nhận request từ {request.client.host} lúc {time.strftime('%Y-%m-%d %H:%M:%S')}")
99
+
100
+ formatted_prompt = format_prompt_as_user_prompt(prompt.prompt)
101
+
102
+ output = await asyncio.to_thread(
103
+ llm,
104
+ formatted_prompt,
105
+ max_tokens=256,
106
+ temperature=0.7,
107
+ top_k=50,
108
+ top_p=0.95,
109
+ stop=["</s>"]
110
+ )
111
+
112
+ end_time = time.time()
113
+ duration = end_time - start_time
114
+ logging.info(f"✅ Xử lý xong sau {duration:.2f} giây.")
115
+
116
+ return {"response": output["choices"][0]["text"].strip()}
117
+
118
+ @app.post("/purechat")
119
+ async def purechat(request: Request, prompt: PromptRequest):
120
+ start_time = time.time()
121
+ logging.info(f"📩 Nhận request từ {request.client.host} lúc {time.strftime('%Y-%m-%d %H:%M:%S')}")
122
+
123
+ formatted_prompt = format_prompt_as_pure_prompt(prompt.prompt)
124
+
125
+ output = await asyncio.to_thread(
126
+ llm,
127
+ formatted_prompt,
128
+ max_tokens=256,
129
+ temperature=0.7,
130
+ top_k=50,
131
+ top_p=0.95,
132
+ stop=["</s>"]
133
+ )
134
+
135
+ end_time = time.time()
136
+ duration = end_time - start_time
137
+ logging.info(f"✅ Xử lý xong sau {duration:.2f} giây.")
138
+
139
+ return {"response": output["choices"][0]["text"].strip()}
140
+
141
+ @app.get("/")
142
+ async def get():
143
+ start_time = time.time()
144
+ logging.info(f"📩 Nhận get request lúc {time.strftime('%Y-%m-%d %H:%M:%S')}")
145
+
146
+ formatted_prompt = format_prompt_as_user_prompt("Bạn tên là gì?")
147
+
148
+ output = await asyncio.to_thread(
149
+ llm,
150
+ formatted_prompt,
151
+ max_tokens=256,
152
+ temperature=0.7,
153
+ top_k=50,
154
+ top_p=0.95,
155
+ stop=["</s>"]
156
+ )
157
+
158
+ end_time = time.time()
159
+ duration = end_time - start_time
160
+ logging.info(f"✅ Xử lý xong sau {duration:.2f} giây.")
161
+
162
+ return {"response": output["choices"][0]["text"].strip()}
app/model_loader.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+
3
+ llm = None # Sẽ được khởi tạo sau
4
+
5
+ def load_model():
6
+ global llm
7
+ if llm is None:
8
+ llm = Llama(
9
+ model_path="models/gemma34b.gguf",
10
+ n_ctx=2048,
11
+ n_threads=4,
12
+ )
13
+ return llm
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ llama-cpp-python==0.2.24