Tim Luka Horstmann
commited on
Commit
·
58d2235
1
Parent(s):
dc475e9
increased batch size again
Browse files- app.py +30 -27
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
|
| 3 |
from datetime import datetime
|
| 4 |
import json
|
| 5 |
import time
|
|
@@ -13,6 +11,7 @@ from huggingface_hub import login, hf_hub_download
|
|
| 13 |
import logging
|
| 14 |
import os
|
| 15 |
import faiss
|
|
|
|
| 16 |
|
| 17 |
# Set up logging
|
| 18 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -20,6 +19,9 @@ logger = logging.getLogger(__name__)
|
|
| 20 |
|
| 21 |
app = FastAPI()
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
# Authenticate with Hugging Face
|
| 24 |
hf_token = os.getenv("HF_TOKEN")
|
| 25 |
if not hf_token:
|
|
@@ -29,11 +31,11 @@ login(token=hf_token)
|
|
| 29 |
|
| 30 |
# Models Configuration
|
| 31 |
sentence_transformer_model = "all-MiniLM-L6-v2"
|
| 32 |
-
#
|
| 33 |
repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
|
| 34 |
-
filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
|
| 35 |
|
| 36 |
-
# Define FAQs
|
| 37 |
faqs = [
|
| 38 |
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
|
| 39 |
{"question": "Where do you live?", "answer": "I live in Paris, France."},
|
|
@@ -45,7 +47,7 @@ faqs = [
|
|
| 45 |
]
|
| 46 |
|
| 47 |
try:
|
| 48 |
-
# Load CV embeddings and build FAISS index
|
| 49 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
| 50 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
| 51 |
cv_data = json.load(f)
|
|
@@ -74,12 +76,12 @@ try:
|
|
| 74 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
| 75 |
token=hf_token,
|
| 76 |
)
|
| 77 |
-
#
|
| 78 |
generator = Llama(
|
| 79 |
model_path=model_path,
|
| 80 |
n_ctx=2048,
|
| 81 |
n_threads=2,
|
| 82 |
-
n_batch=
|
| 83 |
n_gpu_layers=0,
|
| 84 |
verbose=True,
|
| 85 |
)
|
|
@@ -104,7 +106,7 @@ def retrieve_context(query, top_k=2):
|
|
| 104 |
with open("cv_text.txt", "r", encoding="utf-8") as f:
|
| 105 |
full_cv_text = f.read()
|
| 106 |
|
| 107 |
-
def stream_response(query):
|
| 108 |
logger.info(f"Processing query: {query}")
|
| 109 |
start_time = time.time()
|
| 110 |
first_token_logged = False
|
|
@@ -139,21 +141,22 @@ def stream_response(query):
|
|
| 139 |
{"role": "user", "content": query}
|
| 140 |
]
|
| 141 |
|
| 142 |
-
#
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
if
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
| 157 |
yield "data: [DONE]\n\n"
|
| 158 |
|
| 159 |
class QueryRequest(BaseModel):
|
|
@@ -181,10 +184,10 @@ async def model_info():
|
|
| 181 |
"faiss_index_dim": cv_embeddings.shape[1],
|
| 182 |
}
|
| 183 |
|
| 184 |
-
# Use a smaller warm-up query to prime the model without extensive delay.
|
| 185 |
@app.on_event("startup")
|
| 186 |
async def warm_up_model():
|
| 187 |
logger.info("Warming up the model...")
|
| 188 |
dummy_query = "Hello"
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from datetime import datetime
|
| 2 |
import json
|
| 3 |
import time
|
|
|
|
| 11 |
import logging
|
| 12 |
import os
|
| 13 |
import faiss
|
| 14 |
+
import asyncio
|
| 15 |
|
| 16 |
# Set up logging
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 19 |
|
| 20 |
app = FastAPI()
|
| 21 |
|
| 22 |
+
# Global lock for model access
|
| 23 |
+
model_lock = asyncio.Lock()
|
| 24 |
+
|
| 25 |
# Authenticate with Hugging Face
|
| 26 |
hf_token = os.getenv("HF_TOKEN")
|
| 27 |
if not hf_token:
|
|
|
|
| 31 |
|
| 32 |
# Models Configuration
|
| 33 |
sentence_transformer_model = "all-MiniLM-L6-v2"
|
| 34 |
+
# Using the 8B model with Q4_K_M quantization
|
| 35 |
repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
|
| 36 |
+
filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
|
| 37 |
|
| 38 |
+
# Define FAQs
|
| 39 |
faqs = [
|
| 40 |
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
|
| 41 |
{"question": "Where do you live?", "answer": "I live in Paris, France."},
|
|
|
|
| 47 |
]
|
| 48 |
|
| 49 |
try:
|
| 50 |
+
# Load CV embeddings and build FAISS index
|
| 51 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
| 52 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
| 53 |
cv_data = json.load(f)
|
|
|
|
| 76 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
| 77 |
token=hf_token,
|
| 78 |
)
|
| 79 |
+
# Use n_batch=256 for lower first-token latency on CPU
|
| 80 |
generator = Llama(
|
| 81 |
model_path=model_path,
|
| 82 |
n_ctx=2048,
|
| 83 |
n_threads=2,
|
| 84 |
+
n_batch=256, # Reduced from 512 to improve streaming responsiveness
|
| 85 |
n_gpu_layers=0,
|
| 86 |
verbose=True,
|
| 87 |
)
|
|
|
|
| 106 |
with open("cv_text.txt", "r", encoding="utf-8") as f:
|
| 107 |
full_cv_text = f.read()
|
| 108 |
|
| 109 |
+
async def stream_response(query):
|
| 110 |
logger.info(f"Processing query: {query}")
|
| 111 |
start_time = time.time()
|
| 112 |
first_token_logged = False
|
|
|
|
| 141 |
{"role": "user", "content": query}
|
| 142 |
]
|
| 143 |
|
| 144 |
+
# Acquire lock to ensure exclusive model access
|
| 145 |
+
async with model_lock:
|
| 146 |
+
for chunk in generator.create_chat_completion(
|
| 147 |
+
messages=messages,
|
| 148 |
+
max_tokens=512,
|
| 149 |
+
stream=True,
|
| 150 |
+
temperature=0.3,
|
| 151 |
+
top_p=0.7,
|
| 152 |
+
repeat_penalty=1.2
|
| 153 |
+
):
|
| 154 |
+
token = chunk['choices'][0]['delta'].get('content', '')
|
| 155 |
+
if token:
|
| 156 |
+
if not first_token_logged:
|
| 157 |
+
logger.info(f"First token time: {time.time() - start_time:.2f}s")
|
| 158 |
+
first_token_logged = True
|
| 159 |
+
yield f"data: {token}\n\n"
|
| 160 |
yield "data: [DONE]\n\n"
|
| 161 |
|
| 162 |
class QueryRequest(BaseModel):
|
|
|
|
| 184 |
"faiss_index_dim": cv_embeddings.shape[1],
|
| 185 |
}
|
| 186 |
|
|
|
|
| 187 |
@app.on_event("startup")
|
| 188 |
async def warm_up_model():
|
| 189 |
logger.info("Warming up the model...")
|
| 190 |
dummy_query = "Hello"
|
| 191 |
+
async for _ in stream_response(dummy_query):
|
| 192 |
+
pass
|
| 193 |
+
logger.info("Model warm-up completed.")
|
requirements.txt
CHANGED
|
@@ -5,4 +5,5 @@ torch==2.4.1
|
|
| 5 |
numpy==1.26.4
|
| 6 |
llama-cpp-python==0.3.1
|
| 7 |
huggingface_hub==0.30.1
|
| 8 |
-
faiss-cpu==1.8.0
|
|
|
|
|
|
| 5 |
numpy==1.26.4
|
| 6 |
llama-cpp-python==0.3.1
|
| 7 |
huggingface_hub==0.30.1
|
| 8 |
+
faiss-cpu==1.8.0
|
| 9 |
+
asyncio
|