Spaces:

Luka512
/

website

Running

App Files Files Community

Tim Luka Horstmann commited on Apr 10, 2025

Commit

58d2235

1 Parent(s): dc475e9

increased batch size again

Browse files

Files changed (2) hide show

app.py +30 -27
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# app.py
 from datetime import datetime
 import json
 import time
@@ -13,6 +11,7 @@ from huggingface_hub import login, hf_hub_download
 import logging
 import os
 import faiss
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -20,6 +19,9 @@ logger = logging.getLogger(__name__)
 app = FastAPI()
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
@@ -29,11 +31,11 @@ login(token=hf_token)
 # Models Configuration
 sentence_transformer_model = "all-MiniLM-L6-v2"
-# Upgrade to the 8B model and choose Q4_0 quantization for a good balance of performance and resource usage.
 repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
-filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"  # New 8B model with Q4_0 quantization
-# Define FAQs (unchanged)
 faqs = [
     {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
     {"question": "Where do you live?", "answer": "I live in Paris, France."},
@@ -45,7 +47,7 @@ faqs = [
 ]
 try:
-    # Load CV embeddings and build FAISS index (unchanged)
     logger.info("Loading CV embeddings from cv_embeddings.json")
     with open("cv_embeddings.json", "r", encoding="utf-8") as f:
         cv_data = json.load(f)
@@ -74,12 +76,12 @@ try:
         local_dir="/app/cache" if os.getenv("HF_HOME") else None,
         token=hf_token,
     )
-    # Lower n_batch for more frequent token streaming.
     generator = Llama(
         model_path=model_path,
         n_ctx=2048,
         n_threads=2,
-        n_batch=128,  # Adjusted for lower latency on streaming responses
         n_gpu_layers=0,
         verbose=True,
     )
@@ -104,7 +106,7 @@ def retrieve_context(query, top_k=2):
 with open("cv_text.txt", "r", encoding="utf-8") as f:
     full_cv_text = f.read()
-def stream_response(query):
     logger.info(f"Processing query: {query}")
     start_time = time.time()
     first_token_logged = False
@@ -139,21 +141,22 @@ def stream_response(query):
         {"role": "user", "content": query}
     ]
-    # Stream tokens immediately as they are generated, avoiding additional buffering.
-    for chunk in generator.create_chat_completion(
-        messages=messages,
-        max_tokens=512,
-        stream=True,
-        temperature=0.3,
-        top_p=0.7,
-        repeat_penalty=1.2
-    ):
-        token = chunk['choices'][0]['delta'].get('content', '')
-        if token:
-            if not first_token_logged:
-                logger.info(f"First token time: {time.time() - start_time:.2f}s")
-                first_token_logged = True
-            yield f"data: {token}\n\n"
     yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
@@ -181,10 +184,10 @@ async def model_info():
         "faiss_index_dim": cv_embeddings.shape[1],
     }
-# Use a smaller warm-up query to prime the model without extensive delay.
 @app.on_event("startup")
 async def warm_up_model():
     logger.info("Warming up the model...")
     dummy_query = "Hello"
-    next(stream_response(dummy_query))
-    logger.info("Model warm-up initiated.")

 from datetime import datetime
 import json
 import time
 import logging
 import os
 import faiss
+import asyncio
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 app = FastAPI()
+# Global lock for model access
+model_lock = asyncio.Lock()
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
 # Models Configuration
 sentence_transformer_model = "all-MiniLM-L6-v2"
+# Using the 8B model with Q4_K_M quantization
 repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
+filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
+# Define FAQs
 faqs = [
     {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
     {"question": "Where do you live?", "answer": "I live in Paris, France."},
 ]
 try:
+    # Load CV embeddings and build FAISS index
     logger.info("Loading CV embeddings from cv_embeddings.json")
     with open("cv_embeddings.json", "r", encoding="utf-8") as f:
         cv_data = json.load(f)
         local_dir="/app/cache" if os.getenv("HF_HOME") else None,
         token=hf_token,
     )
+    # Use n_batch=256 for lower first-token latency on CPU
     generator = Llama(
         model_path=model_path,
         n_ctx=2048,
         n_threads=2,
+        n_batch=256,  # Reduced from 512 to improve streaming responsiveness
         n_gpu_layers=0,
         verbose=True,
     )
 with open("cv_text.txt", "r", encoding="utf-8") as f:
     full_cv_text = f.read()
+async def stream_response(query):
     logger.info(f"Processing query: {query}")
     start_time = time.time()
     first_token_logged = False
         {"role": "user", "content": query}
     ]
+    # Acquire lock to ensure exclusive model access
+    async with model_lock:
+        for chunk in generator.create_chat_completion(
+            messages=messages,
+            max_tokens=512,
+            stream=True,
+            temperature=0.3,
+            top_p=0.7,
+            repeat_penalty=1.2
+        ):
+            token = chunk['choices'][0]['delta'].get('content', '')
+            if token:
+                if not first_token_logged:
+                    logger.info(f"First token time: {time.time() - start_time:.2f}s")
+                    first_token_logged = True
+                yield f"data: {token}\n\n"
     yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
         "faiss_index_dim": cv_embeddings.shape[1],
     }
 @app.on_event("startup")
 async def warm_up_model():
     logger.info("Warming up the model...")
     dummy_query = "Hello"
+    async for _ in stream_response(dummy_query):
+        pass
+    logger.info("Model warm-up completed.")

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ torch==2.4.1
 numpy==1.26.4
 llama-cpp-python==0.3.1
 huggingface_hub==0.30.1
-faiss-cpu==1.8.0

 numpy==1.26.4
 llama-cpp-python==0.3.1
 huggingface_hub==0.30.1
+faiss-cpu==1.8.0
+asyncio