Spaces:

Luka512
/

website

Running

App Files Files Community

Tim Luka Horstmann commited on Apr 11, 2025

Commit

6f6e59d

1 Parent(s): 58d2235

Updated to use history

Browse files

Files changed (1) hide show

app.py +41 -41

app.py CHANGED Viewed

@@ -31,7 +31,6 @@ login(token=hf_token)
 # Models Configuration
 sentence_transformer_model = "all-MiniLM-L6-v2"
-# Using the 8B model with Q4_K_M quantization
 repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
 filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
@@ -68,7 +67,7 @@ try:
     faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
     faiss.normalize_L2(faq_embeddings)
-    # Load the 8B Cogito model
     logger.info(f"Loading {filename} model")
     model_path = hf_hub_download(
         repo_id=repo_id,
@@ -76,13 +75,13 @@ try:
         local_dir="/app/cache" if os.getenv("HF_HOME") else None,
         token=hf_token,
     )
-    # Use n_batch=256 for lower first-token latency on CPU
     generator = Llama(
         model_path=model_path,
-        n_ctx=2048,
         n_threads=2,
-        n_batch=256,  # Reduced from 512 to improve streaming responsiveness
         n_gpu_layers=0,
         verbose=True,
     )
     logger.info(f"{filename} model loaded")
@@ -106,42 +105,42 @@ def retrieve_context(query, top_k=2):
 with open("cv_text.txt", "r", encoding="utf-8") as f:
     full_cv_text = f.read()
-async def stream_response(query):
     logger.info(f"Processing query: {query}")
     start_time = time.time()
     first_token_logged = False
     current_date = datetime.now().strftime("%Y-%m-%d")
-    # FAQ check first (keep this as it's fast)
-    # query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
-    # query_embedding = query_embedding.reshape(1, -1)
-    # faiss.normalize_L2(query_embedding)
-    # similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
-    # max_sim = np.max(similarities)
-    # if max_sim > 0.9:
-    #     idx = np.argmax(similarities)
-    #     yield f"data: {faqs[idx]['answer']}\n\n"
-    #     yield "data: [DONE]\n\n"
-    #     return
-    # Use full CV instead of retrieved chunks
-    messages = [
-        {
-            "role": "system",
-            "content": (
-                "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
-                "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
-                "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
-                "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
-                f"Today’s date is {current_date}. "
-                f"CV: {full_cv_text}"
-            )
-        },
-        {"role": "user", "content": query}
-    ]
-    # Acquire lock to ensure exclusive model access
     async with model_lock:
         for chunk in generator.create_chat_completion(
             messages=messages,
@@ -160,14 +159,14 @@ async def stream_response(query):
     yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
-    data: list
 @app.post("/api/predict")
 async def predict(request: QueryRequest):
-    if not request.data or not isinstance(request.data, list) or len(request.data) < 1:
-        raise HTTPException(status_code=400, detail="Invalid input: 'data' must be a non-empty list")
-    query = request.data[0]
-    return StreamingResponse(stream_response(query), media_type="text/event-stream")
 @app.get("/health")
 async def health_check():
@@ -188,6 +187,7 @@ async def model_info():
 async def warm_up_model():
     logger.info("Warming up the model...")
     dummy_query = "Hello"
-    async for _ in stream_response(dummy_query):
         pass
     logger.info("Model warm-up completed.")

 # Models Configuration
 sentence_transformer_model = "all-MiniLM-L6-v2"
 repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
 filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
     faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
     faiss.normalize_L2(faq_embeddings)
+    # Load the 8B Cogito model with optimized parameters
     logger.info(f"Loading {filename} model")
     model_path = hf_hub_download(
         repo_id=repo_id,
         local_dir="/app/cache" if os.getenv("HF_HOME") else None,
         token=hf_token,
     )
     generator = Llama(
         model_path=model_path,
+        n_ctx=3072,
         n_threads=2,
+        n_batch=128,
         n_gpu_layers=0,
+        f16_kv=True,
         verbose=True,
     )
     logger.info(f"{filename} model loaded")
 with open("cv_text.txt", "r", encoding="utf-8") as f:
     full_cv_text = f.read()
+async def stream_response(query, history):
     logger.info(f"Processing query: {query}")
     start_time = time.time()
     first_token_logged = False
     current_date = datetime.now().strftime("%Y-%m-%d")
+    system_prompt = (
+        "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
+        "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
+        "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
+        "and say 'I don't have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
+        f"Today’s date is {current_date}. "
+        f"CV: {full_cv_text}"
+    )
+    # Combine system prompt, history, and current query
+    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
+    # Estimate token counts and truncate history if necessary
+    system_tokens = len(generator.tokenize(system_prompt))
+    query_tokens = len(generator.tokenize(query))
+    history_tokens = [len(generator.tokenize(msg["content"])) for msg in history]
+    total_tokens = system_tokens + query_tokens + sum(history_tokens) + len(history) * 10 + 10  # Rough estimate for formatting
+    max_allowed_tokens = generator.n_ctx - 512 - 100  # max_tokens=512, safety_margin=100
+    while total_tokens > max_allowed_tokens and history:
+        removed_msg = history.pop(0)
+        removed_tokens = len(generator.tokenize(removed_msg["content"]))
+        total_tokens -= (removed_tokens + 10)
+    # Reconstruct messages after possible truncation
+    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
+    # Generate response with lock
     async with model_lock:
         for chunk in generator.create_chat_completion(
             messages=messages,
     yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
+    query: str
+    history: list[dict]
 @app.post("/api/predict")
 async def predict(request: QueryRequest):
+    query = request.query
+    history = request.history
+    return StreamingResponse(stream_response(query, history), media_type="text/event-stream")
 @app.get("/health")
 async def health_check():
 async def warm_up_model():
     logger.info("Warming up the model...")
     dummy_query = "Hello"
+    dummy_history = []
+    async for _ in stream_response(dummy_query, dummy_history):
         pass
     logger.info("Model warm-up completed.")