Spaces:

Luka512
/

website

Running

App Files Files Community

Tim Luka Horstmann commited on Apr 9, 2025

Commit

8583b57

1 Parent(s): 655702e

Better streaming and less hallucinations.

Browse files

Files changed (1) hide show

app.py +21 -10

app.py CHANGED Viewed

@@ -74,10 +74,10 @@ try:
     )
     generator = Llama(
         model_path=model_path,
-        n_ctx=1024,  # Adjust if 128k is supported and memory allows; start with 1024
         n_threads=2,
         n_batch=512,
-        n_gpu_layers=0,  # No GPU on free tier
         verbose=True,
     )
     logger.info(f"{filename} model loaded")
@@ -100,7 +100,7 @@ def retrieve_context(query, top_k=2):
 def stream_response(query):
     logger.info(f"Processing query: {query}")
     start_time = time.time()
-    first_token_logged = False  # Flag to log first token time only once
     # FAQ check first
     query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
@@ -114,10 +114,18 @@ def stream_response(query):
         yield "data: [DONE]\n\n"
         return
-    yield "data: I'm thinking...\n\n"
     context = retrieve_context(query, top_k=2)
     messages = [
-        {"role": "system", "content": f"You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response. CV: {context}"},
         {"role": "user", "content": query}
     ]
@@ -126,22 +134,25 @@ def stream_response(query):
         messages=messages,
         max_tokens=512,
         stream=True,
-        temperature=0.5,
-        top_p=0.9,
         repeat_penalty=1.2
     ):
         text = chunk['choices'][0]['delta'].get('content', '')
         if text:
             buffer += text
-            if not first_token_logged and time.time() - start_time > 0:  # Log first token once
                 logger.info(f"First token time: {time.time() - start_time:.2f}s")
                 first_token_logged = True
-            if buffer.endswith(" ") or buffer.endswith(".") or buffer.endswith("!"):
                 yield f"data: {buffer}\n\n"
                 buffer = ""
-    if buffer:  # Flush any remaining buffer
         yield f"data: {buffer}\n\n"
     yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
     data: list

     )
     generator = Llama(
         model_path=model_path,
+        n_ctx=1024,
         n_threads=2,
         n_batch=512,
+        n_gpu_layers=0,
         verbose=True,
     )
     logger.info(f"{filename} model loaded")
 def stream_response(query):
     logger.info(f"Processing query: {query}")
     start_time = time.time()
+    first_token_logged = False
     # FAQ check first
     query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
         yield "data: [DONE]\n\n"
         return
     context = retrieve_context(query, top_k=2)
     messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
+                "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
+                "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
+                "and say 'I don’t have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
+                f"CV: {context}"
+            )
+        },
         {"role": "user", "content": query}
     ]
         messages=messages,
         max_tokens=512,
         stream=True,
+        temperature=0.3,
+        top_p=0.7,
         repeat_penalty=1.2
     ):
         text = chunk['choices'][0]['delta'].get('content', '')
         if text:
             buffer += text
+            if not first_token_logged and time.time() - start_time > 0:
                 logger.info(f"First token time: {time.time() - start_time:.2f}s")
                 first_token_logged = True
+            # Yield on every token or small chunk for live streaming
+            if len(buffer) >= 1:  # Yield per character or small chunk
                 yield f"data: {buffer}\n\n"
                 buffer = ""
+    if buffer:
         yield f"data: {buffer}\n\n"
     yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
     data: list