Tim Luka Horstmann
commited on
Commit
·
8583b57
1
Parent(s):
655702e
Better streaming and less hallucinations.
Browse files
app.py
CHANGED
|
@@ -74,10 +74,10 @@ try:
|
|
| 74 |
)
|
| 75 |
generator = Llama(
|
| 76 |
model_path=model_path,
|
| 77 |
-
n_ctx=1024,
|
| 78 |
n_threads=2,
|
| 79 |
n_batch=512,
|
| 80 |
-
n_gpu_layers=0,
|
| 81 |
verbose=True,
|
| 82 |
)
|
| 83 |
logger.info(f"{filename} model loaded")
|
|
@@ -100,7 +100,7 @@ def retrieve_context(query, top_k=2):
|
|
| 100 |
def stream_response(query):
|
| 101 |
logger.info(f"Processing query: {query}")
|
| 102 |
start_time = time.time()
|
| 103 |
-
first_token_logged = False
|
| 104 |
|
| 105 |
# FAQ check first
|
| 106 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
|
@@ -114,10 +114,18 @@ def stream_response(query):
|
|
| 114 |
yield "data: [DONE]\n\n"
|
| 115 |
return
|
| 116 |
|
| 117 |
-
yield "data: I'm thinking...\n\n"
|
| 118 |
context = retrieve_context(query, top_k=2)
|
| 119 |
messages = [
|
| 120 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
{"role": "user", "content": query}
|
| 122 |
]
|
| 123 |
|
|
@@ -126,22 +134,25 @@ def stream_response(query):
|
|
| 126 |
messages=messages,
|
| 127 |
max_tokens=512,
|
| 128 |
stream=True,
|
| 129 |
-
temperature=0.
|
| 130 |
-
top_p=0.
|
| 131 |
repeat_penalty=1.2
|
| 132 |
):
|
| 133 |
text = chunk['choices'][0]['delta'].get('content', '')
|
| 134 |
if text:
|
| 135 |
buffer += text
|
| 136 |
-
if not first_token_logged and time.time() - start_time > 0:
|
| 137 |
logger.info(f"First token time: {time.time() - start_time:.2f}s")
|
| 138 |
first_token_logged = True
|
| 139 |
-
|
|
|
|
| 140 |
yield f"data: {buffer}\n\n"
|
| 141 |
buffer = ""
|
| 142 |
-
if buffer:
|
| 143 |
yield f"data: {buffer}\n\n"
|
| 144 |
yield "data: [DONE]\n\n"
|
|
|
|
|
|
|
| 145 |
class QueryRequest(BaseModel):
|
| 146 |
data: list
|
| 147 |
|
|
|
|
| 74 |
)
|
| 75 |
generator = Llama(
|
| 76 |
model_path=model_path,
|
| 77 |
+
n_ctx=1024,
|
| 78 |
n_threads=2,
|
| 79 |
n_batch=512,
|
| 80 |
+
n_gpu_layers=0,
|
| 81 |
verbose=True,
|
| 82 |
)
|
| 83 |
logger.info(f"{filename} model loaded")
|
|
|
|
| 100 |
def stream_response(query):
|
| 101 |
logger.info(f"Processing query: {query}")
|
| 102 |
start_time = time.time()
|
| 103 |
+
first_token_logged = False
|
| 104 |
|
| 105 |
# FAQ check first
|
| 106 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
|
|
|
| 114 |
yield "data: [DONE]\n\n"
|
| 115 |
return
|
| 116 |
|
|
|
|
| 117 |
context = retrieve_context(query, top_k=2)
|
| 118 |
messages = [
|
| 119 |
+
{
|
| 120 |
+
"role": "system",
|
| 121 |
+
"content": (
|
| 122 |
+
"You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
|
| 123 |
+
"For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
|
| 124 |
+
"For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
|
| 125 |
+
"and say 'I don’t have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
|
| 126 |
+
f"CV: {context}"
|
| 127 |
+
)
|
| 128 |
+
},
|
| 129 |
{"role": "user", "content": query}
|
| 130 |
]
|
| 131 |
|
|
|
|
| 134 |
messages=messages,
|
| 135 |
max_tokens=512,
|
| 136 |
stream=True,
|
| 137 |
+
temperature=0.3,
|
| 138 |
+
top_p=0.7,
|
| 139 |
repeat_penalty=1.2
|
| 140 |
):
|
| 141 |
text = chunk['choices'][0]['delta'].get('content', '')
|
| 142 |
if text:
|
| 143 |
buffer += text
|
| 144 |
+
if not first_token_logged and time.time() - start_time > 0:
|
| 145 |
logger.info(f"First token time: {time.time() - start_time:.2f}s")
|
| 146 |
first_token_logged = True
|
| 147 |
+
# Yield on every token or small chunk for live streaming
|
| 148 |
+
if len(buffer) >= 1: # Yield per character or small chunk
|
| 149 |
yield f"data: {buffer}\n\n"
|
| 150 |
buffer = ""
|
| 151 |
+
if buffer:
|
| 152 |
yield f"data: {buffer}\n\n"
|
| 153 |
yield "data: [DONE]\n\n"
|
| 154 |
+
|
| 155 |
+
|
| 156 |
class QueryRequest(BaseModel):
|
| 157 |
data: list
|
| 158 |
|