Tim Luka Horstmann commited on
Commit
8583b57
·
1 Parent(s): 655702e

Better streaming and less hallucinations.

Browse files
Files changed (1) hide show
  1. app.py +21 -10
app.py CHANGED
@@ -74,10 +74,10 @@ try:
74
  )
75
  generator = Llama(
76
  model_path=model_path,
77
- n_ctx=1024, # Adjust if 128k is supported and memory allows; start with 1024
78
  n_threads=2,
79
  n_batch=512,
80
- n_gpu_layers=0, # No GPU on free tier
81
  verbose=True,
82
  )
83
  logger.info(f"{filename} model loaded")
@@ -100,7 +100,7 @@ def retrieve_context(query, top_k=2):
100
  def stream_response(query):
101
  logger.info(f"Processing query: {query}")
102
  start_time = time.time()
103
- first_token_logged = False # Flag to log first token time only once
104
 
105
  # FAQ check first
106
  query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
@@ -114,10 +114,18 @@ def stream_response(query):
114
  yield "data: [DONE]\n\n"
115
  return
116
 
117
- yield "data: I'm thinking...\n\n"
118
  context = retrieve_context(query, top_k=2)
119
  messages = [
120
- {"role": "system", "content": f"You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response. CV: {context}"},
 
 
 
 
 
 
 
 
 
121
  {"role": "user", "content": query}
122
  ]
123
 
@@ -126,22 +134,25 @@ def stream_response(query):
126
  messages=messages,
127
  max_tokens=512,
128
  stream=True,
129
- temperature=0.5,
130
- top_p=0.9,
131
  repeat_penalty=1.2
132
  ):
133
  text = chunk['choices'][0]['delta'].get('content', '')
134
  if text:
135
  buffer += text
136
- if not first_token_logged and time.time() - start_time > 0: # Log first token once
137
  logger.info(f"First token time: {time.time() - start_time:.2f}s")
138
  first_token_logged = True
139
- if buffer.endswith(" ") or buffer.endswith(".") or buffer.endswith("!"):
 
140
  yield f"data: {buffer}\n\n"
141
  buffer = ""
142
- if buffer: # Flush any remaining buffer
143
  yield f"data: {buffer}\n\n"
144
  yield "data: [DONE]\n\n"
 
 
145
  class QueryRequest(BaseModel):
146
  data: list
147
 
 
74
  )
75
  generator = Llama(
76
  model_path=model_path,
77
+ n_ctx=1024,
78
  n_threads=2,
79
  n_batch=512,
80
+ n_gpu_layers=0,
81
  verbose=True,
82
  )
83
  logger.info(f"{filename} model loaded")
 
100
  def stream_response(query):
101
  logger.info(f"Processing query: {query}")
102
  start_time = time.time()
103
+ first_token_logged = False
104
 
105
  # FAQ check first
106
  query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
 
114
  yield "data: [DONE]\n\n"
115
  return
116
 
 
117
  context = retrieve_context(query, top_k=2)
118
  messages = [
119
+ {
120
+ "role": "system",
121
+ "content": (
122
+ "You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. "
123
+ "For questions about your CV, base your answer *exclusively* on the provided CV information below and do not add any details not explicitly stated. "
124
+ "For casual questions not covered by the CV, respond naturally but limit answers to general truths about yourself (e.g., your current location is Paris, France, or your field is AI) "
125
+ "and say 'I don’t have specific details to share about that' if pressed for specifics beyond the CV or FAQs. Do not invent facts, experiences, or opinions not supported by the CV or FAQs. "
126
+ f"CV: {context}"
127
+ )
128
+ },
129
  {"role": "user", "content": query}
130
  ]
131
 
 
134
  messages=messages,
135
  max_tokens=512,
136
  stream=True,
137
+ temperature=0.3,
138
+ top_p=0.7,
139
  repeat_penalty=1.2
140
  ):
141
  text = chunk['choices'][0]['delta'].get('content', '')
142
  if text:
143
  buffer += text
144
+ if not first_token_logged and time.time() - start_time > 0:
145
  logger.info(f"First token time: {time.time() - start_time:.2f}s")
146
  first_token_logged = True
147
+ # Yield on every token or small chunk for live streaming
148
+ if len(buffer) >= 1: # Yield per character or small chunk
149
  yield f"data: {buffer}\n\n"
150
  buffer = ""
151
+ if buffer:
152
  yield f"data: {buffer}\n\n"
153
  yield "data: [DONE]\n\n"
154
+
155
+
156
  class QueryRequest(BaseModel):
157
  data: list
158