Spaces:
Sleeping
Sleeping
Add conversation memory: history-aware LLM + context-aware retrieval
Browse files
app.py
CHANGED
|
@@ -148,9 +148,20 @@ def respond_stream(message: str, history: list, philosopher: str, llm_label: str
|
|
| 148 |
yield history + [{"role": "assistant", "content": err}], "", gr.update(), gr.update()
|
| 149 |
return
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# — Retrieval (fast, happens before streaming) —
|
| 152 |
t0 = time.perf_counter()
|
| 153 |
-
docs, scores = retrieve_docs(
|
| 154 |
retrieve_time = time.perf_counter() - t0
|
| 155 |
context_str = "\n\n".join(d.page_content for d in docs)
|
| 156 |
|
|
@@ -174,7 +185,7 @@ def respond_stream(message: str, history: list, philosopher: str, llm_label: str
|
|
| 174 |
t1 = time.perf_counter()
|
| 175 |
full_response = ""
|
| 176 |
try:
|
| 177 |
-
for text_chunk in stream_llm(provider, model_id, context_str, message):
|
| 178 |
full_response += text_chunk
|
| 179 |
history[-1]["content"] = _format_think_blocks(full_response)
|
| 180 |
yield history, "", gr.update(value=chunks_md), gr.update()
|
|
|
|
| 148 |
yield history + [{"role": "assistant", "content": err}], "", gr.update(), gr.update()
|
| 149 |
return
|
| 150 |
|
| 151 |
+
# — Build retrieval query —
|
| 152 |
+
# For short follow-ups ("bahas lebih lanjut", "elaborate", etc.) that lack
|
| 153 |
+
# standalone meaning, prepend the last user message so retrieval has context.
|
| 154 |
+
retrieval_query = message
|
| 155 |
+
if len(message.split()) <= 8 and history:
|
| 156 |
+
last_user = next(
|
| 157 |
+
(t["content"] for t in reversed(history) if t["role"] == "user"), ""
|
| 158 |
+
)
|
| 159 |
+
if last_user:
|
| 160 |
+
retrieval_query = f"{last_user} {message}"
|
| 161 |
+
|
| 162 |
# — Retrieval (fast, happens before streaming) —
|
| 163 |
t0 = time.perf_counter()
|
| 164 |
+
docs, scores = retrieve_docs(retrieval_query, philosopher)
|
| 165 |
retrieve_time = time.perf_counter() - t0
|
| 166 |
context_str = "\n\n".join(d.page_content for d in docs)
|
| 167 |
|
|
|
|
| 185 |
t1 = time.perf_counter()
|
| 186 |
full_response = ""
|
| 187 |
try:
|
| 188 |
+
for text_chunk in stream_llm(provider, model_id, context_str, message, history=history[:-2]):
|
| 189 |
full_response += text_chunk
|
| 190 |
history[-1]["content"] = _format_think_blocks(full_response)
|
| 191 |
yield history, "", gr.update(value=chunks_md), gr.update()
|