Spaces:
Running
Running
Optimize GGUF loading and reduce LLM latency
Browse files- src/rag_core.py +2 -2
src/rag_core.py
CHANGED
|
@@ -36,7 +36,7 @@ DB_DIR = Path("db/faiss_code_edu_by_article")
|
|
| 36 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 37 |
|
| 38 |
TOP_K_FETCH = 30 # nb de docs candidats récupérés
|
| 39 |
-
TOP_K_FINAL =
|
| 40 |
SCORE_THRESHOLD = 1.10 # à ajuster (voir affichage des scores)
|
| 41 |
MAX_CHARS_PER_DOC = 800
|
| 42 |
SNIPPET_CHARS = 260
|
|
@@ -81,7 +81,7 @@ def llm_generate(prompt: str) -> str:
|
|
| 81 |
out = llm.create_chat_completion(
|
| 82 |
messages=[{"role": "user", "content": prompt}],
|
| 83 |
temperature=0.1,
|
| 84 |
-
max_tokens=
|
| 85 |
)
|
| 86 |
return out["choices"][0]["message"]["content"].strip()
|
| 87 |
|
|
|
|
| 36 |
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 37 |
|
| 38 |
TOP_K_FETCH = 30 # nb de docs candidats récupérés
|
| 39 |
+
TOP_K_FINAL = 3 # nb max envoyés au LLM
|
| 40 |
SCORE_THRESHOLD = 1.10 # à ajuster (voir affichage des scores)
|
| 41 |
MAX_CHARS_PER_DOC = 800
|
| 42 |
SNIPPET_CHARS = 260
|
|
|
|
| 81 |
out = llm.create_chat_completion(
|
| 82 |
messages=[{"role": "user", "content": prompt}],
|
| 83 |
temperature=0.1,
|
| 84 |
+
max_tokens=120,
|
| 85 |
)
|
| 86 |
return out["choices"][0]["message"]["content"].strip()
|
| 87 |
|