Spaces:

16bitSega
/

Agentic_RAG

Sleeping

App Files Files Community

Oleksii Obolonskyi commited on Jan 30

Commit

45772d2

1 Parent(s): 2c5e1f2

Refactor HF model selection and token limits

Browse files

Files changed (2) hide show

README.md +7 -5
app.py +155 -106

README.md CHANGED Viewed

@@ -54,7 +54,8 @@ Set these environment variables (local dev or Hugging Face Spaces secrets):
 ```bash
 export HF_TOKEN=hf_your_token_here
-export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M
 export RAG_HF_PROVIDER=hf-inference
 export RAG_LLM_BACKEND=hf
 ```
@@ -104,12 +105,13 @@ export RAG_ARTICLE_MANIFEST_PATH=data/normalized/manifest_articles.json
 export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
 export HF_TOKEN=hf_your_token_here
 export RAG_HF_PROVIDER=hf-inference
-export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M
 export RAG_LLM_BACKEND=hf
-export RAG_HF_API_URL=https://router.huggingface.co/hf-inference/models/Qwen/Qwen2.5-7B-Instruct-1M
 export RAG_MAX_CONTEXT_TOKENS=6000
-export RAG_MAX_CHUNKS=6
 export RAG_MAX_GENERATION_TOKENS=512
 export RAG_OUT_DIR=data/normalized
 export RAG_ARTICLE_SOURCES=sources_articles.json
 ```
@@ -119,7 +121,7 @@ export RAG_ARTICLE_SOURCES=sources_articles.json
 1. Create a new Space (Streamlit SDK) and push this repo.
 2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
 3. In Space Settings → Variables, set `RAG_HF_MODEL`, `RAG_LLM_BACKEND=hf`, and `RAG_HF_PROVIDER`.
-4. Optional: set `RAG_HF_API_URL` for display/debug if you use a custom endpoint.
 ## Common maintenance tasks

 ```bash
 export HF_TOKEN=hf_your_token_here
+export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
+export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
 export RAG_HF_PROVIDER=hf-inference
 export RAG_LLM_BACKEND=hf
 ```
 export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
 export HF_TOKEN=hf_your_token_here
 export RAG_HF_PROVIDER=hf-inference
+export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
+export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
 export RAG_LLM_BACKEND=hf
 export RAG_MAX_CONTEXT_TOKENS=6000
+export RAG_INJECT_MAX_CHUNKS=6
 export RAG_MAX_GENERATION_TOKENS=512
+export RAG_RETRIEVE_TOPK_MULT=2
 export RAG_OUT_DIR=data/normalized
 export RAG_ARTICLE_SOURCES=sources_articles.json
 ```
 1. Create a new Space (Streamlit SDK) and push this repo.
 2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
 3. In Space Settings → Variables, set `RAG_HF_MODEL`, `RAG_LLM_BACKEND=hf`, and `RAG_HF_PROVIDER`.
+4. Optional: `RAG_HF_MODEL_FALLBACKS`, `RAG_INJECT_MAX_CHUNKS`, and `RAG_RETRIEVE_TOPK_MULT`.
 ## Common maintenance tasks

app.py CHANGED Viewed

@@ -24,41 +24,82 @@ COMPANY_EMAIL = "o.obolonsky@proton.me"
 COMPANY_PHONE = "+380953555919"
 COMPANY_ABOUT = "AI Software development company ready to collaborate and make your ideas come true"
-BOOK_CHUNKS_PATH = os.environ.get("RAG_BOOK_CHUNKS_PATH", "data/normalized/chunks_books.jsonl")
-ARTICLE_CHUNKS_PATH = os.environ.get("RAG_ARTICLE_CHUNKS_PATH", "data/normalized/chunks_articles.jsonl")
-BOOK_MANIFEST_PATH = os.environ.get("RAG_BOOK_MANIFEST_PATH", "data/normalized/manifest_books.json")
-ARTICLE_MANIFEST_PATH = os.environ.get("RAG_ARTICLE_MANIFEST_PATH", "data/normalized/manifest_articles.json")
-BOOK_INDEX_PATH = os.environ.get("RAG_BOOK_INDEX_PATH", "data/normalized/index_books.faiss")
-ARTICLE_INDEX_PATH = os.environ.get("RAG_ARTICLE_INDEX_PATH", "data/normalized/index_articles.faiss")
-EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-HF_MODEL = os.getenv("RAG_HF_MODEL", "Qwen/Qwen2.5-7B-Instruct-1M").strip()
 HF_PROVIDER = os.getenv("RAG_HF_PROVIDER", "hf-inference").strip() or "hf-inference"
-HF_API_URL = os.getenv("RAG_HF_API_URL", "").strip()
-if not HF_API_URL:
-    HF_API_URL = f"https://router.huggingface.co/hf-inference/models/{HF_MODEL}"
 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
-MAX_CONTEXT_TOKENS = int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000"))
-MAX_CHUNKS = int(os.getenv("RAG_MAX_CHUNKS", "6"))
-MAX_GENERATION_TOKENS = int(os.getenv("RAG_MAX_GENERATION_TOKENS", "512"))
 REPO_OWNER = "16bitSega"
 REPO_NAME = "RAG_project"
 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "").strip()
-# Retrieval mix: book-first + article nuance.
-BOOK_K = 8
-ARTICLE_K = 4
-PER_DOC_CAP = 3
-OVERLAP_FILTER = True
-# Enhanced answer mix: heavier retrieval for deeper answers.
-ENHANCED_BOOK_K = 14
-ENHANCED_ARTICLE_K = 7
 AVOID_PHRASES = [
     "The article discusses",
@@ -415,7 +456,7 @@ def build_limited_context(
     used = 0
     seen_sections = set()
     for _, c in hits:
-        if used >= MAX_CHUNKS:
             break
         t = normalize_display_text(c.text)
         if len(t) > max_chars_per_chunk:
@@ -442,7 +483,7 @@ def build_limited_context(
         {
             "context_tokens": tok,
             "used_chunks": used,
-            "max_chunks": MAX_CHUNKS,
             "max_context_tokens": MAX_CONTEXT_TOKENS,
         },
     )
@@ -487,8 +528,8 @@ def retrieve_books_and_articles(
     book_k: int,
     article_k: int,
 ) -> Tuple[List[Tuple[float, Chunk]], List[Tuple[float, Chunk]]]:
-    oversample_book = book_k * 2
-    oversample_article = article_k * 2
     book_hits = retrieve(query, embedder, book_index, book_chunks, k=oversample_book)
     article_hits = retrieve(query, embedder, article_index, article_chunks, k=oversample_article)
     book_hits = refine_hits(book_hits, query)
@@ -558,7 +599,7 @@ def answer_question(
         "generation_tokens": MAX_GENERATION_TOKENS,
         "total_tokens": total_est,
         "chunks_used": ctx_stats["used_chunks"],
-        "chunks_cap": MAX_CHUNKS,
         "context_cap": MAX_CONTEXT_TOKENS,
     }
     answer, err = llm_chat(prompt)
@@ -577,38 +618,71 @@ def system_message() -> str:
         "Keep answers concise. Cite sources using the provided citation tags exactly."
     )
-def build_hf_prompt(user_prompt: str, model_id: str) -> str:
-    system_msg = system_message()
-    if "llama-3" in model_id.lower():
-        return (
-            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
-            f"{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>\n"
-            f"{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
-        )
-    return f"System: {system_msg}\nUser: {user_prompt}\nAssistant:"
-@st.cache_resource(show_spinner=False)
-def get_hf_client() -> InferenceClient:
-    return InferenceClient(model=HF_MODEL, provider=HF_PROVIDER, token=HF_TOKEN)
-def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
-    if not HF_TOKEN:
-        return "", "Missing HF_TOKEN (or HUGGINGFACEHUB_API_TOKEN)"
-    client = get_hf_client()
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant. Follow instructions and cite sources if provided."},
-        {"role": "user", "content": prompt},
-    ]
-    try:
-        resp = client.chat.completions.create(
-            model=HF_MODEL,
-            messages=messages,
-            max_tokens=MAX_GENERATION_TOKENS,
-            temperature=0.2,
-        )
-        text = (resp.choices[0].message.content or "").strip()
-        return text, None
-    except Exception:
         try:
             out = client.text_generation(
                 prompt,
@@ -618,35 +692,12 @@ def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Opt
                 return_full_text=False,
             )
             return (out or "").strip(), None
-        except Exception as e2:
-            err_msg = str(e2)
             err_low = err_msg.lower()
-            should_retry_provider = (
-                HF_PROVIDER != "hf-inference"
-                and any(k in err_low for k in ["model_not_found", "does not exist", "invalid_request_error", "404"])
-            )
-            if should_retry_provider:
-                try:
-                    retry_client = InferenceClient(
-                        model=HF_MODEL,
-                        provider="hf-inference",
-                        token=HF_TOKEN,
-                    )
-                    out = retry_client.text_generation(
-                        prompt,
-                        max_new_tokens=MAX_GENERATION_TOKENS,
-                        temperature=0.2,
-                        do_sample=True,
-                        return_full_text=False,
-                    )
-                    return (out or "").strip(), None
-                except Exception as retry_err:
-                    err_msg = str(retry_err)
-            hint = (
-                f"HF model: {HF_MODEL}; provider: {HF_PROVIDER}. "
-                "Choose a provider that serves this model or deploy an Inference Endpoint "
-                "and set RAG_HF_API_URL to that endpoint URL."
-            )
             return "", f"{err_msg} ({hint})"
 def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
@@ -678,19 +729,14 @@ def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Op
     backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
     if backend == "hf":
-        return hf_chat(prompt, timeout=timeout)
     if backend == "ollama":
-        return ollama_chat(prompt, timeout=timeout)
     if is_running_on_spaces():
-        return hf_chat(prompt, timeout=timeout)
     if (HF_TOKEN or "").strip():
-        return hf_chat(prompt, timeout=timeout)
-    return ollama_chat(prompt, timeout=timeout)
-def is_running_on_spaces() -> bool:
-    if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
-        return True
-    return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
 def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
     if not GITHUB_TOKEN:
@@ -758,11 +804,14 @@ with st.sidebar:
         st.session_state["open_ticket_ui"] = True
     st.write("")
     st.subheader("LLM")
-    st.markdown(f"- Backend: `{os.getenv('RAG_LLM_BACKEND', 'auto')}`")
-    st.markdown(f"- HF model: `{HF_MODEL}`")
-    st.markdown(f"- Provider: `{HF_PROVIDER}`")
-    st.markdown(f"- URL (display): `{HF_API_URL}`")
-    st.markdown(f"- HF token set: `{bool(HF_TOKEN)}`")
     st.write("")
     st.subheader("Embedding model (retrieval)")
     st.code(EMBED_MODEL)
@@ -915,8 +964,8 @@ def run_enhance(question: str, enhanced_key: str):
 def run_regen():
     gen_prompt = (
-        "Generate exactly 3 concise user questions about MCP and AI agents orchestration. "
-        "Return each question on its own line without extra text."
     )
     prompt_tokens = estimate_tokens(gen_prompt)
     st.session_state["token_stats"] = {
@@ -925,7 +974,7 @@ def run_regen():
         "generation_tokens": MAX_GENERATION_TOKENS,
         "total_tokens": prompt_tokens + MAX_GENERATION_TOKENS,
         "chunks_used": 0,
-        "chunks_cap": MAX_CHUNKS,
         "context_cap": MAX_CONTEXT_TOKENS,
     }
     text, err = llm_chat(gen_prompt)

 COMPANY_PHONE = "+380953555919"
 COMPANY_ABOUT = "AI Software development company ready to collaborate and make your ideas come true"
+@dataclass
+class AppConfig:
+    book_chunks_path: str
+    article_chunks_path: str
+    book_manifest_path: str
+    article_manifest_path: str
+    book_index_path: str
+    article_index_path: str
+    embed_model: str
+    max_context_tokens: int
+    inject_max_chunks: int
+    max_generation_tokens: int
+    book_k: int
+    article_k: int
+    enhanced_book_k: int
+    enhanced_article_k: int
+    per_doc_cap: int
+    overlap_filter: bool
+    retrieve_topk_mult: int
+CONFIG = AppConfig(
+    book_chunks_path=os.environ.get("RAG_BOOK_CHUNKS_PATH", "data/normalized/chunks_books.jsonl"),
+    article_chunks_path=os.environ.get("RAG_ARTICLE_CHUNKS_PATH", "data/normalized/chunks_articles.jsonl"),
+    book_manifest_path=os.environ.get("RAG_BOOK_MANIFEST_PATH", "data/normalized/manifest_books.json"),
+    article_manifest_path=os.environ.get("RAG_ARTICLE_MANIFEST_PATH", "data/normalized/manifest_articles.json"),
+    book_index_path=os.environ.get("RAG_BOOK_INDEX_PATH", "data/normalized/index_books.faiss"),
+    article_index_path=os.environ.get("RAG_ARTICLE_INDEX_PATH", "data/normalized/index_articles.faiss"),
+    embed_model=os.environ.get("RAG_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"),
+    max_context_tokens=int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000")),
+    inject_max_chunks=int(os.getenv("RAG_INJECT_MAX_CHUNKS", os.getenv("RAG_MAX_CHUNKS", "6"))),
+    max_generation_tokens=int(os.getenv("RAG_MAX_GENERATION_TOKENS", "512")),
+    book_k=8,
+    article_k=4,
+    enhanced_book_k=14,
+    enhanced_article_k=7,
+    per_doc_cap=3,
+    overlap_filter=True,
+    retrieve_topk_mult=int(os.getenv("RAG_RETRIEVE_TOPK_MULT", "2")),
+)
+BOOK_CHUNKS_PATH = CONFIG.book_chunks_path
+ARTICLE_CHUNKS_PATH = CONFIG.article_chunks_path
+BOOK_MANIFEST_PATH = CONFIG.book_manifest_path
+ARTICLE_MANIFEST_PATH = CONFIG.article_manifest_path
+BOOK_INDEX_PATH = CONFIG.book_index_path
+ARTICLE_INDEX_PATH = CONFIG.article_index_path
+EMBED_MODEL = CONFIG.embed_model
+MAX_CONTEXT_TOKENS = CONFIG.max_context_tokens
+INJECT_MAX_CHUNKS = CONFIG.inject_max_chunks
+MAX_GENERATION_TOKENS = CONFIG.max_generation_tokens
+BOOK_K = CONFIG.book_k
+ARTICLE_K = CONFIG.article_k
+ENHANCED_BOOK_K = CONFIG.enhanced_book_k
+ENHANCED_ARTICLE_K = CONFIG.enhanced_article_k
+PER_DOC_CAP = CONFIG.per_doc_cap
+OVERLAP_FILTER = CONFIG.overlap_filter
+RETRIEVE_TOPK_MULT = CONFIG.retrieve_topk_mult
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 HF_PROVIDER = os.getenv("RAG_HF_PROVIDER", "hf-inference").strip() or "hf-inference"
+HF_MODEL_PRIMARY = os.getenv("RAG_HF_MODEL", os.getenv("RAG_HF_MODEL_PRIMARY", "HuggingFaceTB/SmolLM3-3B")).strip()
+HF_MODEL_FALLBACKS_RAW = os.getenv("RAG_HF_MODEL_FALLBACKS", "").strip()
+HF_MODEL_FALLBACKS = (
+    [m.strip() for m in HF_MODEL_FALLBACKS_RAW.split(",") if m.strip()]
+    if HF_MODEL_FALLBACKS_RAW
+    else ["HuggingFaceTB/SmolLM3-3B", "HuggingFaceTB/SmolLM2-1.7B", "HuggingFaceTB/SmolLM2-360M"]
+)
+HF_MODEL_CANDIDATES = [HF_MODEL_PRIMARY] + [m for m in HF_MODEL_FALLBACKS if m != HF_MODEL_PRIMARY]
 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
 REPO_OWNER = "16bitSega"
 REPO_NAME = "RAG_project"
 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "").strip()
 AVOID_PHRASES = [
     "The article discusses",
     used = 0
     seen_sections = set()
     for _, c in hits:
+        if used >= INJECT_MAX_CHUNKS:
             break
         t = normalize_display_text(c.text)
         if len(t) > max_chars_per_chunk:
         {
             "context_tokens": tok,
             "used_chunks": used,
+            "max_chunks": INJECT_MAX_CHUNKS,
             "max_context_tokens": MAX_CONTEXT_TOKENS,
         },
     )
     book_k: int,
     article_k: int,
 ) -> Tuple[List[Tuple[float, Chunk]], List[Tuple[float, Chunk]]]:
+    oversample_book = book_k * RETRIEVE_TOPK_MULT
+    oversample_article = article_k * RETRIEVE_TOPK_MULT
     book_hits = retrieve(query, embedder, book_index, book_chunks, k=oversample_book)
     article_hits = retrieve(query, embedder, article_index, article_chunks, k=oversample_article)
     book_hits = refine_hits(book_hits, query)
         "generation_tokens": MAX_GENERATION_TOKENS,
         "total_tokens": total_est,
         "chunks_used": ctx_stats["used_chunks"],
+        "chunks_cap": INJECT_MAX_CHUNKS,
         "context_cap": MAX_CONTEXT_TOKENS,
     }
     answer, err = llm_chat(prompt)
         "Keep answers concise. Cite sources using the provided citation tags exactly."
     )
+def is_running_on_spaces() -> bool:
+    if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
+        return True
+    return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
+def get_hf_client(model_id: str) -> InferenceClient:
+    return InferenceClient(model=model_id, provider=HF_PROVIDER, token=HF_TOKEN)
+def select_active_hf_model() -> str:
+    if st.session_state.get("hf_active_model"):
+        return st.session_state["hf_active_model"]
+    last_err = ""
+    for model_id in HF_MODEL_CANDIDATES:
+        try:
+            client = get_hf_client(model_id)
+            client.text_generation(
+                "ping",
+                max_new_tokens=2,
+                temperature=0.0,
+                do_sample=False,
+                return_full_text=False,
+            )
+            st.session_state["hf_active_model"] = model_id
+            st.session_state.pop("hf_startup_error", None)
+            return model_id
+        except Exception as exc:
+            last_err = str(exc)
+    st.session_state["hf_active_model"] = HF_MODEL_PRIMARY
+    if last_err:
+        st.session_state["hf_startup_error"] = last_err
+    return HF_MODEL_PRIMARY
+class LLMClient:
+    def __init__(self, backend: str) -> None:
+        self.backend = backend
+    def generate(self, prompt: str) -> Tuple[str, Optional[str]]:
+        if self.backend == "ollama":
+            return ollama_chat(prompt)
+        return self._hf_generate(prompt)
+    def _hf_generate(self, prompt: str) -> Tuple[str, Optional[str]]:
+        model_id = select_active_hf_model()
+        client = get_hf_client(model_id)
+        messages = [
+            {"role": "system", "content": system_message()},
+            {"role": "user", "content": prompt},
+        ]
+        try:
+            chat_api = getattr(getattr(client, "chat", None), "completions", None)
+            create_fn = getattr(chat_api, "create", None)
+            if create_fn:
+                resp = create_fn(
+                    model=model_id,
+                    messages=messages,
+                    max_tokens=MAX_GENERATION_TOKENS,
+                    temperature=0.2,
+                )
+                text = (resp.choices[0].message.content or "").strip()
+                return text, None
+        except Exception as exc:
+            chat_err = str(exc)
+        else:
+            chat_err = ""
         try:
             out = client.text_generation(
                 prompt,
                 return_full_text=False,
             )
             return (out or "").strip(), None
+        except Exception as exc:
+            err_msg = str(exc) or chat_err
+            hint = f"HF model: {model_id}; provider: {HF_PROVIDER}."
             err_low = err_msg.lower()
+            if any(k in err_low for k in ["401", "403", "gated", "license", "not authorized", "forbidden"]):
+                hint += " This model is gated. Ensure HF_TOKEN has accepted the license."
             return "", f"{err_msg} ({hint})"
 def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
     backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
     if backend == "hf":
+        return LLMClient("hf").generate(prompt)
     if backend == "ollama":
+        return LLMClient("ollama").generate(prompt)
     if is_running_on_spaces():
+        return LLMClient("hf").generate(prompt)
     if (HF_TOKEN or "").strip():
+        return LLMClient("hf").generate(prompt)
+    return LLMClient("ollama").generate(prompt)
 def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
     if not GITHUB_TOKEN:
         st.session_state["open_ticket_ui"] = True
     st.write("")
     st.subheader("LLM")
+    backend = os.getenv("RAG_LLM_BACKEND", "auto").strip().lower()
+    use_hf = backend == "hf" or (
+        backend == "auto" and (is_running_on_spaces() or (HF_TOKEN or "").strip())
+    )
+    active_model = select_active_hf_model() if use_hf else HF_MODEL_PRIMARY
+    st.markdown(f"- Active model: `{active_model}`")
+    if use_hf and st.session_state.get("hf_startup_error"):
+        st.warning("HF model not available; check token/provider/model list.")
     st.write("")
     st.subheader("Embedding model (retrieval)")
     st.code(EMBED_MODEL)
 def run_regen():
     gen_prompt = (
+        "Generate exactly 3 short, smart user questions for this app about AI agents, "
+        "orchestration, MCP, tool use, and RAG. One question per line. No numbering."
     )
     prompt_tokens = estimate_tokens(gen_prompt)
     st.session_state["token_stats"] = {
         "generation_tokens": MAX_GENERATION_TOKENS,
         "total_tokens": prompt_tokens + MAX_GENERATION_TOKENS,
         "chunks_used": 0,
+        "chunks_cap": INJECT_MAX_CHUNKS,
         "context_cap": MAX_CONTEXT_TOKENS,
     }
     text, err = llm_chat(gen_prompt)