Spaces:

16bitSega
/

Agentic_RAG

Sleeping

App Files Files Community

Oleksii Obolonskyi commited on Jan 30

Commit

2c5e1f2

1 Parent(s): 44720da

Add token-aware context limits

Browse files

Files changed (2) hide show

README.md +3 -0
app.py +91 -7

README.md CHANGED Viewed

@@ -107,6 +107,9 @@ export RAG_HF_PROVIDER=hf-inference
 export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M
 export RAG_LLM_BACKEND=hf
 export RAG_HF_API_URL=https://router.huggingface.co/hf-inference/models/Qwen/Qwen2.5-7B-Instruct-1M
 export RAG_OUT_DIR=data/normalized
 export RAG_ARTICLE_SOURCES=sources_articles.json
 ```

 export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M
 export RAG_LLM_BACKEND=hf
 export RAG_HF_API_URL=https://router.huggingface.co/hf-inference/models/Qwen/Qwen2.5-7B-Instruct-1M
+export RAG_MAX_CONTEXT_TOKENS=6000
+export RAG_MAX_CHUNKS=6
+export RAG_MAX_GENERATION_TOKENS=512
 export RAG_OUT_DIR=data/normalized
 export RAG_ARTICLE_SOURCES=sources_articles.json
 ```

app.py CHANGED Viewed

@@ -42,6 +42,10 @@ if not HF_API_URL:
 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
 REPO_OWNER = "16bitSega"
 REPO_NAME = "RAG_project"
@@ -134,6 +138,11 @@ def normalize_display_text(s: str) -> str:
     s = re.sub(r"\s+", " ", s).strip()
     return s
 def is_company_question(q: str) -> bool:
     q = (q or "").lower()
     patterns = [
@@ -395,6 +404,49 @@ def build_context(
         parts.append("ARTICLE EXCERPTS:\n" + "\n\n".join(article_parts))
     return "\n\n".join(parts)
 def chunk_keyword_overlap(chunk: Chunk, terms: List[str]) -> int:
     if not terms:
         return 0
@@ -468,7 +520,7 @@ def answer_question(
     if not all_hits or not_found_by_terms(question, all_hits):
         return "Not found in dataset.", citations, False
-    context = build_context(book_hits, article_hits, doc_index, citation_tags)
     avoid_text = "; ".join(AVOID_PHRASES)
     base_rules = (
         "You must answer using only the provided context.\n"
@@ -498,6 +550,17 @@ def answer_question(
         + format_rules
         + f"\nQuestion:\n{question}\n\nContext:\n{context}\n\nAnswer:"
     )
     answer, err = llm_chat(prompt)
     if err:
         st.error(err)
@@ -540,7 +603,7 @@ def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Opt
         resp = client.chat.completions.create(
             model=HF_MODEL,
             messages=messages,
-            max_tokens=512,
             temperature=0.2,
         )
         text = (resp.choices[0].message.content or "").strip()
@@ -549,7 +612,7 @@ def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Opt
         try:
             out = client.text_generation(
                 prompt,
-                max_new_tokens=512,
                 temperature=0.2,
                 do_sample=True,
                 return_full_text=False,
@@ -571,7 +634,7 @@ def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Opt
                     )
                     out = retry_client.text_generation(
                         prompt,
-                        max_new_tokens=512,
                         temperature=0.2,
                         do_sample=True,
                         return_full_text=False,
@@ -595,7 +658,7 @@ def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str,
             {"role": "user", "content": prompt},
         ],
         "stream": False,
-        "options": {"temperature": 0.2},
     }
     try:
         r = requests.post(url, json=payload, timeout=timeout)
@@ -706,8 +769,19 @@ with st.sidebar:
     st.write("")
     st.subheader("Retrieval settings")
     st.caption(f"book_k={BOOK_K}, article_k={ARTICLE_K}, per_doc_cap={PER_DOC_CAP}, overlap_filter={OVERLAP_FILTER}")
-    st.subheader("Dataset stats")
-    st.caption("Local dataset only")
 @st.cache_data(show_spinner=False)
 def load_dataset(path: str) -> List[Chunk]:
     return read_chunks_jsonl(path)
@@ -844,6 +918,16 @@ def run_regen():
         "Generate exactly 3 concise user questions about MCP and AI agents orchestration. "
         "Return each question on its own line without extra text."
     )
     text, err = llm_chat(gen_prompt)
     if err:
         st.error(err)

 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
+MAX_CONTEXT_TOKENS = int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000"))
+MAX_CHUNKS = int(os.getenv("RAG_MAX_CHUNKS", "6"))
+MAX_GENERATION_TOKENS = int(os.getenv("RAG_MAX_GENERATION_TOKENS", "512"))
 REPO_OWNER = "16bitSega"
 REPO_NAME = "RAG_project"
     s = re.sub(r"\s+", " ", s).strip()
     return s
+def estimate_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return max(1, len(text) // 4)
 def is_company_question(q: str) -> bool:
     q = (q or "").lower()
     patterns = [
         parts.append("ARTICLE EXCERPTS:\n" + "\n\n".join(article_parts))
     return "\n\n".join(parts)
+def build_limited_context(
+    hits: List[Tuple[float, Chunk]],
+    doc_index: Dict[str, Dict],
+    tags: Dict[str, str],
+    max_chars_per_chunk: int = 1400,
+) -> Tuple[str, Dict[str, int]]:
+    parts: List[str] = []
+    tok = 0
+    used = 0
+    seen_sections = set()
+    for _, c in hits:
+        if used >= MAX_CHUNKS:
+            break
+        t = normalize_display_text(c.text)
+        if len(t) > max_chars_per_chunk:
+            t = t[:max_chars_per_chunk] + "..."
+        heading = chunk_heading(c, doc_index, tags)
+        block = f"{heading}\n{t}"
+        source_type = infer_source_type(c.doc_id, doc_index.get(c.doc_id))
+        section = "ARTICLE EXCERPTS:" if source_type == "article" else "BOOK EXCERPTS:"
+        section_add = ""
+        if section not in seen_sections:
+            section_add = section
+        addition = (section_add + "\n" if section_add else "") + block
+        add_tokens = estimate_tokens(addition)
+        if tok + add_tokens > MAX_CONTEXT_TOKENS:
+            break
+        if section_add:
+            parts.append(section_add)
+            seen_sections.add(section)
+        parts.append(block)
+        tok += add_tokens
+        used += 1
+    return (
+        "\n\n".join(parts),
+        {
+            "context_tokens": tok,
+            "used_chunks": used,
+            "max_chunks": MAX_CHUNKS,
+            "max_context_tokens": MAX_CONTEXT_TOKENS,
+        },
+    )
 def chunk_keyword_overlap(chunk: Chunk, terms: List[str]) -> int:
     if not terms:
         return 0
     if not all_hits or not_found_by_terms(question, all_hits):
         return "Not found in dataset.", citations, False
+    context, ctx_stats = build_limited_context(all_hits, doc_index, citation_tags)
     avoid_text = "; ".join(AVOID_PHRASES)
     base_rules = (
         "You must answer using only the provided context.\n"
         + format_rules
         + f"\nQuestion:\n{question}\n\nContext:\n{context}\n\nAnswer:"
     )
+    prompt_tokens = estimate_tokens(prompt)
+    total_est = ctx_stats["context_tokens"] + prompt_tokens + MAX_GENERATION_TOKENS
+    st.session_state["token_stats"] = {
+        "context_tokens": ctx_stats["context_tokens"],
+        "prompt_tokens": prompt_tokens,
+        "generation_tokens": MAX_GENERATION_TOKENS,
+        "total_tokens": total_est,
+        "chunks_used": ctx_stats["used_chunks"],
+        "chunks_cap": MAX_CHUNKS,
+        "context_cap": MAX_CONTEXT_TOKENS,
+    }
     answer, err = llm_chat(prompt)
     if err:
         st.error(err)
         resp = client.chat.completions.create(
             model=HF_MODEL,
             messages=messages,
+            max_tokens=MAX_GENERATION_TOKENS,
             temperature=0.2,
         )
         text = (resp.choices[0].message.content or "").strip()
         try:
             out = client.text_generation(
                 prompt,
+                max_new_tokens=MAX_GENERATION_TOKENS,
                 temperature=0.2,
                 do_sample=True,
                 return_full_text=False,
                     )
                     out = retry_client.text_generation(
                         prompt,
+                        max_new_tokens=MAX_GENERATION_TOKENS,
                         temperature=0.2,
                         do_sample=True,
                         return_full_text=False,
             {"role": "user", "content": prompt},
         ],
         "stream": False,
+        "options": {"temperature": 0.2, "num_predict": MAX_GENERATION_TOKENS},
     }
     try:
         r = requests.post(url, json=payload, timeout=timeout)
     st.write("")
     st.subheader("Retrieval settings")
     st.caption(f"book_k={BOOK_K}, article_k={ARTICLE_K}, per_doc_cap={PER_DOC_CAP}, overlap_filter={OVERLAP_FILTER}")
+    st.markdown("### Dataset Stats")
+    ts = st.session_state.get("token_stats")
+    if ts:
+        st.markdown("**Token Consumption (est.)**")
+        st.markdown(f"- Context tokens: `{ts['context_tokens']}` / `{ts['context_cap']}`")
+        st.markdown(f"- Chunks used: `{ts['chunks_used']}` / `{ts['chunks_cap']}`")
+        st.markdown(f"- Prompt tokens: `{ts['prompt_tokens']}`")
+        st.markdown(f"- Generation tokens (max): `{ts['generation_tokens']}`")
+        st.markdown(f"- **Total per request (est.):** `{ts['total_tokens']}`")
+        if ts["context_tokens"] >= int(0.9 * ts["context_cap"]):
+            st.warning("Context near token limit; answers may truncate.")
+    else:
+        st.markdown("_Ask a question to see token usage._")
 @st.cache_data(show_spinner=False)
 def load_dataset(path: str) -> List[Chunk]:
     return read_chunks_jsonl(path)
         "Generate exactly 3 concise user questions about MCP and AI agents orchestration. "
         "Return each question on its own line without extra text."
     )
+    prompt_tokens = estimate_tokens(gen_prompt)
+    st.session_state["token_stats"] = {
+        "context_tokens": 0,
+        "prompt_tokens": prompt_tokens,
+        "generation_tokens": MAX_GENERATION_TOKENS,
+        "total_tokens": prompt_tokens + MAX_GENERATION_TOKENS,
+        "chunks_used": 0,
+        "chunks_cap": MAX_CHUNKS,
+        "context_cap": MAX_CONTEXT_TOKENS,
+    }
     text, err = llm_chat(gen_prompt)
     if err:
         st.error(err)