Spaces:
Sleeping
Sleeping
Oleksii Obolonskyi commited on
Commit ·
45772d2
1
Parent(s): 2c5e1f2
Refactor HF model selection and token limits
Browse files
README.md
CHANGED
|
@@ -54,7 +54,8 @@ Set these environment variables (local dev or Hugging Face Spaces secrets):
|
|
| 54 |
|
| 55 |
```bash
|
| 56 |
export HF_TOKEN=hf_your_token_here
|
| 57 |
-
export RAG_HF_MODEL=
|
|
|
|
| 58 |
export RAG_HF_PROVIDER=hf-inference
|
| 59 |
export RAG_LLM_BACKEND=hf
|
| 60 |
```
|
|
@@ -104,12 +105,13 @@ export RAG_ARTICLE_MANIFEST_PATH=data/normalized/manifest_articles.json
|
|
| 104 |
export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 105 |
export HF_TOKEN=hf_your_token_here
|
| 106 |
export RAG_HF_PROVIDER=hf-inference
|
| 107 |
-
export RAG_HF_MODEL=
|
|
|
|
| 108 |
export RAG_LLM_BACKEND=hf
|
| 109 |
-
export RAG_HF_API_URL=https://router.huggingface.co/hf-inference/models/Qwen/Qwen2.5-7B-Instruct-1M
|
| 110 |
export RAG_MAX_CONTEXT_TOKENS=6000
|
| 111 |
-
export
|
| 112 |
export RAG_MAX_GENERATION_TOKENS=512
|
|
|
|
| 113 |
export RAG_OUT_DIR=data/normalized
|
| 114 |
export RAG_ARTICLE_SOURCES=sources_articles.json
|
| 115 |
```
|
|
@@ -119,7 +121,7 @@ export RAG_ARTICLE_SOURCES=sources_articles.json
|
|
| 119 |
1. Create a new Space (Streamlit SDK) and push this repo.
|
| 120 |
2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
|
| 121 |
3. In Space Settings → Variables, set `RAG_HF_MODEL`, `RAG_LLM_BACKEND=hf`, and `RAG_HF_PROVIDER`.
|
| 122 |
-
4. Optional:
|
| 123 |
|
| 124 |
## Common maintenance tasks
|
| 125 |
|
|
|
|
| 54 |
|
| 55 |
```bash
|
| 56 |
export HF_TOKEN=hf_your_token_here
|
| 57 |
+
export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
|
| 58 |
+
export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
|
| 59 |
export RAG_HF_PROVIDER=hf-inference
|
| 60 |
export RAG_LLM_BACKEND=hf
|
| 61 |
```
|
|
|
|
| 105 |
export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
| 106 |
export HF_TOKEN=hf_your_token_here
|
| 107 |
export RAG_HF_PROVIDER=hf-inference
|
| 108 |
+
export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
|
| 109 |
+
export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
|
| 110 |
export RAG_LLM_BACKEND=hf
|
|
|
|
| 111 |
export RAG_MAX_CONTEXT_TOKENS=6000
|
| 112 |
+
export RAG_INJECT_MAX_CHUNKS=6
|
| 113 |
export RAG_MAX_GENERATION_TOKENS=512
|
| 114 |
+
export RAG_RETRIEVE_TOPK_MULT=2
|
| 115 |
export RAG_OUT_DIR=data/normalized
|
| 116 |
export RAG_ARTICLE_SOURCES=sources_articles.json
|
| 117 |
```
|
|
|
|
| 121 |
1. Create a new Space (Streamlit SDK) and push this repo.
|
| 122 |
2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
|
| 123 |
3. In Space Settings → Variables, set `RAG_HF_MODEL`, `RAG_LLM_BACKEND=hf`, and `RAG_HF_PROVIDER`.
|
| 124 |
+
4. Optional: `RAG_HF_MODEL_FALLBACKS`, `RAG_INJECT_MAX_CHUNKS`, and `RAG_RETRIEVE_TOPK_MULT`.
|
| 125 |
|
| 126 |
## Common maintenance tasks
|
| 127 |
|
app.py
CHANGED
|
@@ -24,41 +24,82 @@ COMPANY_EMAIL = "o.obolonsky@proton.me"
|
|
| 24 |
COMPANY_PHONE = "+380953555919"
|
| 25 |
COMPANY_ABOUT = "AI Software development company ready to collaborate and make your ideas come true"
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 36 |
-
HF_MODEL = os.getenv("RAG_HF_MODEL", "Qwen/Qwen2.5-7B-Instruct-1M").strip()
|
| 37 |
HF_PROVIDER = os.getenv("RAG_HF_PROVIDER", "hf-inference").strip() or "hf-inference"
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
|
| 43 |
OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
|
| 44 |
|
| 45 |
-
MAX_CONTEXT_TOKENS = int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000"))
|
| 46 |
-
MAX_CHUNKS = int(os.getenv("RAG_MAX_CHUNKS", "6"))
|
| 47 |
-
MAX_GENERATION_TOKENS = int(os.getenv("RAG_MAX_GENERATION_TOKENS", "512"))
|
| 48 |
-
|
| 49 |
REPO_OWNER = "16bitSega"
|
| 50 |
REPO_NAME = "RAG_project"
|
| 51 |
|
| 52 |
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "").strip()
|
| 53 |
-
# Retrieval mix: book-first + article nuance.
|
| 54 |
-
BOOK_K = 8
|
| 55 |
-
ARTICLE_K = 4
|
| 56 |
-
PER_DOC_CAP = 3
|
| 57 |
-
OVERLAP_FILTER = True
|
| 58 |
-
|
| 59 |
-
# Enhanced answer mix: heavier retrieval for deeper answers.
|
| 60 |
-
ENHANCED_BOOK_K = 14
|
| 61 |
-
ENHANCED_ARTICLE_K = 7
|
| 62 |
|
| 63 |
AVOID_PHRASES = [
|
| 64 |
"The article discusses",
|
|
@@ -415,7 +456,7 @@ def build_limited_context(
|
|
| 415 |
used = 0
|
| 416 |
seen_sections = set()
|
| 417 |
for _, c in hits:
|
| 418 |
-
if used >=
|
| 419 |
break
|
| 420 |
t = normalize_display_text(c.text)
|
| 421 |
if len(t) > max_chars_per_chunk:
|
|
@@ -442,7 +483,7 @@ def build_limited_context(
|
|
| 442 |
{
|
| 443 |
"context_tokens": tok,
|
| 444 |
"used_chunks": used,
|
| 445 |
-
"max_chunks":
|
| 446 |
"max_context_tokens": MAX_CONTEXT_TOKENS,
|
| 447 |
},
|
| 448 |
)
|
|
@@ -487,8 +528,8 @@ def retrieve_books_and_articles(
|
|
| 487 |
book_k: int,
|
| 488 |
article_k: int,
|
| 489 |
) -> Tuple[List[Tuple[float, Chunk]], List[Tuple[float, Chunk]]]:
|
| 490 |
-
oversample_book = book_k *
|
| 491 |
-
oversample_article = article_k *
|
| 492 |
book_hits = retrieve(query, embedder, book_index, book_chunks, k=oversample_book)
|
| 493 |
article_hits = retrieve(query, embedder, article_index, article_chunks, k=oversample_article)
|
| 494 |
book_hits = refine_hits(book_hits, query)
|
|
@@ -558,7 +599,7 @@ def answer_question(
|
|
| 558 |
"generation_tokens": MAX_GENERATION_TOKENS,
|
| 559 |
"total_tokens": total_est,
|
| 560 |
"chunks_used": ctx_stats["used_chunks"],
|
| 561 |
-
"chunks_cap":
|
| 562 |
"context_cap": MAX_CONTEXT_TOKENS,
|
| 563 |
}
|
| 564 |
answer, err = llm_chat(prompt)
|
|
@@ -577,38 +618,71 @@ def system_message() -> str:
|
|
| 577 |
"Keep answers concise. Cite sources using the provided citation tags exactly."
|
| 578 |
)
|
| 579 |
|
| 580 |
-
def
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
|
| 590 |
-
@st.cache_resource(show_spinner=False)
|
| 591 |
-
def get_hf_client() -> InferenceClient:
|
| 592 |
-
return InferenceClient(model=HF_MODEL, provider=HF_PROVIDER, token=HF_TOKEN)
|
| 593 |
-
|
| 594 |
-
def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
|
| 595 |
-
if not HF_TOKEN:
|
| 596 |
-
return "", "Missing HF_TOKEN (or HUGGINGFACEHUB_API_TOKEN)"
|
| 597 |
-
client = get_hf_client()
|
| 598 |
-
messages = [
|
| 599 |
-
{"role": "system", "content": "You are a helpful assistant. Follow instructions and cite sources if provided."},
|
| 600 |
-
{"role": "user", "content": prompt},
|
| 601 |
-
]
|
| 602 |
-
try:
|
| 603 |
-
resp = client.chat.completions.create(
|
| 604 |
-
model=HF_MODEL,
|
| 605 |
-
messages=messages,
|
| 606 |
-
max_tokens=MAX_GENERATION_TOKENS,
|
| 607 |
-
temperature=0.2,
|
| 608 |
-
)
|
| 609 |
-
text = (resp.choices[0].message.content or "").strip()
|
| 610 |
-
return text, None
|
| 611 |
-
except Exception:
|
| 612 |
try:
|
| 613 |
out = client.text_generation(
|
| 614 |
prompt,
|
|
@@ -618,35 +692,12 @@ def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Opt
|
|
| 618 |
return_full_text=False,
|
| 619 |
)
|
| 620 |
return (out or "").strip(), None
|
| 621 |
-
except Exception as
|
| 622 |
-
err_msg = str(
|
|
|
|
| 623 |
err_low = err_msg.lower()
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
and any(k in err_low for k in ["model_not_found", "does not exist", "invalid_request_error", "404"])
|
| 627 |
-
)
|
| 628 |
-
if should_retry_provider:
|
| 629 |
-
try:
|
| 630 |
-
retry_client = InferenceClient(
|
| 631 |
-
model=HF_MODEL,
|
| 632 |
-
provider="hf-inference",
|
| 633 |
-
token=HF_TOKEN,
|
| 634 |
-
)
|
| 635 |
-
out = retry_client.text_generation(
|
| 636 |
-
prompt,
|
| 637 |
-
max_new_tokens=MAX_GENERATION_TOKENS,
|
| 638 |
-
temperature=0.2,
|
| 639 |
-
do_sample=True,
|
| 640 |
-
return_full_text=False,
|
| 641 |
-
)
|
| 642 |
-
return (out or "").strip(), None
|
| 643 |
-
except Exception as retry_err:
|
| 644 |
-
err_msg = str(retry_err)
|
| 645 |
-
hint = (
|
| 646 |
-
f"HF model: {HF_MODEL}; provider: {HF_PROVIDER}. "
|
| 647 |
-
"Choose a provider that serves this model or deploy an Inference Endpoint "
|
| 648 |
-
"and set RAG_HF_API_URL to that endpoint URL."
|
| 649 |
-
)
|
| 650 |
return "", f"{err_msg} ({hint})"
|
| 651 |
|
| 652 |
def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
|
|
@@ -678,19 +729,14 @@ def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Op
|
|
| 678 |
backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
|
| 679 |
|
| 680 |
if backend == "hf":
|
| 681 |
-
return
|
| 682 |
if backend == "ollama":
|
| 683 |
-
return
|
| 684 |
if is_running_on_spaces():
|
| 685 |
-
return
|
| 686 |
if (HF_TOKEN or "").strip():
|
| 687 |
-
return
|
| 688 |
-
return
|
| 689 |
-
|
| 690 |
-
def is_running_on_spaces() -> bool:
|
| 691 |
-
if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
|
| 692 |
-
return True
|
| 693 |
-
return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
|
| 694 |
|
| 695 |
def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
|
| 696 |
if not GITHUB_TOKEN:
|
|
@@ -758,11 +804,14 @@ with st.sidebar:
|
|
| 758 |
st.session_state["open_ticket_ui"] = True
|
| 759 |
st.write("")
|
| 760 |
st.subheader("LLM")
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
|
|
|
|
|
|
|
|
|
| 766 |
st.write("")
|
| 767 |
st.subheader("Embedding model (retrieval)")
|
| 768 |
st.code(EMBED_MODEL)
|
|
@@ -915,8 +964,8 @@ def run_enhance(question: str, enhanced_key: str):
|
|
| 915 |
|
| 916 |
def run_regen():
|
| 917 |
gen_prompt = (
|
| 918 |
-
"Generate exactly 3
|
| 919 |
-
"
|
| 920 |
)
|
| 921 |
prompt_tokens = estimate_tokens(gen_prompt)
|
| 922 |
st.session_state["token_stats"] = {
|
|
@@ -925,7 +974,7 @@ def run_regen():
|
|
| 925 |
"generation_tokens": MAX_GENERATION_TOKENS,
|
| 926 |
"total_tokens": prompt_tokens + MAX_GENERATION_TOKENS,
|
| 927 |
"chunks_used": 0,
|
| 928 |
-
"chunks_cap":
|
| 929 |
"context_cap": MAX_CONTEXT_TOKENS,
|
| 930 |
}
|
| 931 |
text, err = llm_chat(gen_prompt)
|
|
|
|
| 24 |
COMPANY_PHONE = "+380953555919"
|
| 25 |
COMPANY_ABOUT = "AI Software development company ready to collaborate and make your ideas come true"
|
| 26 |
|
| 27 |
+
@dataclass
|
| 28 |
+
class AppConfig:
|
| 29 |
+
book_chunks_path: str
|
| 30 |
+
article_chunks_path: str
|
| 31 |
+
book_manifest_path: str
|
| 32 |
+
article_manifest_path: str
|
| 33 |
+
book_index_path: str
|
| 34 |
+
article_index_path: str
|
| 35 |
+
embed_model: str
|
| 36 |
+
max_context_tokens: int
|
| 37 |
+
inject_max_chunks: int
|
| 38 |
+
max_generation_tokens: int
|
| 39 |
+
book_k: int
|
| 40 |
+
article_k: int
|
| 41 |
+
enhanced_book_k: int
|
| 42 |
+
enhanced_article_k: int
|
| 43 |
+
per_doc_cap: int
|
| 44 |
+
overlap_filter: bool
|
| 45 |
+
retrieve_topk_mult: int
|
| 46 |
+
|
| 47 |
+
CONFIG = AppConfig(
|
| 48 |
+
book_chunks_path=os.environ.get("RAG_BOOK_CHUNKS_PATH", "data/normalized/chunks_books.jsonl"),
|
| 49 |
+
article_chunks_path=os.environ.get("RAG_ARTICLE_CHUNKS_PATH", "data/normalized/chunks_articles.jsonl"),
|
| 50 |
+
book_manifest_path=os.environ.get("RAG_BOOK_MANIFEST_PATH", "data/normalized/manifest_books.json"),
|
| 51 |
+
article_manifest_path=os.environ.get("RAG_ARTICLE_MANIFEST_PATH", "data/normalized/manifest_articles.json"),
|
| 52 |
+
book_index_path=os.environ.get("RAG_BOOK_INDEX_PATH", "data/normalized/index_books.faiss"),
|
| 53 |
+
article_index_path=os.environ.get("RAG_ARTICLE_INDEX_PATH", "data/normalized/index_articles.faiss"),
|
| 54 |
+
embed_model=os.environ.get("RAG_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"),
|
| 55 |
+
max_context_tokens=int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000")),
|
| 56 |
+
inject_max_chunks=int(os.getenv("RAG_INJECT_MAX_CHUNKS", os.getenv("RAG_MAX_CHUNKS", "6"))),
|
| 57 |
+
max_generation_tokens=int(os.getenv("RAG_MAX_GENERATION_TOKENS", "512")),
|
| 58 |
+
book_k=8,
|
| 59 |
+
article_k=4,
|
| 60 |
+
enhanced_book_k=14,
|
| 61 |
+
enhanced_article_k=7,
|
| 62 |
+
per_doc_cap=3,
|
| 63 |
+
overlap_filter=True,
|
| 64 |
+
retrieve_topk_mult=int(os.getenv("RAG_RETRIEVE_TOPK_MULT", "2")),
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
BOOK_CHUNKS_PATH = CONFIG.book_chunks_path
|
| 68 |
+
ARTICLE_CHUNKS_PATH = CONFIG.article_chunks_path
|
| 69 |
+
BOOK_MANIFEST_PATH = CONFIG.book_manifest_path
|
| 70 |
+
ARTICLE_MANIFEST_PATH = CONFIG.article_manifest_path
|
| 71 |
+
BOOK_INDEX_PATH = CONFIG.book_index_path
|
| 72 |
+
ARTICLE_INDEX_PATH = CONFIG.article_index_path
|
| 73 |
+
EMBED_MODEL = CONFIG.embed_model
|
| 74 |
+
MAX_CONTEXT_TOKENS = CONFIG.max_context_tokens
|
| 75 |
+
INJECT_MAX_CHUNKS = CONFIG.inject_max_chunks
|
| 76 |
+
MAX_GENERATION_TOKENS = CONFIG.max_generation_tokens
|
| 77 |
+
BOOK_K = CONFIG.book_k
|
| 78 |
+
ARTICLE_K = CONFIG.article_k
|
| 79 |
+
ENHANCED_BOOK_K = CONFIG.enhanced_book_k
|
| 80 |
+
ENHANCED_ARTICLE_K = CONFIG.enhanced_article_k
|
| 81 |
+
PER_DOC_CAP = CONFIG.per_doc_cap
|
| 82 |
+
OVERLAP_FILTER = CONFIG.overlap_filter
|
| 83 |
+
RETRIEVE_TOPK_MULT = CONFIG.retrieve_topk_mult
|
| 84 |
|
| 85 |
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
|
|
|
| 86 |
HF_PROVIDER = os.getenv("RAG_HF_PROVIDER", "hf-inference").strip() or "hf-inference"
|
| 87 |
+
HF_MODEL_PRIMARY = os.getenv("RAG_HF_MODEL", os.getenv("RAG_HF_MODEL_PRIMARY", "HuggingFaceTB/SmolLM3-3B")).strip()
|
| 88 |
+
HF_MODEL_FALLBACKS_RAW = os.getenv("RAG_HF_MODEL_FALLBACKS", "").strip()
|
| 89 |
+
HF_MODEL_FALLBACKS = (
|
| 90 |
+
[m.strip() for m in HF_MODEL_FALLBACKS_RAW.split(",") if m.strip()]
|
| 91 |
+
if HF_MODEL_FALLBACKS_RAW
|
| 92 |
+
else ["HuggingFaceTB/SmolLM3-3B", "HuggingFaceTB/SmolLM2-1.7B", "HuggingFaceTB/SmolLM2-360M"]
|
| 93 |
+
)
|
| 94 |
+
HF_MODEL_CANDIDATES = [HF_MODEL_PRIMARY] + [m for m in HF_MODEL_FALLBACKS if m != HF_MODEL_PRIMARY]
|
| 95 |
|
| 96 |
OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
|
| 97 |
OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
REPO_OWNER = "16bitSega"
|
| 100 |
REPO_NAME = "RAG_project"
|
| 101 |
|
| 102 |
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
AVOID_PHRASES = [
|
| 105 |
"The article discusses",
|
|
|
|
| 456 |
used = 0
|
| 457 |
seen_sections = set()
|
| 458 |
for _, c in hits:
|
| 459 |
+
if used >= INJECT_MAX_CHUNKS:
|
| 460 |
break
|
| 461 |
t = normalize_display_text(c.text)
|
| 462 |
if len(t) > max_chars_per_chunk:
|
|
|
|
| 483 |
{
|
| 484 |
"context_tokens": tok,
|
| 485 |
"used_chunks": used,
|
| 486 |
+
"max_chunks": INJECT_MAX_CHUNKS,
|
| 487 |
"max_context_tokens": MAX_CONTEXT_TOKENS,
|
| 488 |
},
|
| 489 |
)
|
|
|
|
| 528 |
book_k: int,
|
| 529 |
article_k: int,
|
| 530 |
) -> Tuple[List[Tuple[float, Chunk]], List[Tuple[float, Chunk]]]:
|
| 531 |
+
oversample_book = book_k * RETRIEVE_TOPK_MULT
|
| 532 |
+
oversample_article = article_k * RETRIEVE_TOPK_MULT
|
| 533 |
book_hits = retrieve(query, embedder, book_index, book_chunks, k=oversample_book)
|
| 534 |
article_hits = retrieve(query, embedder, article_index, article_chunks, k=oversample_article)
|
| 535 |
book_hits = refine_hits(book_hits, query)
|
|
|
|
| 599 |
"generation_tokens": MAX_GENERATION_TOKENS,
|
| 600 |
"total_tokens": total_est,
|
| 601 |
"chunks_used": ctx_stats["used_chunks"],
|
| 602 |
+
"chunks_cap": INJECT_MAX_CHUNKS,
|
| 603 |
"context_cap": MAX_CONTEXT_TOKENS,
|
| 604 |
}
|
| 605 |
answer, err = llm_chat(prompt)
|
|
|
|
| 618 |
"Keep answers concise. Cite sources using the provided citation tags exactly."
|
| 619 |
)
|
| 620 |
|
| 621 |
+
def is_running_on_spaces() -> bool:
|
| 622 |
+
if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
|
| 623 |
+
return True
|
| 624 |
+
return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
|
| 625 |
+
|
| 626 |
+
def get_hf_client(model_id: str) -> InferenceClient:
|
| 627 |
+
return InferenceClient(model=model_id, provider=HF_PROVIDER, token=HF_TOKEN)
|
| 628 |
+
|
| 629 |
+
def select_active_hf_model() -> str:
|
| 630 |
+
if st.session_state.get("hf_active_model"):
|
| 631 |
+
return st.session_state["hf_active_model"]
|
| 632 |
+
last_err = ""
|
| 633 |
+
for model_id in HF_MODEL_CANDIDATES:
|
| 634 |
+
try:
|
| 635 |
+
client = get_hf_client(model_id)
|
| 636 |
+
client.text_generation(
|
| 637 |
+
"ping",
|
| 638 |
+
max_new_tokens=2,
|
| 639 |
+
temperature=0.0,
|
| 640 |
+
do_sample=False,
|
| 641 |
+
return_full_text=False,
|
| 642 |
+
)
|
| 643 |
+
st.session_state["hf_active_model"] = model_id
|
| 644 |
+
st.session_state.pop("hf_startup_error", None)
|
| 645 |
+
return model_id
|
| 646 |
+
except Exception as exc:
|
| 647 |
+
last_err = str(exc)
|
| 648 |
+
st.session_state["hf_active_model"] = HF_MODEL_PRIMARY
|
| 649 |
+
if last_err:
|
| 650 |
+
st.session_state["hf_startup_error"] = last_err
|
| 651 |
+
return HF_MODEL_PRIMARY
|
| 652 |
+
|
| 653 |
+
class LLMClient:
|
| 654 |
+
def __init__(self, backend: str) -> None:
|
| 655 |
+
self.backend = backend
|
| 656 |
+
|
| 657 |
+
def generate(self, prompt: str) -> Tuple[str, Optional[str]]:
|
| 658 |
+
if self.backend == "ollama":
|
| 659 |
+
return ollama_chat(prompt)
|
| 660 |
+
return self._hf_generate(prompt)
|
| 661 |
+
|
| 662 |
+
def _hf_generate(self, prompt: str) -> Tuple[str, Optional[str]]:
|
| 663 |
+
model_id = select_active_hf_model()
|
| 664 |
+
client = get_hf_client(model_id)
|
| 665 |
+
messages = [
|
| 666 |
+
{"role": "system", "content": system_message()},
|
| 667 |
+
{"role": "user", "content": prompt},
|
| 668 |
+
]
|
| 669 |
+
try:
|
| 670 |
+
chat_api = getattr(getattr(client, "chat", None), "completions", None)
|
| 671 |
+
create_fn = getattr(chat_api, "create", None)
|
| 672 |
+
if create_fn:
|
| 673 |
+
resp = create_fn(
|
| 674 |
+
model=model_id,
|
| 675 |
+
messages=messages,
|
| 676 |
+
max_tokens=MAX_GENERATION_TOKENS,
|
| 677 |
+
temperature=0.2,
|
| 678 |
+
)
|
| 679 |
+
text = (resp.choices[0].message.content or "").strip()
|
| 680 |
+
return text, None
|
| 681 |
+
except Exception as exc:
|
| 682 |
+
chat_err = str(exc)
|
| 683 |
+
else:
|
| 684 |
+
chat_err = ""
|
| 685 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
try:
|
| 687 |
out = client.text_generation(
|
| 688 |
prompt,
|
|
|
|
| 692 |
return_full_text=False,
|
| 693 |
)
|
| 694 |
return (out or "").strip(), None
|
| 695 |
+
except Exception as exc:
|
| 696 |
+
err_msg = str(exc) or chat_err
|
| 697 |
+
hint = f"HF model: {model_id}; provider: {HF_PROVIDER}."
|
| 698 |
err_low = err_msg.lower()
|
| 699 |
+
if any(k in err_low for k in ["401", "403", "gated", "license", "not authorized", "forbidden"]):
|
| 700 |
+
hint += " This model is gated. Ensure HF_TOKEN has accepted the license."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
return "", f"{err_msg} ({hint})"
|
| 702 |
|
| 703 |
def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
|
|
|
|
| 729 |
backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
|
| 730 |
|
| 731 |
if backend == "hf":
|
| 732 |
+
return LLMClient("hf").generate(prompt)
|
| 733 |
if backend == "ollama":
|
| 734 |
+
return LLMClient("ollama").generate(prompt)
|
| 735 |
if is_running_on_spaces():
|
| 736 |
+
return LLMClient("hf").generate(prompt)
|
| 737 |
if (HF_TOKEN or "").strip():
|
| 738 |
+
return LLMClient("hf").generate(prompt)
|
| 739 |
+
return LLMClient("ollama").generate(prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
|
| 741 |
def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
|
| 742 |
if not GITHUB_TOKEN:
|
|
|
|
| 804 |
st.session_state["open_ticket_ui"] = True
|
| 805 |
st.write("")
|
| 806 |
st.subheader("LLM")
|
| 807 |
+
backend = os.getenv("RAG_LLM_BACKEND", "auto").strip().lower()
|
| 808 |
+
use_hf = backend == "hf" or (
|
| 809 |
+
backend == "auto" and (is_running_on_spaces() or (HF_TOKEN or "").strip())
|
| 810 |
+
)
|
| 811 |
+
active_model = select_active_hf_model() if use_hf else HF_MODEL_PRIMARY
|
| 812 |
+
st.markdown(f"- Active model: `{active_model}`")
|
| 813 |
+
if use_hf and st.session_state.get("hf_startup_error"):
|
| 814 |
+
st.warning("HF model not available; check token/provider/model list.")
|
| 815 |
st.write("")
|
| 816 |
st.subheader("Embedding model (retrieval)")
|
| 817 |
st.code(EMBED_MODEL)
|
|
|
|
| 964 |
|
| 965 |
def run_regen():
|
| 966 |
gen_prompt = (
|
| 967 |
+
"Generate exactly 3 short, smart user questions for this app about AI agents, "
|
| 968 |
+
"orchestration, MCP, tool use, and RAG. One question per line. No numbering."
|
| 969 |
)
|
| 970 |
prompt_tokens = estimate_tokens(gen_prompt)
|
| 971 |
st.session_state["token_stats"] = {
|
|
|
|
| 974 |
"generation_tokens": MAX_GENERATION_TOKENS,
|
| 975 |
"total_tokens": prompt_tokens + MAX_GENERATION_TOKENS,
|
| 976 |
"chunks_used": 0,
|
| 977 |
+
"chunks_cap": INJECT_MAX_CHUNKS,
|
| 978 |
"context_cap": MAX_CONTEXT_TOKENS,
|
| 979 |
}
|
| 980 |
text, err = llm_chat(gen_prompt)
|