Oleksii Obolonskyi commited on
Commit
45772d2
·
1 Parent(s): 2c5e1f2

Refactor HF model selection and token limits

Browse files
Files changed (2) hide show
  1. README.md +7 -5
  2. app.py +155 -106
README.md CHANGED
@@ -54,7 +54,8 @@ Set these environment variables (local dev or Hugging Face Spaces secrets):
54
 
55
  ```bash
56
  export HF_TOKEN=hf_your_token_here
57
- export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M
 
58
  export RAG_HF_PROVIDER=hf-inference
59
  export RAG_LLM_BACKEND=hf
60
  ```
@@ -104,12 +105,13 @@ export RAG_ARTICLE_MANIFEST_PATH=data/normalized/manifest_articles.json
104
  export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
105
  export HF_TOKEN=hf_your_token_here
106
  export RAG_HF_PROVIDER=hf-inference
107
- export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M
 
108
  export RAG_LLM_BACKEND=hf
109
- export RAG_HF_API_URL=https://router.huggingface.co/hf-inference/models/Qwen/Qwen2.5-7B-Instruct-1M
110
  export RAG_MAX_CONTEXT_TOKENS=6000
111
- export RAG_MAX_CHUNKS=6
112
  export RAG_MAX_GENERATION_TOKENS=512
 
113
  export RAG_OUT_DIR=data/normalized
114
  export RAG_ARTICLE_SOURCES=sources_articles.json
115
  ```
@@ -119,7 +121,7 @@ export RAG_ARTICLE_SOURCES=sources_articles.json
119
  1. Create a new Space (Streamlit SDK) and push this repo.
120
  2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
121
  3. In Space Settings → Variables, set `RAG_HF_MODEL`, `RAG_LLM_BACKEND=hf`, and `RAG_HF_PROVIDER`.
122
- 4. Optional: set `RAG_HF_API_URL` for display/debug if you use a custom endpoint.
123
 
124
  ## Common maintenance tasks
125
 
 
54
 
55
  ```bash
56
  export HF_TOKEN=hf_your_token_here
57
+ export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
58
+ export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
59
  export RAG_HF_PROVIDER=hf-inference
60
  export RAG_LLM_BACKEND=hf
61
  ```
 
105
  export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
106
  export HF_TOKEN=hf_your_token_here
107
  export RAG_HF_PROVIDER=hf-inference
108
+ export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
109
+ export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
110
  export RAG_LLM_BACKEND=hf
 
111
  export RAG_MAX_CONTEXT_TOKENS=6000
112
+ export RAG_INJECT_MAX_CHUNKS=6
113
  export RAG_MAX_GENERATION_TOKENS=512
114
+ export RAG_RETRIEVE_TOPK_MULT=2
115
  export RAG_OUT_DIR=data/normalized
116
  export RAG_ARTICLE_SOURCES=sources_articles.json
117
  ```
 
121
  1. Create a new Space (Streamlit SDK) and push this repo.
122
  2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
123
  3. In Space Settings → Variables, set `RAG_HF_MODEL`, `RAG_LLM_BACKEND=hf`, and `RAG_HF_PROVIDER`.
124
+ 4. Optional: `RAG_HF_MODEL_FALLBACKS`, `RAG_INJECT_MAX_CHUNKS`, and `RAG_RETRIEVE_TOPK_MULT`.
125
 
126
  ## Common maintenance tasks
127
 
app.py CHANGED
@@ -24,41 +24,82 @@ COMPANY_EMAIL = "o.obolonsky@proton.me"
24
  COMPANY_PHONE = "+380953555919"
25
  COMPANY_ABOUT = "AI Software development company ready to collaborate and make your ideas come true"
26
 
27
- BOOK_CHUNKS_PATH = os.environ.get("RAG_BOOK_CHUNKS_PATH", "data/normalized/chunks_books.jsonl")
28
- ARTICLE_CHUNKS_PATH = os.environ.get("RAG_ARTICLE_CHUNKS_PATH", "data/normalized/chunks_articles.jsonl")
29
- BOOK_MANIFEST_PATH = os.environ.get("RAG_BOOK_MANIFEST_PATH", "data/normalized/manifest_books.json")
30
- ARTICLE_MANIFEST_PATH = os.environ.get("RAG_ARTICLE_MANIFEST_PATH", "data/normalized/manifest_articles.json")
31
- BOOK_INDEX_PATH = os.environ.get("RAG_BOOK_INDEX_PATH", "data/normalized/index_books.faiss")
32
- ARTICLE_INDEX_PATH = os.environ.get("RAG_ARTICLE_INDEX_PATH", "data/normalized/index_articles.faiss")
33
- EMBED_MODEL = os.environ.get("RAG_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
36
- HF_MODEL = os.getenv("RAG_HF_MODEL", "Qwen/Qwen2.5-7B-Instruct-1M").strip()
37
  HF_PROVIDER = os.getenv("RAG_HF_PROVIDER", "hf-inference").strip() or "hf-inference"
38
- HF_API_URL = os.getenv("RAG_HF_API_URL", "").strip()
39
- if not HF_API_URL:
40
- HF_API_URL = f"https://router.huggingface.co/hf-inference/models/{HF_MODEL}"
 
 
 
 
 
41
 
42
  OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
43
  OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
44
 
45
- MAX_CONTEXT_TOKENS = int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000"))
46
- MAX_CHUNKS = int(os.getenv("RAG_MAX_CHUNKS", "6"))
47
- MAX_GENERATION_TOKENS = int(os.getenv("RAG_MAX_GENERATION_TOKENS", "512"))
48
-
49
  REPO_OWNER = "16bitSega"
50
  REPO_NAME = "RAG_project"
51
 
52
  GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "").strip()
53
- # Retrieval mix: book-first + article nuance.
54
- BOOK_K = 8
55
- ARTICLE_K = 4
56
- PER_DOC_CAP = 3
57
- OVERLAP_FILTER = True
58
-
59
- # Enhanced answer mix: heavier retrieval for deeper answers.
60
- ENHANCED_BOOK_K = 14
61
- ENHANCED_ARTICLE_K = 7
62
 
63
  AVOID_PHRASES = [
64
  "The article discusses",
@@ -415,7 +456,7 @@ def build_limited_context(
415
  used = 0
416
  seen_sections = set()
417
  for _, c in hits:
418
- if used >= MAX_CHUNKS:
419
  break
420
  t = normalize_display_text(c.text)
421
  if len(t) > max_chars_per_chunk:
@@ -442,7 +483,7 @@ def build_limited_context(
442
  {
443
  "context_tokens": tok,
444
  "used_chunks": used,
445
- "max_chunks": MAX_CHUNKS,
446
  "max_context_tokens": MAX_CONTEXT_TOKENS,
447
  },
448
  )
@@ -487,8 +528,8 @@ def retrieve_books_and_articles(
487
  book_k: int,
488
  article_k: int,
489
  ) -> Tuple[List[Tuple[float, Chunk]], List[Tuple[float, Chunk]]]:
490
- oversample_book = book_k * 2
491
- oversample_article = article_k * 2
492
  book_hits = retrieve(query, embedder, book_index, book_chunks, k=oversample_book)
493
  article_hits = retrieve(query, embedder, article_index, article_chunks, k=oversample_article)
494
  book_hits = refine_hits(book_hits, query)
@@ -558,7 +599,7 @@ def answer_question(
558
  "generation_tokens": MAX_GENERATION_TOKENS,
559
  "total_tokens": total_est,
560
  "chunks_used": ctx_stats["used_chunks"],
561
- "chunks_cap": MAX_CHUNKS,
562
  "context_cap": MAX_CONTEXT_TOKENS,
563
  }
564
  answer, err = llm_chat(prompt)
@@ -577,38 +618,71 @@ def system_message() -> str:
577
  "Keep answers concise. Cite sources using the provided citation tags exactly."
578
  )
579
 
580
- def build_hf_prompt(user_prompt: str, model_id: str) -> str:
581
- system_msg = system_message()
582
- if "llama-3" in model_id.lower():
583
- return (
584
- "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
585
- f"{system_msg}<|eot_id|><|start_header_id|>user<|end_header_id|>\n"
586
- f"{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
587
- )
588
- return f"System: {system_msg}\nUser: {user_prompt}\nAssistant:"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
- @st.cache_resource(show_spinner=False)
591
- def get_hf_client() -> InferenceClient:
592
- return InferenceClient(model=HF_MODEL, provider=HF_PROVIDER, token=HF_TOKEN)
593
-
594
- def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
595
- if not HF_TOKEN:
596
- return "", "Missing HF_TOKEN (or HUGGINGFACEHUB_API_TOKEN)"
597
- client = get_hf_client()
598
- messages = [
599
- {"role": "system", "content": "You are a helpful assistant. Follow instructions and cite sources if provided."},
600
- {"role": "user", "content": prompt},
601
- ]
602
- try:
603
- resp = client.chat.completions.create(
604
- model=HF_MODEL,
605
- messages=messages,
606
- max_tokens=MAX_GENERATION_TOKENS,
607
- temperature=0.2,
608
- )
609
- text = (resp.choices[0].message.content or "").strip()
610
- return text, None
611
- except Exception:
612
  try:
613
  out = client.text_generation(
614
  prompt,
@@ -618,35 +692,12 @@ def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Opt
618
  return_full_text=False,
619
  )
620
  return (out or "").strip(), None
621
- except Exception as e2:
622
- err_msg = str(e2)
 
623
  err_low = err_msg.lower()
624
- should_retry_provider = (
625
- HF_PROVIDER != "hf-inference"
626
- and any(k in err_low for k in ["model_not_found", "does not exist", "invalid_request_error", "404"])
627
- )
628
- if should_retry_provider:
629
- try:
630
- retry_client = InferenceClient(
631
- model=HF_MODEL,
632
- provider="hf-inference",
633
- token=HF_TOKEN,
634
- )
635
- out = retry_client.text_generation(
636
- prompt,
637
- max_new_tokens=MAX_GENERATION_TOKENS,
638
- temperature=0.2,
639
- do_sample=True,
640
- return_full_text=False,
641
- )
642
- return (out or "").strip(), None
643
- except Exception as retry_err:
644
- err_msg = str(retry_err)
645
- hint = (
646
- f"HF model: {HF_MODEL}; provider: {HF_PROVIDER}. "
647
- "Choose a provider that serves this model or deploy an Inference Endpoint "
648
- "and set RAG_HF_API_URL to that endpoint URL."
649
- )
650
  return "", f"{err_msg} ({hint})"
651
 
652
  def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
@@ -678,19 +729,14 @@ def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Op
678
  backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
679
 
680
  if backend == "hf":
681
- return hf_chat(prompt, timeout=timeout)
682
  if backend == "ollama":
683
- return ollama_chat(prompt, timeout=timeout)
684
  if is_running_on_spaces():
685
- return hf_chat(prompt, timeout=timeout)
686
  if (HF_TOKEN or "").strip():
687
- return hf_chat(prompt, timeout=timeout)
688
- return ollama_chat(prompt, timeout=timeout)
689
-
690
- def is_running_on_spaces() -> bool:
691
- if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
692
- return True
693
- return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
694
 
695
  def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
696
  if not GITHUB_TOKEN:
@@ -758,11 +804,14 @@ with st.sidebar:
758
  st.session_state["open_ticket_ui"] = True
759
  st.write("")
760
  st.subheader("LLM")
761
- st.markdown(f"- Backend: `{os.getenv('RAG_LLM_BACKEND', 'auto')}`")
762
- st.markdown(f"- HF model: `{HF_MODEL}`")
763
- st.markdown(f"- Provider: `{HF_PROVIDER}`")
764
- st.markdown(f"- URL (display): `{HF_API_URL}`")
765
- st.markdown(f"- HF token set: `{bool(HF_TOKEN)}`")
 
 
 
766
  st.write("")
767
  st.subheader("Embedding model (retrieval)")
768
  st.code(EMBED_MODEL)
@@ -915,8 +964,8 @@ def run_enhance(question: str, enhanced_key: str):
915
 
916
  def run_regen():
917
  gen_prompt = (
918
- "Generate exactly 3 concise user questions about MCP and AI agents orchestration. "
919
- "Return each question on its own line without extra text."
920
  )
921
  prompt_tokens = estimate_tokens(gen_prompt)
922
  st.session_state["token_stats"] = {
@@ -925,7 +974,7 @@ def run_regen():
925
  "generation_tokens": MAX_GENERATION_TOKENS,
926
  "total_tokens": prompt_tokens + MAX_GENERATION_TOKENS,
927
  "chunks_used": 0,
928
- "chunks_cap": MAX_CHUNKS,
929
  "context_cap": MAX_CONTEXT_TOKENS,
930
  }
931
  text, err = llm_chat(gen_prompt)
 
24
  COMPANY_PHONE = "+380953555919"
25
  COMPANY_ABOUT = "AI Software development company ready to collaborate and make your ideas come true"
26
 
27
+ @dataclass
28
+ class AppConfig:
29
+ book_chunks_path: str
30
+ article_chunks_path: str
31
+ book_manifest_path: str
32
+ article_manifest_path: str
33
+ book_index_path: str
34
+ article_index_path: str
35
+ embed_model: str
36
+ max_context_tokens: int
37
+ inject_max_chunks: int
38
+ max_generation_tokens: int
39
+ book_k: int
40
+ article_k: int
41
+ enhanced_book_k: int
42
+ enhanced_article_k: int
43
+ per_doc_cap: int
44
+ overlap_filter: bool
45
+ retrieve_topk_mult: int
46
+
47
+ CONFIG = AppConfig(
48
+ book_chunks_path=os.environ.get("RAG_BOOK_CHUNKS_PATH", "data/normalized/chunks_books.jsonl"),
49
+ article_chunks_path=os.environ.get("RAG_ARTICLE_CHUNKS_PATH", "data/normalized/chunks_articles.jsonl"),
50
+ book_manifest_path=os.environ.get("RAG_BOOK_MANIFEST_PATH", "data/normalized/manifest_books.json"),
51
+ article_manifest_path=os.environ.get("RAG_ARTICLE_MANIFEST_PATH", "data/normalized/manifest_articles.json"),
52
+ book_index_path=os.environ.get("RAG_BOOK_INDEX_PATH", "data/normalized/index_books.faiss"),
53
+ article_index_path=os.environ.get("RAG_ARTICLE_INDEX_PATH", "data/normalized/index_articles.faiss"),
54
+ embed_model=os.environ.get("RAG_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"),
55
+ max_context_tokens=int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000")),
56
+ inject_max_chunks=int(os.getenv("RAG_INJECT_MAX_CHUNKS", os.getenv("RAG_MAX_CHUNKS", "6"))),
57
+ max_generation_tokens=int(os.getenv("RAG_MAX_GENERATION_TOKENS", "512")),
58
+ book_k=8,
59
+ article_k=4,
60
+ enhanced_book_k=14,
61
+ enhanced_article_k=7,
62
+ per_doc_cap=3,
63
+ overlap_filter=True,
64
+ retrieve_topk_mult=int(os.getenv("RAG_RETRIEVE_TOPK_MULT", "2")),
65
+ )
66
+
67
+ BOOK_CHUNKS_PATH = CONFIG.book_chunks_path
68
+ ARTICLE_CHUNKS_PATH = CONFIG.article_chunks_path
69
+ BOOK_MANIFEST_PATH = CONFIG.book_manifest_path
70
+ ARTICLE_MANIFEST_PATH = CONFIG.article_manifest_path
71
+ BOOK_INDEX_PATH = CONFIG.book_index_path
72
+ ARTICLE_INDEX_PATH = CONFIG.article_index_path
73
+ EMBED_MODEL = CONFIG.embed_model
74
+ MAX_CONTEXT_TOKENS = CONFIG.max_context_tokens
75
+ INJECT_MAX_CHUNKS = CONFIG.inject_max_chunks
76
+ MAX_GENERATION_TOKENS = CONFIG.max_generation_tokens
77
+ BOOK_K = CONFIG.book_k
78
+ ARTICLE_K = CONFIG.article_k
79
+ ENHANCED_BOOK_K = CONFIG.enhanced_book_k
80
+ ENHANCED_ARTICLE_K = CONFIG.enhanced_article_k
81
+ PER_DOC_CAP = CONFIG.per_doc_cap
82
+ OVERLAP_FILTER = CONFIG.overlap_filter
83
+ RETRIEVE_TOPK_MULT = CONFIG.retrieve_topk_mult
84
 
85
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 
86
  HF_PROVIDER = os.getenv("RAG_HF_PROVIDER", "hf-inference").strip() or "hf-inference"
87
+ HF_MODEL_PRIMARY = os.getenv("RAG_HF_MODEL", os.getenv("RAG_HF_MODEL_PRIMARY", "HuggingFaceTB/SmolLM3-3B")).strip()
88
+ HF_MODEL_FALLBACKS_RAW = os.getenv("RAG_HF_MODEL_FALLBACKS", "").strip()
89
+ HF_MODEL_FALLBACKS = (
90
+ [m.strip() for m in HF_MODEL_FALLBACKS_RAW.split(",") if m.strip()]
91
+ if HF_MODEL_FALLBACKS_RAW
92
+ else ["HuggingFaceTB/SmolLM3-3B", "HuggingFaceTB/SmolLM2-1.7B", "HuggingFaceTB/SmolLM2-360M"]
93
+ )
94
+ HF_MODEL_CANDIDATES = [HF_MODEL_PRIMARY] + [m for m in HF_MODEL_FALLBACKS if m != HF_MODEL_PRIMARY]
95
 
96
  OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
97
  OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
98
 
 
 
 
 
99
  REPO_OWNER = "16bitSega"
100
  REPO_NAME = "RAG_project"
101
 
102
  GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "").strip()
 
 
 
 
 
 
 
 
 
103
 
104
  AVOID_PHRASES = [
105
  "The article discusses",
 
456
  used = 0
457
  seen_sections = set()
458
  for _, c in hits:
459
+ if used >= INJECT_MAX_CHUNKS:
460
  break
461
  t = normalize_display_text(c.text)
462
  if len(t) > max_chars_per_chunk:
 
483
  {
484
  "context_tokens": tok,
485
  "used_chunks": used,
486
+ "max_chunks": INJECT_MAX_CHUNKS,
487
  "max_context_tokens": MAX_CONTEXT_TOKENS,
488
  },
489
  )
 
528
  book_k: int,
529
  article_k: int,
530
  ) -> Tuple[List[Tuple[float, Chunk]], List[Tuple[float, Chunk]]]:
531
+ oversample_book = book_k * RETRIEVE_TOPK_MULT
532
+ oversample_article = article_k * RETRIEVE_TOPK_MULT
533
  book_hits = retrieve(query, embedder, book_index, book_chunks, k=oversample_book)
534
  article_hits = retrieve(query, embedder, article_index, article_chunks, k=oversample_article)
535
  book_hits = refine_hits(book_hits, query)
 
599
  "generation_tokens": MAX_GENERATION_TOKENS,
600
  "total_tokens": total_est,
601
  "chunks_used": ctx_stats["used_chunks"],
602
+ "chunks_cap": INJECT_MAX_CHUNKS,
603
  "context_cap": MAX_CONTEXT_TOKENS,
604
  }
605
  answer, err = llm_chat(prompt)
 
618
  "Keep answers concise. Cite sources using the provided citation tags exactly."
619
  )
620
 
621
+ def is_running_on_spaces() -> bool:
622
+ if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
623
+ return True
624
+ return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
625
+
626
+ def get_hf_client(model_id: str) -> InferenceClient:
627
+ return InferenceClient(model=model_id, provider=HF_PROVIDER, token=HF_TOKEN)
628
+
629
+ def select_active_hf_model() -> str:
630
+ if st.session_state.get("hf_active_model"):
631
+ return st.session_state["hf_active_model"]
632
+ last_err = ""
633
+ for model_id in HF_MODEL_CANDIDATES:
634
+ try:
635
+ client = get_hf_client(model_id)
636
+ client.text_generation(
637
+ "ping",
638
+ max_new_tokens=2,
639
+ temperature=0.0,
640
+ do_sample=False,
641
+ return_full_text=False,
642
+ )
643
+ st.session_state["hf_active_model"] = model_id
644
+ st.session_state.pop("hf_startup_error", None)
645
+ return model_id
646
+ except Exception as exc:
647
+ last_err = str(exc)
648
+ st.session_state["hf_active_model"] = HF_MODEL_PRIMARY
649
+ if last_err:
650
+ st.session_state["hf_startup_error"] = last_err
651
+ return HF_MODEL_PRIMARY
652
+
653
+ class LLMClient:
654
+ def __init__(self, backend: str) -> None:
655
+ self.backend = backend
656
+
657
+ def generate(self, prompt: str) -> Tuple[str, Optional[str]]:
658
+ if self.backend == "ollama":
659
+ return ollama_chat(prompt)
660
+ return self._hf_generate(prompt)
661
+
662
+ def _hf_generate(self, prompt: str) -> Tuple[str, Optional[str]]:
663
+ model_id = select_active_hf_model()
664
+ client = get_hf_client(model_id)
665
+ messages = [
666
+ {"role": "system", "content": system_message()},
667
+ {"role": "user", "content": prompt},
668
+ ]
669
+ try:
670
+ chat_api = getattr(getattr(client, "chat", None), "completions", None)
671
+ create_fn = getattr(chat_api, "create", None)
672
+ if create_fn:
673
+ resp = create_fn(
674
+ model=model_id,
675
+ messages=messages,
676
+ max_tokens=MAX_GENERATION_TOKENS,
677
+ temperature=0.2,
678
+ )
679
+ text = (resp.choices[0].message.content or "").strip()
680
+ return text, None
681
+ except Exception as exc:
682
+ chat_err = str(exc)
683
+ else:
684
+ chat_err = ""
685
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
  try:
687
  out = client.text_generation(
688
  prompt,
 
692
  return_full_text=False,
693
  )
694
  return (out or "").strip(), None
695
+ except Exception as exc:
696
+ err_msg = str(exc) or chat_err
697
+ hint = f"HF model: {model_id}; provider: {HF_PROVIDER}."
698
  err_low = err_msg.lower()
699
+ if any(k in err_low for k in ["401", "403", "gated", "license", "not authorized", "forbidden"]):
700
+ hint += " This model is gated. Ensure HF_TOKEN has accepted the license."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  return "", f"{err_msg} ({hint})"
702
 
703
  def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
 
729
  backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
730
 
731
  if backend == "hf":
732
+ return LLMClient("hf").generate(prompt)
733
  if backend == "ollama":
734
+ return LLMClient("ollama").generate(prompt)
735
  if is_running_on_spaces():
736
+ return LLMClient("hf").generate(prompt)
737
  if (HF_TOKEN or "").strip():
738
+ return LLMClient("hf").generate(prompt)
739
+ return LLMClient("ollama").generate(prompt)
 
 
 
 
 
740
 
741
  def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
742
  if not GITHUB_TOKEN:
 
804
  st.session_state["open_ticket_ui"] = True
805
  st.write("")
806
  st.subheader("LLM")
807
+ backend = os.getenv("RAG_LLM_BACKEND", "auto").strip().lower()
808
+ use_hf = backend == "hf" or (
809
+ backend == "auto" and (is_running_on_spaces() or (HF_TOKEN or "").strip())
810
+ )
811
+ active_model = select_active_hf_model() if use_hf else HF_MODEL_PRIMARY
812
+ st.markdown(f"- Active model: `{active_model}`")
813
+ if use_hf and st.session_state.get("hf_startup_error"):
814
+ st.warning("HF model not available; check token/provider/model list.")
815
  st.write("")
816
  st.subheader("Embedding model (retrieval)")
817
  st.code(EMBED_MODEL)
 
964
 
965
  def run_regen():
966
  gen_prompt = (
967
+ "Generate exactly 3 short, smart user questions for this app about AI agents, "
968
+ "orchestration, MCP, tool use, and RAG. One question per line. No numbering."
969
  )
970
  prompt_tokens = estimate_tokens(gen_prompt)
971
  st.session_state["token_stats"] = {
 
974
  "generation_tokens": MAX_GENERATION_TOKENS,
975
  "total_tokens": prompt_tokens + MAX_GENERATION_TOKENS,
976
  "chunks_used": 0,
977
+ "chunks_cap": INJECT_MAX_CHUNKS,
978
  "context_cap": MAX_CONTEXT_TOKENS,
979
  }
980
  text, err = llm_chat(gen_prompt)