Oleksii Obolonskyi commited on
Commit
6f19c35
·
1 Parent(s): 123d866

Persist FAISS indexes across restarts

Browse files
Files changed (2) hide show
  1. README.md +19 -16
  2. app.py +274 -102
README.md CHANGED
@@ -54,13 +54,12 @@ Set these environment variables (local dev or Hugging Face Spaces secrets):
54
 
55
  ```bash
56
  export HF_TOKEN=hf_your_token_here
57
- export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
58
- export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
59
- export RAG_HF_PROVIDER=hf-inference
60
- export RAG_LLM_BACKEND=hf
61
  ```
62
 
63
- Optional: set `RAG_HF_API_URL` for display/debug if you are using a custom endpoint.
64
 
65
  ### 3) Prepare sources
66
 
@@ -88,8 +87,8 @@ streamlit run app.py
88
  ```
89
 
90
  Open `http://localhost:8501`. On first run, the app builds FAISS indexes:
91
- - `data/normalized/index_books.faiss`
92
- - `data/normalized/index_articles.faiss`
93
 
94
  ## Configuration
95
 
@@ -98,16 +97,15 @@ You can override defaults via environment variables:
98
  ```bash
99
  export RAG_BOOK_CHUNKS_PATH=data/normalized/chunks_books.jsonl
100
  export RAG_ARTICLE_CHUNKS_PATH=data/normalized/chunks_articles.jsonl
101
- export RAG_BOOK_INDEX_PATH=data/normalized/index_books.faiss
102
- export RAG_ARTICLE_INDEX_PATH=data/normalized/index_articles.faiss
103
  export RAG_BOOK_MANIFEST_PATH=data/normalized/manifest_books.json
104
  export RAG_ARTICLE_MANIFEST_PATH=data/normalized/manifest_articles.json
105
  export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
106
  export HF_TOKEN=hf_your_token_here
107
- export RAG_HF_PROVIDER=hf-inference
108
- export RAG_HF_MODEL=HuggingFaceTB/SmolLM3-3B
109
- export RAG_HF_MODEL_FALLBACKS=HuggingFaceTB/SmolLM2-1.7B,HuggingFaceTB/SmolLM2-360M
110
- export RAG_LLM_BACKEND=hf
111
  export RAG_MAX_CONTEXT_TOKENS=6000
112
  export RAG_INJECT_MAX_CHUNKS=6
113
  export RAG_MAX_GENERATION_TOKENS=512
@@ -119,9 +117,14 @@ export RAG_ARTICLE_SOURCES=sources_articles.json
119
  ## Deploy to Hugging Face Spaces
120
 
121
  1. Create a new Space (Streamlit SDK) and push this repo.
122
- 2. In Space Settings Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
123
- 3. In Space Settings → Variables, set `RAG_HF_MODEL`, `RAG_LLM_BACKEND=hf`, and `RAG_HF_PROVIDER`.
124
- 4. Optional: `RAG_HF_MODEL_FALLBACKS`, `RAG_INJECT_MAX_CHUNKS`, and `RAG_RETRIEVE_TOPK_MULT`.
 
 
 
 
 
125
 
126
  ## Common maintenance tasks
127
 
 
54
 
55
  ```bash
56
  export HF_TOKEN=hf_your_token_here
57
+ export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M:featherless-ai
58
+ export RAG_HF_PROVIDER_SUFFIX=featherless-ai
59
+ export RAG_LLM_BACKEND=hf-router
 
60
  ```
61
 
62
+ Optional: set `RAG_HF_PROVIDER_SUFFIX` if your model id is missing the provider suffix.
63
 
64
  ### 3) Prepare sources
65
 
 
87
  ```
88
 
89
  Open `http://localhost:8501`. On first run, the app builds FAISS indexes:
90
+ - `data/cache/index_books.faiss` (local)
91
+ - `data/cache/index_articles.faiss` (local)
92
 
93
  ## Configuration
94
 
 
97
  ```bash
98
  export RAG_BOOK_CHUNKS_PATH=data/normalized/chunks_books.jsonl
99
  export RAG_ARTICLE_CHUNKS_PATH=data/normalized/chunks_articles.jsonl
100
+ export RAG_BOOK_INDEX_PATH=data/cache/index_books.faiss
101
+ export RAG_ARTICLE_INDEX_PATH=data/cache/index_articles.faiss
102
  export RAG_BOOK_MANIFEST_PATH=data/normalized/manifest_books.json
103
  export RAG_ARTICLE_MANIFEST_PATH=data/normalized/manifest_articles.json
104
  export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
105
  export HF_TOKEN=hf_your_token_here
106
+ export RAG_HF_MODEL=Qwen/Qwen2.5-7B-Instruct-1M:featherless-ai
107
+ export RAG_HF_PROVIDER_SUFFIX=featherless-ai
108
+ export RAG_LLM_BACKEND=hf-router
 
109
  export RAG_MAX_CONTEXT_TOKENS=6000
110
  export RAG_INJECT_MAX_CHUNKS=6
111
  export RAG_MAX_GENERATION_TOKENS=512
 
117
  ## Deploy to Hugging Face Spaces
118
 
119
  1. Create a new Space (Streamlit SDK) and push this repo.
120
+ 2. Enable Persistent Storage and set caches:
121
+ - `HF_HOME=/data/.huggingface`
122
+ - `SENTENCE_TRANSFORMERS_HOME=/data/.sentence-transformers`
123
+ 3. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
124
+ 4. In Space Settings → Variables, set `RAG_HF_MODEL` and `RAG_LLM_BACKEND=hf-router`.
125
+ 5. Optional: `RAG_HF_PROVIDER_SUFFIX`, `RAG_INJECT_MAX_CHUNKS`, and `RAG_RETRIEVE_TOPK_MULT`.
126
+
127
+ With persistent storage enabled, FAISS indexes are stored in `/data/rag_cache` and reused across restarts. They rebuild only when the normalized chunk/manifest files change.
128
 
129
  ## Common maintenance tasks
130
 
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import re
3
  import json
 
4
  import html
5
  from dataclasses import dataclass
6
  from pathlib import Path
@@ -19,6 +20,16 @@ from sentence_transformers import SentenceTransformer
19
 
20
  load_dotenv(Path(__file__).resolve().parent / ".env", override=True)
21
 
 
 
 
 
 
 
 
 
 
 
22
  COMPANY_NAME = "O_O.inc"
23
  COMPANY_EMAIL = "o.obolonsky@proton.me"
24
  COMPANY_PHONE = "+380953555919"
@@ -49,8 +60,8 @@ CONFIG = AppConfig(
49
  article_chunks_path=os.environ.get("RAG_ARTICLE_CHUNKS_PATH", "data/normalized/chunks_articles.jsonl"),
50
  book_manifest_path=os.environ.get("RAG_BOOK_MANIFEST_PATH", "data/normalized/manifest_books.json"),
51
  article_manifest_path=os.environ.get("RAG_ARTICLE_MANIFEST_PATH", "data/normalized/manifest_articles.json"),
52
- book_index_path=os.environ.get("RAG_BOOK_INDEX_PATH", "data/normalized/index_books.faiss"),
53
- article_index_path=os.environ.get("RAG_ARTICLE_INDEX_PATH", "data/normalized/index_articles.faiss"),
54
  embed_model=os.environ.get("RAG_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"),
55
  max_context_tokens=int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000")),
56
  inject_max_chunks=int(os.getenv("RAG_INJECT_MAX_CHUNKS", os.getenv("RAG_MAX_CHUNKS", "6"))),
@@ -70,6 +81,8 @@ BOOK_MANIFEST_PATH = CONFIG.book_manifest_path
70
  ARTICLE_MANIFEST_PATH = CONFIG.article_manifest_path
71
  BOOK_INDEX_PATH = CONFIG.book_index_path
72
  ARTICLE_INDEX_PATH = CONFIG.article_index_path
 
 
73
  EMBED_MODEL = CONFIG.embed_model
74
  MAX_CONTEXT_TOKENS = CONFIG.max_context_tokens
75
  INJECT_MAX_CHUNKS = CONFIG.inject_max_chunks
@@ -82,8 +95,10 @@ PER_DOC_CAP = CONFIG.per_doc_cap
82
  OVERLAP_FILTER = CONFIG.overlap_filter
83
  RETRIEVE_TOPK_MULT = CONFIG.retrieve_topk_mult
84
 
 
 
 
85
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
86
- HF_MODEL = os.getenv("RAG_HF_MODEL", "Qwen/Qwen2.5-7B-Instruct-1M:featherless-ai").strip()
87
 
88
  OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
89
  OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
@@ -330,23 +345,99 @@ def build_faiss_index(vectors: np.ndarray) -> faiss.Index:
330
  index.add(vectors)
331
  return index
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  def load_or_build_index(
 
334
  chunks: List[Chunk],
335
  embedder: SentenceTransformer,
 
 
336
  index_path: str,
337
- source_path: Optional[str] = None,
338
- ) -> faiss.Index:
 
 
 
339
  p = Path(index_path)
340
- src = Path(source_path) if source_path else None
341
- if p.exists() and (not src or not src.exists() or p.stat().st_mtime >= src.stat().st_mtime):
342
- return faiss.read_index(str(p))
 
 
 
 
 
 
 
 
 
 
343
  texts = [c.text for c in chunks]
344
- vecs = embedder.encode(texts, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
 
 
 
 
 
 
 
345
  vecs = np.asarray(vecs, dtype="float32")
346
  index = build_faiss_index(vecs)
347
  p.parent.mkdir(parents=True, exist_ok=True)
348
  faiss.write_index(index, str(p))
349
- return index
 
 
 
 
 
 
 
 
 
 
350
 
351
  def retrieve(query: str, embedder: SentenceTransformer, index: faiss.Index, chunks: List[Chunk], k: int = 8) -> List[Tuple[float, Chunk]]:
352
  qv = embedder.encode([query], normalize_embeddings=True)
@@ -594,9 +685,16 @@ def answer_question(
594
  "chunks_cap": INJECT_MAX_CHUNKS,
595
  "context_cap": MAX_CONTEXT_TOKENS,
596
  }
597
- answer, err = llm_chat(prompt)
 
 
598
  if err:
599
- st.error(err)
 
 
 
 
 
600
  return f"Model error: {err}", citations, False
601
  if not answer:
602
  st.error("Empty response from model")
@@ -610,6 +708,34 @@ def system_message() -> str:
610
  "Keep answers concise. Cite sources using the provided citation tags exactly."
611
  )
612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  def is_running_on_spaces() -> bool:
614
  if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
615
  return True
@@ -617,28 +743,27 @@ def is_running_on_spaces() -> bool:
617
 
618
  @st.cache_resource(show_spinner=False)
619
  def get_hf_router_client() -> OpenAI:
620
- return OpenAI(
621
- base_url="https://router.huggingface.co/v1",
622
- api_key=HF_TOKEN,
623
- )
624
 
625
- def hf_chat(prompt: str) -> Tuple[str, Optional[str]]:
626
- if not HF_TOKEN:
627
- return "", "Missing HF_TOKEN (or HUGGINGFACEHUB_API_TOKEN)"
628
  try:
629
  client = get_hf_router_client()
630
  completion = client.chat.completions.create(
631
- model=HF_MODEL,
632
  messages=[
633
- {"role": "system", "content": "You are a helpful assistant."},
634
  {"role": "user", "content": prompt},
635
  ],
636
  max_tokens=MAX_GENERATION_TOKENS,
637
  temperature=0.2,
638
  )
639
- return (completion.choices[0].message.content or "").strip(), None
640
  except Exception as e:
641
- return "", str(e)
642
 
643
  def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
644
  url = f"{OLLAMA_BASE_URL}/api/chat"
@@ -660,7 +785,7 @@ def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str,
660
  except Exception as e:
661
  return "", str(e)
662
 
663
- def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
664
  """
665
  Routes generation to HF if configured; otherwise falls back to Ollama.
666
  Prefer explicit env var if you want:
@@ -669,14 +794,16 @@ def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Op
669
  backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
670
 
671
  if backend == "hf-router":
672
- return hf_chat(prompt)
673
  if backend == "ollama":
674
- return ollama_chat(prompt)
 
675
  if is_running_on_spaces():
676
- return hf_chat(prompt)
677
  if (HF_TOKEN or "").strip():
678
- return hf_chat(prompt)
679
- return ollama_chat(prompt)
 
680
 
681
  def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
682
  global _GITHUB_TOKEN_LOGGED
@@ -746,39 +873,6 @@ button[aria-label^="MCP •"]::before{content:"MCP";position:absolute;left:0.6re
746
 
747
  if "is_thinking" not in st.session_state:
748
  st.session_state["is_thinking"] = False
749
-
750
- with st.sidebar:
751
- st.markdown(f"**Company:** {COMPANY_NAME}")
752
- st.markdown(f"**Contact:** {COMPANY_EMAIL} · {COMPANY_PHONE}")
753
- st.caption(COMPANY_ABOUT)
754
- st.write("")
755
- st.subheader("Support")
756
- st.caption("If an answer is not found in the dataset, you can create a support ticket (GitHub issue).")
757
- st.session_state.setdefault("open_ticket_ui", False)
758
- if st.button("Open ticket form", use_container_width=True, disabled=st.session_state["is_thinking"]):
759
- st.session_state["open_ticket_ui"] = True
760
- st.write("")
761
- st.subheader("LLM")
762
- st.markdown(f"- Active model: `{HF_MODEL}`")
763
- st.write("")
764
- st.subheader("Embedding model (retrieval)")
765
- st.code(EMBED_MODEL)
766
- st.write("")
767
- st.subheader("Retrieval settings")
768
- st.caption(f"book_k={BOOK_K}, article_k={ARTICLE_K}, per_doc_cap={PER_DOC_CAP}, overlap_filter={OVERLAP_FILTER}")
769
- st.markdown("### Dataset Stats")
770
- ts = st.session_state.get("token_stats")
771
- if ts:
772
- st.markdown("**Token Consumption (est.)**")
773
- st.markdown(f"- Context tokens: `{ts['context_tokens']}` / `{ts['context_cap']}`")
774
- st.markdown(f"- Chunks used: `{ts['chunks_used']}` / `{ts['chunks_cap']}`")
775
- st.markdown(f"- Prompt tokens: `{ts['prompt_tokens']}`")
776
- st.markdown(f"- Generation tokens (max): `{ts['generation_tokens']}`")
777
- st.markdown(f"- **Total per request (est.):** `{ts['total_tokens']}`")
778
- if ts["context_tokens"] >= int(0.9 * ts["context_cap"]):
779
- st.warning("Context near token limit; answers may truncate.")
780
- else:
781
- st.markdown("_Ask a question to see token usage._")
782
  @st.cache_data(show_spinner=False)
783
  def load_dataset(path: str) -> List[Chunk]:
784
  return read_chunks_jsonl(path)
@@ -811,8 +905,115 @@ doc_index = merge_doc_indexes(book_doc_index, article_doc_index)
811
  book_stats = compute_stats(book_chunks, book_manifest, book_doc_index)
812
  article_stats = compute_stats(article_chunks, article_manifest, article_doc_index)
813
  embedder = load_embedder(EMBED_MODEL)
814
- book_index = load_or_build_index(book_chunks, embedder, BOOK_INDEX_PATH, BOOK_CHUNKS_PATH)
815
- article_index = load_or_build_index(article_chunks, embedder, ARTICLE_INDEX_PATH, ARTICLE_CHUNKS_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
 
817
  if "chat" not in st.session_state:
818
  st.session_state["chat"] = []
@@ -854,42 +1055,6 @@ def parse_generated_questions(text: str) -> List[str]:
854
  break
855
  return cleaned
856
 
857
- with st.sidebar:
858
- st.write("")
859
- st.markdown("**Books + MCP**")
860
- st.write(f"Chunk length: min {book_stats['length_min']}, median {book_stats['length_median']}, max {book_stats['length_max']}")
861
- st.write("")
862
- st.markdown("**Articles**")
863
- st.write(f"Chunk length: min {article_stats['length_min']}, median {article_stats['length_median']}, max {article_stats['length_max']}")
864
- st.write("")
865
- st.markdown("**By type (inferred)**")
866
- for k in ["book", "mcp", "article"]:
867
- total = 0
868
- if k in book_stats["type_counts"]:
869
- total += book_stats["type_counts"][k]
870
- if k in article_stats["type_counts"]:
871
- total += article_stats["type_counts"][k]
872
- if total:
873
- st.write(f"{k}: {total}")
874
- st.write("")
875
- st.session_state.setdefault("show_sources", False)
876
- st.markdown('<div class="stacked-control sources-btn">', unsafe_allow_html=True)
877
- if st.button("Sources (click to expand the list)", use_container_width=True, disabled=st.session_state["is_thinking"]):
878
- st.session_state["show_sources"] = not st.session_state["show_sources"]
879
- st.markdown("</div>", unsafe_allow_html=True)
880
- if st.session_state["show_sources"]:
881
- if book_stats["mcp_docs_count"]:
882
- mcp_line = f"MCP: {book_stats['mcp_docs_count']} docs"
883
- if book_stats["mcp_blocks_total"]:
884
- mcp_line += f", {book_stats['mcp_blocks_total']} blocks"
885
- st.write(mcp_line)
886
- for line in book_stats["sources_lines"]:
887
- st.write(line)
888
- if article_stats["sources_lines"]:
889
- st.write("")
890
- st.markdown("**Article sources**")
891
- for line in article_stats["sources_lines"]:
892
- st.write(line)
893
 
894
  def run_enhance(question: str, enhanced_key: str):
895
  if not question or not enhanced_key:
@@ -925,9 +1090,16 @@ def run_regen():
925
  "chunks_cap": INJECT_MAX_CHUNKS,
926
  "context_cap": MAX_CONTEXT_TOKENS,
927
  }
928
- text, err = llm_chat(gen_prompt)
 
 
929
  if err:
930
- st.error(err)
 
 
 
 
 
931
  st.warning(f"LLM request failed: {err}")
932
  return
933
  if not text:
 
1
  import os
2
  import re
3
  import json
4
+ import hashlib
5
  import html
6
  from dataclasses import dataclass
7
  from pathlib import Path
 
20
 
21
  load_dotenv(Path(__file__).resolve().parent / ".env", override=True)
22
 
23
+ def get_persist_dir() -> str:
24
+ if os.path.isdir("/data") and os.access("/data", os.W_OK):
25
+ p = "/data/rag_cache"
26
+ else:
27
+ p = "data/cache"
28
+ os.makedirs(p, exist_ok=True)
29
+ return p
30
+
31
+ PERSIST_DIR = get_persist_dir()
32
+
33
  COMPANY_NAME = "O_O.inc"
34
  COMPANY_EMAIL = "o.obolonsky@proton.me"
35
  COMPANY_PHONE = "+380953555919"
 
60
  article_chunks_path=os.environ.get("RAG_ARTICLE_CHUNKS_PATH", "data/normalized/chunks_articles.jsonl"),
61
  book_manifest_path=os.environ.get("RAG_BOOK_MANIFEST_PATH", "data/normalized/manifest_books.json"),
62
  article_manifest_path=os.environ.get("RAG_ARTICLE_MANIFEST_PATH", "data/normalized/manifest_articles.json"),
63
+ book_index_path=os.environ.get("RAG_BOOK_INDEX_PATH", os.path.join(PERSIST_DIR, "index_books.faiss")),
64
+ article_index_path=os.environ.get("RAG_ARTICLE_INDEX_PATH", os.path.join(PERSIST_DIR, "index_articles.faiss")),
65
  embed_model=os.environ.get("RAG_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"),
66
  max_context_tokens=int(os.getenv("RAG_MAX_CONTEXT_TOKENS", "6000")),
67
  inject_max_chunks=int(os.getenv("RAG_INJECT_MAX_CHUNKS", os.getenv("RAG_MAX_CHUNKS", "6"))),
 
81
  ARTICLE_MANIFEST_PATH = CONFIG.article_manifest_path
82
  BOOK_INDEX_PATH = CONFIG.book_index_path
83
  ARTICLE_INDEX_PATH = CONFIG.article_index_path
84
+ BOOK_META_PATH = BOOK_INDEX_PATH + ".meta.json"
85
+ ARTICLE_META_PATH = ARTICLE_INDEX_PATH + ".meta.json"
86
  EMBED_MODEL = CONFIG.embed_model
87
  MAX_CONTEXT_TOKENS = CONFIG.max_context_tokens
88
  INJECT_MAX_CHUNKS = CONFIG.inject_max_chunks
 
95
  OVERLAP_FILTER = CONFIG.overlap_filter
96
  RETRIEVE_TOPK_MULT = CONFIG.retrieve_topk_mult
97
 
98
+ HF_BASE_URL = "https://router.huggingface.co/v1"
99
+ HF_MODEL_RAW = os.getenv("RAG_HF_MODEL", "Qwen/Qwen2.5-7B-Instruct-1M").strip()
100
+ HF_MODEL_SUFFIX = os.getenv("RAG_HF_PROVIDER_SUFFIX", "").strip()
101
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 
102
 
103
  OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
104
  OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
 
345
  index.add(vectors)
346
  return index
347
 
348
+ def file_fingerprint(path: str) -> Optional[str]:
349
+ try:
350
+ stinfo = os.stat(path)
351
+ except FileNotFoundError:
352
+ return None
353
+ h = hashlib.sha256()
354
+ h.update(f"{stinfo.st_size}:{int(stinfo.st_mtime)}".encode("utf-8"))
355
+ try:
356
+ with open(path, "rb") as f:
357
+ head = f.read(1024 * 1024)
358
+ h.update(head)
359
+ if stinfo.st_size > 1024 * 1024:
360
+ f.seek(max(0, stinfo.st_size - 1024 * 1024))
361
+ tail = f.read(1024 * 1024)
362
+ h.update(tail)
363
+ except OSError:
364
+ return None
365
+ return h.hexdigest()
366
+
367
+ def compute_fingerprint(kind: str, embed_model: str, chunks_path: str, manifest_path: str, params: Dict) -> str:
368
+ payload = {
369
+ "kind": kind,
370
+ "embed_model": embed_model,
371
+ "chunks_fp": file_fingerprint(chunks_path),
372
+ "manifest_fp": file_fingerprint(manifest_path),
373
+ "params": params,
374
+ }
375
+ raw = json.dumps(payload, sort_keys=True).encode("utf-8")
376
+ return hashlib.sha256(raw).hexdigest()
377
+
378
+ def load_meta(path: str) -> Dict:
379
+ if not Path(path).exists():
380
+ return {}
381
+ try:
382
+ return json.loads(Path(path).read_text(encoding="utf-8"))
383
+ except Exception:
384
+ return {}
385
+
386
+ def save_meta(path: str, meta: Dict) -> None:
387
+ tmp = f"{path}.tmp"
388
+ Path(tmp).write_text(json.dumps(meta, indent=2, sort_keys=True), encoding="utf-8")
389
+ os.replace(tmp, path)
390
+
391
  def load_or_build_index(
392
+ kind: str,
393
  chunks: List[Chunk],
394
  embedder: SentenceTransformer,
395
+ chunks_path: str,
396
+ manifest_path: str,
397
  index_path: str,
398
+ meta_path: str,
399
+ *,
400
+ params: Optional[Dict] = None,
401
+ fingerprint: Optional[str] = None,
402
+ ) -> Tuple[faiss.Index, Dict]:
403
  p = Path(index_path)
404
+ if params is None:
405
+ params = {
406
+ "normalize_embeddings": True,
407
+ "dim": getattr(embedder, "get_sentence_embedding_dimension", lambda: None)(),
408
+ "engine": "faiss",
409
+ }
410
+ if fingerprint is None:
411
+ fingerprint = compute_fingerprint(kind, EMBED_MODEL, chunks_path, manifest_path, params)
412
+ if p.exists() and p.stat().st_size > 0 and Path(meta_path).exists():
413
+ meta = load_meta(meta_path)
414
+ if meta.get("fingerprint") == fingerprint:
415
+ return faiss.read_index(str(p)), meta
416
+
417
  texts = [c.text for c in chunks]
418
+ show_progress = os.getenv("RAG_SHOW_EMBED_PROGRESS", "0") == "1"
419
+ with st.spinner(f"Building {kind} retrieval index (first run or dataset changed)..."):
420
+ vecs = embedder.encode(
421
+ texts,
422
+ batch_size=32,
423
+ show_progress_bar=show_progress,
424
+ normalize_embeddings=True,
425
+ )
426
  vecs = np.asarray(vecs, dtype="float32")
427
  index = build_faiss_index(vecs)
428
  p.parent.mkdir(parents=True, exist_ok=True)
429
  faiss.write_index(index, str(p))
430
+ meta = {
431
+ "fingerprint": fingerprint,
432
+ "kind": kind,
433
+ "embed_model": EMBED_MODEL,
434
+ "chunks_path": chunks_path,
435
+ "manifest_path": manifest_path,
436
+ "params": params,
437
+ "built_at": datetime.now(timezone.utc).isoformat(),
438
+ }
439
+ save_meta(meta_path, meta)
440
+ return index, meta
441
 
442
  def retrieve(query: str, embedder: SentenceTransformer, index: faiss.Index, chunks: List[Chunk], k: int = 8) -> List[Tuple[float, Chunk]]:
443
  qv = embedder.encode([query], normalize_embeddings=True)
 
685
  "chunks_cap": INJECT_MAX_CHUNKS,
686
  "context_cap": MAX_CONTEXT_TOKENS,
687
  }
688
+ answer, err, meta = llm_chat(prompt)
689
+ if meta and meta.get("model"):
690
+ st.session_state["active_model"] = meta["model"]
691
  if err:
692
+ if is_model_not_supported(err):
693
+ render_model_recommendations()
694
+ with st.expander("Model error details"):
695
+ st.code(err)
696
+ else:
697
+ st.error(err)
698
  return f"Model error: {err}", citations, False
699
  if not answer:
700
  st.error("Empty response from model")
 
708
  "Keep answers concise. Cite sources using the provided citation tags exactly."
709
  )
710
 
711
+ def get_effective_hf_model() -> str:
712
+ if HF_MODEL_SUFFIX and ":" not in HF_MODEL_RAW:
713
+ return f"{HF_MODEL_RAW}:{HF_MODEL_SUFFIX}"
714
+ return HF_MODEL_RAW
715
+
716
+ RECOMMENDED_MODELS = [
717
+ "Qwen/Qwen2.5-7B-Instruct-1M:featherless-ai",
718
+ "Qwen/Qwen2.5-7B-Instruct:featherless-ai",
719
+ "mistralai/Mistral-7B-Instruct-v0.3",
720
+ "HuggingFaceTB/SmolLM3-3B",
721
+ "google/gemma-2-9b-it",
722
+ ]
723
+
724
+ def is_model_not_supported(err: str) -> bool:
725
+ s = (err or "").lower()
726
+ return "model_not_supported" in s or "not supported by any provider you have enabled" in s
727
+
728
+ def render_model_recommendations() -> None:
729
+ st.error("HF Router: model is not supported by your enabled providers.")
730
+ st.markdown("**Fix options:**")
731
+ st.markdown("- Use the provider-suffixed model id shown on the model page (e.g. `...:featherless-ai`).")
732
+ st.markdown("- Or enable additional Inference Providers in your HF account settings.")
733
+ st.markdown("- Or switch to a model that is served by a provider you have enabled.")
734
+ st.markdown("**Try one of these model IDs:**")
735
+ for mid in RECOMMENDED_MODELS:
736
+ st.code(mid)
737
+ st.markdown("Set `RAG_HF_MODEL` to one of the above, or set `RAG_HF_PROVIDER_SUFFIX=featherless-ai` for Qwen.")
738
+
739
  def is_running_on_spaces() -> bool:
740
  if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
741
  return True
 
743
 
744
  @st.cache_resource(show_spinner=False)
745
  def get_hf_router_client() -> OpenAI:
746
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
747
+ if not token:
748
+ raise RuntimeError("HF_TOKEN is not set. Add it as a Hugging Face Secret.")
749
+ return OpenAI(base_url=HF_BASE_URL, api_key=token)
750
 
751
+ def hf_router_chat(prompt: str) -> Tuple[str, Optional[str], Optional[Dict[str, str]]]:
752
+ model_id = get_effective_hf_model()
 
753
  try:
754
  client = get_hf_router_client()
755
  completion = client.chat.completions.create(
756
+ model=model_id,
757
  messages=[
758
+ {"role": "system", "content": "You are a helpful assistant. Follow the instructions and use provided context only when required."},
759
  {"role": "user", "content": prompt},
760
  ],
761
  max_tokens=MAX_GENERATION_TOKENS,
762
  temperature=0.2,
763
  )
764
+ return (completion.choices[0].message.content or "").strip(), None, {"model": model_id}
765
  except Exception as e:
766
+ return "", str(e), {"model": model_id}
767
 
768
  def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
769
  url = f"{OLLAMA_BASE_URL}/api/chat"
 
785
  except Exception as e:
786
  return "", str(e)
787
 
788
+ def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str], Optional[Dict[str, str]]]:
789
  """
790
  Routes generation to HF if configured; otherwise falls back to Ollama.
791
  Prefer explicit env var if you want:
 
794
  backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
795
 
796
  if backend == "hf-router":
797
+ return hf_router_chat(prompt)
798
  if backend == "ollama":
799
+ text, err = ollama_chat(prompt)
800
+ return text, err, None
801
  if is_running_on_spaces():
802
+ return hf_router_chat(prompt)
803
  if (HF_TOKEN or "").strip():
804
+ return hf_router_chat(prompt)
805
+ text, err = ollama_chat(prompt)
806
+ return text, err, None
807
 
808
  def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
809
  global _GITHUB_TOKEN_LOGGED
 
873
 
874
  if "is_thinking" not in st.session_state:
875
  st.session_state["is_thinking"] = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  @st.cache_data(show_spinner=False)
877
  def load_dataset(path: str) -> List[Chunk]:
878
  return read_chunks_jsonl(path)
 
905
  book_stats = compute_stats(book_chunks, book_manifest, book_doc_index)
906
  article_stats = compute_stats(article_chunks, article_manifest, article_doc_index)
907
  embedder = load_embedder(EMBED_MODEL)
908
+
909
+ @st.cache_resource(show_spinner=False)
910
+ def get_indexes(book_fp: str, article_fp: str) -> Tuple[faiss.Index, faiss.Index]:
911
+ params = {
912
+ "normalize_embeddings": True,
913
+ "dim": getattr(embedder, "get_sentence_embedding_dimension", lambda: None)(),
914
+ "engine": "faiss",
915
+ }
916
+ book_index, _ = load_or_build_index(
917
+ "books",
918
+ book_chunks,
919
+ embedder,
920
+ BOOK_CHUNKS_PATH,
921
+ BOOK_MANIFEST_PATH,
922
+ BOOK_INDEX_PATH,
923
+ BOOK_META_PATH,
924
+ params=params,
925
+ fingerprint=book_fp,
926
+ )
927
+ article_index, _ = load_or_build_index(
928
+ "articles",
929
+ article_chunks,
930
+ embedder,
931
+ ARTICLE_CHUNKS_PATH,
932
+ ARTICLE_MANIFEST_PATH,
933
+ ARTICLE_INDEX_PATH,
934
+ ARTICLE_META_PATH,
935
+ params=params,
936
+ fingerprint=article_fp,
937
+ )
938
+ return book_index, article_index
939
+
940
+ index_params = {
941
+ "normalize_embeddings": True,
942
+ "dim": getattr(embedder, "get_sentence_embedding_dimension", lambda: None)(),
943
+ "engine": "faiss",
944
+ }
945
+ book_fp = compute_fingerprint("books", EMBED_MODEL, BOOK_CHUNKS_PATH, BOOK_MANIFEST_PATH, index_params)
946
+ article_fp = compute_fingerprint("articles", EMBED_MODEL, ARTICLE_CHUNKS_PATH, ARTICLE_MANIFEST_PATH, index_params)
947
+ book_index, article_index = get_indexes(book_fp, article_fp)
948
+
949
+ with st.sidebar:
950
+ st.markdown(f"**Company:** {COMPANY_NAME}")
951
+ st.markdown(f"**Contact:** {COMPANY_EMAIL} · {COMPANY_PHONE}")
952
+ st.caption(COMPANY_ABOUT)
953
+ st.write("")
954
+ st.subheader("Support")
955
+ st.caption("If an answer is not found in the dataset, you can create a support ticket (GitHub issue).")
956
+ st.session_state.setdefault("open_ticket_ui", False)
957
+ if st.button("Open ticket form", use_container_width=True, disabled=st.session_state["is_thinking"]):
958
+ st.session_state["open_ticket_ui"] = True
959
+ st.write("")
960
+ st.subheader("LLM")
961
+ st.markdown(f"- Active model: `{st.session_state.get('active_model', get_effective_hf_model())}`")
962
+ st.write("")
963
+ st.subheader("Embedding model (retrieval)")
964
+ st.code(EMBED_MODEL)
965
+ st.write("")
966
+ st.subheader("Retrieval settings")
967
+ st.caption(f"book_k={BOOK_K}, article_k={ARTICLE_K}, per_doc_cap={PER_DOC_CAP}, overlap_filter={OVERLAP_FILTER}")
968
+ st.markdown("### Dataset Stats")
969
+ st.write("")
970
+ st.markdown("**Books + MCP**")
971
+ st.write(f"Chunk length: min {book_stats['length_min']}, median {book_stats['length_median']}, max {book_stats['length_max']}")
972
+ st.write("")
973
+ st.markdown("**Articles**")
974
+ st.write(f"Chunk length: min {article_stats['length_min']}, median {article_stats['length_median']}, max {article_stats['length_max']}")
975
+ st.write("")
976
+ st.markdown("**By type (inferred)**")
977
+ for k in ["book", "mcp", "article"]:
978
+ total = 0
979
+ if k in book_stats["type_counts"]:
980
+ total += book_stats["type_counts"][k]
981
+ if k in article_stats["type_counts"]:
982
+ total += article_stats["type_counts"][k]
983
+ if total:
984
+ st.write(f"{k}: {total}")
985
+ st.write("")
986
+ ts = st.session_state.get("token_stats")
987
+ if ts:
988
+ st.markdown("**Token Consumption (est.)**")
989
+ st.markdown(f"- Context tokens: `{ts['context_tokens']}` / `{ts['context_cap']}`")
990
+ st.markdown(f"- Chunks used: `{ts['chunks_used']}` / `{ts['chunks_cap']}`")
991
+ st.markdown(f"- Prompt tokens: `{ts['prompt_tokens']}`")
992
+ st.markdown(f"- Generation tokens (max): `{ts['generation_tokens']}`")
993
+ st.markdown(f"- **Total per request (est.):** `{ts['total_tokens']}`")
994
+ if ts["context_tokens"] >= int(0.9 * ts["context_cap"]):
995
+ st.warning("Context near token limit; answers may truncate.")
996
+ else:
997
+ st.markdown("_Ask a question to see token usage._")
998
+ st.write("")
999
+ st.session_state.setdefault("show_sources", False)
1000
+ st.markdown('<div class="stacked-control sources-btn">', unsafe_allow_html=True)
1001
+ if st.button("Sources (click to expand the list)", use_container_width=True, disabled=st.session_state["is_thinking"]):
1002
+ st.session_state["show_sources"] = not st.session_state["show_sources"]
1003
+ st.markdown("</div>", unsafe_allow_html=True)
1004
+ if st.session_state["show_sources"]:
1005
+ if book_stats["mcp_docs_count"]:
1006
+ mcp_line = f"MCP: {book_stats['mcp_docs_count']} docs"
1007
+ if book_stats["mcp_blocks_total"]:
1008
+ mcp_line += f", {book_stats['mcp_blocks_total']} blocks"
1009
+ st.write(mcp_line)
1010
+ for line in book_stats["sources_lines"]:
1011
+ st.write(line)
1012
+ if article_stats["sources_lines"]:
1013
+ st.write("")
1014
+ st.markdown("**Article sources**")
1015
+ for line in article_stats["sources_lines"]:
1016
+ st.write(line)
1017
 
1018
  if "chat" not in st.session_state:
1019
  st.session_state["chat"] = []
 
1055
  break
1056
  return cleaned
1057
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1058
 
1059
  def run_enhance(question: str, enhanced_key: str):
1060
  if not question or not enhanced_key:
 
1090
  "chunks_cap": INJECT_MAX_CHUNKS,
1091
  "context_cap": MAX_CONTEXT_TOKENS,
1092
  }
1093
+ text, err, meta = llm_chat(gen_prompt)
1094
+ if meta and meta.get("model"):
1095
+ st.session_state["active_model"] = meta["model"]
1096
  if err:
1097
+ if is_model_not_supported(err):
1098
+ render_model_recommendations()
1099
+ with st.expander("Model error details"):
1100
+ st.code(err)
1101
+ else:
1102
+ st.error(err)
1103
  st.warning(f"LLM request failed: {err}")
1104
  return
1105
  if not text: