robertolofaro commited on
Commit
7af5e1b
·
verified ·
1 Parent(s): 285322b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -314
app.py CHANGED
@@ -1,134 +1,62 @@
1
- """
2
- app.py – Article Q&A chatbot
3
- Runs on:
4
- • Hugging Face Spaces (CPU-only, default)
5
- • Local PC (CPU or CUDA GPU)
6
-
7
- Environment variables
8
- ---------------------
9
- HF_TOKEN HuggingFace token for private model repo (required on HF Space)
10
- LOCAL_MODE Set to "1" to force local-PC behaviour (optional; auto-detected via SPACE_ID)
11
- LOCAL_MODEL_PATH Absolute path to the .gguf file on disk (optional; skips HF hub download)
12
- GITHUB_TOKEN GitHub PAT for higher rate-limits (optional; works without it)
13
- N_THREADS Override CPU thread count (optional)
14
- """
15
-
16
  import gradio as gr
17
  from llama_cpp import Llama
18
  from huggingface_hub import hf_hub_download
19
  import os
20
  import pickle
21
- import requests
22
- from datetime import datetime, timedelta
23
  from langchain_huggingface import HuggingFaceEmbeddings
24
 
25
- # ====================== ENVIRONMENT DETECTION ======================
26
- # HuggingFace Spaces always set SPACE_ID; absent → we're running locally.
27
- IS_HF_SPACE = bool(os.environ.get("SPACE_ID"))
28
- IS_LOCAL = (not IS_HF_SPACE) or (os.environ.get("LOCAL_MODE", "0") == "1")
29
-
30
- def _detect_cuda() -> bool:
31
- """Return True only when a CUDA device is actually usable by llama-cpp."""
32
- if not IS_LOCAL:
33
- return False # HF free tier is CPU-only
34
- try:
35
- import torch
36
- return torch.cuda.is_available()
37
- except ImportError:
38
- pass
39
- # Fallback: check for libcuda without torch
40
- try:
41
- import ctypes
42
- ctypes.cdll.LoadLibrary("libcuda.so.1")
43
- return True
44
- except Exception:
45
- return False
46
-
47
- CUDA_AVAILABLE = _detect_cuda()
48
- # -1 → offload every layer to GPU; 0 → pure CPU
49
- N_GPU_LAYERS = -1 if CUDA_AVAILABLE else 0
50
- # Use all available cores locally; HF free tier: keep at 2 to avoid OOM
51
- N_THREADS = int(os.environ.get("N_THREADS", os.cpu_count() if IS_LOCAL else 2))
52
-
53
  # ====================== CONFIG ======================
54
- REPO_ID = "robertolofaro/articles-model"
55
- MODEL_FILENAME = "articles-Q4_K_M.gguf"
56
 
57
  BACKENDS = {
58
  "FAISS - RAG (HNSW)": "FAISS",
59
- "Qdrant - RAG": "Qdrant",
60
  }
61
 
62
- FAISS_PATH = "faiss_index_hnsw"
63
- QDRANT_PATH = "qdrant_db"
64
- QDRANT_COLLECTION = "articles"
65
 
66
- # MorningNews GitHub location
67
- GH_OWNER = "robertolofaro"
68
- GH_REPO = "supportmaterial"
69
- GH_NEWS_PATH = "MorningNewsAgentTest"
70
- GH_API_ROOT = "https://api.github.com"
71
- GH_RAW_ROOT = "https://raw.githubusercontent.com"
72
- NEWS_ACCEPTED_EXT = (".txt", ".md", ".json")
73
- NEWS_MAX_CHARS_FILE = 2000 # chars kept per file
74
- NEWS_MAX_CHARS_TOTAL = 3500 # total chars injected into prompt
75
- NEWS_CACHE_TTL = timedelta(hours=1)
76
-
77
- # Web search
78
- WEB_MAX_RESULTS = 5
79
- WEB_MAX_CHARS = 2500 # total chars from web injected into prompt
80
-
81
- # ====================== LOAD METADATA ======================
82
- def load_articles_list() -> list[str]:
83
  try:
84
  with open("metadata.pkl", "rb") as f:
85
  df = pickle.load(f)
86
- cats = sorted(df["article_category"].unique().tolist())
87
- return ["All categories"] + cats
88
- except Exception:
89
  return ["All categories"]
90
 
91
  ARTICLE_LIST = load_articles_list()
92
 
93
  # ====================== LOAD LLM ======================
94
- def _load_llm() -> Llama:
95
- local_path = os.environ.get("LOCAL_MODEL_PATH", "")
96
- if IS_LOCAL and local_path and os.path.isfile(local_path):
97
- model_path = local_path
98
- print(f"[LLM] Loading from local path: {model_path}")
99
- else:
100
- model_path = hf_hub_download(
101
- repo_id=REPO_ID,
102
- filename=MODEL_FILENAME,
103
- repo_type="model",
104
- token=os.environ.get("HF_TOKEN"),
105
- )
106
- print(f"[LLM] Downloaded from HF hub → {model_path}")
107
-
108
- print(f"[LLM] n_gpu_layers={N_GPU_LAYERS} n_threads={N_THREADS} cuda={CUDA_AVAILABLE}")
109
- return Llama(
110
- model_path=model_path,
111
- n_ctx=4096,
112
- n_threads=N_THREADS,
113
- n_batch=512,
114
- n_ubatch=512,
115
- n_gpu_layers=N_GPU_LAYERS,
116
- verbose=False,
117
- )
118
-
119
- llm = _load_llm()
120
-
121
- # ====================== RAG VECTORSTORE CACHE ======================
122
- _vectorstores: dict = {}
123
 
124
  def get_vectorstore(backend_name: str):
125
- if backend_name in _vectorstores:
126
- return _vectorstores[backend_name]
 
 
127
  try:
128
- embeddings = HuggingFaceEmbeddings(
129
- model_name="BAAI/bge-small-en-v1.5",
130
- encode_kwargs={"normalize_embeddings": True},
131
- )
132
  if backend_name == "FAISS":
133
  from langchain_community.vectorstores import FAISS
134
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
@@ -138,167 +66,45 @@ def get_vectorstore(backend_name: str):
138
  else:
139
  from langchain_community.vectorstores import FAISS
140
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
141
- _vectorstores[backend_name] = vs
 
142
  return vs
143
- except Exception as e:
144
- print(f"[RAG] Could not load vectorstore '{backend_name}': {e}")
145
  return None
146
 
147
- # ====================== MORNING NEWS FETCHER ======================
148
- _news_cache: dict = {"content": None, "fetched_at": None}
149
-
150
- def fetch_morning_news() -> str:
151
- """
152
- Fetch text/md/json files from the MorningNewsAgentTest directory on GitHub.
153
- Results are cached for NEWS_CACHE_TTL to avoid hammering the API.
154
- Works with or without a GITHUB_TOKEN (unauthenticated rate-limit: 60 req/hr).
155
- """
156
- global _news_cache
157
- now = datetime.utcnow()
158
-
159
- # Serve from cache if still fresh
160
- if _news_cache["content"] is not None and _news_cache["fetched_at"]:
161
- if now - _news_cache["fetched_at"] < NEWS_CACHE_TTL:
162
- print("[MorningNews] Serving from cache")
163
- return _news_cache["content"]
164
-
165
- headers = {"Accept": "application/vnd.github.v3+json"}
166
- gh_token = os.environ.get("GITHUB_TOKEN", "")
167
- if gh_token:
168
- headers["Authorization"] = f"token {gh_token}"
169
-
170
- try:
171
- # List files in the directory
172
- dir_url = f"{GH_API_ROOT}/repos/{GH_OWNER}/{GH_REPO}/contents/{GH_NEWS_PATH}"
173
- resp = requests.get(dir_url, headers=headers, timeout=10)
174
- resp.raise_for_status()
175
- entries = resp.json()
176
-
177
- # Sort by name descending so the most recent file (date-prefixed) comes first
178
- entries = sorted(
179
- [e for e in entries if e["type"] == "file"
180
- and e["name"].lower().endswith(NEWS_ACCEPTED_EXT)],
181
- key=lambda e: e["name"],
182
- reverse=True,
183
- )
184
-
185
- collected, total_chars = [], 0
186
- for entry in entries:
187
- if total_chars >= NEWS_MAX_CHARS_TOTAL:
188
- break
189
- raw_url = entry["download_url"]
190
- try:
191
- file_resp = requests.get(raw_url, headers=headers, timeout=10)
192
- file_resp.raise_for_status()
193
- snippet = file_resp.text[:NEWS_MAX_CHARS_FILE]
194
- collected.append(f"--- [{entry['name']}] ---\n{snippet}")
195
- total_chars += len(snippet)
196
- except Exception as fe:
197
- print(f"[MorningNews] Could not fetch {entry['name']}: {fe}")
198
-
199
- combined = "\n\n".join(collected)[:NEWS_MAX_CHARS_TOTAL]
200
- _news_cache = {"content": combined, "fetched_at": now}
201
- print(f"[MorningNews] Fetched {len(collected)} file(s), {len(combined)} chars")
202
- return combined
203
-
204
- except Exception as e:
205
- print(f"[MorningNews] Directory listing failed: {e}")
206
- # Return stale cache rather than nothing if available
207
- return _news_cache.get("content") or ""
208
-
209
- # ====================== SYSTEM PROMPTS ======================
210
- # Base prompt – articles only
211
- SYSTEM_PROMPT_BASE = """You are the reference expert for the articles contained in the training of this model, \
212
- all extracted from the website robertolofaro.com, and all focused on change.
213
- # Your Mission
214
- When a user asks a question, provide a structured response based ONLY on the articles in your training. \
215
- Do not provide general advice from outside these sources.
216
- # Response Format
217
- 1. Executive Summary: A 2-3 sentence overview answering the core query.
218
- 2. Guidelines & Hints: A markdown list of specific answers/guidelines/hints found in the source material.
219
- """
220
 
221
- # Extended prompt when extra sources are active
222
- SYSTEM_PROMPT_EXTENDED = """You are the reference expert for the articles contained in the training of this model, \
223
- all extracted from the website robertolofaro.com, and all focused on change. \
224
- You have also been provided with supplementary external context (morning news results).
225
- # Your Mission
226
- Provide a structured response that integrates all available information. \
227
- Clearly tag each insight with its source label so the reader can judge its provenance:
228
- [Articles] – insight from the trained article corpus
229
- [MorningNews] – insight from the morning news briefing
230
- # Response Format
231
  1. Executive Summary: A 2-3 sentence overview answering the core query.
232
- 2. Guidelines & Hints: A markdown list of tagged insights from the source material.
233
- 3. Additional Context (when MorningNews are present): \
234
- brief synthesis of external findings relevant to the query.
235
  """
236
 
237
- # ====================== CONTEXT BUDGET HELPER ======================
238
- # Rough token estimate: 1 token ≈ 4 chars for English text.
239
- # n_ctx=4096 → reserve ~800 for answer, ~400 for system+history → ~2900 chars for context.
240
- CONTEXT_BUDGET_CHARS = 2900
241
-
242
- def _trim_to_budget(parts: list[tuple[str, str]]) -> str:
243
- """
244
- parts = [(label, text), ...]
245
- Allocates the context budget proportionally across available sources,
246
- then returns a single assembled context string.
247
- """
248
- # First pass: measure totals
249
- totals = [(label, text) for label, text in parts if text.strip()]
250
- if not totals:
251
- return ""
252
- per_source = CONTEXT_BUDGET_CHARS // len(totals)
253
- sections = []
254
- for label, text in totals:
255
- trimmed = text[:per_source]
256
- sections.append(f"=== {label} ===\n{trimmed}")
257
- return "\n\n".join(sections)
258
 
259
  # ====================== GENERATION FUNCTION ======================
260
- def generate_response(
261
- message, history,
262
- rag_mode, article_filter,
263
- use_morning_news,
264
- max_tokens, temperature, top_p, repeat_penalty,
265
- ):
266
- has_extra = use_morning_news
267
- system_prompt = SYSTEM_PROMPT_EXTENDED if has_extra else SYSTEM_PROMPT_BASE
268
 
269
- full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
270
-
271
- # Keep the last 4 turns to limit context pressure
272
  for msg in history[-4:]:
273
  full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
274
 
275
- # ---- Gather context from all active sources ----
276
- context_parts: list[tuple[str, str]] = []
277
-
278
- # 1. RAG (vectorstore)
279
  backend = BACKENDS.get(rag_mode)
 
 
280
  if backend:
281
  vs = get_vectorstore(backend)
282
  if vs:
283
  try:
284
- filt = {"article_category": article_filter} if article_filter != "All categories" else None
285
- docs = vs.similarity_search(message, k=5, filter=filt)
286
- rag_text = "\n\n".join(
287
- f"[Cat: {d.metadata.get('article_category','N/A')}] {d.page_content[:700]}"
288
- for d in docs
289
- )
290
- context_parts.append(("ARTICLES CONTEXT", rag_text))
291
- except Exception as e:
292
- print(f"[RAG] similarity_search failed: {e}")
293
-
294
- # 2. Morning News
295
- if use_morning_news:
296
- news = fetch_morning_news()
297
- if news:
298
- context_parts.append(("MORNING NEWS BRIEFING", news))
299
-
300
- # ---- Assemble context within token budget ----
301
- context = _trim_to_budget(context_parts)
302
 
303
  if context:
304
  full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
@@ -307,99 +113,57 @@ def generate_response(
307
 
308
  full_prompt += "<|im_start|>assistant\n"
309
 
310
- # ---- Inference parameters ----
311
- max_tok = int(max_tokens) if max_tokens is not None else 900
312
- temp = float(temperature) if temperature is not None else 0.65
313
- tp = float(top_p) if top_p is not None else 0.9
314
- rep_pen = float(repeat_penalty) if repeat_penalty is not None else 1.1
315
 
316
- partial = ""
317
  for chunk in llm(
318
  full_prompt,
319
- max_tokens=max_tok,
320
- temperature=temp,
321
- top_p=tp,
322
- repeat_penalty=rep_pen,
323
  stop=["<|im_end|>", "<|im_start|>"],
324
  stream=True,
325
  ):
326
- partial += chunk["choices"][0]["text"]
327
- yield partial
328
-
329
- # ====================== RUNTIME STATUS BADGE ======================
330
- def _build_status() -> str:
331
- parts = []
332
- if IS_HF_SPACE and not IS_LOCAL:
333
- parts.append("☁️ HuggingFace Space · CPU-only")
334
- else:
335
- parts.append("🖥️ Local mode")
336
- parts.append("⚡ GPU (CUDA)" if CUDA_AVAILABLE else "🐢 CPU-only")
337
- parts.append(f"threads={N_THREADS}")
338
- return " | ".join(parts)
339
-
340
- STATUS_LINE = _build_status()
341
 
342
  # ====================== GRADIO INTERFACE ======================
343
  with gr.Blocks(title="Article Q&A model") as demo:
344
  gr.Markdown("# sourcing 350+ articles on change")
345
- gr.Markdown(
346
- "Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com — "
347
- "experimental on CPU-only, to test embedding methods (takes a few minutes, "
348
- "no selection for the category yet) — updated as of 2026-05-05"
349
- )
350
- gr.Markdown(f"**Runtime:** {STATUS_LINE}")
351
- gr.Markdown(
352
- "**NOTAM:** by querying this model you access the articles and metadata "
353
- "available on robertolofaro.com and GitHub. "
354
- "Answers reflect the article corpus only — do not treat them as advice specific to your context."
355
- )
356
- gr.Markdown(
357
- "If, after getting an answer, you want something more contextualised, "
358
- "contact a consultant (myself included)."
359
- )
360
 
361
  with gr.Row():
362
  rag_mode = gr.Radio(
363
  choices=list(BACKENDS.keys()),
364
  value="FAISS - RAG (HNSW)",
365
- label="Retrieval mode",
366
  )
367
  article_filter = gr.Dropdown(
368
  choices=ARTICLE_LIST,
369
  value="All categories",
370
- label="Focus on category",
371
- )
372
-
373
- with gr.Row():
374
- use_morning_news = gr.Checkbox(
375
- value=False,
376
- label="📰 Read MorningNews",
377
- info="Supplement with the latest Morning News briefing fetched from GitHub "
378
- f"(robertolofaro/supportmaterial · {GH_NEWS_PATH}). "
379
- "Results are cached for 1 hour.",
380
  )
381
 
382
  with gr.Accordion("Advanced Generation Parameters", open=False):
383
- max_tokens = gr.Slider(256, 2048, value=900, step=64, label="Max Tokens")
384
- temperature = gr.Slider(0.0, 1.0, value=0.65, step=0.05, label="Temperature")
385
- top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
386
- repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
387
 
388
  gr.ChatInterface(
389
  fn=generate_response,
390
- additional_inputs=[
391
- rag_mode, article_filter,
392
- use_morning_news,
393
- max_tokens, temperature, top_p, repeat_penalty,
394
- ],
395
- cache_examples=False, # prevents Gradio from running examples at startup
396
  examples=[
397
  ["What is the potential for Italy? /nothink"],
398
- ["What is the potential for Turin? /nothink"],
399
  ],
400
  )
401
 
402
  if __name__ == "__main__":
403
- # Local launch: share=False keeps it on localhost only.
404
- # Set share=True if you want a temporary public Gradio tunnel.
405
- demo.queue(default_concurrency_limit=1).launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
  import pickle
 
 
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # ====================== CONFIG ======================
9
+ repo_id = "robertolofaro/articles-model"
 
10
 
11
  BACKENDS = {
12
  "FAISS - RAG (HNSW)": "FAISS",
13
+ "Qdrant - RAG": "Qdrant"
14
  }
15
 
16
+ FAISS_PATH = "faiss_index_hnsw"
17
+ QDRANT_PATH = "qdrant_db"
18
+ QDRANT_COLLECTION = "articles"
19
 
20
+ # ====================== LOAD METADATA FOR ARTICLE LIST ======================
21
+ def load_articles_list():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  try:
23
  with open("metadata.pkl", "rb") as f:
24
  df = pickle.load(f)
25
+ articles = sorted(df['article_category'].unique().tolist())
26
+ return ["All categories"] + articles
27
+ except:
28
  return ["All categories"]
29
 
30
  ARTICLE_LIST = load_articles_list()
31
 
32
  # ====================== LOAD LLM ======================
33
+ model_path = hf_hub_download(
34
+ repo_id=repo_id,
35
+ filename="articles-Q4_K_M.gguf",
36
+ repo_type="model",
37
+ token=os.environ.get("HF_TOKEN")
38
+ )
39
+
40
+ llm = Llama(
41
+ model_path=model_path,
42
+ n_ctx=4096,
43
+ n_threads=2,
44
+ n_batch=512,
45
+ n_ubatch=512,
46
+ verbose=False,
47
+ )
48
+
49
+ # ====================== RAG CACHE ======================
50
+ vectorstores = {}
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def get_vectorstore(backend_name: str):
53
+ if backend_name in vectorstores:
54
+ return vectorstores[backend_name]
55
+ # ... (same loading logic as before - Chroma, FAISS, Qdrant) ...
56
+ # I'll keep it short here for brevity, but same as previous version
57
  try:
58
+ embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs={'normalize_embeddings': True})
59
+
 
 
60
  if backend_name == "FAISS":
61
  from langchain_community.vectorstores import FAISS
62
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
 
66
  else:
67
  from langchain_community.vectorstores import FAISS
68
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
69
+
70
+ vectorstores[backend_name] = vs
71
  return vs
72
+ except:
 
73
  return None
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # ====================== SYSTEM PROMPT ======================
77
+ SYSTEM_PROMPT = """You are the reference expert for the articles contained in the training of this model, all extracted from the website robertolofaro.com, and all focused on change.
78
+ #Your Mission:
79
+ When a user asks a question, your goal is to provide a structured response based ONLY on the articles provided in your training. Do not provide general advice from outside these sources.
80
+ # Response Format:
 
 
 
 
 
81
  1. Executive Summary: A 2-3 sentence overview answering the core query.
82
+ 2. Guidelines & Hints: A markdown list of specific "answers/guidelines/hints" found in the source material.
 
 
83
  """
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # ====================== GENERATION FUNCTION ======================
87
+ def generate_response(message, history, rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty):
88
+ full_prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
 
 
 
 
 
 
89
 
 
 
 
90
  for msg in history[-4:]:
91
  full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
92
 
 
 
 
 
93
  backend = BACKENDS.get(rag_mode)
94
+ context = ""
95
+
96
  if backend:
97
  vs = get_vectorstore(backend)
98
  if vs:
99
  try:
100
+ filter_dict = {"article_category": article_filter} if article_filter != "All categories" else None
101
+ docs = vs.similarity_search(message, k=5, filter=filter_dict)
102
+ context = "\n\n".join([
103
+ f"[Category: {doc.metadata.get('article_category', 'N/A')}] {doc.page_content[:700]}"
104
+ for doc in docs
105
+ ])
106
+ except:
107
+ pass
 
 
 
 
 
 
 
 
 
 
108
 
109
  if context:
110
  full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
 
113
 
114
  full_prompt += "<|im_start|>assistant\n"
115
 
116
+ max_tokens_val = int(max_tokens) if max_tokens is not None else 900
117
+ temp_val = float(temperature) if temperature is not None else 0.65
118
+ top_p_val = float(top_p) if top_p is not None else 0.9
119
+ rep_penalty_val = float(repeat_penalty) if repeat_penalty is not None else 1.1
 
120
 
121
+ partial_text = ""
122
  for chunk in llm(
123
  full_prompt,
124
+ max_tokens=max_tokens_val,
125
+ temperature=temp_val,
126
+ top_p=top_p_val,
127
+ repeat_penalty=rep_penalty_val,
128
  stop=["<|im_end|>", "<|im_start|>"],
129
  stream=True,
130
  ):
131
+ token = chunk['choices'][0]['text']
132
+ partial_text += token
133
+ yield partial_text
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # ====================== GRADIO INTERFACE ======================
136
  with gr.Blocks(title="Article Q&A model") as demo:
137
  gr.Markdown("# sourcing 350+ articles on change")
138
+ gr.Markdown("Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com - experimental on CPU-only, to test embedding methods (takes few minutes, no selection for the category yet) - updated as of 2026-05-05")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  with gr.Row():
141
  rag_mode = gr.Radio(
142
  choices=list(BACKENDS.keys()),
143
  value="FAISS - RAG (HNSW)",
144
+ label="Mode"
145
  )
146
  article_filter = gr.Dropdown(
147
  choices=ARTICLE_LIST,
148
  value="All categories",
149
+ label="Focus on category"
 
 
 
 
 
 
 
 
 
150
  )
151
 
152
  with gr.Accordion("Advanced Generation Parameters", open=False):
153
+ max_tokens = gr.Slider(256, 2048, value=900, step=64, label="Max Tokens")
154
+ temperature = gr.Slider(0.0, 1.0, value=0.65, step=0.05, label="Temperature")
155
+ top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
156
+ repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
157
 
158
  gr.ChatInterface(
159
  fn=generate_response,
160
+ additional_inputs=[rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty],
161
+ cache_examples=False, # <--- Stops Gradio from executing them at startup
 
 
 
 
162
  examples=[
163
  ["What is the potential for Italy? /nothink"],
164
+ ["What is the potential for Turin? /nothink"]
165
  ],
166
  )
167
 
168
  if __name__ == "__main__":
169
+ demo.queue(default_concurrency_limit=1).launch()