robertolofaro commited on
Commit
e26d588
Β·
verified Β·
1 Parent(s): 777360c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +355 -81
app.py CHANGED
@@ -1,62 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
  import pickle
 
 
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # ====================== CONFIG ======================
9
- repo_id = "robertolofaro/articles-model"
 
10
 
11
  BACKENDS = {
12
  "FAISS - RAG (HNSW)": "FAISS",
13
- "Qdrant - RAG": "Qdrant"
14
  }
15
 
16
- FAISS_PATH = "faiss_index_hnsw"
17
- QDRANT_PATH = "qdrant_db"
18
- QDRANT_COLLECTION = "articles"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # ====================== LOAD METADATA FOR ARTICLE LIST ======================
21
- def load_articles_list():
22
  try:
23
  with open("metadata.pkl", "rb") as f:
24
  df = pickle.load(f)
25
- articles = sorted(df['article_category'].unique().tolist())
26
- return ["All categories"] + articles
27
- except:
28
  return ["All categories"]
29
 
30
  ARTICLE_LIST = load_articles_list()
31
 
32
  # ====================== LOAD LLM ======================
33
- model_path = hf_hub_download(
34
- repo_id=repo_id,
35
- filename="articles-Q4_K_M.gguf",
36
- repo_type="model",
37
- token=os.environ.get("HF_TOKEN")
38
- )
39
-
40
- llm = Llama(
41
- model_path=model_path,
42
- n_ctx=4096,
43
- n_threads=2,
44
- n_batch=512,
45
- n_ubatch=512,
46
- verbose=False,
47
- )
48
-
49
- # ====================== RAG CACHE ======================
50
- vectorstores = {}
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def get_vectorstore(backend_name: str):
53
- if backend_name in vectorstores:
54
- return vectorstores[backend_name]
55
- # ... (same loading logic as before - Chroma, FAISS, Qdrant) ...
56
- # I'll keep it short here for brevity, but same as previous version
57
  try:
58
- embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs={'normalize_embeddings': True})
59
-
 
 
60
  if backend_name == "FAISS":
61
  from langchain_community.vectorstores import FAISS
62
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
@@ -66,45 +138,202 @@ def get_vectorstore(backend_name: str):
66
  else:
67
  from langchain_community.vectorstores import FAISS
68
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
69
-
70
- vectorstores[backend_name] = vs
71
  return vs
72
- except:
 
73
  return None
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # ====================== SYSTEM PROMPT ======================
77
- SYSTEM_PROMPT = """You are the reference expert for the articles contained in the training of this model, all extracted from the website robertolofaro.com, and all focused on change.
78
- #Your Mission:
79
- When a user asks a question, your goal is to provide a structured response based ONLY on the articles provided in your training. Do not provide general advice from outside these sources.
80
- # Response Format:
 
 
 
 
 
 
81
  1. Executive Summary: A 2-3 sentence overview answering the core query.
82
- 2. Guidelines & Hints: A markdown list of specific "answers/guidelines/hints" found in the source material.
 
 
83
  """
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # ====================== GENERATION FUNCTION ======================
87
- def generate_response(message, history, rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty):
88
- full_prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
 
 
 
 
 
 
 
 
89
 
 
90
  for msg in history[-4:]:
91
  full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
92
 
93
- backend = BACKENDS.get(rag_mode)
94
- context = ""
95
 
 
 
96
  if backend:
97
  vs = get_vectorstore(backend)
98
  if vs:
99
  try:
100
- filter_dict = {"article_category": article_filter} if article_filter != "All categories" else None
101
- docs = vs.similarity_search(message, k=5, filter=filter_dict)
102
- context = "\n\n".join([
103
- f"[Category: {doc.metadata.get('article_category', 'N/A')}] {doc.page_content[:700]}"
104
- for doc in docs
105
- ])
106
- except:
107
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  if context:
110
  full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
@@ -113,60 +342,105 @@ def generate_response(message, history, rag_mode, article_filter, max_tokens, te
113
 
114
  full_prompt += "<|im_start|>assistant\n"
115
 
116
- max_tokens_val = int(max_tokens) if max_tokens is not None else 900
117
- temp_val = float(temperature) if temperature is not None else 0.65
118
- top_p_val = float(top_p) if top_p is not None else 0.9
119
- rep_penalty_val = float(repeat_penalty) if repeat_penalty is not None else 1.1
 
120
 
121
- partial_text = ""
122
  for chunk in llm(
123
  full_prompt,
124
- max_tokens=max_tokens_val,
125
- temperature=temp_val,
126
- top_p=top_p_val,
127
- repeat_penalty=rep_penalty_val,
128
  stop=["<|im_end|>", "<|im_start|>"],
129
  stream=True,
130
  ):
131
- token = chunk['choices'][0]['text']
132
- partial_text += token
133
- yield partial_text
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # ====================== GRADIO INTERFACE ======================
136
  with gr.Blocks(title="Article Q&A model") as demo:
137
  gr.Markdown("# sourcing 350+ articles on change")
138
- gr.Markdown("Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com - experimental on CPU-only, to test embedding methods (takes few minutes, no selection for the category yet) - updated as of 2026-05-05")
139
- gr.Markdown("NOTAM: a fair warning- by querying this model, you will access the articles and metadata that you can find also on robertolofaro.com and GitHub.")
140
- gr.Markdown("Each article contains questions and answers, but only focused on the article- do not take any answer as advice, as your own context is not 'known' to the articles.")
141
- gr.Markdown("If, after getting the answer, you would like something more contextualized, contact some consultants (myself included).")
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  with gr.Row():
144
  rag_mode = gr.Radio(
145
  choices=list(BACKENDS.keys()),
146
  value="FAISS - RAG (HNSW)",
147
- label="Mode"
148
  )
149
  article_filter = gr.Dropdown(
150
  choices=ARTICLE_LIST,
151
  value="All categories",
152
- label="Focus on category"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  )
154
 
155
  with gr.Accordion("Advanced Generation Parameters", open=False):
156
- max_tokens = gr.Slider(256, 2048, value=900, step=64, label="Max Tokens")
157
- temperature = gr.Slider(0.0, 1.0, value=0.65, step=0.05, label="Temperature")
158
- top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
159
- repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
160
 
161
  gr.ChatInterface(
162
  fn=generate_response,
163
- additional_inputs=[rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty],
164
- cache_examples=False, # <--- Stops Gradio from executing them at startup
 
 
 
 
165
  examples=[
166
  ["What is the potential for Italy? /nothink"],
167
- ["What is the potential for Turin? /nothink"]
168
  ],
169
  )
170
 
171
  if __name__ == "__main__":
172
- demo.queue(default_concurrency_limit=1).launch()
 
 
 
1
+ """
2
+ app.py – Article Q&A chatbot
3
+ Runs on:
4
+ β€’ Hugging Face Spaces (CPU-only, default)
5
+ β€’ Local PC (CPU or CUDA GPU)
6
+
7
+ Environment variables
8
+ ---------------------
9
+ HF_TOKEN HuggingFace token for private model repo (required on HF Space)
10
+ LOCAL_MODE Set to "1" to force local-PC behaviour (optional; auto-detected via SPACE_ID)
11
+ LOCAL_MODEL_PATH Absolute path to the .gguf file on disk (optional; skips HF hub download)
12
+ GITHUB_TOKEN GitHub PAT for higher rate-limits (optional; works without it)
13
+ N_THREADS Override CPU thread count (optional)
14
+ """
15
+
16
  import gradio as gr
17
  from llama_cpp import Llama
18
  from huggingface_hub import hf_hub_download
19
  import os
20
  import pickle
21
+ import requests
22
+ from datetime import datetime, timedelta
23
  from langchain_huggingface import HuggingFaceEmbeddings
24
 
25
+ # ====================== ENVIRONMENT DETECTION ======================
26
+ # HuggingFace Spaces always set SPACE_ID; absent β†’ we're running locally.
27
+ IS_HF_SPACE = bool(os.environ.get("SPACE_ID"))
28
+ IS_LOCAL = (not IS_HF_SPACE) or (os.environ.get("LOCAL_MODE", "0") == "1")
29
+
30
+ def _detect_cuda() -> bool:
31
+ """Return True only when a CUDA device is actually usable by llama-cpp."""
32
+ if not IS_LOCAL:
33
+ return False # HF free tier is CPU-only
34
+ try:
35
+ import torch
36
+ return torch.cuda.is_available()
37
+ except ImportError:
38
+ pass
39
+ # Fallback: check for libcuda without torch
40
+ try:
41
+ import ctypes
42
+ ctypes.cdll.LoadLibrary("libcuda.so.1")
43
+ return True
44
+ except Exception:
45
+ return False
46
+
47
+ CUDA_AVAILABLE = _detect_cuda()
48
+ # -1 β†’ offload every layer to GPU; 0 β†’ pure CPU
49
+ N_GPU_LAYERS = -1 if CUDA_AVAILABLE else 0
50
+ # Use all available cores locally; HF free tier: keep at 2 to avoid OOM
51
+ N_THREADS = int(os.environ.get("N_THREADS", os.cpu_count() if IS_LOCAL else 2))
52
+
53
  # ====================== CONFIG ======================
54
+ REPO_ID = "robertolofaro/articles-model"
55
+ MODEL_FILENAME = "articles-Q4_K_M.gguf"
56
 
57
  BACKENDS = {
58
  "FAISS - RAG (HNSW)": "FAISS",
59
+ "Qdrant - RAG": "Qdrant",
60
  }
61
 
62
+ FAISS_PATH = "faiss_index_hnsw"
63
+ QDRANT_PATH = "qdrant_db"
64
+ QDRANT_COLLECTION = "articles"
65
+
66
+ # MorningNews GitHub location
67
+ GH_OWNER = "robertolofaro"
68
+ GH_REPO = "supportmaterial"
69
+ GH_NEWS_PATH = "MorningNewsAgentTest"
70
+ GH_API_ROOT = "https://api.github.com"
71
+ GH_RAW_ROOT = "https://raw.githubusercontent.com"
72
+ NEWS_ACCEPTED_EXT = (".txt", ".md", ".json")
73
+ NEWS_MAX_CHARS_FILE = 2000 # chars kept per file
74
+ NEWS_MAX_CHARS_TOTAL = 3500 # total chars injected into prompt
75
+ NEWS_CACHE_TTL = timedelta(hours=1)
76
+
77
+ # Web search
78
+ WEB_MAX_RESULTS = 5
79
+ WEB_MAX_CHARS = 2500 # total chars from web injected into prompt
80
 
81
+ # ====================== LOAD METADATA ======================
82
+ def load_articles_list() -> list[str]:
83
  try:
84
  with open("metadata.pkl", "rb") as f:
85
  df = pickle.load(f)
86
+ cats = sorted(df["article_category"].unique().tolist())
87
+ return ["All categories"] + cats
88
+ except Exception:
89
  return ["All categories"]
90
 
91
  ARTICLE_LIST = load_articles_list()
92
 
93
  # ====================== LOAD LLM ======================
94
+ def _load_llm() -> Llama:
95
+ local_path = os.environ.get("LOCAL_MODEL_PATH", "")
96
+ if IS_LOCAL and local_path and os.path.isfile(local_path):
97
+ model_path = local_path
98
+ print(f"[LLM] Loading from local path: {model_path}")
99
+ else:
100
+ model_path = hf_hub_download(
101
+ repo_id=REPO_ID,
102
+ filename=MODEL_FILENAME,
103
+ repo_type="model",
104
+ token=os.environ.get("HF_TOKEN"),
105
+ )
106
+ print(f"[LLM] Downloaded from HF hub β†’ {model_path}")
107
+
108
+ print(f"[LLM] n_gpu_layers={N_GPU_LAYERS} n_threads={N_THREADS} cuda={CUDA_AVAILABLE}")
109
+ return Llama(
110
+ model_path=model_path,
111
+ n_ctx=4096,
112
+ n_threads=N_THREADS,
113
+ n_batch=512,
114
+ n_ubatch=512,
115
+ n_gpu_layers=N_GPU_LAYERS,
116
+ verbose=False,
117
+ )
118
+
119
+ llm = _load_llm()
120
+
121
+ # ====================== RAG VECTORSTORE CACHE ======================
122
+ _vectorstores: dict = {}
123
 
124
  def get_vectorstore(backend_name: str):
125
+ if backend_name in _vectorstores:
126
+ return _vectorstores[backend_name]
 
 
127
  try:
128
+ embeddings = HuggingFaceEmbeddings(
129
+ model_name="BAAI/bge-small-en-v1.5",
130
+ encode_kwargs={"normalize_embeddings": True},
131
+ )
132
  if backend_name == "FAISS":
133
  from langchain_community.vectorstores import FAISS
134
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
 
138
  else:
139
  from langchain_community.vectorstores import FAISS
140
  vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
141
+ _vectorstores[backend_name] = vs
 
142
  return vs
143
+ except Exception as e:
144
+ print(f"[RAG] Could not load vectorstore '{backend_name}': {e}")
145
  return None
146
 
147
+ # ====================== MORNING NEWS FETCHER ======================
148
+ _news_cache: dict = {"content": None, "fetched_at": None}
149
+
150
+ def fetch_morning_news() -> str:
151
+ """
152
+ Fetch text/md/json files from the MorningNewsAgentTest directory on GitHub.
153
+ Results are cached for NEWS_CACHE_TTL to avoid hammering the API.
154
+ Works with or without a GITHUB_TOKEN (unauthenticated rate-limit: 60 req/hr).
155
+ """
156
+ global _news_cache
157
+ now = datetime.utcnow()
158
+
159
+ # Serve from cache if still fresh
160
+ if _news_cache["content"] is not None and _news_cache["fetched_at"]:
161
+ if now - _news_cache["fetched_at"] < NEWS_CACHE_TTL:
162
+ print("[MorningNews] Serving from cache")
163
+ return _news_cache["content"]
164
+
165
+ headers = {"Accept": "application/vnd.github.v3+json"}
166
+ gh_token = os.environ.get("GITHUB_TOKEN", "")
167
+ if gh_token:
168
+ headers["Authorization"] = f"token {gh_token}"
169
+
170
+ try:
171
+ # List files in the directory
172
+ dir_url = f"{GH_API_ROOT}/repos/{GH_OWNER}/{GH_REPO}/contents/{GH_NEWS_PATH}"
173
+ resp = requests.get(dir_url, headers=headers, timeout=10)
174
+ resp.raise_for_status()
175
+ entries = resp.json()
176
+
177
+ # Sort by name descending so the most recent file (date-prefixed) comes first
178
+ entries = sorted(
179
+ [e for e in entries if e["type"] == "file"
180
+ and e["name"].lower().endswith(NEWS_ACCEPTED_EXT)],
181
+ key=lambda e: e["name"],
182
+ reverse=True,
183
+ )
184
+
185
+ collected, total_chars = [], 0
186
+ for entry in entries:
187
+ if total_chars >= NEWS_MAX_CHARS_TOTAL:
188
+ break
189
+ raw_url = entry["download_url"]
190
+ try:
191
+ file_resp = requests.get(raw_url, headers=headers, timeout=10)
192
+ file_resp.raise_for_status()
193
+ snippet = file_resp.text[:NEWS_MAX_CHARS_FILE]
194
+ collected.append(f"--- [{entry['name']}] ---\n{snippet}")
195
+ total_chars += len(snippet)
196
+ except Exception as fe:
197
+ print(f"[MorningNews] Could not fetch {entry['name']}: {fe}")
198
+
199
+ combined = "\n\n".join(collected)[:NEWS_MAX_CHARS_TOTAL]
200
+ _news_cache = {"content": combined, "fetched_at": now}
201
+ print(f"[MorningNews] Fetched {len(collected)} file(s), {len(combined)} chars")
202
+ return combined
203
+
204
+ except Exception as e:
205
+ print(f"[MorningNews] Directory listing failed: {e}")
206
+ # Return stale cache rather than nothing if available
207
+ return _news_cache.get("content") or ""
208
+
209
+ # ====================== WEB SEARCH (DuckDuckGo) ======================
210
+ def search_web(query: str) -> str:
211
+ """
212
+ Search DuckDuckGo via duckduckgo-search and return a compact text block.
213
+ Gracefully degrades to an empty string if the package is missing or
214
+ the search fails (e.g. rate-limited on HF Spaces).
215
+ """
216
+ try:
217
+ from duckduckgo_search import DDGS
218
+ except ImportError:
219
+ print("[WebSearch] duckduckgo-search not installed – skipping")
220
+ return ""
221
+
222
+ try:
223
+ results = []
224
+ with DDGS() as ddgs:
225
+ for hit in ddgs.text(query, max_results=WEB_MAX_RESULTS):
226
+ title = hit.get("title", "").strip()
227
+ body = hit.get("body", "").strip()[:400]
228
+ href = hit.get("href", "")
229
+ results.append(f"β€’ {title}\n {body}\n ({href})")
230
+ combined = "\n\n".join(results)[:WEB_MAX_CHARS]
231
+ print(f"[WebSearch] {len(results)} result(s) for: {query[:60]}")
232
+ return combined
233
+ except Exception as e:
234
+ print(f"[WebSearch] Search failed: {e}")
235
+ return ""
236
+
237
+ # ====================== SYSTEM PROMPTS ======================
238
+ # Base prompt – articles only
239
+ SYSTEM_PROMPT_BASE = """You are the reference expert for the articles contained in the training of this model, \
240
+ all extracted from the website robertolofaro.com, and all focused on change.
241
+ # Your Mission
242
+ When a user asks a question, provide a structured response based ONLY on the articles in your training. \
243
+ Do not provide general advice from outside these sources.
244
+ # Response Format
245
+ 1. Executive Summary: A 2-3 sentence overview answering the core query.
246
+ 2. Guidelines & Hints: A markdown list of specific answers/guidelines/hints found in the source material.
247
+ """
248
 
249
+ # Extended prompt – when extra sources are active
250
+ SYSTEM_PROMPT_EXTENDED = """You are the reference expert for the articles contained in the training of this model, \
251
+ all extracted from the website robertolofaro.com, and all focused on change. \
252
+ You have also been provided with supplementary external context (morning news and/or web results).
253
+ # Your Mission
254
+ Provide a structured response that integrates all available information. \
255
+ Clearly tag each insight with its source label so the reader can judge its provenance:
256
+ [Articles] – insight from the trained article corpus
257
+ [MorningNews] – insight from the morning news briefing
258
+ [Web] – insight from live web search results
259
+ # Response Format
260
  1. Executive Summary: A 2-3 sentence overview answering the core query.
261
+ 2. Guidelines & Hints: A markdown list of tagged insights from the source material.
262
+ 3. Additional Context (when MorningNews or Web results are present): \
263
+ brief synthesis of external findings relevant to the query.
264
  """
265
 
266
+ # ====================== CONTEXT BUDGET HELPER ======================
267
+ # Rough token estimate: 1 token β‰ˆ 4 chars for English text.
268
+ # n_ctx=4096 β†’ reserve ~800 for answer, ~400 for system+history β†’ ~2900 chars for context.
269
+ CONTEXT_BUDGET_CHARS = 2900
270
+
271
+ def _trim_to_budget(parts: list[tuple[str, str]]) -> str:
272
+ """
273
+ parts = [(label, text), ...]
274
+ Allocates the context budget proportionally across available sources,
275
+ then returns a single assembled context string.
276
+ """
277
+ # First pass: measure totals
278
+ totals = [(label, text) for label, text in parts if text.strip()]
279
+ if not totals:
280
+ return ""
281
+ per_source = CONTEXT_BUDGET_CHARS // len(totals)
282
+ sections = []
283
+ for label, text in totals:
284
+ trimmed = text[:per_source]
285
+ sections.append(f"=== {label} ===\n{trimmed}")
286
+ return "\n\n".join(sections)
287
 
288
  # ====================== GENERATION FUNCTION ======================
289
+ def generate_response(
290
+ message, history,
291
+ rag_mode, article_filter,
292
+ use_morning_news, use_web_search,
293
+ max_tokens, temperature, top_p, repeat_penalty,
294
+ ):
295
+ has_extra = use_morning_news or use_web_search
296
+ system_prompt = SYSTEM_PROMPT_EXTENDED if has_extra else SYSTEM_PROMPT_BASE
297
+
298
+ full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
299
 
300
+ # Keep the last 4 turns to limit context pressure
301
  for msg in history[-4:]:
302
  full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
303
 
304
+ # ---- Gather context from all active sources ----
305
+ context_parts: list[tuple[str, str]] = []
306
 
307
+ # 1. RAG (vectorstore)
308
+ backend = BACKENDS.get(rag_mode)
309
  if backend:
310
  vs = get_vectorstore(backend)
311
  if vs:
312
  try:
313
+ filt = {"article_category": article_filter} if article_filter != "All categories" else None
314
+ docs = vs.similarity_search(message, k=5, filter=filt)
315
+ rag_text = "\n\n".join(
316
+ f"[Cat: {d.metadata.get('article_category','N/A')}] {d.page_content[:700]}"
317
+ for d in docs
318
+ )
319
+ context_parts.append(("ARTICLES CONTEXT", rag_text))
320
+ except Exception as e:
321
+ print(f"[RAG] similarity_search failed: {e}")
322
+
323
+ # 2. Morning News
324
+ if use_morning_news:
325
+ news = fetch_morning_news()
326
+ if news:
327
+ context_parts.append(("MORNING NEWS BRIEFING", news))
328
+
329
+ # 3. Web search
330
+ if use_web_search:
331
+ web = search_web(message)
332
+ if web:
333
+ context_parts.append(("WEB SEARCH RESULTS", web))
334
+
335
+ # ---- Assemble context within token budget ----
336
+ context = _trim_to_budget(context_parts)
337
 
338
  if context:
339
  full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
 
342
 
343
  full_prompt += "<|im_start|>assistant\n"
344
 
345
+ # ---- Inference parameters ----
346
+ max_tok = int(max_tokens) if max_tokens is not None else 900
347
+ temp = float(temperature) if temperature is not None else 0.65
348
+ tp = float(top_p) if top_p is not None else 0.9
349
+ rep_pen = float(repeat_penalty) if repeat_penalty is not None else 1.1
350
 
351
+ partial = ""
352
  for chunk in llm(
353
  full_prompt,
354
+ max_tokens=max_tok,
355
+ temperature=temp,
356
+ top_p=tp,
357
+ repeat_penalty=rep_pen,
358
  stop=["<|im_end|>", "<|im_start|>"],
359
  stream=True,
360
  ):
361
+ partial += chunk["choices"][0]["text"]
362
+ yield partial
363
+
364
+ # ====================== RUNTIME STATUS BADGE ======================
365
+ def _build_status() -> str:
366
+ parts = []
367
+ if IS_HF_SPACE and not IS_LOCAL:
368
+ parts.append("☁️ HuggingFace Space · CPU-only")
369
+ else:
370
+ parts.append("πŸ–₯️ Local mode")
371
+ parts.append("⚑ GPU (CUDA)" if CUDA_AVAILABLE else "🐒 CPU-only")
372
+ parts.append(f"threads={N_THREADS}")
373
+ return " | ".join(parts)
374
+
375
+ STATUS_LINE = _build_status()
376
 
377
  # ====================== GRADIO INTERFACE ======================
378
  with gr.Blocks(title="Article Q&A model") as demo:
379
  gr.Markdown("# sourcing 350+ articles on change")
380
+ gr.Markdown(
381
+ "Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com β€” "
382
+ "experimental on CPU-only, to test embedding methods (takes a few minutes, "
383
+ "no selection for the category yet) β€” updated as of 2026-05-05"
384
+ )
385
+ gr.Markdown(f"**Runtime:** {STATUS_LINE}")
386
+ gr.Markdown(
387
+ "**NOTAM:** by querying this model you access the articles and metadata "
388
+ "available on robertolofaro.com and GitHub. "
389
+ "Answers reflect the article corpus only β€” do not treat them as personal advice."
390
+ )
391
+ gr.Markdown(
392
+ "If, after getting an answer, you want something more contextualised, "
393
+ "contact a consultant (myself included)."
394
+ )
395
 
396
  with gr.Row():
397
  rag_mode = gr.Radio(
398
  choices=list(BACKENDS.keys()),
399
  value="FAISS - RAG (HNSW)",
400
+ label="Retrieval mode",
401
  )
402
  article_filter = gr.Dropdown(
403
  choices=ARTICLE_LIST,
404
  value="All categories",
405
+ label="Focus on category",
406
+ )
407
+
408
+ with gr.Row():
409
+ use_morning_news = gr.Checkbox(
410
+ value=False,
411
+ label="πŸ“° Read MorningNews",
412
+ info="Supplement with the latest Morning News briefing fetched from GitHub "
413
+ f"(robertolofaro/supportmaterial Β· {GH_NEWS_PATH}). "
414
+ "Results are cached for 1 hour.",
415
+ )
416
+ use_web_search = gr.Checkbox(
417
+ value=False,
418
+ label="πŸ” Search Web (DuckDuckGo)",
419
+ info="Complement the answer with live web search results via DuckDuckGo. "
420
+ "Note: may be rate-limited on the free HF Space tier.",
421
  )
422
 
423
  with gr.Accordion("Advanced Generation Parameters", open=False):
424
+ max_tokens = gr.Slider(256, 2048, value=900, step=64, label="Max Tokens")
425
+ temperature = gr.Slider(0.0, 1.0, value=0.65, step=0.05, label="Temperature")
426
+ top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
427
+ repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
428
 
429
  gr.ChatInterface(
430
  fn=generate_response,
431
+ additional_inputs=[
432
+ rag_mode, article_filter,
433
+ use_morning_news, use_web_search,
434
+ max_tokens, temperature, top_p, repeat_penalty,
435
+ ],
436
+ cache_examples=False, # prevents Gradio from running examples at startup
437
  examples=[
438
  ["What is the potential for Italy? /nothink"],
439
+ ["What is the potential for Turin? /nothink"],
440
  ],
441
  )
442
 
443
  if __name__ == "__main__":
444
+ # Local launch: share=False keeps it on localhost only.
445
+ # Set share=True if you want a temporary public Gradio tunnel.
446
+ demo.queue(default_concurrency_limit=1).launch(share=False)