ZENLLC commited on
Commit
2230519
·
verified ·
1 Parent(s): d223e34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -28
app.py CHANGED
@@ -6,11 +6,24 @@ import requests
6
  import gradio as gr
7
  from openai import OpenAI
8
 
 
 
 
 
 
 
9
  # -------------------- CONFIG --------------------
10
 
11
  CHAT_MODEL = "gpt-5" # main chat model
12
- # Use a model your project actually has access to
13
- EMBED_MODEL = "text-embedding-3-small"
 
 
 
 
 
 
 
14
 
15
  DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
16
 
@@ -37,11 +50,11 @@ PRESET_CONFIGS = {
37
  "with youth, homeschool, and professional tracks and blockchain-verified credentials."
38
  ),
39
  },
40
- "Policy Explainer (external PDFs / links)": {
41
  "system": DEFAULT_SYSTEM_PROMPT
42
  + "\n\nYou act as a neutral policy explainer. Summarize clearly, highlight key risks, opportunities, and practical implications.",
43
- "urls": "",
44
- "text": "This preset is for uploading AI policy PDFs, legal texts, and governance reports.",
45
  },
46
  "Research Notebook / Personal RAG Sandbox": {
47
  "system": DEFAULT_SYSTEM_PROMPT
@@ -84,11 +97,40 @@ def cosine_similarity(a: List[float], b: List[float]) -> float:
84
  return dot / (norm_a * norm_b)
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  # -------------------- DATA SOURCE HELPERS --------------------
88
 
89
 
90
  def fetch_url_text(url: str) -> str:
91
- """Fetch text from a URL in a lightweight way."""
92
  try:
93
  resp = requests.get(url, timeout=12)
94
  resp.raise_for_status()
@@ -119,6 +161,73 @@ def read_file_text(path: str) -> str:
119
  return f"[Error reading file {os.path.basename(path)}: {e}]"
120
 
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # -------------------- EMBEDDING / KB BUILD --------------------
123
 
124
 
@@ -131,6 +240,11 @@ def build_embeddings(
131
  return [], "⚠️ No documents to index."
132
 
133
  client = OpenAI(api_key=api_key)
 
 
 
 
 
134
  kb_chunks: List[Dict[str, Any]] = []
135
  total_chunks = 0
136
 
@@ -142,7 +256,7 @@ def build_embeddings(
142
  for idx, ch in enumerate(chunks):
143
  try:
144
  emb_resp = client.embeddings.create(
145
- model=EMBED_MODEL,
146
  input=ch,
147
  )
148
  emb = emb_resp.data[0].embedding
@@ -165,7 +279,10 @@ def build_embeddings(
165
  }
166
  )
167
 
168
- status = f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks."
 
 
 
169
  return kb_chunks, status
170
 
171
 
@@ -181,9 +298,14 @@ def retrieve_context(
181
  return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
182
 
183
  client = OpenAI(api_key=api_key)
 
 
 
 
 
184
  try:
185
  q_emb_resp = client.embeddings.create(
186
- model=EMBED_MODEL,
187
  input=query,
188
  )
189
  q_emb = q_emb_resp.data[0].embedding
@@ -216,7 +338,11 @@ def retrieve_context(
216
  )
217
 
218
  context = "\n\n---\n\n".join(context_parts)
219
- debug = f"📚 Retrieved {len(top)} chunks from KB (top_k={top_k}, threshold={similarity_threshold})."
 
 
 
 
220
  return context, debug
221
 
222
 
@@ -232,6 +358,15 @@ def save_api_key(api_key: str):
232
  return status, api_key
233
 
234
 
 
 
 
 
 
 
 
 
 
235
  def apply_preset(preset_name: str):
236
  cfg = PRESET_CONFIGS.get(preset_name) or PRESET_CONFIGS["None (manual setup)"]
237
  return cfg["system"], cfg["urls"], cfg["text"]
@@ -239,21 +374,43 @@ def apply_preset(preset_name: str):
239
 
240
  def build_knowledge_base(
241
  api_key: str,
 
242
  urls_text: str,
243
  raw_text: str,
244
  file_paths: Optional[List[str]],
245
  ):
 
 
 
 
 
 
 
246
  api_key = (api_key or "").strip()
247
  if not api_key:
248
  return "❌ Please save your OpenAI API key first.", []
249
 
 
 
250
  docs: List[Dict[str, Any]] = []
251
 
252
  # URLs
253
  urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
254
  for u in urls:
255
- txt = fetch_url_text(u)
256
- docs.append({"source": u, "text": txt})
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  # Raw text
259
  if raw_text and raw_text.strip():
@@ -353,19 +510,21 @@ def clear_chat():
353
 
354
  # -------------------- UI LAYOUT --------------------
355
 
356
- with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
357
  gr.Markdown(
358
  """
359
- # 🔍 RAG Chatbot — GPT-5 + URLs / Files / Text
360
 
361
  1. Enter your **OpenAI API key** and click **Save**.
362
- 2. Add knowledge via **URLs**, **uploaded files**, and/or **raw text**.
363
- 3. Click **Build / Refresh Knowledge Base**.
364
- 4. Ask questions the bot will answer **only** from your knowledge and system instructions.
 
365
  """
366
  )
367
 
368
  api_key_state = gr.State("")
 
369
  kb_state = gr.State([])
370
  chat_state = gr.State([])
371
 
@@ -378,13 +537,21 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
378
  placeholder="sk-...",
379
  type="password",
380
  )
381
- save_api_btn = gr.Button("Save API Key", variant="primary")
382
- save_status = gr.Markdown("API key not set.")
 
 
 
 
 
 
 
 
383
 
384
  preset_dropdown = gr.Dropdown(
385
  label="Presets",
386
  choices=list(PRESET_CONFIGS.keys()),
387
- value="None (manual setup)",
388
  )
389
 
390
  system_box = gr.Textbox(
@@ -398,7 +565,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
398
  urls_box = gr.Textbox(
399
  label="Knowledge URLs (one per line)",
400
  lines=4,
401
- placeholder="https://example.com/docs\nhttps://zenai.world",
402
  )
403
 
404
  raw_text_box = gr.Textbox(
@@ -413,8 +580,8 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
413
  type="filepath",
414
  )
415
 
416
- build_kb_btn = gr.Button(
417
- "Build / Refresh Knowledge Base",
418
  variant="secondary",
419
  )
420
  kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
@@ -431,7 +598,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
431
  user_input = gr.Textbox(
432
  label="Ask a question",
433
  lines=3,
434
- placeholder="Ask about the content of your URLs, files, or pasted text...",
435
  )
436
 
437
  with gr.Row():
@@ -442,13 +609,20 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
442
  "ℹ️ Retrieval debug info will appear here after each answer."
443
  )
444
 
445
- # Wiring: save API key
446
  save_api_btn.click(
447
  fn=save_api_key,
448
  inputs=[api_key_box],
449
  outputs=[save_status, api_key_state],
450
  )
451
 
 
 
 
 
 
 
 
452
  # Wiring: presets
453
  preset_dropdown.change(
454
  fn=apply_preset,
@@ -456,10 +630,10 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
456
  outputs=[system_box, urls_box, raw_text_box],
457
  )
458
 
459
- # Wiring: build knowledge base
460
- build_kb_btn.click(
461
  fn=build_knowledge_base,
462
- inputs=[api_key_state, urls_box, raw_text_box, files_input],
463
  outputs=[kb_status_md, kb_state],
464
  )
465
 
 
6
  import gradio as gr
7
  from openai import OpenAI
8
 
9
+ # Firecrawl SDK (used for crawling URLs into markdown)
10
+ try:
11
+ from firecrawl import Firecrawl
12
+ except ImportError:
13
+ Firecrawl = None # we’ll handle this gracefully later
14
+
15
  # -------------------- CONFIG --------------------
16
 
17
  CHAT_MODEL = "gpt-5" # main chat model
18
+
19
+ # Candidate embedding models – we'll auto-select one your project has access to
20
+ EMBED_MODEL_CANDIDATES = [
21
+ "text-embedding-3-small",
22
+ "text-embedding-3-large",
23
+ "text-embedding-ada-002",
24
+ ]
25
+
26
+ SELECTED_EMBED_MODEL: Optional[str] = None # set at runtime once discovered
27
 
28
  DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
29
 
 
50
  "with youth, homeschool, and professional tracks and blockchain-verified credentials."
51
  ),
52
  },
53
+ "AI Policy & Governance Starter": {
54
  "system": DEFAULT_SYSTEM_PROMPT
55
  + "\n\nYou act as a neutral policy explainer. Summarize clearly, highlight key risks, opportunities, and practical implications.",
56
+ "urls": "https://oecd.ai/en/ai-principles",
57
+ "text": "Use this preset for high-level AI policy, governance, and principles exploration.",
58
  },
59
  "Research Notebook / Personal RAG Sandbox": {
60
  "system": DEFAULT_SYSTEM_PROMPT
 
97
  return dot / (norm_a * norm_b)
98
 
99
 
100
+ # -------------------- EMBEDDING MODEL SELECTION --------------------
101
+
102
+
103
+ def pick_embedding_model(client: OpenAI) -> str:
104
+ """
105
+ Discover a usable embedding model for this project.
106
+ Tries candidates in order and caches the first that works.
107
+ """
108
+ global SELECTED_EMBED_MODEL
109
+ if SELECTED_EMBED_MODEL:
110
+ return SELECTED_EMBED_MODEL
111
+
112
+ last_error: Optional[Exception] = None
113
+ for model_name in EMBED_MODEL_CANDIDATES:
114
+ try:
115
+ # cheap sanity call
116
+ client.embeddings.create(model=model_name, input="test")
117
+ SELECTED_EMBED_MODEL = model_name
118
+ return model_name
119
+ except Exception as e:
120
+ last_error = e
121
+ continue
122
+
123
+ raise RuntimeError(
124
+ f"No usable embedding model found for this project. "
125
+ f"Tried: {EMBED_MODEL_CANDIDATES}. Last error: {last_error}"
126
+ )
127
+
128
+
129
  # -------------------- DATA SOURCE HELPERS --------------------
130
 
131
 
132
  def fetch_url_text(url: str) -> str:
133
+ """Fallback: fetch text from a URL via simple HTTP."""
134
  try:
135
  resp = requests.get(url, timeout=12)
136
  resp.raise_for_status()
 
161
  return f"[Error reading file {os.path.basename(path)}: {e}]"
162
 
163
 
164
+ # -------------------- FIRECRAWL HELPERS --------------------
165
+
166
+
167
+ def extract_markdown_from_firecrawl_result(result: Any) -> str:
168
+ """
169
+ Firecrawl crawl(...) can return:
170
+ - A list of Document-like objects with .markdown
171
+ - An object with .data which is a list of Documents
172
+ - Dict structures with 'markdown' or 'data'
173
+ We try to collect all markdown text into one big string.
174
+ """
175
+ texts: List[str] = []
176
+
177
+ def _collect(obj: Any):
178
+ if obj is None:
179
+ return
180
+
181
+ # Document-like object with attribute markdown
182
+ md = getattr(obj, "markdown", None)
183
+ if isinstance(md, str) and md.strip():
184
+ texts.append(md)
185
+ return
186
+
187
+ # Dict-shaped
188
+ if isinstance(obj, dict):
189
+ if isinstance(obj.get("markdown"), str):
190
+ texts.append(obj["markdown"])
191
+ data = obj.get("data")
192
+ if data is not None:
193
+ _collect(data)
194
+ return
195
+
196
+ # Iterable (list/tuple of docs)
197
+ if isinstance(obj, (list, tuple)):
198
+ for item in obj:
199
+ _collect(item)
200
+ return
201
+
202
+ _collect(result)
203
+ if texts:
204
+ return "\n\n".join(texts)
205
+ # Fallback: string representation if nothing else worked
206
+ return str(result)
207
+
208
+
209
+ def firecrawl_crawl_url(firecrawl_api_key: str, url: str) -> str:
210
+ """
211
+ Use Firecrawl to crawl a URL and return concatenated markdown for all pages.
212
+ If Firecrawl is not available or fails, return an error marker (caller can fallback).
213
+ """
214
+ firecrawl_api_key = (firecrawl_api_key or "").strip()
215
+ if not firecrawl_api_key:
216
+ return "[Firecrawl error: no Firecrawl API key provided.]"
217
+
218
+ if Firecrawl is None:
219
+ return "[Firecrawl error: firecrawl-py is not installed. Add it to requirements.txt.]"
220
+
221
+ try:
222
+ fc = Firecrawl(api_key=firecrawl_api_key)
223
+ # Crawl whole site; keep limit modest for speed
224
+ docs = fc.crawl(url=url, limit=50)
225
+ markdown = extract_markdown_from_firecrawl_result(docs)
226
+ return markdown
227
+ except Exception as e:
228
+ return f"[Firecrawl error for {url}: {e}]"
229
+
230
+
231
  # -------------------- EMBEDDING / KB BUILD --------------------
232
 
233
 
 
240
  return [], "⚠️ No documents to index."
241
 
242
  client = OpenAI(api_key=api_key)
243
+ try:
244
+ embed_model = pick_embedding_model(client)
245
+ except Exception as e:
246
+ return [], f"❌ Failed to select an embedding model: {e}"
247
+
248
  kb_chunks: List[Dict[str, Any]] = []
249
  total_chunks = 0
250
 
 
256
  for idx, ch in enumerate(chunks):
257
  try:
258
  emb_resp = client.embeddings.create(
259
+ model=embed_model,
260
  input=ch,
261
  )
262
  emb = emb_resp.data[0].embedding
 
279
  }
280
  )
281
 
282
+ status = (
283
+ f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks. "
284
+ f"Embedding model: `{SELECTED_EMBED_MODEL}`"
285
+ )
286
  return kb_chunks, status
287
 
288
 
 
298
  return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
299
 
300
  client = OpenAI(api_key=api_key)
301
+ try:
302
+ embed_model = pick_embedding_model(client)
303
+ except Exception as e:
304
+ return "", f"❌ Failed to select an embedding model: {e}"
305
+
306
  try:
307
  q_emb_resp = client.embeddings.create(
308
+ model=embed_model,
309
  input=query,
310
  )
311
  q_emb = q_emb_resp.data[0].embedding
 
338
  )
339
 
340
  context = "\n\n---\n\n".join(context_parts)
341
+ debug = (
342
+ f"📚 Retrieved {len(top)} chunks from KB "
343
+ f"(top_k={top_k}, threshold={similarity_threshold}). "
344
+ f"Embedding model: `{SELECTED_EMBED_MODEL}`"
345
+ )
346
  return context, debug
347
 
348
 
 
358
  return status, api_key
359
 
360
 
361
+ def save_firecrawl_key(fc_key: str):
362
+ fc_key = (fc_key or "").strip()
363
+ if not fc_key:
364
+ return "⚠️ No Firecrawl API key provided.", ""
365
+ masked = f"{fc_key[:3]}...{fc_key[-4:]}" if len(fc_key) >= 8 else "******"
366
+ status = f"✅ Firecrawl key saved for this session: `{masked}`"
367
+ return status, fc_key
368
+
369
+
370
  def apply_preset(preset_name: str):
371
  cfg = PRESET_CONFIGS.get(preset_name) or PRESET_CONFIGS["None (manual setup)"]
372
  return cfg["system"], cfg["urls"], cfg["text"]
 
374
 
375
  def build_knowledge_base(
376
  api_key: str,
377
+ firecrawl_api_key: str,
378
  urls_text: str,
379
  raw_text: str,
380
  file_paths: Optional[List[str]],
381
  ):
382
+ """
383
+ Build knowledge base using:
384
+ - Firecrawl for URLs (if Firecrawl key provided and SDK available)
385
+ - Fallback to simple HTTP fetch if Firecrawl not available
386
+ - Raw text
387
+ - Files
388
+ """
389
  api_key = (api_key or "").strip()
390
  if not api_key:
391
  return "❌ Please save your OpenAI API key first.", []
392
 
393
+ firecrawl_api_key = (firecrawl_api_key or "").strip()
394
+
395
  docs: List[Dict[str, Any]] = []
396
 
397
  # URLs
398
  urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
399
  for u in urls:
400
+ text_from_url = ""
401
+ if firecrawl_api_key:
402
+ # Try Firecrawl first
403
+ fc_text = firecrawl_crawl_url(firecrawl_api_key, u)
404
+ if not fc_text.startswith("[Firecrawl error"):
405
+ text_from_url = fc_text
406
+ else:
407
+ # Firecrawl failed; fallback to simple fetch
408
+ text_from_url = fetch_url_text(u)
409
+ else:
410
+ # No Firecrawl key → simple fetch
411
+ text_from_url = fetch_url_text(u)
412
+
413
+ docs.append({"source": u, "text": text_from_url})
414
 
415
  # Raw text
416
  if raw_text and raw_text.strip():
 
510
 
511
  # -------------------- UI LAYOUT --------------------
512
 
513
+ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl") as demo:
514
  gr.Markdown(
515
  """
516
+ # 🔍 RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl
517
 
518
  1. Enter your **OpenAI API key** and click **Save**.
519
+ 2. (Optional) Enter your **Firecrawl API key** and save it.
520
+ 3. Add knowledge via **URLs** (e.g. `https://zenai.world`), **uploaded files**, and/or **raw text**.
521
+ 4. Click **Grab / Retrieve Knowledge (Firecrawl)** to crawl URLs + index everything.
522
+ 5. Ask questions — the bot will answer **only** from your knowledge and system instructions.
523
  """
524
  )
525
 
526
  api_key_state = gr.State("")
527
+ firecrawl_key_state = gr.State("")
528
  kb_state = gr.State([])
529
  chat_state = gr.State([])
530
 
 
537
  placeholder="sk-...",
538
  type="password",
539
  )
540
+ save_api_btn = gr.Button("Save OpenAI API Key", variant="primary")
541
+ save_status = gr.Markdown("OpenAI API key not set.")
542
+
543
+ firecrawl_key_box = gr.Textbox(
544
+ label="Firecrawl API Key (optional)",
545
+ placeholder="fc-...",
546
+ type="password",
547
+ )
548
+ save_firecrawl_btn = gr.Button("Save Firecrawl Key")
549
+ firecrawl_status = gr.Markdown("Firecrawl key not set (fallback to simple URL fetch).")
550
 
551
  preset_dropdown = gr.Dropdown(
552
  label="Presets",
553
  choices=list(PRESET_CONFIGS.keys()),
554
+ value="ZEN Sites Deep QA (zenai.world + AI Arena)",
555
  )
556
 
557
  system_box = gr.Textbox(
 
565
  urls_box = gr.Textbox(
566
  label="Knowledge URLs (one per line)",
567
  lines=4,
568
+ placeholder="https://zenai.world\nhttps://us.zenai.biz",
569
  )
570
 
571
  raw_text_box = gr.Textbox(
 
580
  type="filepath",
581
  )
582
 
583
+ grab_kb_btn = gr.Button(
584
+ "Grab / Retrieve Knowledge (Firecrawl + Embeddings)",
585
  variant="secondary",
586
  )
587
  kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
 
598
  user_input = gr.Textbox(
599
  label="Ask a question",
600
  lines=3,
601
+ placeholder="Ask about the content of zenai.world, AI Arena, or your uploaded docs...",
602
  )
603
 
604
  with gr.Row():
 
609
  "ℹ️ Retrieval debug info will appear here after each answer."
610
  )
611
 
612
+ # Wiring: save OpenAI API key
613
  save_api_btn.click(
614
  fn=save_api_key,
615
  inputs=[api_key_box],
616
  outputs=[save_status, api_key_state],
617
  )
618
 
619
+ # Wiring: save Firecrawl API key
620
+ save_firecrawl_btn.click(
621
+ fn=save_firecrawl_key,
622
+ inputs=[firecrawl_key_box],
623
+ outputs=[firecrawl_status, firecrawl_key_state],
624
+ )
625
+
626
  # Wiring: presets
627
  preset_dropdown.change(
628
  fn=apply_preset,
 
630
  outputs=[system_box, urls_box, raw_text_box],
631
  )
632
 
633
+ # Wiring: build knowledge base (Firecrawl + embeddings)
634
+ grab_kb_btn.click(
635
  fn=build_knowledge_base,
636
+ inputs=[api_key_state, firecrawl_key_state, urls_box, raw_text_box, files_input],
637
  outputs=[kb_status_md, kb_state],
638
  )
639