ZENLLC commited on
Commit
681132c
·
verified ·
1 Parent(s): 7999523

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -153
app.py CHANGED
@@ -1,30 +1,20 @@
1
  import os
2
- import math
3
  from typing import List, Dict, Any, Tuple, Optional
4
 
5
  import requests
6
  import gradio as gr
7
  from openai import OpenAI
8
 
9
- # Firecrawl SDK (used for crawling URLs into markdown)
10
  try:
11
  from firecrawl import Firecrawl
12
  except ImportError:
13
- Firecrawl = None # we’ll handle this gracefully later
14
 
15
  # -------------------- CONFIG --------------------
16
 
17
  CHAT_MODEL = "gpt-5" # main chat model
18
 
19
- # Candidate embedding models – we'll auto-select one your project has access to
20
- EMBED_MODEL_CANDIDATES = [
21
- "text-embedding-3-small",
22
- "text-embedding-3-large",
23
- "text-embedding-ada-002",
24
- ]
25
-
26
- SELECTED_EMBED_MODEL: Optional[str] = None # set at runtime once discovered
27
-
28
  DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
29
 
30
  Rules:
@@ -64,7 +54,7 @@ PRESET_CONFIGS = {
64
  },
65
  }
66
 
67
- # -------------------- TEXT / EMBEDDING HELPERS --------------------
68
 
69
 
70
  def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str]:
@@ -85,45 +75,15 @@ def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str
85
  return chunks
86
 
87
 
88
- def cosine_similarity(a: List[float], b: List[float]) -> float:
89
- """Compute cosine similarity between two vectors."""
90
- if not a or not b:
91
- return 0.0
92
- dot = sum(x * y for x, y in zip(a, b))
93
- norm_a = math.sqrt(sum(x * x for x in a))
94
- norm_b = math.sqrt(sum(y * y for y in b))
95
- if norm_a == 0 or norm_b == 0:
96
- return 0.0
97
- return dot / (norm_a * norm_b)
98
-
99
-
100
- # -------------------- EMBEDDING MODEL SELECTION --------------------
101
-
102
-
103
- def pick_embedding_model(client: OpenAI) -> str:
104
- """
105
- Discover a usable embedding model for this project.
106
- Tries candidates in order and caches the first that works.
107
- """
108
- global SELECTED_EMBED_MODEL
109
- if SELECTED_EMBED_MODEL:
110
- return SELECTED_EMBED_MODEL
111
-
112
- last_error: Optional[Exception] = None
113
- for model_name in EMBED_MODEL_CANDIDATES:
114
- try:
115
- # cheap sanity call
116
- client.embeddings.create(model=model_name, input="test")
117
- SELECTED_EMBED_MODEL = model_name
118
- return model_name
119
- except Exception as e:
120
- last_error = e
121
- continue
122
-
123
- raise RuntimeError(
124
- f"No usable embedding model found for this project. "
125
- f"Tried: {EMBED_MODEL_CANDIDATES}. Last error: {last_error}"
126
- )
127
 
128
 
129
  # -------------------- DATA SOURCE HELPERS --------------------
@@ -166,10 +126,7 @@ def read_file_text(path: str) -> str:
166
 
167
  def extract_markdown_from_firecrawl_result(result: Any) -> str:
168
  """
169
- Firecrawl crawl(...) can return:
170
- - A list of Document-like objects with .markdown
171
- - An object with .data which is a list of Documents
172
- - Dict structures with 'markdown' or 'data'
173
  We try to collect all markdown text into one big string.
174
  """
175
  texts: List[str] = []
@@ -206,10 +163,10 @@ def extract_markdown_from_firecrawl_result(result: Any) -> str:
206
  return str(result)
207
 
208
 
209
- def firecrawl_crawl_url(firecrawl_api_key: str, url: str) -> str:
210
  """
211
- Use Firecrawl to crawl a URL and return concatenated markdown for all pages.
212
- If Firecrawl is not available or fails, return an error marker (caller can fallback).
213
  """
214
  firecrawl_api_key = (firecrawl_api_key or "").strip()
215
  if not firecrawl_api_key:
@@ -220,31 +177,22 @@ def firecrawl_crawl_url(firecrawl_api_key: str, url: str) -> str:
220
 
221
  try:
222
  fc = Firecrawl(api_key=firecrawl_api_key)
223
- # Crawl whole site; keep limit modest for speed
224
- docs = fc.crawl(url=url, limit=50)
225
- markdown = extract_markdown_from_firecrawl_result(docs)
226
  return markdown
227
  except Exception as e:
228
  return f"[Firecrawl error for {url}: {e}]"
229
 
230
 
231
- # -------------------- EMBEDDING / KB BUILD --------------------
232
-
233
 
234
- def build_embeddings(
235
- api_key: str,
236
- docs: List[Dict[str, Any]],
237
- ) -> Tuple[List[Dict[str, Any]], str]:
238
- """Embed all document chunks and return them as KB docs with embeddings."""
239
- if not docs:
240
- return [], "⚠️ No documents to index."
241
-
242
- client = OpenAI(api_key=api_key)
243
- try:
244
- embed_model = pick_embedding_model(client)
245
- except Exception as e:
246
- return [], f"❌ Failed to select an embedding model: {e}"
247
 
 
 
 
 
 
248
  kb_chunks: List[Dict[str, Any]] = []
249
  total_chunks = 0
250
 
@@ -254,80 +202,54 @@ def build_embeddings(
254
  chunks = chunk_text(text, max_chars=2000, overlap=200)
255
 
256
  for idx, ch in enumerate(chunks):
257
- try:
258
- emb_resp = client.embeddings.create(
259
- model=embed_model,
260
- input=ch,
261
- )
262
- emb = emb_resp.data[0].embedding
263
- kb_chunks.append(
264
- {
265
- "id": f"{source}_{idx}",
266
- "source": source,
267
- "text": ch,
268
- "embedding": emb,
269
- }
270
- )
271
- total_chunks += 1
272
- except Exception as e:
273
- kb_chunks.append(
274
- {
275
- "id": f"{source}_{idx}_error",
276
- "source": source,
277
- "text": f"[Error embedding chunk: {e}]",
278
- "embedding": [],
279
- }
280
- )
281
-
282
- status = (
283
- f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks. "
284
- f"Embedding model: `{SELECTED_EMBED_MODEL}`"
285
- )
286
  return kb_chunks, status
287
 
288
 
289
- def retrieve_context(
290
- api_key: str,
291
  kb: List[Dict[str, Any]],
292
  query: str,
293
  top_k: int = 5,
294
- similarity_threshold: float = 0.25,
295
  ) -> Tuple[str, str]:
296
- """Retrieve top-k relevant chunks from KB for the query."""
 
 
 
297
  if not kb:
298
  return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
299
 
300
- client = OpenAI(api_key=api_key)
301
- try:
302
- embed_model = pick_embedding_model(client)
303
- except Exception as e:
304
- return "", f"❌ Failed to select an embedding model: {e}"
305
 
306
- try:
307
- q_emb_resp = client.embeddings.create(
308
- model=embed_model,
309
- input=query,
310
- )
311
- q_emb = q_emb_resp.data[0].embedding
312
- except Exception as e:
313
- return "", f"⚠️ Error creating query embedding: {e}"
314
 
315
- scored: List[Tuple[float, Dict[str, Any]]] = []
316
  for d in kb:
317
- emb = d.get("embedding") or []
318
- if not emb:
319
  continue
320
- sim = cosine_similarity(q_emb, emb)
321
- scored.append((sim, d))
 
 
322
 
323
  if not scored:
324
- return "", "⚠️ No valid embeddings in KB; cannot retrieve context."
325
 
326
  scored.sort(key=lambda x: x[0], reverse=True)
327
- top = [d for (sim, d) in scored[:top_k] if sim >= similarity_threshold]
328
-
329
- if not top:
330
- return "", "ℹ️ No chunks passed the similarity threshold; answering from instructions only."
331
 
332
  context_parts = []
333
  for idx, d in enumerate(top, start=1):
@@ -338,11 +260,7 @@ def retrieve_context(
338
  )
339
 
340
  context = "\n\n---\n\n".join(context_parts)
341
- debug = (
342
- f"📚 Retrieved {len(top)} chunks from KB "
343
- f"(top_k={top_k}, threshold={similarity_threshold}). "
344
- f"Embedding model: `{SELECTED_EMBED_MODEL}`"
345
- )
346
  return context, debug
347
 
348
 
@@ -354,7 +272,7 @@ def save_api_key(api_key: str):
354
  if not api_key:
355
  return "❌ No API key provided.", ""
356
  masked = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) >= 8 else "******"
357
- status = f"✅ API key saved for this session: `{masked}`"
358
  return status, api_key
359
 
360
 
@@ -381,7 +299,7 @@ def build_knowledge_base(
381
  ):
382
  """
383
  Build knowledge base using:
384
- - Firecrawl for URLs (if Firecrawl key provided and SDK available)
385
  - Fallback to simple HTTP fetch if Firecrawl not available
386
  - Raw text
387
  - Files
@@ -399,8 +317,8 @@ def build_knowledge_base(
399
  for u in urls:
400
  text_from_url = ""
401
  if firecrawl_api_key:
402
- # Try Firecrawl first
403
- fc_text = firecrawl_crawl_url(firecrawl_api_key, u)
404
  if not fc_text.startswith("[Firecrawl error"):
405
  text_from_url = fc_text
406
  else:
@@ -428,7 +346,7 @@ def build_knowledge_base(
428
  if not docs:
429
  return "⚠️ No knowledge sources provided (URLs, text, or files).", []
430
 
431
- kb, status = build_embeddings(api_key, docs)
432
  return status, kb
433
 
434
 
@@ -452,8 +370,8 @@ def chat_with_rag(
452
  if not system_prompt:
453
  system_prompt = DEFAULT_SYSTEM_PROMPT
454
 
455
- # Retrieve context from KB
456
- context, debug_retrieval = retrieve_context(api_key, kb, user_message)
457
 
458
  client = OpenAI(api_key=api_key)
459
 
@@ -517,8 +435,8 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl")
517
 
518
  1. Enter your **OpenAI API key** and click **Save**.
519
  2. (Optional) Enter your **Firecrawl API key** and save it.
520
- 3. Add knowledge via **URLs** (e.g. `https://zenai.world`), **uploaded files**, and/or **raw text**.
521
- 4. Click **Grab / Retrieve Knowledge (Firecrawl)** to crawl URLs + index everything.
522
  5. Ask questions — the bot will answer **only** from your knowledge and system instructions.
523
  """
524
  )
@@ -528,6 +446,10 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl")
528
  kb_state = gr.State([])
529
  chat_state = gr.State([])
530
 
 
 
 
 
531
  with gr.Row():
532
  with gr.Column(scale=1):
533
  gr.Markdown("### 🔑 API & System")
@@ -546,18 +468,20 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl")
546
  type="password",
547
  )
548
  save_firecrawl_btn = gr.Button("Save Firecrawl Key")
549
- firecrawl_status = gr.Markdown("Firecrawl key not set (fallback to simple URL fetch).")
 
 
550
 
551
  preset_dropdown = gr.Dropdown(
552
  label="Presets",
553
  choices=list(PRESET_CONFIGS.keys()),
554
- value="ZEN Sites Deep QA (zenai.world + AI Arena)",
555
  )
556
 
557
  system_box = gr.Textbox(
558
  label="System Instructions",
559
  lines=8,
560
- value=DEFAULT_SYSTEM_PROMPT,
561
  )
562
 
563
  gr.Markdown("### 📚 Knowledge Sources")
@@ -565,12 +489,14 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl")
565
  urls_box = gr.Textbox(
566
  label="Knowledge URLs (one per line)",
567
  lines=4,
 
568
  placeholder="https://zenai.world\nhttps://us.zenai.biz",
569
  )
570
 
571
  raw_text_box = gr.Textbox(
572
  label="Additional Knowledge Text",
573
  lines=6,
 
574
  placeholder="Paste any notes, docs, or reference text here...",
575
  )
576
 
@@ -581,7 +507,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl")
581
  )
582
 
583
  grab_kb_btn = gr.Button(
584
- "Grab / Retrieve Knowledge (Firecrawl + Embeddings)",
585
  variant="secondary",
586
  )
587
  kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
@@ -598,7 +524,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl")
598
  user_input = gr.Textbox(
599
  label="Ask a question",
600
  lines=3,
601
- placeholder="Ask about the content of zenai.world, AI Arena, or your uploaded docs...",
602
  )
603
 
604
  with gr.Row():
@@ -630,7 +556,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl")
630
  outputs=[system_box, urls_box, raw_text_box],
631
  )
632
 
633
- # Wiring: build knowledge base (Firecrawl + embeddings)
634
  grab_kb_btn.click(
635
  fn=build_knowledge_base,
636
  inputs=[api_key_state, firecrawl_key_state, urls_box, raw_text_box, files_input],
 
1
  import os
 
2
  from typing import List, Dict, Any, Tuple, Optional
3
 
4
  import requests
5
  import gradio as gr
6
  from openai import OpenAI
7
 
8
+ # Firecrawl SDK (used for scraping URLs into markdown)
9
  try:
10
  from firecrawl import Firecrawl
11
  except ImportError:
12
+ Firecrawl = None # handled gracefully below
13
 
14
  # -------------------- CONFIG --------------------
15
 
16
  CHAT_MODEL = "gpt-5" # main chat model
17
 
 
 
 
 
 
 
 
 
 
18
  DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
19
 
20
  Rules:
 
54
  },
55
  }
56
 
57
+ # -------------------- TEXT HELPERS --------------------
58
 
59
 
60
  def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str]:
 
75
  return chunks
76
 
77
 
78
+ def tokenize(text: str) -> List[str]:
79
+ """Very simple tokenizer: lowercase, keep alphanumerics, split on spaces."""
80
+ cleaned = []
81
+ for ch in text.lower():
82
+ if ch.isalnum():
83
+ cleaned.append(ch)
84
+ else:
85
+ cleaned.append(" ")
86
+ return [tok for tok in "".join(cleaned).split() if tok]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
  # -------------------- DATA SOURCE HELPERS --------------------
 
126
 
127
  def extract_markdown_from_firecrawl_result(result: Any) -> str:
128
  """
129
+ Firecrawl scrape(...) can return Document-like objects or dicts.
 
 
 
130
  We try to collect all markdown text into one big string.
131
  """
132
  texts: List[str] = []
 
163
  return str(result)
164
 
165
 
166
+ def firecrawl_scrape_url(firecrawl_api_key: str, url: str) -> str:
167
  """
168
+ Use Firecrawl to scrape a single URL and return markdown.
169
+ This is intentionally *not* a full crawl to keep it fast.
170
  """
171
  firecrawl_api_key = (firecrawl_api_key or "").strip()
172
  if not firecrawl_api_key:
 
177
 
178
  try:
179
  fc = Firecrawl(api_key=firecrawl_api_key)
180
+ # Fast single-page scrape markdown suitable for RAG
181
+ doc = fc.scrape(url, formats=["markdown"])
182
+ markdown = extract_markdown_from_firecrawl_result(doc)
183
  return markdown
184
  except Exception as e:
185
  return f"[Firecrawl error for {url}: {e}]"
186
 
187
 
188
+ # -------------------- LOCAL KB BUILD (NO OPENAI EMBEDDINGS) --------------------
 
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
+ def build_local_kb(docs: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], str]:
192
+ """
193
+ Build a local KB with lexical features only (no OpenAI embeddings).
194
+ Each KB entry: {id, source, text, tokens}
195
+ """
196
  kb_chunks: List[Dict[str, Any]] = []
197
  total_chunks = 0
198
 
 
202
  chunks = chunk_text(text, max_chars=2000, overlap=200)
203
 
204
  for idx, ch in enumerate(chunks):
205
+ tokens = tokenize(ch)
206
+ kb_chunks.append(
207
+ {
208
+ "id": f"{source}_{idx}",
209
+ "source": source,
210
+ "text": ch,
211
+ "tokens": tokens,
212
+ }
213
+ )
214
+ total_chunks += 1
215
+
216
+ status = f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks (lexical retrieval)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  return kb_chunks, status
218
 
219
 
220
+ def retrieve_context_local(
 
221
  kb: List[Dict[str, Any]],
222
  query: str,
223
  top_k: int = 5,
 
224
  ) -> Tuple[str, str]:
225
+ """
226
+ Retrieve top-k relevant chunks from KB for the query using simple lexical matching:
227
+ score = number of overlapping tokens between query and chunk.
228
+ """
229
  if not kb:
230
  return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
231
 
232
+ q_tokens = tokenize(query)
233
+ if not q_tokens:
234
+ return "", "ℹ️ Query has no meaningful tokens; answering from instructions only."
 
 
235
 
236
+ q_set = set(q_tokens)
 
 
 
 
 
 
 
237
 
238
+ scored: List[Tuple[int, Dict[str, Any]]] = []
239
  for d in kb:
240
+ tokens = d.get("tokens") or []
241
+ if not tokens:
242
  continue
243
+ t_set = set(tokens)
244
+ overlap = len(q_set & t_set)
245
+ if overlap > 0:
246
+ scored.append((overlap, d))
247
 
248
  if not scored:
249
+ return "", "ℹ️ No lexical overlap with knowledge base; answering from instructions only."
250
 
251
  scored.sort(key=lambda x: x[0], reverse=True)
252
+ top = [d for (score, d) in scored[:top_k]]
 
 
 
253
 
254
  context_parts = []
255
  for idx, d in enumerate(top, start=1):
 
260
  )
261
 
262
  context = "\n\n---\n\n".join(context_parts)
263
+ debug = f"📚 Retrieved {len(top)} chunks from KB via lexical retrieval (no embeddings)."
 
 
 
 
264
  return context, debug
265
 
266
 
 
272
  if not api_key:
273
  return "❌ No API key provided.", ""
274
  masked = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) >= 8 else "******"
275
+ status = f"✅ OpenAI key saved for this session: `{masked}`"
276
  return status, api_key
277
 
278
 
 
299
  ):
300
  """
301
  Build knowledge base using:
302
+ - Firecrawl scrape for URLs (if Firecrawl key provided and SDK available)
303
  - Fallback to simple HTTP fetch if Firecrawl not available
304
  - Raw text
305
  - Files
 
317
  for u in urls:
318
  text_from_url = ""
319
  if firecrawl_api_key:
320
+ # Try Firecrawl first (single-page scrape)
321
+ fc_text = firecrawl_scrape_url(firecrawl_api_key, u)
322
  if not fc_text.startswith("[Firecrawl error"):
323
  text_from_url = fc_text
324
  else:
 
346
  if not docs:
347
  return "⚠️ No knowledge sources provided (URLs, text, or files).", []
348
 
349
+ kb, status = build_local_kb(docs)
350
  return status, kb
351
 
352
 
 
370
  if not system_prompt:
371
  system_prompt = DEFAULT_SYSTEM_PROMPT
372
 
373
+ # Retrieve context from KB (local lexical retrieval)
374
+ context, debug_retrieval = retrieve_context_local(kb, user_message)
375
 
376
  client = OpenAI(api_key=api_key)
377
 
 
435
 
436
  1. Enter your **OpenAI API key** and click **Save**.
437
  2. (Optional) Enter your **Firecrawl API key** and save it.
438
+ 3. Choose a preset (e.g. **ZEN Sites Deep QA**) — this auto-loads URLs like `https://zenai.world`.
439
+ 4. Click **Grab / Retrieve Knowledge (Firecrawl + Lexical Index)** to scrape URLs + index everything.
440
  5. Ask questions — the bot will answer **only** from your knowledge and system instructions.
441
  """
442
  )
 
446
  kb_state = gr.State([])
447
  chat_state = gr.State([])
448
 
449
+ # default preset on load -> ZEN
450
+ default_preset_name = "ZEN Sites Deep QA (zenai.world + AI Arena)"
451
+ default_preset_cfg = PRESET_CONFIGS[default_preset_name]
452
+
453
  with gr.Row():
454
  with gr.Column(scale=1):
455
  gr.Markdown("### 🔑 API & System")
 
468
  type="password",
469
  )
470
  save_firecrawl_btn = gr.Button("Save Firecrawl Key")
471
+ firecrawl_status = gr.Markdown(
472
+ "Firecrawl key not set (will fall back to simple URL fetch)."
473
+ )
474
 
475
  preset_dropdown = gr.Dropdown(
476
  label="Presets",
477
  choices=list(PRESET_CONFIGS.keys()),
478
+ value=default_preset_name,
479
  )
480
 
481
  system_box = gr.Textbox(
482
  label="System Instructions",
483
  lines=8,
484
+ value=default_preset_cfg["system"],
485
  )
486
 
487
  gr.Markdown("### 📚 Knowledge Sources")
 
489
  urls_box = gr.Textbox(
490
  label="Knowledge URLs (one per line)",
491
  lines=4,
492
+ value=default_preset_cfg["urls"],
493
  placeholder="https://zenai.world\nhttps://us.zenai.biz",
494
  )
495
 
496
  raw_text_box = gr.Textbox(
497
  label="Additional Knowledge Text",
498
  lines=6,
499
+ value=default_preset_cfg["text"],
500
  placeholder="Paste any notes, docs, or reference text here...",
501
  )
502
 
 
507
  )
508
 
509
  grab_kb_btn = gr.Button(
510
+ "Grab / Retrieve Knowledge (Firecrawl + Lexical Index)",
511
  variant="secondary",
512
  )
513
  kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
 
524
  user_input = gr.Textbox(
525
  label="Ask a question",
526
  lines=3,
527
+ placeholder="Ask about zenai.world, AI Arena, or your uploaded docs...",
528
  )
529
 
530
  with gr.Row():
 
556
  outputs=[system_box, urls_box, raw_text_box],
557
  )
558
 
559
+ # Wiring: build knowledge base (Firecrawl + lexical index)
560
  grab_kb_btn.click(
561
  fn=build_knowledge_base,
562
  inputs=[api_key_state, firecrawl_key_state, urls_box, raw_text_box, files_input],