Update app.py
Browse files
app.py
CHANGED
|
@@ -6,11 +6,24 @@ import requests
|
|
| 6 |
import gradio as gr
|
| 7 |
from openai import OpenAI
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# -------------------- CONFIG --------------------
|
| 10 |
|
| 11 |
CHAT_MODEL = "gpt-5" # main chat model
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
|
| 16 |
|
|
@@ -37,11 +50,11 @@ PRESET_CONFIGS = {
|
|
| 37 |
"with youth, homeschool, and professional tracks and blockchain-verified credentials."
|
| 38 |
),
|
| 39 |
},
|
| 40 |
-
"Policy
|
| 41 |
"system": DEFAULT_SYSTEM_PROMPT
|
| 42 |
+ "\n\nYou act as a neutral policy explainer. Summarize clearly, highlight key risks, opportunities, and practical implications.",
|
| 43 |
-
"urls": "",
|
| 44 |
-
"text": "
|
| 45 |
},
|
| 46 |
"Research Notebook / Personal RAG Sandbox": {
|
| 47 |
"system": DEFAULT_SYSTEM_PROMPT
|
|
@@ -84,11 +97,40 @@ def cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
| 84 |
return dot / (norm_a * norm_b)
|
| 85 |
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
# -------------------- DATA SOURCE HELPERS --------------------
|
| 88 |
|
| 89 |
|
| 90 |
def fetch_url_text(url: str) -> str:
|
| 91 |
-
"""
|
| 92 |
try:
|
| 93 |
resp = requests.get(url, timeout=12)
|
| 94 |
resp.raise_for_status()
|
|
@@ -119,6 +161,73 @@ def read_file_text(path: str) -> str:
|
|
| 119 |
return f"[Error reading file {os.path.basename(path)}: {e}]"
|
| 120 |
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# -------------------- EMBEDDING / KB BUILD --------------------
|
| 123 |
|
| 124 |
|
|
@@ -131,6 +240,11 @@ def build_embeddings(
|
|
| 131 |
return [], "⚠️ No documents to index."
|
| 132 |
|
| 133 |
client = OpenAI(api_key=api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
kb_chunks: List[Dict[str, Any]] = []
|
| 135 |
total_chunks = 0
|
| 136 |
|
|
@@ -142,7 +256,7 @@ def build_embeddings(
|
|
| 142 |
for idx, ch in enumerate(chunks):
|
| 143 |
try:
|
| 144 |
emb_resp = client.embeddings.create(
|
| 145 |
-
model=
|
| 146 |
input=ch,
|
| 147 |
)
|
| 148 |
emb = emb_resp.data[0].embedding
|
|
@@ -165,7 +279,10 @@ def build_embeddings(
|
|
| 165 |
}
|
| 166 |
)
|
| 167 |
|
| 168 |
-
status =
|
|
|
|
|
|
|
|
|
|
| 169 |
return kb_chunks, status
|
| 170 |
|
| 171 |
|
|
@@ -181,9 +298,14 @@ def retrieve_context(
|
|
| 181 |
return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
|
| 182 |
|
| 183 |
client = OpenAI(api_key=api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
try:
|
| 185 |
q_emb_resp = client.embeddings.create(
|
| 186 |
-
model=
|
| 187 |
input=query,
|
| 188 |
)
|
| 189 |
q_emb = q_emb_resp.data[0].embedding
|
|
@@ -216,7 +338,11 @@ def retrieve_context(
|
|
| 216 |
)
|
| 217 |
|
| 218 |
context = "\n\n---\n\n".join(context_parts)
|
| 219 |
-
debug =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
return context, debug
|
| 221 |
|
| 222 |
|
|
@@ -232,6 +358,15 @@ def save_api_key(api_key: str):
|
|
| 232 |
return status, api_key
|
| 233 |
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
def apply_preset(preset_name: str):
|
| 236 |
cfg = PRESET_CONFIGS.get(preset_name) or PRESET_CONFIGS["None (manual setup)"]
|
| 237 |
return cfg["system"], cfg["urls"], cfg["text"]
|
|
@@ -239,21 +374,43 @@ def apply_preset(preset_name: str):
|
|
| 239 |
|
| 240 |
def build_knowledge_base(
|
| 241 |
api_key: str,
|
|
|
|
| 242 |
urls_text: str,
|
| 243 |
raw_text: str,
|
| 244 |
file_paths: Optional[List[str]],
|
| 245 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
api_key = (api_key or "").strip()
|
| 247 |
if not api_key:
|
| 248 |
return "❌ Please save your OpenAI API key first.", []
|
| 249 |
|
|
|
|
|
|
|
| 250 |
docs: List[Dict[str, Any]] = []
|
| 251 |
|
| 252 |
# URLs
|
| 253 |
urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
|
| 254 |
for u in urls:
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
# Raw text
|
| 259 |
if raw_text and raw_text.strip():
|
|
@@ -353,19 +510,21 @@ def clear_chat():
|
|
| 353 |
|
| 354 |
# -------------------- UI LAYOUT --------------------
|
| 355 |
|
| 356 |
-
with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
|
| 357 |
gr.Markdown(
|
| 358 |
"""
|
| 359 |
-
# 🔍 RAG Chatbot — GPT-5 + URLs / Files / Text
|
| 360 |
|
| 361 |
1. Enter your **OpenAI API key** and click **Save**.
|
| 362 |
-
2.
|
| 363 |
-
3.
|
| 364 |
-
4.
|
|
|
|
| 365 |
"""
|
| 366 |
)
|
| 367 |
|
| 368 |
api_key_state = gr.State("")
|
|
|
|
| 369 |
kb_state = gr.State([])
|
| 370 |
chat_state = gr.State([])
|
| 371 |
|
|
@@ -378,13 +537,21 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
|
|
| 378 |
placeholder="sk-...",
|
| 379 |
type="password",
|
| 380 |
)
|
| 381 |
-
save_api_btn = gr.Button("Save API Key", variant="primary")
|
| 382 |
-
save_status = gr.Markdown("API key not set.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
preset_dropdown = gr.Dropdown(
|
| 385 |
label="Presets",
|
| 386 |
choices=list(PRESET_CONFIGS.keys()),
|
| 387 |
-
value="
|
| 388 |
)
|
| 389 |
|
| 390 |
system_box = gr.Textbox(
|
|
@@ -398,7 +565,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
|
|
| 398 |
urls_box = gr.Textbox(
|
| 399 |
label="Knowledge URLs (one per line)",
|
| 400 |
lines=4,
|
| 401 |
-
placeholder="https://
|
| 402 |
)
|
| 403 |
|
| 404 |
raw_text_box = gr.Textbox(
|
|
@@ -413,8 +580,8 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
|
|
| 413 |
type="filepath",
|
| 414 |
)
|
| 415 |
|
| 416 |
-
|
| 417 |
-
"
|
| 418 |
variant="secondary",
|
| 419 |
)
|
| 420 |
kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
|
|
@@ -431,7 +598,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
|
|
| 431 |
user_input = gr.Textbox(
|
| 432 |
label="Ask a question",
|
| 433 |
lines=3,
|
| 434 |
-
placeholder="Ask about the content of
|
| 435 |
)
|
| 436 |
|
| 437 |
with gr.Row():
|
|
@@ -442,13 +609,20 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
|
|
| 442 |
"ℹ️ Retrieval debug info will appear here after each answer."
|
| 443 |
)
|
| 444 |
|
| 445 |
-
# Wiring: save API key
|
| 446 |
save_api_btn.click(
|
| 447 |
fn=save_api_key,
|
| 448 |
inputs=[api_key_box],
|
| 449 |
outputs=[save_status, api_key_state],
|
| 450 |
)
|
| 451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
# Wiring: presets
|
| 453 |
preset_dropdown.change(
|
| 454 |
fn=apply_preset,
|
|
@@ -456,10 +630,10 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
|
|
| 456 |
outputs=[system_box, urls_box, raw_text_box],
|
| 457 |
)
|
| 458 |
|
| 459 |
-
# Wiring: build knowledge base
|
| 460 |
-
|
| 461 |
fn=build_knowledge_base,
|
| 462 |
-
inputs=[api_key_state, urls_box, raw_text_box, files_input],
|
| 463 |
outputs=[kb_status_md, kb_state],
|
| 464 |
)
|
| 465 |
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from openai import OpenAI
|
| 8 |
|
| 9 |
+
# Firecrawl SDK (used for crawling URLs into markdown)
|
| 10 |
+
try:
|
| 11 |
+
from firecrawl import Firecrawl
|
| 12 |
+
except ImportError:
|
| 13 |
+
Firecrawl = None # we’ll handle this gracefully later
|
| 14 |
+
|
| 15 |
# -------------------- CONFIG --------------------
|
| 16 |
|
| 17 |
CHAT_MODEL = "gpt-5" # main chat model
|
| 18 |
+
|
| 19 |
+
# Candidate embedding models – we'll auto-select one your project has access to
|
| 20 |
+
EMBED_MODEL_CANDIDATES = [
|
| 21 |
+
"text-embedding-3-small",
|
| 22 |
+
"text-embedding-3-large",
|
| 23 |
+
"text-embedding-ada-002",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
SELECTED_EMBED_MODEL: Optional[str] = None # set at runtime once discovered
|
| 27 |
|
| 28 |
DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
|
| 29 |
|
|
|
|
| 50 |
"with youth, homeschool, and professional tracks and blockchain-verified credentials."
|
| 51 |
),
|
| 52 |
},
|
| 53 |
+
"AI Policy & Governance Starter": {
|
| 54 |
"system": DEFAULT_SYSTEM_PROMPT
|
| 55 |
+ "\n\nYou act as a neutral policy explainer. Summarize clearly, highlight key risks, opportunities, and practical implications.",
|
| 56 |
+
"urls": "https://oecd.ai/en/ai-principles",
|
| 57 |
+
"text": "Use this preset for high-level AI policy, governance, and principles exploration.",
|
| 58 |
},
|
| 59 |
"Research Notebook / Personal RAG Sandbox": {
|
| 60 |
"system": DEFAULT_SYSTEM_PROMPT
|
|
|
|
| 97 |
return dot / (norm_a * norm_b)
|
| 98 |
|
| 99 |
|
| 100 |
+
# -------------------- EMBEDDING MODEL SELECTION --------------------
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def pick_embedding_model(client: OpenAI) -> str:
|
| 104 |
+
"""
|
| 105 |
+
Discover a usable embedding model for this project.
|
| 106 |
+
Tries candidates in order and caches the first that works.
|
| 107 |
+
"""
|
| 108 |
+
global SELECTED_EMBED_MODEL
|
| 109 |
+
if SELECTED_EMBED_MODEL:
|
| 110 |
+
return SELECTED_EMBED_MODEL
|
| 111 |
+
|
| 112 |
+
last_error: Optional[Exception] = None
|
| 113 |
+
for model_name in EMBED_MODEL_CANDIDATES:
|
| 114 |
+
try:
|
| 115 |
+
# cheap sanity call
|
| 116 |
+
client.embeddings.create(model=model_name, input="test")
|
| 117 |
+
SELECTED_EMBED_MODEL = model_name
|
| 118 |
+
return model_name
|
| 119 |
+
except Exception as e:
|
| 120 |
+
last_error = e
|
| 121 |
+
continue
|
| 122 |
+
|
| 123 |
+
raise RuntimeError(
|
| 124 |
+
f"No usable embedding model found for this project. "
|
| 125 |
+
f"Tried: {EMBED_MODEL_CANDIDATES}. Last error: {last_error}"
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
# -------------------- DATA SOURCE HELPERS --------------------
|
| 130 |
|
| 131 |
|
| 132 |
def fetch_url_text(url: str) -> str:
|
| 133 |
+
"""Fallback: fetch text from a URL via simple HTTP."""
|
| 134 |
try:
|
| 135 |
resp = requests.get(url, timeout=12)
|
| 136 |
resp.raise_for_status()
|
|
|
|
| 161 |
return f"[Error reading file {os.path.basename(path)}: {e}]"
|
| 162 |
|
| 163 |
|
| 164 |
+
# -------------------- FIRECRAWL HELPERS --------------------
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def extract_markdown_from_firecrawl_result(result: Any) -> str:
|
| 168 |
+
"""
|
| 169 |
+
Firecrawl crawl(...) can return:
|
| 170 |
+
- A list of Document-like objects with .markdown
|
| 171 |
+
- An object with .data which is a list of Documents
|
| 172 |
+
- Dict structures with 'markdown' or 'data'
|
| 173 |
+
We try to collect all markdown text into one big string.
|
| 174 |
+
"""
|
| 175 |
+
texts: List[str] = []
|
| 176 |
+
|
| 177 |
+
def _collect(obj: Any):
|
| 178 |
+
if obj is None:
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
# Document-like object with attribute markdown
|
| 182 |
+
md = getattr(obj, "markdown", None)
|
| 183 |
+
if isinstance(md, str) and md.strip():
|
| 184 |
+
texts.append(md)
|
| 185 |
+
return
|
| 186 |
+
|
| 187 |
+
# Dict-shaped
|
| 188 |
+
if isinstance(obj, dict):
|
| 189 |
+
if isinstance(obj.get("markdown"), str):
|
| 190 |
+
texts.append(obj["markdown"])
|
| 191 |
+
data = obj.get("data")
|
| 192 |
+
if data is not None:
|
| 193 |
+
_collect(data)
|
| 194 |
+
return
|
| 195 |
+
|
| 196 |
+
# Iterable (list/tuple of docs)
|
| 197 |
+
if isinstance(obj, (list, tuple)):
|
| 198 |
+
for item in obj:
|
| 199 |
+
_collect(item)
|
| 200 |
+
return
|
| 201 |
+
|
| 202 |
+
_collect(result)
|
| 203 |
+
if texts:
|
| 204 |
+
return "\n\n".join(texts)
|
| 205 |
+
# Fallback: string representation if nothing else worked
|
| 206 |
+
return str(result)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def firecrawl_crawl_url(firecrawl_api_key: str, url: str) -> str:
|
| 210 |
+
"""
|
| 211 |
+
Use Firecrawl to crawl a URL and return concatenated markdown for all pages.
|
| 212 |
+
If Firecrawl is not available or fails, return an error marker (caller can fallback).
|
| 213 |
+
"""
|
| 214 |
+
firecrawl_api_key = (firecrawl_api_key or "").strip()
|
| 215 |
+
if not firecrawl_api_key:
|
| 216 |
+
return "[Firecrawl error: no Firecrawl API key provided.]"
|
| 217 |
+
|
| 218 |
+
if Firecrawl is None:
|
| 219 |
+
return "[Firecrawl error: firecrawl-py is not installed. Add it to requirements.txt.]"
|
| 220 |
+
|
| 221 |
+
try:
|
| 222 |
+
fc = Firecrawl(api_key=firecrawl_api_key)
|
| 223 |
+
# Crawl whole site; keep limit modest for speed
|
| 224 |
+
docs = fc.crawl(url=url, limit=50)
|
| 225 |
+
markdown = extract_markdown_from_firecrawl_result(docs)
|
| 226 |
+
return markdown
|
| 227 |
+
except Exception as e:
|
| 228 |
+
return f"[Firecrawl error for {url}: {e}]"
|
| 229 |
+
|
| 230 |
+
|
| 231 |
# -------------------- EMBEDDING / KB BUILD --------------------
|
| 232 |
|
| 233 |
|
|
|
|
| 240 |
return [], "⚠️ No documents to index."
|
| 241 |
|
| 242 |
client = OpenAI(api_key=api_key)
|
| 243 |
+
try:
|
| 244 |
+
embed_model = pick_embedding_model(client)
|
| 245 |
+
except Exception as e:
|
| 246 |
+
return [], f"❌ Failed to select an embedding model: {e}"
|
| 247 |
+
|
| 248 |
kb_chunks: List[Dict[str, Any]] = []
|
| 249 |
total_chunks = 0
|
| 250 |
|
|
|
|
| 256 |
for idx, ch in enumerate(chunks):
|
| 257 |
try:
|
| 258 |
emb_resp = client.embeddings.create(
|
| 259 |
+
model=embed_model,
|
| 260 |
input=ch,
|
| 261 |
)
|
| 262 |
emb = emb_resp.data[0].embedding
|
|
|
|
| 279 |
}
|
| 280 |
)
|
| 281 |
|
| 282 |
+
status = (
|
| 283 |
+
f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks. "
|
| 284 |
+
f"Embedding model: `{SELECTED_EMBED_MODEL}`"
|
| 285 |
+
)
|
| 286 |
return kb_chunks, status
|
| 287 |
|
| 288 |
|
|
|
|
| 298 |
return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
|
| 299 |
|
| 300 |
client = OpenAI(api_key=api_key)
|
| 301 |
+
try:
|
| 302 |
+
embed_model = pick_embedding_model(client)
|
| 303 |
+
except Exception as e:
|
| 304 |
+
return "", f"❌ Failed to select an embedding model: {e}"
|
| 305 |
+
|
| 306 |
try:
|
| 307 |
q_emb_resp = client.embeddings.create(
|
| 308 |
+
model=embed_model,
|
| 309 |
input=query,
|
| 310 |
)
|
| 311 |
q_emb = q_emb_resp.data[0].embedding
|
|
|
|
| 338 |
)
|
| 339 |
|
| 340 |
context = "\n\n---\n\n".join(context_parts)
|
| 341 |
+
debug = (
|
| 342 |
+
f"📚 Retrieved {len(top)} chunks from KB "
|
| 343 |
+
f"(top_k={top_k}, threshold={similarity_threshold}). "
|
| 344 |
+
f"Embedding model: `{SELECTED_EMBED_MODEL}`"
|
| 345 |
+
)
|
| 346 |
return context, debug
|
| 347 |
|
| 348 |
|
|
|
|
| 358 |
return status, api_key
|
| 359 |
|
| 360 |
|
| 361 |
+
def save_firecrawl_key(fc_key: str):
|
| 362 |
+
fc_key = (fc_key or "").strip()
|
| 363 |
+
if not fc_key:
|
| 364 |
+
return "⚠️ No Firecrawl API key provided.", ""
|
| 365 |
+
masked = f"{fc_key[:3]}...{fc_key[-4:]}" if len(fc_key) >= 8 else "******"
|
| 366 |
+
status = f"✅ Firecrawl key saved for this session: `{masked}`"
|
| 367 |
+
return status, fc_key
|
| 368 |
+
|
| 369 |
+
|
| 370 |
def apply_preset(preset_name: str):
|
| 371 |
cfg = PRESET_CONFIGS.get(preset_name) or PRESET_CONFIGS["None (manual setup)"]
|
| 372 |
return cfg["system"], cfg["urls"], cfg["text"]
|
|
|
|
| 374 |
|
| 375 |
def build_knowledge_base(
|
| 376 |
api_key: str,
|
| 377 |
+
firecrawl_api_key: str,
|
| 378 |
urls_text: str,
|
| 379 |
raw_text: str,
|
| 380 |
file_paths: Optional[List[str]],
|
| 381 |
):
|
| 382 |
+
"""
|
| 383 |
+
Build knowledge base using:
|
| 384 |
+
- Firecrawl for URLs (if Firecrawl key provided and SDK available)
|
| 385 |
+
- Fallback to simple HTTP fetch if Firecrawl not available
|
| 386 |
+
- Raw text
|
| 387 |
+
- Files
|
| 388 |
+
"""
|
| 389 |
api_key = (api_key or "").strip()
|
| 390 |
if not api_key:
|
| 391 |
return "❌ Please save your OpenAI API key first.", []
|
| 392 |
|
| 393 |
+
firecrawl_api_key = (firecrawl_api_key or "").strip()
|
| 394 |
+
|
| 395 |
docs: List[Dict[str, Any]] = []
|
| 396 |
|
| 397 |
# URLs
|
| 398 |
urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
|
| 399 |
for u in urls:
|
| 400 |
+
text_from_url = ""
|
| 401 |
+
if firecrawl_api_key:
|
| 402 |
+
# Try Firecrawl first
|
| 403 |
+
fc_text = firecrawl_crawl_url(firecrawl_api_key, u)
|
| 404 |
+
if not fc_text.startswith("[Firecrawl error"):
|
| 405 |
+
text_from_url = fc_text
|
| 406 |
+
else:
|
| 407 |
+
# Firecrawl failed; fallback to simple fetch
|
| 408 |
+
text_from_url = fetch_url_text(u)
|
| 409 |
+
else:
|
| 410 |
+
# No Firecrawl key → simple fetch
|
| 411 |
+
text_from_url = fetch_url_text(u)
|
| 412 |
+
|
| 413 |
+
docs.append({"source": u, "text": text_from_url})
|
| 414 |
|
| 415 |
# Raw text
|
| 416 |
if raw_text and raw_text.strip():
|
|
|
|
| 510 |
|
| 511 |
# -------------------- UI LAYOUT --------------------
|
| 512 |
|
| 513 |
+
with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl") as demo:
|
| 514 |
gr.Markdown(
|
| 515 |
"""
|
| 516 |
+
# 🔍 RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl
|
| 517 |
|
| 518 |
1. Enter your **OpenAI API key** and click **Save**.
|
| 519 |
+
2. (Optional) Enter your **Firecrawl API key** and save it.
|
| 520 |
+
3. Add knowledge via **URLs** (e.g. `https://zenai.world`), **uploaded files**, and/or **raw text**.
|
| 521 |
+
4. Click **Grab / Retrieve Knowledge (Firecrawl)** to crawl URLs + index everything.
|
| 522 |
+
5. Ask questions — the bot will answer **only** from your knowledge and system instructions.
|
| 523 |
"""
|
| 524 |
)
|
| 525 |
|
| 526 |
api_key_state = gr.State("")
|
| 527 |
+
firecrawl_key_state = gr.State("")
|
| 528 |
kb_state = gr.State([])
|
| 529 |
chat_state = gr.State([])
|
| 530 |
|
|
|
|
| 537 |
placeholder="sk-...",
|
| 538 |
type="password",
|
| 539 |
)
|
| 540 |
+
save_api_btn = gr.Button("Save OpenAI API Key", variant="primary")
|
| 541 |
+
save_status = gr.Markdown("OpenAI API key not set.")
|
| 542 |
+
|
| 543 |
+
firecrawl_key_box = gr.Textbox(
|
| 544 |
+
label="Firecrawl API Key (optional)",
|
| 545 |
+
placeholder="fc-...",
|
| 546 |
+
type="password",
|
| 547 |
+
)
|
| 548 |
+
save_firecrawl_btn = gr.Button("Save Firecrawl Key")
|
| 549 |
+
firecrawl_status = gr.Markdown("Firecrawl key not set (fallback to simple URL fetch).")
|
| 550 |
|
| 551 |
preset_dropdown = gr.Dropdown(
|
| 552 |
label="Presets",
|
| 553 |
choices=list(PRESET_CONFIGS.keys()),
|
| 554 |
+
value="ZEN Sites Deep QA (zenai.world + AI Arena)",
|
| 555 |
)
|
| 556 |
|
| 557 |
system_box = gr.Textbox(
|
|
|
|
| 565 |
urls_box = gr.Textbox(
|
| 566 |
label="Knowledge URLs (one per line)",
|
| 567 |
lines=4,
|
| 568 |
+
placeholder="https://zenai.world\nhttps://us.zenai.biz",
|
| 569 |
)
|
| 570 |
|
| 571 |
raw_text_box = gr.Textbox(
|
|
|
|
| 580 |
type="filepath",
|
| 581 |
)
|
| 582 |
|
| 583 |
+
grab_kb_btn = gr.Button(
|
| 584 |
+
"Grab / Retrieve Knowledge (Firecrawl + Embeddings)",
|
| 585 |
variant="secondary",
|
| 586 |
)
|
| 587 |
kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
|
|
|
|
| 598 |
user_input = gr.Textbox(
|
| 599 |
label="Ask a question",
|
| 600 |
lines=3,
|
| 601 |
+
placeholder="Ask about the content of zenai.world, AI Arena, or your uploaded docs...",
|
| 602 |
)
|
| 603 |
|
| 604 |
with gr.Row():
|
|
|
|
| 609 |
"ℹ️ Retrieval debug info will appear here after each answer."
|
| 610 |
)
|
| 611 |
|
| 612 |
+
# Wiring: save OpenAI API key
|
| 613 |
save_api_btn.click(
|
| 614 |
fn=save_api_key,
|
| 615 |
inputs=[api_key_box],
|
| 616 |
outputs=[save_status, api_key_state],
|
| 617 |
)
|
| 618 |
|
| 619 |
+
# Wiring: save Firecrawl API key
|
| 620 |
+
save_firecrawl_btn.click(
|
| 621 |
+
fn=save_firecrawl_key,
|
| 622 |
+
inputs=[firecrawl_key_box],
|
| 623 |
+
outputs=[firecrawl_status, firecrawl_key_state],
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
# Wiring: presets
|
| 627 |
preset_dropdown.change(
|
| 628 |
fn=apply_preset,
|
|
|
|
| 630 |
outputs=[system_box, urls_box, raw_text_box],
|
| 631 |
)
|
| 632 |
|
| 633 |
+
# Wiring: build knowledge base (Firecrawl + embeddings)
|
| 634 |
+
grab_kb_btn.click(
|
| 635 |
fn=build_knowledge_base,
|
| 636 |
+
inputs=[api_key_state, firecrawl_key_state, urls_box, raw_text_box, files_input],
|
| 637 |
outputs=[kb_status_md, kb_state],
|
| 638 |
)
|
| 639 |
|