File size: 21,644 Bytes
9f787a4
f1a89df
9f787a4
f1a89df
9f787a4
 
 
681132c
2230519
 
 
681132c
2230519
9f787a4
 
f1a89df
2230519
9f787a4
 
 
 
 
 
f1a89df
9f787a4
 
 
 
 
 
 
 
 
 
 
f1a89df
9f787a4
f1a89df
 
 
 
9f787a4
2230519
9f787a4
f1a89df
2230519
 
9f787a4
 
 
 
 
 
 
 
 
681132c
9f787a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681132c
 
 
 
 
 
 
 
 
2230519
 
f1a89df
 
 
9f787a4
2230519
9f787a4
f1a89df
9f787a4
 
f1a89df
 
9f787a4
 
 
f1a89df
9f787a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2230519
 
 
 
 
681132c
2230519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681132c
2230519
681132c
 
2230519
 
 
 
 
 
 
 
 
 
681132c
 
 
2230519
 
 
 
 
681132c
f1a89df
2230519
681132c
 
 
 
 
f1a89df
9f787a4
 
 
 
 
 
f1a89df
9f787a4
681132c
 
 
 
 
 
 
 
 
 
 
 
9f787a4
 
 
681132c
9f787a4
 
 
 
681132c
 
 
 
9f787a4
 
 
681132c
 
 
2230519
681132c
9f787a4
681132c
9f787a4
681132c
 
9f787a4
681132c
 
 
 
9f787a4
 
681132c
9f787a4
 
681132c
9f787a4
 
 
 
 
 
 
 
 
 
681132c
9f787a4
 
 
 
 
f1a89df
9f787a4
 
 
 
 
681132c
9f787a4
 
 
2230519
 
 
 
 
 
 
 
 
9f787a4
 
 
 
 
 
 
2230519
9f787a4
 
f1a89df
9f787a4
2230519
 
681132c
2230519
 
 
24761a4
 
 
2230519
9f787a4
 
 
 
2230519
 
f1a89df
9f787a4
 
 
 
2230519
 
681132c
 
2230519
 
 
 
 
 
 
 
 
 
9f787a4
 
 
 
 
 
f1a89df
9f787a4
 
 
 
 
 
 
 
 
 
681132c
9f787a4
 
 
54cf97f
27d3e61
54cf97f
 
 
 
27d3e61
54cf97f
27d3e61
 
54cf97f
 
 
 
 
 
 
 
 
 
27d3e61
54cf97f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27d3e61
54cf97f
 
 
 
 
 
 
 
 
 
 
27d3e61
 
9f787a4
 
 
 
 
24761a4
9f787a4
24761a4
 
54cf97f
24761a4
9f787a4
 
 
 
 
24761a4
9f787a4
 
24761a4
9f787a4
 
 
 
681132c
 
9f787a4
 
 
54cf97f
 
f1a89df
9f787a4
 
 
 
 
54cf97f
 
 
 
 
 
9f787a4
 
 
 
 
 
 
 
54cf97f
 
 
 
 
 
9f787a4
24761a4
 
 
54cf97f
 
 
 
 
 
 
 
 
 
 
 
9f787a4
 
54cf97f
 
 
 
 
 
9f787a4
54cf97f
9f787a4
54cf97f
a4a5726
54cf97f
 
a4a5726
54cf97f
 
 
9f787a4
 
 
24761a4
 
9f787a4
 
 
 
 
24761a4
9f787a4
 
 
 
2230519
9f787a4
 
2230519
9f787a4
 
2230519
681132c
 
2230519
9f787a4
 
 
 
2230519
9f787a4
24761a4
9f787a4
681132c
 
 
 
9f787a4
 
 
 
 
 
 
 
 
2230519
 
 
 
 
 
 
 
 
681132c
 
 
9f787a4
 
 
 
681132c
9f787a4
 
 
 
 
681132c
9f787a4
 
 
 
 
 
 
681132c
2230519
9f787a4
 
 
 
 
681132c
9f787a4
 
 
 
 
 
 
 
 
2230519
681132c
9f787a4
 
 
 
 
 
 
24761a4
9f787a4
f1a89df
9f787a4
 
 
 
 
 
681132c
9f787a4
 
 
 
 
 
 
 
 
 
2230519
9f787a4
 
 
 
 
 
2230519
 
 
 
 
 
 
9f787a4
 
 
 
 
 
 
681132c
2230519
9f787a4
2230519
9f787a4
 
 
f1a89df
9f787a4
 
 
 
 
 
f1a89df
9f787a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
import os
from typing import List, Dict, Any, Tuple, Optional

import requests
import gradio as gr
from openai import OpenAI

# Firecrawl SDK (used for scraping URLs into markdown)
try:
    from firecrawl import Firecrawl
except ImportError:
    Firecrawl = None  # handled gracefully below

# -------------------- CONFIG --------------------

CHAT_MODEL = "gpt-5"  # main chat model

DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.

Rules:
- Answer ONLY using the provided knowledge base context and system instructions.
- If the answer is not clearly supported by the context, say "I don’t know based on the current knowledge base."
- Do not invent sources, statistics, or facts that are not present in the context.
- When applicable, cite which source you used (e.g., "According to the uploaded file" or "Based on zenai.world").
- Be clear, concise, and structured.
"""

PRESET_CONFIGS = {
    "None (manual setup)": {
        "system": DEFAULT_SYSTEM_PROMPT,
        "urls": "",
        "text": "",
    },
    "ZEN Sites Deep QA (zenai.world + AI Arena)": {
        "system": DEFAULT_SYSTEM_PROMPT
        + "\n\nYou specialize in answering questions about ZEN AI’s mission, programs, AI Pioneer, and ZEN AI Arena.",
        "urls": "https://zenai.world\nhttps://us.zenai.biz",
        "text": (
            "ZEN AI is building the first global AI × Web3 literacy and automation movement, "
            "with youth, homeschool, and professional tracks and blockchain-verified credentials."
        ),
    },
    "AI Policy & Governance Starter": {
        "system": DEFAULT_SYSTEM_PROMPT
        + "\n\nYou act as a neutral policy explainer. Summarize clearly, highlight key risks, opportunities, and practical implications.",
        "urls": "https://oecd.ai/en/ai-principles",
        "text": "Use this preset for high-level AI policy, governance, and principles exploration.",
    },
    "Research Notebook / Personal RAG Sandbox": {
        "system": DEFAULT_SYSTEM_PROMPT
        + "\n\nYou help the user explore, connect, and synthesize insights from their personal notes and documents.",
        "urls": "",
        "text": "Use this as a sandbox for notebooks, transcripts, and long-form notes.",
    },
}

# -------------------- TEXT HELPERS --------------------


def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> List[str]:
    """Simple character-based chunking with overlap."""
    text = (text or "").strip()
    if not text:
        return []
    chunks = []
    start = 0
    length = len(text)
    while start < length:
        end = min(start + max_chars, length)
        chunk = text[start:end]
        chunks.append(chunk)
        if end >= length:
            break
        start = max(0, end - overlap)
    return chunks


def tokenize(text: str) -> List[str]:
    """Very simple tokenizer: lowercase, keep alphanumerics, split on spaces."""
    cleaned = []
    for ch in text.lower():
        if ch.isalnum():
            cleaned.append(ch)
        else:
            cleaned.append(" ")
    return [tok for tok in "".join(cleaned).split() if tok]


# -------------------- DATA SOURCE HELPERS --------------------


def fetch_url_text(url: str) -> str:
    """Fallback: fetch text from a URL via simple HTTP."""
    try:
        resp = requests.get(url, timeout=12)
        resp.raise_for_status()
        text = resp.text

        # crude HTML stripping: cut off at first script/style and remove angle brackets
        for tag in ["<script", "<style"]:
            if tag in text:
                text = text.split(tag)[0]

        text = text.replace("<", " ").replace(">", " ")
        return text
    except Exception as e:
        return f"[Error fetching {url}: {e}]"


def read_file_text(path: str) -> str:
    """Read text from simple text-based files; skip others safely."""
    if not path:
        return ""
    path_lower = path.lower()
    try:
        if any(path_lower.endswith(ext) for ext in [".txt", ".md", ".csv", ".json"]):
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                return f.read()
        return f"[Unsupported file type for RAG content: {os.path.basename(path)}]"
    except Exception as e:
        return f"[Error reading file {os.path.basename(path)}: {e}]"


# -------------------- FIRECRAWL HELPERS --------------------


def extract_markdown_from_firecrawl_result(result: Any) -> str:
    """
    Firecrawl scrape(...) can return Document-like objects or dicts.
    We try to collect all markdown text into one big string.
    """
    texts: List[str] = []

    def _collect(obj: Any):
        if obj is None:
            return

        # Document-like object with attribute markdown
        md = getattr(obj, "markdown", None)
        if isinstance(md, str) and md.strip():
            texts.append(md)
            return

        # Dict-shaped
        if isinstance(obj, dict):
            if isinstance(obj.get("markdown"), str):
                texts.append(obj["markdown"])
            data = obj.get("data")
            if data is not None:
                _collect(data)
            return

        # Iterable (list/tuple of docs)
        if isinstance(obj, (list, tuple)):
            for item in obj:
                _collect(item)
            return

    _collect(result)
    if texts:
        return "\n\n".join(texts)
    # Fallback: string representation if nothing else worked
    return str(result)


def firecrawl_scrape_url(firecrawl_api_key: str, url: str) -> str:
    """
    Use Firecrawl to scrape a single URL and return markdown.
    This is intentionally *not* a full crawl to keep it fast.
    """
    firecrawl_api_key = (firecrawl_api_key or "").strip()
    if not firecrawl_api_key:
        return "[Firecrawl error: no Firecrawl API key provided.]"

    if Firecrawl is None:
        return "[Firecrawl error: firecrawl-py is not installed. Add it to requirements.txt.]"

    try:
        fc = Firecrawl(api_key=firecrawl_api_key)
        # Fast single-page scrape → markdown suitable for RAG
        doc = fc.scrape(url, formats=["markdown"])
        markdown = extract_markdown_from_firecrawl_result(doc)
        return markdown
    except Exception as e:
        return f"[Firecrawl error for {url}: {e}]"


# -------------------- LOCAL KB BUILD (NO OPENAI EMBEDDINGS) --------------------


def build_local_kb(docs: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], str]:
    """
    Build a local KB with lexical features only (no OpenAI embeddings).
    Each KB entry: {id, source, text, tokens}
    """
    kb_chunks: List[Dict[str, Any]] = []
    total_chunks = 0

    for d in docs:
        source = d.get("source", "unknown")
        text = d.get("text", "")
        chunks = chunk_text(text, max_chars=2000, overlap=200)

        for idx, ch in enumerate(chunks):
            tokens = tokenize(ch)
            kb_chunks.append(
                {
                    "id": f"{source}_{idx}",
                    "source": source,
                    "text": ch,
                    "tokens": tokens,
                }
            )
            total_chunks += 1

    status = f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks (lexical retrieval)."
    return kb_chunks, status


def retrieve_context_local(
    kb: List[Dict[str, Any]],
    query: str,
    top_k: int = 5,
) -> Tuple[str, str]:
    """
    Retrieve top-k relevant chunks from KB for the query using simple lexical matching:
    score = number of overlapping tokens between query and chunk.
    """
    if not kb:
        return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."

    q_tokens = tokenize(query)
    if not q_tokens:
        return "", "ℹ️ Query has no meaningful tokens; answering from instructions only."

    q_set = set(q_tokens)

    scored: List[Tuple[int, Dict[str, Any]]] = []
    for d in kb:
        tokens = d.get("tokens") or []
        if not tokens:
            continue
        t_set = set(tokens)
        overlap = len(q_set & t_set)
        if overlap > 0:
            scored.append((overlap, d))

    if not scored:
        return "", "ℹ️ No lexical overlap with knowledge base; answering from instructions only."

    scored.sort(key=lambda x: x[0], reverse=True)
    top = [d for (score, d) in scored[:top_k]]

    context_parts = []
    for idx, d in enumerate(top, start=1):
        src = d.get("source", "unknown")
        txt = d.get("text", "")
        context_parts.append(
            f"[Chunk {idx} | Source: {src}]\n{txt}\n"
        )

    context = "\n\n---\n\n".join(context_parts)
    debug = f"📚 Retrieved {len(top)} chunks from KB via lexical retrieval (no embeddings)."
    return context, debug


# -------------------- GRADIO CALLBACKS --------------------


def save_api_key(api_key: str):
    api_key = (api_key or "").strip()
    if not api_key:
        return "❌ No API key provided.", ""
    masked = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) >= 8 else "******"
    status = f"✅ OpenAI key saved for this session: `{masked}`"
    return status, api_key


def save_firecrawl_key(fc_key: str):
    fc_key = (fc_key or "").strip()
    if not fc_key:
        return "⚠️ No Firecrawl API key provided.", ""
    masked = f"{fc_key[:3]}...{fc_key[-4:]}" if len(fc_key) >= 8 else "******"
    status = f"✅ Firecrawl key saved for this session: `{masked}`"
    return status, fc_key


def apply_preset(preset_name: str):
    cfg = PRESET_CONFIGS.get(preset_name) or PRESET_CONFIGS["None (manual setup)"]
    return cfg["system"], cfg["urls"], cfg["text"]


def build_knowledge_base(
    api_key: str,
    firecrawl_api_key: str,
    urls_text: str,
    raw_text: str,
    file_paths: Optional[List[str]],
):
    """
    Build knowledge base using:
    - Firecrawl scrape for URLs (if Firecrawl key provided and SDK available)
    - Fallback to simple HTTP fetch if Firecrawl not available
    - Raw text
    - Files

    Note: api_key is kept in the signature for symmetry and potential future use,
    but not required for lexical-only indexing.
    """
    api_key = (api_key or "").strip()
    if not api_key:
        return "❌ Please save your OpenAI API key first.", []

    firecrawl_api_key = (firecrawl_api_key or "").strip()

    docs: List[Dict[str, Any]] = []

    # URLs
    urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
    for u in urls:
        text_from_url = ""
        if firecrawl_api_key:
            # Try Firecrawl first (single-page scrape)
            fc_text = firecrawl_scrape_url(firecrawl_api_key, u)
            if not fc_text.startswith("[Firecrawl error"):
                text_from_url = fc_text
            else:
                # Firecrawl failed; fallback to simple fetch
                text_from_url = fetch_url_text(u)
        else:
            # No Firecrawl key → simple fetch
            text_from_url = fetch_url_text(u)

        docs.append({"source": u, "text": text_from_url})

    # Raw text
    if raw_text and raw_text.strip():
        docs.append({"source": "Raw Text Block", "text": raw_text})

    # Files
    if file_paths:
        for p in file_paths:
            if not p:
                continue
            txt = read_file_text(p)
            src_name = os.path.basename(p)
            docs.append({"source": f"File: {src_name}", "text": txt})

    if not docs:
        return "⚠️ No knowledge sources provided (URLs, text, or files).", []

    kb, status = build_local_kb(docs)
    return status, kb


def extract_text_from_response(resp: Any) -> str:
    """
    Extract plain text from the Responses API result.
    We assume structure like:
        resp.output -> list of output items
        each item.content -> list of content parts with .text or ['text']
    """
    if resp is None:
        return ""

    texts: List[str] = []

    # New Responses API usually has resp.output
    output = getattr(resp, "output", None) or getattr(resp, "data", None)
    if output is None:
        # Fallback to just stringifying
        return str(resp)

    if not isinstance(output, (list, tuple)):
        output = [output]

    for item in output:
        content = getattr(item, "content", None)
        if content is None and isinstance(item, dict):
            content = item.get("content")
        if content is None:
            continue

        if not isinstance(content, (list, tuple)):
            content = [content]

        for part in content:
            # Part might be object with .text
            txt = getattr(part, "text", None)
            if isinstance(txt, str) and txt.strip():
                texts.append(txt)
                continue

            # Or dict-like
            if isinstance(part, dict):
                t = part.get("text")
                if isinstance(t, str) and t.strip():
                    texts.append(t)
                    continue

            # Fallback, stringify
            texts.append(str(part))

    return "\n".join(texts).strip()


def chat_with_rag(
    user_message: str,
    api_key: str,
    kb: List[Dict[str, Any]],
    system_prompt: str,
    history_pairs: List[List[str]],
):
    """
    history_pairs: list of [user_str, assistant_str] pairs for the UI Chatbot.
    We'll rebuild conversation history for the Responses API each time.
    """
    user_message = (user_message or "").strip()
    api_key = (api_key or "").strip()
    system_prompt = (system_prompt or "").strip()

    if not user_message:
        return history_pairs, history_pairs, "❌ Please enter a question."

    if not api_key:
        return history_pairs, history_pairs, "❌ Please save your OpenAI API key first."

    if not system_prompt:
        system_prompt = DEFAULT_SYSTEM_PROMPT

    # Retrieve context from KB (local lexical retrieval)
    context, debug_retrieval = retrieve_context_local(kb, user_message)

    client = OpenAI(api_key=api_key)

    # Build input for Responses API
    input_messages: List[Dict[str, Any]] = []

    combined_system = (
        DEFAULT_SYSTEM_PROMPT.strip()
        + "\n\n---\n\nUser System Instructions:\n"
        + system_prompt.strip()
    )
    input_messages.append(
        {
            "role": "system",
            "content": [{"type": "input_text", "text": combined_system}],
        }
    )

    if context:
        context_block = (
            "You have access to the following knowledge base context.\n"
            "You MUST base your answer ONLY on this context and the system instructions.\n"
            "If the answer is not supported by the context, say you don’t know.\n\n"
            f"{context}"
        )
        input_messages.append(
            {
                "role": "system",
                "content": [{"type": "input_text", "text": context_block}],
            }
        )

    # Rebuild conversation history from pairs (last few turns)
    recent_pairs = history_pairs[-5:] if history_pairs else []
    for u, a in recent_pairs:
        input_messages.append(
            {
                "role": "user",
                "content": [{"type": "input_text", "text": u}],
            }
        )
        input_messages.append(
            {
                "role": "assistant",
                "content": [{"type": "output_text", "text": a}],
            }
        )

    # Current user message
    input_messages.append(
        {
            "role": "user",
            "content": [{"type": "input_text", "text": user_message}],
        }
    )

    # Call OpenAI GPT-5 via Responses API
    try:
        resp = client.responses.create(
            model=CHAT_MODEL,
            input=input_messages,
            # no temperature, no token params -> avoid unsupported parameter errors
        )
        answer = extract_text_from_response(resp)
        if not answer.strip():
            answer = "⚠️ Model returned an empty response object. This may be an API issue."
    except Exception as e:
        answer = f"⚠️ OpenAI API error: {e}"

    # Update UI history as list of [user, assistant] pairs
    new_history = history_pairs + [[user_message, answer]]

    return new_history, new_history, debug_retrieval


def clear_chat():
    return [], [], "Chat cleared."


# -------------------- UI LAYOUT --------------------

with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl") as demo:
    gr.Markdown(
        """
# 🔍 RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl

1. Enter your **OpenAI API key** and click **Save**.  
2. (Optional) Enter your **Firecrawl API key** and save it.  
3. Choose a preset (e.g. **ZEN Sites Deep QA**) — this auto-loads URLs like `https://zenai.world`.  
4. Click **Grab / Retrieve Knowledge (Firecrawl + Lexical Index)** to scrape URLs + index everything.  
5. Ask questions — the bot will answer **only** from your knowledge and system instructions.
"""
    )

    api_key_state = gr.State("")
    firecrawl_key_state = gr.State("")
    kb_state = gr.State([])
    chat_state = gr.State([])  # list of [user, assistant] pairs

    # default preset on load -> ZEN
    default_preset_name = "ZEN Sites Deep QA (zenai.world + AI Arena)"
    default_preset_cfg = PRESET_CONFIGS[default_preset_name]

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🔑 API & System")

            api_key_box = gr.Textbox(
                label="OpenAI API Key",
                placeholder="sk-...",
                type="password",
            )
            save_api_btn = gr.Button("Save OpenAI API Key", variant="primary")
            save_status = gr.Markdown("OpenAI API key not set.")

            firecrawl_key_box = gr.Textbox(
                label="Firecrawl API Key (optional)",
                placeholder="fc-...",
                type="password",
            )
            save_firecrawl_btn = gr.Button("Save Firecrawl Key")
            firecrawl_status = gr.Markdown(
                "Firecrawl key not set (will fall back to simple URL fetch)."
            )

            preset_dropdown = gr.Dropdown(
                label="Presets",
                choices=list(PRESET_CONFIGS.keys()),
                value=default_preset_name,
            )

            system_box = gr.Textbox(
                label="System Instructions",
                lines=8,
                value=default_preset_cfg["system"],
            )

            gr.Markdown("### 📚 Knowledge Sources")

            urls_box = gr.Textbox(
                label="Knowledge URLs (one per line)",
                lines=4,
                value=default_preset_cfg["urls"],
                placeholder="https://zenai.world\nhttps://us.zenai.biz",
            )

            raw_text_box = gr.Textbox(
                label="Additional Knowledge Text",
                lines=6,
                value=default_preset_cfg["text"],
                placeholder="Paste any notes, docs, or reference text here...",
            )

            files_input = gr.File(
                label="Upload Knowledge Files (.txt, .md, .csv, .json)",
                file_count="multiple",
                type="filepath",
            )

            grab_kb_btn = gr.Button(
                "Grab / Retrieve Knowledge (Firecrawl + Lexical Index)",
                variant="secondary",
            )
            kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")

        with gr.Column(scale=2):
            gr.Markdown("### 💬 RAG Chat")

            # Classic Chatbot format: list of [user, assistant] pairs
            chatbot = gr.Chatbot(
                label="RAG Chatbot (GPT-5)",
                height=450,
            )

            user_input = gr.Textbox(
                label="Ask a question",
                lines=3,
                placeholder="Ask about zenai.world, AI Arena, or your uploaded docs...",
            )

            with gr.Row():
                send_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear Chat")

            debug_md = gr.Markdown(
                "ℹ️ Retrieval debug info will appear here after each answer."
            )

    # Wiring: save OpenAI API key
    save_api_btn.click(
        fn=save_api_key,
        inputs=[api_key_box],
        outputs=[save_status, api_key_state],
    )

    # Wiring: save Firecrawl API key
    save_firecrawl_btn.click(
        fn=save_firecrawl_key,
        inputs=[firecrawl_key_box],
        outputs=[firecrawl_status, firecrawl_key_state],
    )

    # Wiring: presets
    preset_dropdown.change(
        fn=apply_preset,
        inputs=[preset_dropdown],
        outputs=[system_box, urls_box, raw_text_box],
    )

    # Wiring: build knowledge base (Firecrawl + lexical index)
    grab_kb_btn.click(
        fn=build_knowledge_base,
        inputs=[api_key_state, firecrawl_key_state, urls_box, raw_text_box, files_input],
        outputs=[kb_status_md, kb_state],
    )

    # Wiring: chat send (button)
    send_btn.click(
        fn=chat_with_rag,
        inputs=[user_input, api_key_state, kb_state, system_box, chat_state],
        outputs=[chatbot, chat_state, debug_md],
    )

    # Wiring: chat send (Enter key)
    user_input.submit(
        fn=chat_with_rag,
        inputs=[user_input, api_key_state, kb_state, system_box, chat_state],
        outputs=[chatbot, chat_state, debug_md],
    )

    # Wiring: clear chat
    clear_btn.click(
        fn=clear_chat,
        inputs=[],
        outputs=[chatbot, chat_state, debug_md],
    )

if __name__ == "__main__":
    demo.launch()