Spaces:

ZENLLC
/

RAGmod4

Sleeping

App Files Files Community

ZENLLC commited on Nov 11, 2025

Commit

2230519

verified ·

1 Parent(s): d223e34

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -28

app.py CHANGED Viewed

@@ -6,11 +6,24 @@ import requests
 import gradio as gr
 from openai import OpenAI
 # -------------------- CONFIG --------------------
 CHAT_MODEL = "gpt-5"  # main chat model
-# Use a model your project actually has access to
-EMBED_MODEL = "text-embedding-3-small"
 DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
@@ -37,11 +50,11 @@ PRESET_CONFIGS = {
             "with youth, homeschool, and professional tracks and blockchain-verified credentials."
         ),
     },
-    "Policy Explainer (external PDFs / links)": {
         "system": DEFAULT_SYSTEM_PROMPT
         + "\n\nYou act as a neutral policy explainer. Summarize clearly, highlight key risks, opportunities, and practical implications.",
-        "urls": "",
-        "text": "This preset is for uploading AI policy PDFs, legal texts, and governance reports.",
     },
     "Research Notebook / Personal RAG Sandbox": {
         "system": DEFAULT_SYSTEM_PROMPT
@@ -84,11 +97,40 @@ def cosine_similarity(a: List[float], b: List[float]) -> float:
     return dot / (norm_a * norm_b)
 # -------------------- DATA SOURCE HELPERS --------------------
 def fetch_url_text(url: str) -> str:
-    """Fetch text from a URL in a lightweight way."""
     try:
         resp = requests.get(url, timeout=12)
         resp.raise_for_status()
@@ -119,6 +161,73 @@ def read_file_text(path: str) -> str:
         return f"[Error reading file {os.path.basename(path)}: {e}]"
 # -------------------- EMBEDDING / KB BUILD --------------------
@@ -131,6 +240,11 @@ def build_embeddings(
         return [], "⚠️ No documents to index."
     client = OpenAI(api_key=api_key)
     kb_chunks: List[Dict[str, Any]] = []
     total_chunks = 0
@@ -142,7 +256,7 @@ def build_embeddings(
         for idx, ch in enumerate(chunks):
             try:
                 emb_resp = client.embeddings.create(
-                    model=EMBED_MODEL,
                     input=ch,
                 )
                 emb = emb_resp.data[0].embedding
@@ -165,7 +279,10 @@ def build_embeddings(
                     }
                 )
-    status = f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks."
     return kb_chunks, status
@@ -181,9 +298,14 @@ def retrieve_context(
         return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
     client = OpenAI(api_key=api_key)
     try:
         q_emb_resp = client.embeddings.create(
-            model=EMBED_MODEL,
             input=query,
         )
         q_emb = q_emb_resp.data[0].embedding
@@ -216,7 +338,11 @@ def retrieve_context(
         )
     context = "\n\n---\n\n".join(context_parts)
-    debug = f"📚 Retrieved {len(top)} chunks from KB (top_k={top_k}, threshold={similarity_threshold})."
     return context, debug
@@ -232,6 +358,15 @@ def save_api_key(api_key: str):
     return status, api_key
 def apply_preset(preset_name: str):
     cfg = PRESET_CONFIGS.get(preset_name) or PRESET_CONFIGS["None (manual setup)"]
     return cfg["system"], cfg["urls"], cfg["text"]
@@ -239,21 +374,43 @@ def apply_preset(preset_name: str):
 def build_knowledge_base(
     api_key: str,
     urls_text: str,
     raw_text: str,
     file_paths: Optional[List[str]],
 ):
     api_key = (api_key or "").strip()
     if not api_key:
         return "❌ Please save your OpenAI API key first.", []
     docs: List[Dict[str, Any]] = []
     # URLs
     urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
     for u in urls:
-        txt = fetch_url_text(u)
-        docs.append({"source": u, "text": txt})
     # Raw text
     if raw_text and raw_text.strip():
@@ -353,19 +510,21 @@ def clear_chat():
 # -------------------- UI LAYOUT --------------------
-with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
     gr.Markdown(
         """
-# 🔍 RAG Chatbot — GPT-5 + URLs / Files / Text
 1. Enter your **OpenAI API key** and click **Save**.
-2. Add knowledge via **URLs**, **uploaded files**, and/or **raw text**.
-3. Click **Build / Refresh Knowledge Base**.
-4. Ask questions — the bot will answer **only** from your knowledge and system instructions.
 """
     )
     api_key_state = gr.State("")
     kb_state = gr.State([])
     chat_state = gr.State([])
@@ -378,13 +537,21 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
                 placeholder="sk-...",
                 type="password",
             )
-            save_api_btn = gr.Button("Save API Key", variant="primary")
-            save_status = gr.Markdown("API key not set.")
             preset_dropdown = gr.Dropdown(
                 label="Presets",
                 choices=list(PRESET_CONFIGS.keys()),
-                value="None (manual setup)",
             )
             system_box = gr.Textbox(
@@ -398,7 +565,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
             urls_box = gr.Textbox(
                 label="Knowledge URLs (one per line)",
                 lines=4,
-                placeholder="https://example.com/docs\nhttps://zenai.world",
             )
             raw_text_box = gr.Textbox(
@@ -413,8 +580,8 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
                 type="filepath",
             )
-            build_kb_btn = gr.Button(
-                "Build / Refresh Knowledge Base",
                 variant="secondary",
             )
             kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
@@ -431,7 +598,7 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
             user_input = gr.Textbox(
                 label="Ask a question",
                 lines=3,
-                placeholder="Ask about the content of your URLs, files, or pasted text...",
             )
             with gr.Row():
@@ -442,13 +609,20 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
                 "ℹ️ Retrieval debug info will appear here after each answer."
             )
-    # Wiring: save API key
     save_api_btn.click(
         fn=save_api_key,
         inputs=[api_key_box],
         outputs=[save_status, api_key_state],
     )
     # Wiring: presets
     preset_dropdown.change(
         fn=apply_preset,
@@ -456,10 +630,10 @@ with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text") as demo:
         outputs=[system_box, urls_box, raw_text_box],
     )
-    # Wiring: build knowledge base
-    build_kb_btn.click(
         fn=build_knowledge_base,
-        inputs=[api_key_state, urls_box, raw_text_box, files_input],
         outputs=[kb_status_md, kb_state],
     )

 import gradio as gr
 from openai import OpenAI
+# Firecrawl SDK (used for crawling URLs into markdown)
+try:
+    from firecrawl import Firecrawl
+except ImportError:
+    Firecrawl = None  # we’ll handle this gracefully later
 # -------------------- CONFIG --------------------
 CHAT_MODEL = "gpt-5"  # main chat model
+# Candidate embedding models – we'll auto-select one your project has access to
+EMBED_MODEL_CANDIDATES = [
+    "text-embedding-3-small",
+    "text-embedding-3-large",
+    "text-embedding-ada-002",
+]
+SELECTED_EMBED_MODEL: Optional[str] = None  # set at runtime once discovered
 DEFAULT_SYSTEM_PROMPT = """You are a Retrieval-Augmented Generation (RAG) assistant.
             "with youth, homeschool, and professional tracks and blockchain-verified credentials."
         ),
     },
+    "AI Policy & Governance Starter": {
         "system": DEFAULT_SYSTEM_PROMPT
         + "\n\nYou act as a neutral policy explainer. Summarize clearly, highlight key risks, opportunities, and practical implications.",
+        "urls": "https://oecd.ai/en/ai-principles",
+        "text": "Use this preset for high-level AI policy, governance, and principles exploration.",
     },
     "Research Notebook / Personal RAG Sandbox": {
         "system": DEFAULT_SYSTEM_PROMPT
     return dot / (norm_a * norm_b)
+# -------------------- EMBEDDING MODEL SELECTION --------------------
+def pick_embedding_model(client: OpenAI) -> str:
+    """
+    Discover a usable embedding model for this project.
+    Tries candidates in order and caches the first that works.
+    """
+    global SELECTED_EMBED_MODEL
+    if SELECTED_EMBED_MODEL:
+        return SELECTED_EMBED_MODEL
+    last_error: Optional[Exception] = None
+    for model_name in EMBED_MODEL_CANDIDATES:
+        try:
+            # cheap sanity call
+            client.embeddings.create(model=model_name, input="test")
+            SELECTED_EMBED_MODEL = model_name
+            return model_name
+        except Exception as e:
+            last_error = e
+            continue
+    raise RuntimeError(
+        f"No usable embedding model found for this project. "
+        f"Tried: {EMBED_MODEL_CANDIDATES}. Last error: {last_error}"
+    )
 # -------------------- DATA SOURCE HELPERS --------------------
 def fetch_url_text(url: str) -> str:
+    """Fallback: fetch text from a URL via simple HTTP."""
     try:
         resp = requests.get(url, timeout=12)
         resp.raise_for_status()
         return f"[Error reading file {os.path.basename(path)}: {e}]"
+# -------------------- FIRECRAWL HELPERS --------------------
+def extract_markdown_from_firecrawl_result(result: Any) -> str:
+    """
+    Firecrawl crawl(...) can return:
+    - A list of Document-like objects with .markdown
+    - An object with .data which is a list of Documents
+    - Dict structures with 'markdown' or 'data'
+    We try to collect all markdown text into one big string.
+    """
+    texts: List[str] = []
+    def _collect(obj: Any):
+        if obj is None:
+            return
+        # Document-like object with attribute markdown
+        md = getattr(obj, "markdown", None)
+        if isinstance(md, str) and md.strip():
+            texts.append(md)
+            return
+        # Dict-shaped
+        if isinstance(obj, dict):
+            if isinstance(obj.get("markdown"), str):
+                texts.append(obj["markdown"])
+            data = obj.get("data")
+            if data is not None:
+                _collect(data)
+            return
+        # Iterable (list/tuple of docs)
+        if isinstance(obj, (list, tuple)):
+            for item in obj:
+                _collect(item)
+            return
+    _collect(result)
+    if texts:
+        return "\n\n".join(texts)
+    # Fallback: string representation if nothing else worked
+    return str(result)
+def firecrawl_crawl_url(firecrawl_api_key: str, url: str) -> str:
+    """
+    Use Firecrawl to crawl a URL and return concatenated markdown for all pages.
+    If Firecrawl is not available or fails, return an error marker (caller can fallback).
+    """
+    firecrawl_api_key = (firecrawl_api_key or "").strip()
+    if not firecrawl_api_key:
+        return "[Firecrawl error: no Firecrawl API key provided.]"
+    if Firecrawl is None:
+        return "[Firecrawl error: firecrawl-py is not installed. Add it to requirements.txt.]"
+    try:
+        fc = Firecrawl(api_key=firecrawl_api_key)
+        # Crawl whole site; keep limit modest for speed
+        docs = fc.crawl(url=url, limit=50)
+        markdown = extract_markdown_from_firecrawl_result(docs)
+        return markdown
+    except Exception as e:
+        return f"[Firecrawl error for {url}: {e}]"
 # -------------------- EMBEDDING / KB BUILD --------------------
         return [], "⚠️ No documents to index."
     client = OpenAI(api_key=api_key)
+    try:
+        embed_model = pick_embedding_model(client)
+    except Exception as e:
+        return [], f"❌ Failed to select an embedding model: {e}"
     kb_chunks: List[Dict[str, Any]] = []
     total_chunks = 0
         for idx, ch in enumerate(chunks):
             try:
                 emb_resp = client.embeddings.create(
+                    model=embed_model,
                     input=ch,
                 )
                 emb = emb_resp.data[0].embedding
                     }
                 )
+    status = (
+        f"✅ Knowledge base built with {len(docs)} documents and {total_chunks} chunks. "
+        f"Embedding model: `{SELECTED_EMBED_MODEL}`"
+    )
     return kb_chunks, status
         return "", "ℹ️ No knowledge base yet. The model will answer from instructions only."
     client = OpenAI(api_key=api_key)
+    try:
+        embed_model = pick_embedding_model(client)
+    except Exception as e:
+        return "", f"❌ Failed to select an embedding model: {e}"
     try:
         q_emb_resp = client.embeddings.create(
+            model=embed_model,
             input=query,
         )
         q_emb = q_emb_resp.data[0].embedding
         )
     context = "\n\n---\n\n".join(context_parts)
+    debug = (
+        f"📚 Retrieved {len(top)} chunks from KB "
+        f"(top_k={top_k}, threshold={similarity_threshold}). "
+        f"Embedding model: `{SELECTED_EMBED_MODEL}`"
+    )
     return context, debug
     return status, api_key
+def save_firecrawl_key(fc_key: str):
+    fc_key = (fc_key or "").strip()
+    if not fc_key:
+        return "⚠️ No Firecrawl API key provided.", ""
+    masked = f"{fc_key[:3]}...{fc_key[-4:]}" if len(fc_key) >= 8 else "******"
+    status = f"✅ Firecrawl key saved for this session: `{masked}`"
+    return status, fc_key
 def apply_preset(preset_name: str):
     cfg = PRESET_CONFIGS.get(preset_name) or PRESET_CONFIGS["None (manual setup)"]
     return cfg["system"], cfg["urls"], cfg["text"]
 def build_knowledge_base(
     api_key: str,
+    firecrawl_api_key: str,
     urls_text: str,
     raw_text: str,
     file_paths: Optional[List[str]],
 ):
+    """
+    Build knowledge base using:
+    - Firecrawl for URLs (if Firecrawl key provided and SDK available)
+    - Fallback to simple HTTP fetch if Firecrawl not available
+    - Raw text
+    - Files
+    """
     api_key = (api_key or "").strip()
     if not api_key:
         return "❌ Please save your OpenAI API key first.", []
+    firecrawl_api_key = (firecrawl_api_key or "").strip()
     docs: List[Dict[str, Any]] = []
     # URLs
     urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
     for u in urls:
+        text_from_url = ""
+        if firecrawl_api_key:
+            # Try Firecrawl first
+            fc_text = firecrawl_crawl_url(firecrawl_api_key, u)
+            if not fc_text.startswith("[Firecrawl error"):
+                text_from_url = fc_text
+            else:
+                # Firecrawl failed; fallback to simple fetch
+                text_from_url = fetch_url_text(u)
+        else:
+            # No Firecrawl key → simple fetch
+            text_from_url = fetch_url_text(u)
+        docs.append({"source": u, "text": text_from_url})
     # Raw text
     if raw_text and raw_text.strip():
 # -------------------- UI LAYOUT --------------------
+with gr.Blocks(title="RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl") as demo:
     gr.Markdown(
         """
+# 🔍 RAG Chatbot — GPT-5 + URLs / Files / Text + Firecrawl
 1. Enter your **OpenAI API key** and click **Save**.
+2. (Optional) Enter your **Firecrawl API key** and save it.
+3. Add knowledge via **URLs** (e.g. `https://zenai.world`), **uploaded files**, and/or **raw text**.
+4. Click **Grab / Retrieve Knowledge (Firecrawl)** to crawl URLs + index everything.
+5. Ask questions — the bot will answer **only** from your knowledge and system instructions.
 """
     )
     api_key_state = gr.State("")
+    firecrawl_key_state = gr.State("")
     kb_state = gr.State([])
     chat_state = gr.State([])
                 placeholder="sk-...",
                 type="password",
             )
+            save_api_btn = gr.Button("Save OpenAI API Key", variant="primary")
+            save_status = gr.Markdown("OpenAI API key not set.")
+            firecrawl_key_box = gr.Textbox(
+                label="Firecrawl API Key (optional)",
+                placeholder="fc-...",
+                type="password",
+            )
+            save_firecrawl_btn = gr.Button("Save Firecrawl Key")
+            firecrawl_status = gr.Markdown("Firecrawl key not set (fallback to simple URL fetch).")
             preset_dropdown = gr.Dropdown(
                 label="Presets",
                 choices=list(PRESET_CONFIGS.keys()),
+                value="ZEN Sites Deep QA (zenai.world + AI Arena)",
             )
             system_box = gr.Textbox(
             urls_box = gr.Textbox(
                 label="Knowledge URLs (one per line)",
                 lines=4,
+                placeholder="https://zenai.world\nhttps://us.zenai.biz",
             )
             raw_text_box = gr.Textbox(
                 type="filepath",
             )
+            grab_kb_btn = gr.Button(
+                "Grab / Retrieve Knowledge (Firecrawl + Embeddings)",
                 variant="secondary",
             )
             kb_status_md = gr.Markdown("ℹ️ No knowledge base built yet.")
             user_input = gr.Textbox(
                 label="Ask a question",
                 lines=3,
+                placeholder="Ask about the content of zenai.world, AI Arena, or your uploaded docs...",
             )
             with gr.Row():
                 "ℹ️ Retrieval debug info will appear here after each answer."
             )
+    # Wiring: save OpenAI API key
     save_api_btn.click(
         fn=save_api_key,
         inputs=[api_key_box],
         outputs=[save_status, api_key_state],
     )
+    # Wiring: save Firecrawl API key
+    save_firecrawl_btn.click(
+        fn=save_firecrawl_key,
+        inputs=[firecrawl_key_box],
+        outputs=[firecrawl_status, firecrawl_key_state],
+    )
     # Wiring: presets
     preset_dropdown.change(
         fn=apply_preset,
         outputs=[system_box, urls_box, raw_text_box],
     )
+    # Wiring: build knowledge base (Firecrawl + embeddings)
+    grab_kb_btn.click(
         fn=build_knowledge_base,
+        inputs=[api_key_state, firecrawl_key_state, urls_box, raw_text_box, files_input],
         outputs=[kb_status_md, kb_state],
     )