Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
-
import os, io, json, zipfile,
|
| 2 |
-
from typing import List, Dict, Any, Optional
|
| 3 |
import gradio as gr
|
| 4 |
-
from pydantic import BaseModel
|
| 5 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 6 |
|
| 7 |
-
# --- Optional .env support
|
| 8 |
try:
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
load_dotenv()
|
|
@@ -24,13 +24,43 @@ try:
|
|
| 24 |
except Exception:
|
| 25 |
anthropic = None
|
| 26 |
|
| 27 |
-
# Firecrawl
|
| 28 |
-
|
| 29 |
-
# Example usage shows Firecrawl(api_key).search(..., scrape_options={formats: [...]})
|
| 30 |
-
from firecrawl import Firecrawl # type: ignore
|
| 31 |
|
| 32 |
# --------------------------
|
| 33 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# --------------------------
|
| 35 |
class Keys(BaseModel):
|
| 36 |
openai: Optional[str] = None
|
|
@@ -38,11 +68,10 @@ class Keys(BaseModel):
|
|
| 38 |
firecrawl: Optional[str] = None
|
| 39 |
|
| 40 |
def resolve_keys(session: Keys) -> Keys:
|
| 41 |
-
"""Priority: UI session input > environment variables > None."""
|
| 42 |
return Keys(
|
| 43 |
-
openai
|
| 44 |
-
anthropic
|
| 45 |
-
firecrawl
|
| 46 |
)
|
| 47 |
|
| 48 |
# --------------------------
|
|
@@ -51,7 +80,7 @@ def resolve_keys(session: Keys) -> Keys:
|
|
| 51 |
def fc_client(session: Keys) -> Firecrawl:
|
| 52 |
keys = resolve_keys(session)
|
| 53 |
if not keys.firecrawl:
|
| 54 |
-
raise gr.Error("Missing FIRECRAWL_API_KEY. Enter it in Keys
|
| 55 |
return Firecrawl(api_key=keys.firecrawl)
|
| 56 |
|
| 57 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
|
|
@@ -61,9 +90,9 @@ def fc_search(session: Keys, query: str, limit: int = 5, scrape_formats: Optiona
|
|
| 61 |
if location:
|
| 62 |
kwargs["location"] = location
|
| 63 |
if scrape_formats:
|
| 64 |
-
# per docs: search(..., scrape_options={"formats": [...]})
|
| 65 |
kwargs["scrape_options"] = {"formats": scrape_formats}
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
|
| 69 |
def fc_scrape(session: Keys, url: str, formats: Optional[List[str]] = None, timeout_ms: Optional[int] = None, mobile: bool = False) -> Dict[str, Any]:
|
|
@@ -75,7 +104,8 @@ def fc_scrape(session: Keys, url: str, formats: Optional[List[str]] = None, time
|
|
| 75 |
kwargs["timeout"] = timeout_ms
|
| 76 |
if mobile:
|
| 77 |
kwargs["mobile"] = True
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
|
| 81 |
def fc_crawl(session: Keys, url: str, max_pages: int = 25, formats: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
@@ -83,11 +113,18 @@ def fc_crawl(session: Keys, url: str, max_pages: int = 25, formats: Optional[Lis
|
|
| 83 |
kwargs: Dict[str, Any] = {"url": url, "limit": max_pages}
|
| 84 |
if formats:
|
| 85 |
kwargs["scrape_options"] = {"formats": formats}
|
| 86 |
-
|
|
|
|
| 87 |
|
| 88 |
# --------------------------
|
| 89 |
-
# LLM helpers
|
| 90 |
# --------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
def use_openai(session: Keys):
|
| 92 |
keys = resolve_keys(session)
|
| 93 |
if not keys.openai:
|
|
@@ -104,36 +141,34 @@ def use_anthropic(session: Keys):
|
|
| 104 |
raise gr.Error("Anthropic SDK not installed.")
|
| 105 |
return anthropic.Anthropic(api_key=keys.anthropic)
|
| 106 |
|
| 107 |
-
SYSTEM_STEER = (
|
| 108 |
-
"You are ZEN's VibeCoder: extract web insights, generate clean scaffolds, "
|
| 109 |
-
"and produce production-ready artifacts. Prefer structured outlines, code blocks, and checklists. "
|
| 110 |
-
"When asked to clone or refactor, output file trees and exact text."
|
| 111 |
-
)
|
| 112 |
-
|
| 113 |
def llm_summarize(session: Keys, provider: str, model_name: str, prompt: str, context_md: str, temp: float=0.4) -> str:
|
| 114 |
if provider == "openai":
|
| 115 |
client = use_openai(session)
|
| 116 |
resp = client.chat.completions.create(
|
| 117 |
-
model=model_name
|
| 118 |
temperature=temp,
|
| 119 |
messages=[
|
| 120 |
{"role": "system", "content": SYSTEM_STEER},
|
| 121 |
-
{"role": "user", "content": f"{prompt}\n\n=== SOURCE (markdown) ===\n{context_md[:150000]}"},
|
| 122 |
],
|
| 123 |
)
|
| 124 |
-
return resp.choices[0].message.content or ""
|
| 125 |
else:
|
| 126 |
client = use_anthropic(session)
|
| 127 |
resp = client.messages.create(
|
| 128 |
-
model=model_name
|
| 129 |
max_tokens=4000,
|
| 130 |
temperature=temp,
|
| 131 |
system=SYSTEM_STEER,
|
| 132 |
-
messages=[
|
| 133 |
-
{"role": "user", "content": f"{prompt}\n\n=== SOURCE (markdown) ===\n{context_md[:150000]}"},
|
| 134 |
-
],
|
| 135 |
)
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# --------------------------
|
| 139 |
# ZIP export
|
|
@@ -144,18 +179,15 @@ def pack_zip(pages: List[Dict[str, Any]]) -> bytes:
|
|
| 144 |
manifest = []
|
| 145 |
for i, p in enumerate(pages, start=1):
|
| 146 |
url = p.get("url") or p.get("metadata", {}).get("sourceURL") or f"page_{i}"
|
| 147 |
-
slug = hashlib.sha1(url.encode("utf-8")).hexdigest()[:10]
|
| 148 |
md = p.get("markdown") or p.get("data", {}).get("markdown") or p.get("content") or ""
|
| 149 |
html = p.get("html") or p.get("data", {}).get("html") or ""
|
| 150 |
links = p.get("links") or p.get("data", {}).get("links") or []
|
| 151 |
-
# write markdown/html if present
|
| 152 |
if md:
|
| 153 |
zf.writestr(f"{i:03d}_{slug}.md", md)
|
| 154 |
if html:
|
| 155 |
zf.writestr(f"{i:03d}_{slug}.html", html)
|
| 156 |
-
|
| 157 |
-
record = {"url": url, "title": p.get("title") or p.get("metadata", {}).get("title"), "links": links}
|
| 158 |
-
manifest.append(record)
|
| 159 |
zf.writestr("manifest.json", json.dumps(manifest, indent=2))
|
| 160 |
mem.seek(0)
|
| 161 |
return mem.read()
|
|
@@ -164,36 +196,41 @@ def pack_zip(pages: List[Dict[str, Any]]) -> bytes:
|
|
| 164 |
# Gradio actions
|
| 165 |
# --------------------------
|
| 166 |
def save_keys(openai_key, anthropic_key, firecrawl_key):
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
|
| 172 |
def action_search(session: Keys, query: str, limit: int, scrape_content: bool, location: str):
|
| 173 |
if not query.strip():
|
| 174 |
raise gr.Error("Enter a search query.")
|
| 175 |
formats = ["markdown", "links"] if scrape_content else None
|
| 176 |
-
result = fc_search(session, query=query.strip(), limit=limit, scrape_formats=formats, location=location or None)
|
| 177 |
-
#
|
| 178 |
-
data = result.get("data")
|
| 179 |
-
items = []
|
| 180 |
if isinstance(data, dict):
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
for it in data.get(bucket, []):
|
| 184 |
-
items.append(it)
|
| 185 |
elif isinstance(data, list):
|
| 186 |
-
items = data
|
|
|
|
|
|
|
| 187 |
return json.dumps(items, indent=2)
|
| 188 |
|
| 189 |
def action_scrape(session: Keys, url: str, mobile: bool, formats_sel: List[str], timeout_ms: int):
|
| 190 |
if not url.strip():
|
| 191 |
raise gr.Error("Enter a URL.")
|
| 192 |
formats = formats_sel or ["markdown", "html", "links"]
|
| 193 |
-
out = fc_scrape(session, url.strip(), formats=formats, timeout_ms=timeout_ms or None, mobile=mobile)
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
return pretty, md
|
| 198 |
|
| 199 |
def action_crawl(session: Keys, base_url: str, max_pages: int, formats_sel: List[str]):
|
|
@@ -201,9 +238,8 @@ def action_crawl(session: Keys, base_url: str, max_pages: int, formats_sel: List
|
|
| 201 |
raise gr.Error("Enter a base URL to crawl.")
|
| 202 |
formats = formats_sel or ["markdown", "links"]
|
| 203 |
out = fc_crawl(session, base_url.strip(), max_pages=max_pages, formats=formats)
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
if not isinstance(pages, list) or len(pages) == 0:
|
| 207 |
raise gr.Error("Crawl returned no pages.")
|
| 208 |
zip_bytes = pack_zip(pages)
|
| 209 |
return gr.File.update(value=io.BytesIO(zip_bytes), visible=True, filename="site_clone.zip"), f"Crawled {len(pages)} pages. ZIP is ready."
|
|
@@ -211,7 +247,7 @@ def action_crawl(session: Keys, base_url: str, max_pages: int, formats_sel: List
|
|
| 211 |
def action_generate(session: Keys, provider: str, model_name: str, sys_prompt: str, user_prompt: str, context_md: str, temp: float):
|
| 212 |
if not user_prompt.strip():
|
| 213 |
raise gr.Error("Enter a prompt or click a starter tile.")
|
| 214 |
-
model = model_name.strip() or ("gpt-5" if provider == "openai" else "claude-3-5-sonnet-latest")
|
| 215 |
steer = (sys_prompt or "").strip()
|
| 216 |
prompt = (("SYSTEM:\n" + steer + "\n\n") if steer else "") + user_prompt.strip()
|
| 217 |
out = llm_summarize(session, provider, model, prompt, context_md or "", temp=temp)
|
|
@@ -220,11 +256,9 @@ def action_generate(session: Keys, provider: str, model_name: str, sys_prompt: s
|
|
| 220 |
# --------------------------
|
| 221 |
# UI (Blocks)
|
| 222 |
# --------------------------
|
| 223 |
-
with gr.Blocks(css=""
|
| 224 |
-
#keys .wrap.svelte-1ipelgc { filter: none !important; }
|
| 225 |
-
""") as demo:
|
| 226 |
gr.Markdown("## ZEN VibeCoder — Web Clone & Research Foundry")
|
| 227 |
-
session_state = gr.State(Keys())
|
| 228 |
|
| 229 |
with gr.Accordion("🔐 Keys (session)", open=True):
|
| 230 |
with gr.Row():
|
|
@@ -236,7 +270,6 @@ with gr.Blocks(css="""
|
|
| 236 |
save_btn.click(save_keys, [openai_key, anthropic_key, firecrawl_key], [session_state, save_msg])
|
| 237 |
|
| 238 |
with gr.Tabs():
|
| 239 |
-
# --- TAB: Search ---
|
| 240 |
with gr.Tab("🔎 Search"):
|
| 241 |
query = gr.Textbox(label="Query", placeholder='ex: "best open-source vector databases in 2025 site:docs"')
|
| 242 |
with gr.Row():
|
|
@@ -247,7 +280,6 @@ with gr.Blocks(css="""
|
|
| 247 |
search_json = gr.Code(label="Results JSON", language="json")
|
| 248 |
go_search.click(action_search, [session_state, query, limit, scrape_content, location], [search_json])
|
| 249 |
|
| 250 |
-
# --- TAB: Scrape / Crawl / Clone ---
|
| 251 |
with gr.Tab("🕸️ Scrape • Crawl • Clone"):
|
| 252 |
with gr.Row():
|
| 253 |
target_url = gr.Textbox(label="URL to Scrape", placeholder="https://example.com")
|
|
@@ -271,7 +303,6 @@ with gr.Blocks(css="""
|
|
| 271 |
crawl_status = gr.Markdown()
|
| 272 |
run_crawl.click(action_crawl, [session_state, base_url, max_pages, formats_crawl], [zip_file, crawl_status])
|
| 273 |
|
| 274 |
-
# --- TAB: Vibe Code (LLM Synthesis) ---
|
| 275 |
with gr.Tab("✨ Vibe Code (Synthesis)"):
|
| 276 |
with gr.Row():
|
| 277 |
provider = gr.Radio(choices=["openai","anthropic"], value="openai", label="Provider")
|
|
@@ -287,7 +318,6 @@ with gr.Blocks(css="""
|
|
| 287 |
gen_btn = gr.Button("Generate", variant="primary")
|
| 288 |
out_md = gr.Markdown()
|
| 289 |
|
| 290 |
-
# Starter Tiles
|
| 291 |
gr.Markdown("**Starter Tiles**")
|
| 292 |
with gr.Row():
|
| 293 |
t1 = gr.Button("🔧 Clone Docs ➜ Clean Markdown ➜ README")
|
|
@@ -297,14 +327,14 @@ with gr.Blocks(css="""
|
|
| 297 |
t5 = gr.Button("📊 Dataset Outline ➜ Schema + Fields + ETL")
|
| 298 |
|
| 299 |
def fill_tile(tile: str):
|
| 300 |
-
|
| 301 |
-
"t1":
|
| 302 |
-
"t2":
|
| 303 |
-
"t3":
|
| 304 |
-
"t4":
|
| 305 |
-
"t5":
|
| 306 |
}
|
| 307 |
-
return
|
| 308 |
|
| 309 |
t1.click(lambda: fill_tile("t1"), outputs=[user_prompt])
|
| 310 |
t2.click(lambda: fill_tile("t2"), outputs=[user_prompt])
|
|
@@ -314,11 +344,8 @@ with gr.Blocks(css="""
|
|
| 314 |
|
| 315 |
gen_btn.click(action_generate, [session_state, provider, model_name, sys_prompt, user_prompt, ctx_md, temp], [out_md])
|
| 316 |
|
| 317 |
-
gr.Markdown(
|
| 318 |
-
"Built for **ZEN Arena** pipelines. Export ZIPs → ingest → credentialize achievements via ZEN Cards.\n"
|
| 319 |
-
"Docs used for Firecrawl behavior: search/scrape/crawl endpoints."
|
| 320 |
-
)
|
| 321 |
|
| 322 |
if __name__ == "__main__":
|
| 323 |
-
#
|
| 324 |
demo.launch()
|
|
|
|
| 1 |
+
import os, io, json, zipfile, hashlib
|
| 2 |
+
from typing import List, Dict, Any, Optional, Union
|
| 3 |
import gradio as gr
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 6 |
|
| 7 |
+
# --- Optional .env support ---
|
| 8 |
try:
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
load_dotenv()
|
|
|
|
| 24 |
except Exception:
|
| 25 |
anthropic = None
|
| 26 |
|
| 27 |
+
# Firecrawl SDK
|
| 28 |
+
from firecrawl import Firecrawl # v2.x
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# --------------------------
|
| 31 |
+
# Utilities
|
| 32 |
+
# --------------------------
|
| 33 |
+
def _to_dict(obj: Any) -> Any:
|
| 34 |
+
"""
|
| 35 |
+
Recursively convert Firecrawl/Pydantic models (SearchData, ScrapeData, CrawlData, etc.)
|
| 36 |
+
or other objects into plain Python dict/list primitives.
|
| 37 |
+
"""
|
| 38 |
+
# Pydantic v2 models
|
| 39 |
+
if isinstance(obj, BaseModel):
|
| 40 |
+
return obj.model_dump()
|
| 41 |
+
# Mapping-like
|
| 42 |
+
if isinstance(obj, dict):
|
| 43 |
+
return {k: _to_dict(v) for k, v in obj.items()}
|
| 44 |
+
# List/Tuple
|
| 45 |
+
if isinstance(obj, (list, tuple)):
|
| 46 |
+
return [_to_dict(v) for v in obj]
|
| 47 |
+
# Objects with __dict__ (fallback)
|
| 48 |
+
if hasattr(obj, "__dict__") and not isinstance(obj, (str, bytes)):
|
| 49 |
+
try:
|
| 50 |
+
return {k: _to_dict(v) for k, v in vars(obj).items()}
|
| 51 |
+
except Exception:
|
| 52 |
+
pass
|
| 53 |
+
return obj
|
| 54 |
+
|
| 55 |
+
def _pretty_json(data: Any, limit: int = 300_000) -> str:
|
| 56 |
+
try:
|
| 57 |
+
s = json.dumps(_to_dict(data), indent=2)
|
| 58 |
+
return s[:limit]
|
| 59 |
+
except Exception as e:
|
| 60 |
+
return f"<!> Could not serialize to JSON: {e}"
|
| 61 |
+
|
| 62 |
+
# --------------------------
|
| 63 |
+
# Session keys
|
| 64 |
# --------------------------
|
| 65 |
class Keys(BaseModel):
|
| 66 |
openai: Optional[str] = None
|
|
|
|
| 68 |
firecrawl: Optional[str] = None
|
| 69 |
|
| 70 |
def resolve_keys(session: Keys) -> Keys:
|
|
|
|
| 71 |
return Keys(
|
| 72 |
+
openai=session.openai or os.getenv("OPENAI_API_KEY"),
|
| 73 |
+
anthropic=session.anthropic or os.getenv("ANTHROPIC_API_KEY"),
|
| 74 |
+
firecrawl=session.firecrawl or os.getenv("FIRECRAWL_API_KEY"),
|
| 75 |
)
|
| 76 |
|
| 77 |
# --------------------------
|
|
|
|
| 80 |
def fc_client(session: Keys) -> Firecrawl:
|
| 81 |
keys = resolve_keys(session)
|
| 82 |
if not keys.firecrawl:
|
| 83 |
+
raise gr.Error("Missing FIRECRAWL_API_KEY. Enter it in Keys → Save.")
|
| 84 |
return Firecrawl(api_key=keys.firecrawl)
|
| 85 |
|
| 86 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
|
|
|
|
| 90 |
if location:
|
| 91 |
kwargs["location"] = location
|
| 92 |
if scrape_formats:
|
|
|
|
| 93 |
kwargs["scrape_options"] = {"formats": scrape_formats}
|
| 94 |
+
result = fc.search(**kwargs) # returns a Pydantic model
|
| 95 |
+
return _to_dict(result)
|
| 96 |
|
| 97 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
|
| 98 |
def fc_scrape(session: Keys, url: str, formats: Optional[List[str]] = None, timeout_ms: Optional[int] = None, mobile: bool = False) -> Dict[str, Any]:
|
|
|
|
| 104 |
kwargs["timeout"] = timeout_ms
|
| 105 |
if mobile:
|
| 106 |
kwargs["mobile"] = True
|
| 107 |
+
result = fc.scrape(**kwargs) # Pydantic model
|
| 108 |
+
return _to_dict(result)
|
| 109 |
|
| 110 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
|
| 111 |
def fc_crawl(session: Keys, url: str, max_pages: int = 25, formats: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
|
|
| 113 |
kwargs: Dict[str, Any] = {"url": url, "limit": max_pages}
|
| 114 |
if formats:
|
| 115 |
kwargs["scrape_options"] = {"formats": formats}
|
| 116 |
+
result = fc.crawl(**kwargs) # Pydantic model
|
| 117 |
+
return _to_dict(result)
|
| 118 |
|
| 119 |
# --------------------------
|
| 120 |
+
# LLM helpers
|
| 121 |
# --------------------------
|
| 122 |
+
SYSTEM_STEER = (
|
| 123 |
+
"You are ZEN's VibeCoder: extract web insights, generate clean scaffolds, "
|
| 124 |
+
"and produce production-ready artifacts. Prefer structured outlines, code blocks, and checklists. "
|
| 125 |
+
"When asked to clone or refactor, output file trees and exact text."
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
def use_openai(session: Keys):
|
| 129 |
keys = resolve_keys(session)
|
| 130 |
if not keys.openai:
|
|
|
|
| 141 |
raise gr.Error("Anthropic SDK not installed.")
|
| 142 |
return anthropic.Anthropic(api_key=keys.anthropic)
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
def llm_summarize(session: Keys, provider: str, model_name: str, prompt: str, context_md: str, temp: float=0.4) -> str:
|
| 145 |
if provider == "openai":
|
| 146 |
client = use_openai(session)
|
| 147 |
resp = client.chat.completions.create(
|
| 148 |
+
model=model_name or "gpt-5",
|
| 149 |
temperature=temp,
|
| 150 |
messages=[
|
| 151 |
{"role": "system", "content": SYSTEM_STEER},
|
| 152 |
+
{"role": "user", "content": f"{prompt}\n\n=== SOURCE (markdown) ===\n{(context_md or '')[:150000]}"},
|
| 153 |
],
|
| 154 |
)
|
| 155 |
+
return (resp.choices[0].message.content or "").strip()
|
| 156 |
else:
|
| 157 |
client = use_anthropic(session)
|
| 158 |
resp = client.messages.create(
|
| 159 |
+
model=model_name or "claude-3-5-sonnet-latest",
|
| 160 |
max_tokens=4000,
|
| 161 |
temperature=temp,
|
| 162 |
system=SYSTEM_STEER,
|
| 163 |
+
messages=[{"role": "user", "content": f"{prompt}\n\n=== SOURCE (markdown) ===\n{(context_md or '')[:150000]}"}],
|
|
|
|
|
|
|
| 164 |
)
|
| 165 |
+
chunks = []
|
| 166 |
+
for blk in resp.content:
|
| 167 |
+
# anthropic returns TextBlock objects
|
| 168 |
+
text = getattr(blk, "text", None)
|
| 169 |
+
if text:
|
| 170 |
+
chunks.append(text)
|
| 171 |
+
return "".join(chunks).strip()
|
| 172 |
|
| 173 |
# --------------------------
|
| 174 |
# ZIP export
|
|
|
|
| 179 |
manifest = []
|
| 180 |
for i, p in enumerate(pages, start=1):
|
| 181 |
url = p.get("url") or p.get("metadata", {}).get("sourceURL") or f"page_{i}"
|
| 182 |
+
slug = hashlib.sha1(str(url).encode("utf-8")).hexdigest()[:10]
|
| 183 |
md = p.get("markdown") or p.get("data", {}).get("markdown") or p.get("content") or ""
|
| 184 |
html = p.get("html") or p.get("data", {}).get("html") or ""
|
| 185 |
links = p.get("links") or p.get("data", {}).get("links") or []
|
|
|
|
| 186 |
if md:
|
| 187 |
zf.writestr(f"{i:03d}_{slug}.md", md)
|
| 188 |
if html:
|
| 189 |
zf.writestr(f"{i:03d}_{slug}.html", html)
|
| 190 |
+
manifest.append({"url": url, "title": p.get("title") or p.get("metadata", {}).get("title"), "links": links})
|
|
|
|
|
|
|
| 191 |
zf.writestr("manifest.json", json.dumps(manifest, indent=2))
|
| 192 |
mem.seek(0)
|
| 193 |
return mem.read()
|
|
|
|
| 196 |
# Gradio actions
|
| 197 |
# --------------------------
|
| 198 |
def save_keys(openai_key, anthropic_key, firecrawl_key):
|
| 199 |
+
return Keys(
|
| 200 |
+
openai=(openai_key or "").strip() or None,
|
| 201 |
+
anthropic=(anthropic_key or "").strip() or None,
|
| 202 |
+
firecrawl=(firecrawl_key or "").strip() or None,
|
| 203 |
+
), gr.Info("Keys saved to this session. (Env vars still apply if set.)")
|
| 204 |
|
| 205 |
def action_search(session: Keys, query: str, limit: int, scrape_content: bool, location: str):
|
| 206 |
if not query.strip():
|
| 207 |
raise gr.Error("Enter a search query.")
|
| 208 |
formats = ["markdown", "links"] if scrape_content else None
|
| 209 |
+
result = fc_search(session, query=query.strip(), limit=limit, scrape_formats=formats, location=(location or None))
|
| 210 |
+
# Firecrawl v2 search returns {'data': {'web': [...], 'news': [...], ...}} or {'data': [...]}
|
| 211 |
+
data = result.get("data", result) # tolerate both shapes
|
| 212 |
+
items: List[Any] = []
|
| 213 |
if isinstance(data, dict):
|
| 214 |
+
for bucket in ("web", "news", "images", "videos", "discussion"):
|
| 215 |
+
items.extend(_to_dict(data.get(bucket, [])))
|
|
|
|
|
|
|
| 216 |
elif isinstance(data, list):
|
| 217 |
+
items = _to_dict(data)
|
| 218 |
+
else:
|
| 219 |
+
items = [_to_dict(data)]
|
| 220 |
return json.dumps(items, indent=2)
|
| 221 |
|
| 222 |
def action_scrape(session: Keys, url: str, mobile: bool, formats_sel: List[str], timeout_ms: int):
|
| 223 |
if not url.strip():
|
| 224 |
raise gr.Error("Enter a URL.")
|
| 225 |
formats = formats_sel or ["markdown", "html", "links"]
|
| 226 |
+
out = fc_scrape(session, url.strip(), formats=formats, timeout_ms=(timeout_ms or None), mobile=mobile)
|
| 227 |
+
pretty = _pretty_json(out)
|
| 228 |
+
md = (
|
| 229 |
+
out.get("markdown")
|
| 230 |
+
or out.get("data", {}).get("markdown")
|
| 231 |
+
or out.get("content")
|
| 232 |
+
or ""
|
| 233 |
+
)
|
| 234 |
return pretty, md
|
| 235 |
|
| 236 |
def action_crawl(session: Keys, base_url: str, max_pages: int, formats_sel: List[str]):
|
|
|
|
| 238 |
raise gr.Error("Enter a base URL to crawl.")
|
| 239 |
formats = formats_sel or ["markdown", "links"]
|
| 240 |
out = fc_crawl(session, base_url.strip(), max_pages=max_pages, formats=formats)
|
| 241 |
+
pages = out.get("data")
|
| 242 |
+
if not isinstance(pages, list) or not pages:
|
|
|
|
| 243 |
raise gr.Error("Crawl returned no pages.")
|
| 244 |
zip_bytes = pack_zip(pages)
|
| 245 |
return gr.File.update(value=io.BytesIO(zip_bytes), visible=True, filename="site_clone.zip"), f"Crawled {len(pages)} pages. ZIP is ready."
|
|
|
|
| 247 |
def action_generate(session: Keys, provider: str, model_name: str, sys_prompt: str, user_prompt: str, context_md: str, temp: float):
|
| 248 |
if not user_prompt.strip():
|
| 249 |
raise gr.Error("Enter a prompt or click a starter tile.")
|
| 250 |
+
model = (model_name or "").strip() or ("gpt-5" if provider == "openai" else "claude-3-5-sonnet-latest")
|
| 251 |
steer = (sys_prompt or "").strip()
|
| 252 |
prompt = (("SYSTEM:\n" + steer + "\n\n") if steer else "") + user_prompt.strip()
|
| 253 |
out = llm_summarize(session, provider, model, prompt, context_md or "", temp=temp)
|
|
|
|
| 256 |
# --------------------------
|
| 257 |
# UI (Blocks)
|
| 258 |
# --------------------------
|
| 259 |
+
with gr.Blocks(css="#keys .wrap.svelte-1ipelgc { filter: none !important; }") as demo:
|
|
|
|
|
|
|
| 260 |
gr.Markdown("## ZEN VibeCoder — Web Clone & Research Foundry")
|
| 261 |
+
session_state = gr.State(Keys())
|
| 262 |
|
| 263 |
with gr.Accordion("🔐 Keys (session)", open=True):
|
| 264 |
with gr.Row():
|
|
|
|
| 270 |
save_btn.click(save_keys, [openai_key, anthropic_key, firecrawl_key], [session_state, save_msg])
|
| 271 |
|
| 272 |
with gr.Tabs():
|
|
|
|
| 273 |
with gr.Tab("🔎 Search"):
|
| 274 |
query = gr.Textbox(label="Query", placeholder='ex: "best open-source vector databases in 2025 site:docs"')
|
| 275 |
with gr.Row():
|
|
|
|
| 280 |
search_json = gr.Code(label="Results JSON", language="json")
|
| 281 |
go_search.click(action_search, [session_state, query, limit, scrape_content, location], [search_json])
|
| 282 |
|
|
|
|
| 283 |
with gr.Tab("🕸️ Scrape • Crawl • Clone"):
|
| 284 |
with gr.Row():
|
| 285 |
target_url = gr.Textbox(label="URL to Scrape", placeholder="https://example.com")
|
|
|
|
| 303 |
crawl_status = gr.Markdown()
|
| 304 |
run_crawl.click(action_crawl, [session_state, base_url, max_pages, formats_crawl], [zip_file, crawl_status])
|
| 305 |
|
|
|
|
| 306 |
with gr.Tab("✨ Vibe Code (Synthesis)"):
|
| 307 |
with gr.Row():
|
| 308 |
provider = gr.Radio(choices=["openai","anthropic"], value="openai", label="Provider")
|
|
|
|
| 318 |
gen_btn = gr.Button("Generate", variant="primary")
|
| 319 |
out_md = gr.Markdown()
|
| 320 |
|
|
|
|
| 321 |
gr.Markdown("**Starter Tiles**")
|
| 322 |
with gr.Row():
|
| 323 |
t1 = gr.Button("🔧 Clone Docs ➜ Clean Markdown ➜ README")
|
|
|
|
| 327 |
t5 = gr.Button("📊 Dataset Outline ➜ Schema + Fields + ETL")
|
| 328 |
|
| 329 |
def fill_tile(tile: str):
|
| 330 |
+
tiles = {
|
| 331 |
+
"t1": "Create a clean knowledge pack from the context, then output a README.md with:\n- Overview\n- Key features\n- Quickstart\n- API endpoints (if any)\n- Notes & gotchas\n- License\nAlso produce a /docs/ tree outline with suggested pages and headings.",
|
| 332 |
+
"t2": "From the context, produce a feature matrix, pricing table, ICP notes, moats/risks, and a market POV. Conclude with a ZEN playbook: 5 lever moves for advantage.",
|
| 333 |
+
"t3": "Using the context, design a Python client that wraps the target API with retry/backoff and typed responses. Output:\n- package layout\n- requirements\n- client.py\n- examples/\n- README with usage.\nInclude robust error handling.",
|
| 334 |
+
"t4": "Rewrite the landing page in ZEN brand voice: crisp headline, 3 value props, social proof, CTA, and a concise FAQ. Provide HTML sections and copy blocks.",
|
| 335 |
+
"t5": "Propose a dataset schema based on the context. Output a table of fields, types, constraints, and an ETL plan (sources, transforms, validation, freshness, monitoring).",
|
| 336 |
}
|
| 337 |
+
return tiles[tile]
|
| 338 |
|
| 339 |
t1.click(lambda: fill_tile("t1"), outputs=[user_prompt])
|
| 340 |
t2.click(lambda: fill_tile("t2"), outputs=[user_prompt])
|
|
|
|
| 344 |
|
| 345 |
gen_btn.click(action_generate, [session_state, provider, model_name, sys_prompt, user_prompt, ctx_md, temp], [out_md])
|
| 346 |
|
| 347 |
+
gr.Markdown("Built for **ZEN Arena** pipelines. Export ZIPs → ingest → credentialize via ZEN Cards.")
|
|
|
|
|
|
|
|
|
|
| 348 |
|
| 349 |
if __name__ == "__main__":
|
| 350 |
+
# If SSR causes issues in your Space, you can disable it by: demo.launch(ssr_mode=False)
|
| 351 |
demo.launch()
|