Spaces:

nazib61
/

crawl4AI

Sleeping

App Files Files Community

nazib61 commited on Feb 14

Commit

b66aa32

verified ·

1 Parent(s): c925750

Create app.py

Browse files

Files changed (1) hide show

app.py +81 -0

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import asyncio
+import json
+import gradio as gr
+import nest_asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+# This allows us to run the crawler's async loop inside Gradio's loop
+nest_asyncio.apply()
+async def extract_with_gemini(url, api_key, prompt):
+    if not url or not api_key:
+        return "Please provide both a URL and your Gemini API Key."
+    # 1. Setup the Gemini Extraction Strategy
+    # We use 'gemini/gemini-1.5-flash' (fast & cheap) or 'gemini/gemini-1.5-pro'
+    extraction_strategy = LLMExtractionStrategy(
+        provider="gemini/gemini-1.5-flash",
+        api_token=api_key,
+        instruction=prompt,
+        verbose=True
+    )
+    # 2. Configure the Browser
+    browser_config = BrowserConfig(headless=True)
+    # 3. Configure the Run (Strategy + Cache settings)
+    run_config = CrawlerRunConfig(
+        extraction_strategy=extraction_strategy,
+        cache_mode=CacheMode.BYPASS  # Ensures fresh crawl every time
+    )
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            # Execute the crawl and extraction
+            result = await crawler.arun(url=url, config=run_config)
+            if result.success:
+                # The extracted_content is typically a JSON string
+                try:
+                    data = json.loads(result.extracted_content)
+                    return json.dumps(data, indent=2)
+                except:
+                    return result.extracted_content
+            else:
+                return f"Error: {result.error_message}"
+    except Exception as e:
+        return f"Runtime Error: {str(e)}"
+# Wrapper for Gradio
+def gradio_wrapper(url, api_key, prompt):
+    return asyncio.run(extract_with_gemini(url, api_key, prompt))
+# --- Gradio UI ---
+with gr.Blocks(theme=gr.themes.Default()) as demo:
+    gr.Markdown("# 🕷️ Crawl4AI + Gemini Extraction")
+    gr.Markdown("Extract structured data from any website using Google's Gemini models.")
+    with gr.Row():
+        with gr.Column():
+            url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
+            api_key = gr.Textbox(label="Gemini API Key", type="password", placeholder="AIzaSy...")
+            instruction = gr.Textbox(
+                label="What to extract?",
+                placeholder="Extract all product names and prices into a JSON list.",
+                lines=4
+            )
+            btn = gr.Button("Start Extraction", variant="primary")
+        with gr.Column():
+            output_text = gr.Code(label="Extracted JSON", language="json")
+    btn.click(
+        fn=gradio_wrapper,
+        inputs=[url_input, api_key, instruction],
+        outputs=output_text
+    )
+if __name__ == "__main__":
+    demo.launch()