nazib61 commited on
Commit
b66aa32
·
verified ·
1 Parent(s): c925750

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import gradio as gr
4
+ import nest_asyncio
5
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
6
+ from crawl4ai.extraction_strategy import LLMExtractionStrategy
7
+
8
+ # This allows us to run the crawler's async loop inside Gradio's loop
9
+ nest_asyncio.apply()
10
+
11
+ async def extract_with_gemini(url, api_key, prompt):
12
+ if not url or not api_key:
13
+ return "Please provide both a URL and your Gemini API Key."
14
+
15
+ # 1. Setup the Gemini Extraction Strategy
16
+ # We use 'gemini/gemini-1.5-flash' (fast & cheap) or 'gemini/gemini-1.5-pro'
17
+ extraction_strategy = LLMExtractionStrategy(
18
+ provider="gemini/gemini-1.5-flash",
19
+ api_token=api_key,
20
+ instruction=prompt,
21
+ verbose=True
22
+ )
23
+
24
+ # 2. Configure the Browser
25
+ browser_config = BrowserConfig(headless=True)
26
+
27
+ # 3. Configure the Run (Strategy + Cache settings)
28
+ run_config = CrawlerRunConfig(
29
+ extraction_strategy=extraction_strategy,
30
+ cache_mode=CacheMode.BYPASS # Ensures fresh crawl every time
31
+ )
32
+
33
+ try:
34
+ async with AsyncWebCrawler(config=browser_config) as crawler:
35
+ # Execute the crawl and extraction
36
+ result = await crawler.arun(url=url, config=run_config)
37
+
38
+ if result.success:
39
+ # The extracted_content is typically a JSON string
40
+ try:
41
+ data = json.loads(result.extracted_content)
42
+ return json.dumps(data, indent=2)
43
+ except:
44
+ return result.extracted_content
45
+ else:
46
+ return f"Error: {result.error_message}"
47
+
48
+ except Exception as e:
49
+ return f"Runtime Error: {str(e)}"
50
+
51
+ # Wrapper for Gradio
52
+ def gradio_wrapper(url, api_key, prompt):
53
+ return asyncio.run(extract_with_gemini(url, api_key, prompt))
54
+
55
+ # --- Gradio UI ---
56
+ with gr.Blocks(theme=gr.themes.Default()) as demo:
57
+ gr.Markdown("# 🕷️ Crawl4AI + Gemini Extraction")
58
+ gr.Markdown("Extract structured data from any website using Google's Gemini models.")
59
+
60
+ with gr.Row():
61
+ with gr.Column():
62
+ url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
63
+ api_key = gr.Textbox(label="Gemini API Key", type="password", placeholder="AIzaSy...")
64
+ instruction = gr.Textbox(
65
+ label="What to extract?",
66
+ placeholder="Extract all product names and prices into a JSON list.",
67
+ lines=4
68
+ )
69
+ btn = gr.Button("Start Extraction", variant="primary")
70
+
71
+ with gr.Column():
72
+ output_text = gr.Code(label="Extracted JSON", language="json")
73
+
74
+ btn.click(
75
+ fn=gradio_wrapper,
76
+ inputs=[url_input, api_key, instruction],
77
+ outputs=output_text
78
+ )
79
+
80
+ if __name__ == "__main__":
81
+ demo.launch()