Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from datetime import datetime | |
| def scrape_website(url, engine, api_key): | |
| """ | |
| Scrape website using AnyCrawl API | |
| API Key can be obtained from: https://anycrawl.dev | |
| """ | |
| # Prepare the request payload | |
| payload = {"url": url, "engine": engine} | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Accept": "application/json", | |
| "Authorization": f"Bearer {api_key}", | |
| } | |
| try: | |
| response = requests.post( | |
| "https://api.anycrawl.dev/v1/scrape", headers=headers, json=payload | |
| ) | |
| response.raise_for_status() | |
| result = response.json() | |
| # Extract data from the response | |
| data = result.get("data", {}) | |
| markdown_content = data.get("markdown", "No markdown content found") | |
| # Extract metadata from data | |
| metadata_md = "### Metadata\n" | |
| metadata_md += f"- **URL**: {data.get('url', 'N/A')}\n" | |
| metadata_md += f"- **Status**: {data.get('status', 'N/A')}\n" | |
| metadata_md += f"- **Job ID**: {data.get('job_id', 'N/A')}\n" | |
| metadata_md += f"- **Title**: {data.get('title', 'N/A')}\n" | |
| metadata_md += f"- **Timestamp**: {data.get('timestamp', 'N/A')}\n" | |
| # Add page metadata if available | |
| if "metadata" in data: | |
| metadata_md += "\n### Page Metadata\n" | |
| for key, value in data["metadata"].items(): | |
| metadata_md += f"- **{key}**: {value}\n" | |
| return markdown_content, metadata_md | |
| except Exception as e: | |
| return "Error occurred", f"### Error\n{str(e)}" | |
| def search_content(query, pages, api_key): | |
| """ | |
| Search content using AnyCrawl API | |
| """ | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {api_key}", | |
| } | |
| payload = {"query": query, "pages": pages} | |
| try: | |
| response = requests.post( | |
| "https://api.anycrawl.dev/v1/search", headers=headers, json=payload | |
| ) | |
| response.raise_for_status() | |
| result = response.json() | |
| # Format search results | |
| search_results = "### Search Results\n\n" | |
| data = result.get("data", []) | |
| if not data: | |
| return "No results found" | |
| for item in data: | |
| search_results += f"#### {item.get('title', 'Untitled')}\n" | |
| search_results += f"- **URL**: {item.get('url', 'N/A')}\n" | |
| search_results += f"- **Description**: {item.get('description', 'N/A')}\n" | |
| search_results += f"- **Source**: {item.get('source', 'N/A')}\n\n" | |
| return search_results | |
| except Exception as e: | |
| return f"### Error\n{str(e)}" | |
| # Create the Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# AnyCrawl, Turning web into LLM-ready.") | |
| gr.Markdown("## We are open-sourced under the MIT License, and you can deploy it yourself anytime.") | |
| gr.Markdown( | |
| "Get your API key from [AnyCrawl.dev](https://anycrawl.dev), The free test api_key: ac-d4b8045a313b7a6c694fe046ff3a7, the free is limited, if you like AnyCrawl, please sign up and get your own api_key" | |
| ) | |
| api_key = gr.Textbox( | |
| label="API Key", type="password", value="ac-d4b8045a313b7a6c694fe046ff3a7" | |
| ) | |
| with gr.Tabs() as tabs: | |
| with gr.Tab("Scrape Website"): | |
| with gr.Row(): | |
| url_input = gr.Textbox( | |
| label="URL", | |
| placeholder="Enter website URL you want to scrape", | |
| ) | |
| with gr.Row(): | |
| engine_input = gr.Dropdown( | |
| choices=["puppeteer", "playwright", "cheerio"], | |
| value="playwright", | |
| label="Scraping Engine", | |
| ) | |
| scrape_btn = gr.Button("Scrape Website") | |
| with gr.Row(): | |
| markdown_output = gr.Markdown(label="Content") | |
| metadata_output = gr.Markdown(label="Metadata") | |
| scrape_btn.click( | |
| fn=scrape_website, | |
| inputs=[ | |
| url_input, | |
| engine_input, | |
| api_key, | |
| ], | |
| outputs=[markdown_output, metadata_output], | |
| ) | |
| with gr.Tab("Search Content"): | |
| with gr.Row(): | |
| search_input = gr.Textbox( | |
| label="Search Query", | |
| placeholder="Enter your search query", | |
| ) | |
| pages_input = gr.Number( | |
| label="Number of Pages", | |
| value=1, | |
| minimum=1, | |
| maximum=10, | |
| step=1, | |
| ) | |
| search_btn = gr.Button("Search") | |
| search_output = gr.Markdown(label="Search Results") | |
| search_btn.click( | |
| fn=search_content, | |
| inputs=[ | |
| search_input, | |
| pages_input, | |
| api_key, | |
| ], | |
| outputs=search_output, | |
| ) | |
| demo.launch(mcp_server=True) | |