import gradio as gr import requests from datetime import datetime def scrape_website(url, engine, api_key): """ Scrape website using AnyCrawl API API Key can be obtained from: https://anycrawl.dev """ # Prepare the request payload payload = {"url": url, "engine": engine} headers = { "Content-Type": "application/json", "Accept": "application/json", "Authorization": f"Bearer {api_key}", } try: response = requests.post( "https://api.anycrawl.dev/v1/scrape", headers=headers, json=payload ) response.raise_for_status() result = response.json() # Extract data from the response data = result.get("data", {}) markdown_content = data.get("markdown", "No markdown content found") # Extract metadata from data metadata_md = "### Metadata\n" metadata_md += f"- **URL**: {data.get('url', 'N/A')}\n" metadata_md += f"- **Status**: {data.get('status', 'N/A')}\n" metadata_md += f"- **Job ID**: {data.get('job_id', 'N/A')}\n" metadata_md += f"- **Title**: {data.get('title', 'N/A')}\n" metadata_md += f"- **Timestamp**: {data.get('timestamp', 'N/A')}\n" # Add page metadata if available if "metadata" in data: metadata_md += "\n### Page Metadata\n" for key, value in data["metadata"].items(): metadata_md += f"- **{key}**: {value}\n" return markdown_content, metadata_md except Exception as e: return "Error occurred", f"### Error\n{str(e)}" def search_content(query, pages, api_key): """ Search content using AnyCrawl API """ headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}", } payload = {"query": query, "pages": pages} try: response = requests.post( "https://api.anycrawl.dev/v1/search", headers=headers, json=payload ) response.raise_for_status() result = response.json() # Format search results search_results = "### Search Results\n\n" data = result.get("data", []) if not data: return "No results found" for item in data: search_results += f"#### {item.get('title', 'Untitled')}\n" search_results += f"- **URL**: {item.get('url', 'N/A')}\n" search_results += f"- **Description**: {item.get('description', 'N/A')}\n" search_results += f"- **Source**: {item.get('source', 'N/A')}\n\n" return search_results except Exception as e: return f"### Error\n{str(e)}" # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("# AnyCrawl, Turning web into LLM-ready.") gr.Markdown("## We are open-sourced under the MIT License, and you can deploy it yourself anytime.") gr.Markdown( "Get your API key from [AnyCrawl.dev](https://anycrawl.dev), The free test api_key: ac-d4b8045a313b7a6c694fe046ff3a7, the free is limited, if you like AnyCrawl, please sign up and get your own api_key" ) api_key = gr.Textbox( label="API Key", type="password", value="ac-d4b8045a313b7a6c694fe046ff3a7" ) with gr.Tabs() as tabs: with gr.Tab("Scrape Website"): with gr.Row(): url_input = gr.Textbox( label="URL", placeholder="Enter website URL you want to scrape", ) with gr.Row(): engine_input = gr.Dropdown( choices=["puppeteer", "playwright", "cheerio"], value="playwright", label="Scraping Engine", ) scrape_btn = gr.Button("Scrape Website") with gr.Row(): markdown_output = gr.Markdown(label="Content") metadata_output = gr.Markdown(label="Metadata") scrape_btn.click( fn=scrape_website, inputs=[ url_input, engine_input, api_key, ], outputs=[markdown_output, metadata_output], ) with gr.Tab("Search Content"): with gr.Row(): search_input = gr.Textbox( label="Search Query", placeholder="Enter your search query", ) pages_input = gr.Number( label="Number of Pages", value=1, minimum=1, maximum=10, step=1, ) search_btn = gr.Button("Search") search_output = gr.Markdown(label="Search Results") search_btn.click( fn=search_content, inputs=[ search_input, pages_input, api_key, ], outputs=search_output, ) demo.launch(mcp_server=True)