AnyCrawl / app.py
ntbperst's picture
Update app.py
453801a verified
import gradio as gr
import requests
from datetime import datetime
def scrape_website(url, engine, api_key):
"""
Scrape website using AnyCrawl API
API Key can be obtained from: https://anycrawl.dev
"""
# Prepare the request payload
payload = {"url": url, "engine": engine}
headers = {
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": f"Bearer {api_key}",
}
try:
response = requests.post(
"https://api.anycrawl.dev/v1/scrape", headers=headers, json=payload
)
response.raise_for_status()
result = response.json()
# Extract data from the response
data = result.get("data", {})
markdown_content = data.get("markdown", "No markdown content found")
# Extract metadata from data
metadata_md = "### Metadata\n"
metadata_md += f"- **URL**: {data.get('url', 'N/A')}\n"
metadata_md += f"- **Status**: {data.get('status', 'N/A')}\n"
metadata_md += f"- **Job ID**: {data.get('job_id', 'N/A')}\n"
metadata_md += f"- **Title**: {data.get('title', 'N/A')}\n"
metadata_md += f"- **Timestamp**: {data.get('timestamp', 'N/A')}\n"
# Add page metadata if available
if "metadata" in data:
metadata_md += "\n### Page Metadata\n"
for key, value in data["metadata"].items():
metadata_md += f"- **{key}**: {value}\n"
return markdown_content, metadata_md
except Exception as e:
return "Error occurred", f"### Error\n{str(e)}"
def search_content(query, pages, api_key):
"""
Search content using AnyCrawl API
"""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
}
payload = {"query": query, "pages": pages}
try:
response = requests.post(
"https://api.anycrawl.dev/v1/search", headers=headers, json=payload
)
response.raise_for_status()
result = response.json()
# Format search results
search_results = "### Search Results\n\n"
data = result.get("data", [])
if not data:
return "No results found"
for item in data:
search_results += f"#### {item.get('title', 'Untitled')}\n"
search_results += f"- **URL**: {item.get('url', 'N/A')}\n"
search_results += f"- **Description**: {item.get('description', 'N/A')}\n"
search_results += f"- **Source**: {item.get('source', 'N/A')}\n\n"
return search_results
except Exception as e:
return f"### Error\n{str(e)}"
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# AnyCrawl, Turning web into LLM-ready.")
gr.Markdown("## We are open-sourced under the MIT License, and you can deploy it yourself anytime.")
gr.Markdown(
"Get your API key from [AnyCrawl.dev](https://anycrawl.dev), The free test api_key: ac-d4b8045a313b7a6c694fe046ff3a7, the free is limited, if you like AnyCrawl, please sign up and get your own api_key"
)
api_key = gr.Textbox(
label="API Key", type="password", value="ac-d4b8045a313b7a6c694fe046ff3a7"
)
with gr.Tabs() as tabs:
with gr.Tab("Scrape Website"):
with gr.Row():
url_input = gr.Textbox(
label="URL",
placeholder="Enter website URL you want to scrape",
)
with gr.Row():
engine_input = gr.Dropdown(
choices=["puppeteer", "playwright", "cheerio"],
value="playwright",
label="Scraping Engine",
)
scrape_btn = gr.Button("Scrape Website")
with gr.Row():
markdown_output = gr.Markdown(label="Content")
metadata_output = gr.Markdown(label="Metadata")
scrape_btn.click(
fn=scrape_website,
inputs=[
url_input,
engine_input,
api_key,
],
outputs=[markdown_output, metadata_output],
)
with gr.Tab("Search Content"):
with gr.Row():
search_input = gr.Textbox(
label="Search Query",
placeholder="Enter your search query",
)
pages_input = gr.Number(
label="Number of Pages",
value=1,
minimum=1,
maximum=10,
step=1,
)
search_btn = gr.Button("Search")
search_output = gr.Markdown(label="Search Results")
search_btn.click(
fn=search_content,
inputs=[
search_input,
pages_input,
api_key,
],
outputs=search_output,
)
demo.launch(mcp_server=True)