z4hid commited on
Commit
f8db296
·
verified ·
1 Parent(s): 23e65a7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -0
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import os
4
+ import tempfile
5
+ from dotenv import load_dotenv
6
+ from typing import Tuple
7
+
8
+ # Load environment variables from .env file
9
+ load_dotenv()
10
+
11
+ JINA_API_KEY = os.getenv('JINA_API_KEY')
12
+
13
+ def web_scraper(url: str) -> str:
14
+ """
15
+ Scrape the content of a given URL using the Jina API.
16
+
17
+ Args:
18
+ url (str): The URL to scrape.
19
+
20
+ Returns:
21
+ str: The scraped content in markdown format.
22
+ """
23
+ headers = {
24
+ 'Authorization': f'Bearer {JINA_API_KEY}',
25
+ 'X-Locale': 'en-US',
26
+ 'X-Return-Format': 'text',
27
+ 'X-With-Generated-Alt': 'true',
28
+ 'X-With-Links-Summary': 'true'
29
+ }
30
+ scrape_pattern = f'https://r.jina.ai/{url}'
31
+
32
+ response = requests.get(scrape_pattern, headers=headers)
33
+ return response.text
34
+
35
+ def scrape_and_display(url: str) -> Tuple[str, tempfile._TemporaryFileWrapper]:
36
+ """
37
+ Scrape the content of a given URL and prepare it for display and download.
38
+
39
+ Args:
40
+ url (str): The URL to scrape.
41
+
42
+ Returns:
43
+ Tuple[str, tempfile._TemporaryFileWrapper]: A tuple containing the scraped content and a temporary file for download.
44
+ """
45
+ scraped_content = web_scraper(url)
46
+
47
+ # Create a temporary file
48
+ temp_file = tempfile.NamedTemporaryFile(mode="w+", suffix=".md", delete=False)
49
+ temp_file.write(scraped_content)
50
+ temp_file.flush()
51
+
52
+ return scraped_content, temp_file.name
53
+
54
+ def create_gradio_interface() -> gr.Interface:
55
+ """
56
+ Create and configure the Gradio interface for the web scraper.
57
+
58
+ Returns:
59
+ gr.Interface: The configured Gradio interface.
60
+ """
61
+ return gr.Interface(
62
+ fn=scrape_and_display,
63
+ inputs=gr.Textbox(label="Enter URL to scrape"),
64
+ outputs=[
65
+ gr.Markdown(label="Scraped Content"),
66
+ gr.File(label="Download Markdown")
67
+ ],
68
+ title="Web Scraper",
69
+ description="Enter a URL to scrape and view the content in markdown format. You can also download the markdown file.",
70
+ examples=[["https://www.robots.ox.ac.uk/~vgg/data/flowers/102/categories.html"]],
71
+ allow_flagging="never"
72
+ )
73
+
74
+ if __name__ == "__main__":
75
+ iface = create_gradio_interface()
76
+ iface.launch()