rstallman dwancin commited on
Commit
c68cb37
·
0 Parent(s):

Duplicate from dwancin/web-scraping

Browse files

Co-authored-by: DW <dwancin@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +160 -0
  4. requirements.txt +11 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Web Scraping
3
+ emoji: 🕵️
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.35.2
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: dwancin/web-scraping
11
+ ---
12
+
13
+ https://huggingface.co/spaces/dwancin/web-scraping
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import requests
4
+ import uuid
5
+ import zipfile
6
+ import hashlib
7
+ import shutil
8
+ import gradio as gr
9
+ from bs4 import BeautifulSoup
10
+ from urllib.parse import urljoin, urlparse
11
+
12
+ # Function to validate URLs
13
+ def is_valid(url):
14
+ parsed = urlparse(url)
15
+ return bool(parsed.netloc) and bool(parsed.scheme)
16
+
17
+ # Function to find files on webpage
18
+ def find_files(url, soup, file_type):
19
+ files = []
20
+ if file_type == "image":
21
+ tags = ['jpg', 'jpeg', 'png', 'svg', 'gif']
22
+ for tag in soup.find_all('img'):
23
+ file = tag.get('src')
24
+ if any(tag in file for tag in tags):
25
+ file_url = file
26
+ if not is_valid(file_url):
27
+ file_url = urljoin(url, file_url)
28
+ files.append(file_url)
29
+ elif file_type == "text":
30
+ text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
31
+ for tag in text_tags:
32
+ for element in soup.find_all(tag):
33
+ files.append(element.get_text())
34
+ else:
35
+ for link in soup.find_all('a'):
36
+ file = link.get('href')
37
+ if file_type in file:
38
+ file_url = file
39
+ if not is_valid(file_url):
40
+ file_url = urljoin(url, file_url)
41
+ files.append(file_url)
42
+ return files
43
+
44
+
45
+
46
+
47
+ # Function to download files
48
+ def download_files(urls, folder_name):
49
+ os.makedirs(folder_name, exist_ok=True)
50
+ for i, url in enumerate(urls):
51
+ response = requests.get(url, stream=True)
52
+ file_extension = url.split(".")[-1].split("&")[0]
53
+ url_hash = hashlib.md5(url.encode()).hexdigest()
54
+ unique_id = str(uuid.uuid4())[:8]
55
+ file_name = f'{url_hash}-{unique_id}.{file_extension}'
56
+ file_name = file_name[:255] # Truncate the file name to avoid exceeding the limit
57
+ file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name) # Replace special characters with underscores
58
+ with open(f'{folder_name}/{file_name}', 'wb') as out_file:
59
+ out_file.write(response.content)
60
+ print(f"Downloaded file: {file_name}")
61
+
62
+ # Function to create zip file
63
+ def create_zip_file(folder_name):
64
+ # Only create zip file if there are files in the directory
65
+ if os.listdir(folder_name):
66
+ with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
67
+ for file in os.listdir(folder_name):
68
+ zipf.write(f'{folder_name}/{file}')
69
+ return f'{folder_name}.zip'
70
+ else:
71
+ return ""
72
+
73
+
74
+
75
+ # Function to scrape website
76
+ def scrape_website(url, images=False, text=False):
77
+ try:
78
+ response = requests.get(url, timeout=10)
79
+ response.raise_for_status() # Raise an exception if the GET request was unsuccessful
80
+ except (requests.exceptions.RequestException, ValueError):
81
+ raise gr.Error(f"Unable to access URL: {url}")
82
+ return None, None
83
+
84
+ soup = BeautifulSoup(response.content, 'html.parser')
85
+
86
+ # Clear the contents of the folders
87
+ if images:
88
+ shutil.rmtree('images', ignore_errors=True)
89
+ if text:
90
+ shutil.rmtree('text', ignore_errors=True)
91
+
92
+ # Download files
93
+ if images:
94
+ image_urls = find_files(url, soup, 'image')
95
+ download_files(image_urls, 'images')
96
+ if text:
97
+ text_content = find_files(url, soup, 'text')
98
+ os.makedirs('text', exist_ok=True) # Make sure the directory exists before writing
99
+ if text_content: # Only create the file if there is text to write
100
+ with open('text/content.txt', 'w') as text_file:
101
+ for line in text_content:
102
+ text_file.write(line + '\n')
103
+
104
+ # Create zip files and return paths
105
+ images_zip_file, text_zip_file = None, None
106
+ if images and os.path.exists('images') and os.listdir('images'):
107
+ images_zip_file = create_zip_file('images')
108
+ if text and os.path.exists('text') and os.listdir('text'):
109
+ text_zip_file = create_zip_file('text')
110
+
111
+ return images_zip_file, text_zip_file
112
+
113
+
114
+
115
+ # Function for web scraping
116
+ def web_scraping(url, file_types):
117
+ # Check if the URL is empty
118
+ if not url:
119
+ raise gr.Error("URL cannot be empty.")
120
+
121
+ # Check if the URL begins with https://
122
+ if not url.startswith("https://"):
123
+ raise gr.Error("The URL must begin with https://")
124
+
125
+ # Check if at least one checkbox is selected
126
+ if not file_types:
127
+ raise gr.Error("At least one media type must be selected.")
128
+
129
+ images = "Images" in file_types
130
+ text = "Text" in file_types
131
+ return scrape_website(url, images, text)
132
+
133
+ with gr.Blocks(theme="dwancin/yellow", css=".lg.svelte-1ipelgc {max-height: 60px !important;}") as app:
134
+ with gr.Row():
135
+ with gr.Column(scale=2):
136
+ url_name = gr.Textbox(
137
+ placeholder="Enter URL here",
138
+ show_label=True,
139
+ label="Website",
140
+ info="Example: https://en.wikipedia.org/wiki/Main_Page",
141
+ )
142
+ media_types = gr.CheckboxGroup(
143
+ [
144
+ "Images",
145
+ "Text",
146
+ ],
147
+ value="Images",
148
+ label="Media types",
149
+ )
150
+ submit_button = gr.Button(
151
+ "Scrape",
152
+ variant="primary",
153
+ interactive=True,
154
+ )
155
+ with gr.Column(scale=1):
156
+ output_images_zip_file = gr.File(label="Images ZIP-file")
157
+ output_text_zip_file = gr.File(label="Text ZIP-file")
158
+
159
+ submit_button.click(web_scraping, inputs=[url_name, media_types], outputs=[output_images_zip_file, output_text_zip_file])
160
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=3.35.2
2
+ beautifulsoup4>=4.12.2
3
+ Pillow>=9.5.0
4
+ requests>=2.31.0
5
+ Flask>=2.3.2
6
+ zipfile2>= 0.0.12
7
+ urllib3>=2.0.3
8
+ pytest-shutil>=1.7.0
9
+ mime>=0.1.0
10
+ mimetypes-extensions>=0.1.0
11
+ uuid>=1.30