File size: 5,615 Bytes
c68cb37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import re
import requests
import uuid
import zipfile
import hashlib
import shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Function to validate URLs
def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

# Function to find files on webpage
def find_files(url, soup, file_type):
    files = []
    if file_type == "image":
        tags = ['jpg', 'jpeg', 'png', 'svg', 'gif']
        for tag in soup.find_all('img'):
            file = tag.get('src')
            if any(tag in file for tag in tags):
                file_url = file
                if not is_valid(file_url):
                    file_url = urljoin(url, file_url)
                files.append(file_url)
    elif file_type == "text":
        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
        for tag in text_tags:
            for element in soup.find_all(tag):
                files.append(element.get_text())
    else:
        for link in soup.find_all('a'):
            file = link.get('href')
            if file_type in file:
                file_url = file
                if not is_valid(file_url):
                    file_url = urljoin(url, file_url)
                files.append(file_url)
    return files




# Function to download files
def download_files(urls, folder_name):
    os.makedirs(folder_name, exist_ok=True)
    for i, url in enumerate(urls):
        response = requests.get(url, stream=True)
        file_extension = url.split(".")[-1].split("&")[0]
        url_hash = hashlib.md5(url.encode()).hexdigest()
        unique_id = str(uuid.uuid4())[:8]
        file_name = f'{url_hash}-{unique_id}.{file_extension}'
        file_name = file_name[:255]  # Truncate the file name to avoid exceeding the limit
        file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)  # Replace special characters with underscores
        with open(f'{folder_name}/{file_name}', 'wb') as out_file:
            out_file.write(response.content)
        print(f"Downloaded file: {file_name}")

# Function to create zip file
def create_zip_file(folder_name):
    # Only create zip file if there are files in the directory
    if os.listdir(folder_name):
        with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
            for file in os.listdir(folder_name):
                zipf.write(f'{folder_name}/{file}')
        return f'{folder_name}.zip'
    else:
        return ""


    
# Function to scrape website
def scrape_website(url, images=False, text=False):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an exception if the GET request was unsuccessful
    except (requests.exceptions.RequestException, ValueError):
        raise gr.Error(f"Unable to access URL: {url}")
        return None, None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Clear the contents of the folders
    if images:
        shutil.rmtree('images', ignore_errors=True)
    if text:
        shutil.rmtree('text', ignore_errors=True)

    # Download files
    if images:
        image_urls = find_files(url, soup, 'image')
        download_files(image_urls, 'images')
    if text:
        text_content = find_files(url, soup, 'text')
        os.makedirs('text', exist_ok=True)  # Make sure the directory exists before writing
        if text_content:  # Only create the file if there is text to write
            with open('text/content.txt', 'w') as text_file:
                for line in text_content:
                    text_file.write(line + '\n')

    # Create zip files and return paths
    images_zip_file, text_zip_file = None, None
    if images and os.path.exists('images') and os.listdir('images'):
        images_zip_file = create_zip_file('images')
    if text and os.path.exists('text') and os.listdir('text'):
        text_zip_file = create_zip_file('text')

    return images_zip_file, text_zip_file



# Function for web scraping
def web_scraping(url, file_types):
    # Check if the URL is empty
    if not url:
        raise gr.Error("URL cannot be empty.")

    # Check if the URL begins with https://
    if not url.startswith("https://"):
        raise gr.Error("The URL must begin with https://")

    # Check if at least one checkbox is selected
    if not file_types:
        raise gr.Error("At least one media type must be selected.")

    images = "Images" in file_types
    text = "Text" in file_types
    return scrape_website(url, images, text)

with gr.Blocks(theme="dwancin/yellow", css=".lg.svelte-1ipelgc {max-height: 60px !important;}") as app:
    with gr.Row():
        with gr.Column(scale=2):
            url_name = gr.Textbox(
                placeholder="Enter URL here",
                show_label=True,
                label="Website",
                info="Example: https://en.wikipedia.org/wiki/Main_Page",
            )
            media_types = gr.CheckboxGroup(
                [
                    "Images", 
                    "Text",
                ], 
                value="Images",
                label="Media types",
            )
            submit_button = gr.Button(
                "Scrape",
                variant="primary",
                interactive=True,
            )
        with gr.Column(scale=1): 
            output_images_zip_file = gr.File(label="Images ZIP-file")
            output_text_zip_file = gr.File(label="Text ZIP-file")

    submit_button.click(web_scraping, inputs=[url_name, media_types], outputs=[output_images_zip_file, output_text_zip_file])
app.launch()