Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import time | |
| from cryptography.fernet import Fernet | |
| import gradio as gr | |
| BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html" | |
| # Scrape a single page | |
| def scrape_page(page): | |
| books = [] | |
| url = BASE_URL.format(page) | |
| response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) | |
| if response.status_code != 200: | |
| return books | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| for book in soup.select("article.product_pod"): | |
| title = book.h3.a['title'] | |
| price = book.select_one("p.price_color").text | |
| rating = book.p['class'][1] # e.g., 'Three' | |
| books.append({"Title": title, "Price": price, "Rating": rating}) | |
| return books | |
| # Scrape multiple pages | |
| def scrape_books(pages=1): | |
| all_books = [] | |
| for page in range(1, pages+1): | |
| all_books.extend(scrape_page(page)) | |
| time.sleep(1) # ethical scraping | |
| df = pd.DataFrame(all_books) | |
| return df | |
| # Mask titles (original title removed) | |
| def mask_titles_only(df): | |
| df_masked = df.copy() | |
| df_masked["Title"] = df_masked["Title"].apply(lambda x: x[0] + "***" if x != "N/A" else x) | |
| return df_masked | |
| # Encrypt CSV | |
| def encrypt_csv(df): | |
| key = Fernet.generate_key() | |
| cipher = Fernet(key) | |
| csv_bytes = df.to_csv(index=False).encode() | |
| encrypted_file = "books_encrypted.csv" | |
| with open(encrypted_file, "wb") as f: | |
| f.write(cipher.encrypt(csv_bytes)) | |
| return encrypted_file, key.decode() | |
| # Gradio interface function | |
| def app_interface(pages, security_option): | |
| df = scrape_books(pages) | |
| if security_option == "Mask Titles": | |
| df = mask_titles_only(df) | |
| file_path = "books_masked.csv" | |
| df.to_csv(file_path, index=False) | |
| return df, file_path, "Masked CSV ready to download." | |
| elif security_option == "Encrypt CSV": | |
| encrypted_file, key = encrypt_csv(df) | |
| return df, encrypted_file, f"Encrypted CSV ready to download. Key: {key}" | |
| else: | |
| file_path = "books.csv" | |
| df.to_csv(file_path, index=False) | |
| return df, file_path, "CSV ready to download." | |
| # Gradio App | |
| iface = gr.Interface( | |
| fn=app_interface, | |
| inputs=[ | |
| gr.Slider(1, 50, step=1, label="Number of Pages to Scrape", value=1), | |
| gr.Radio(["None", "Mask Titles", "Encrypt CSV"], label="Security Option") | |
| ], | |
| outputs=[ | |
| gr.Dataframe(label="Books Data"), | |
| gr.File(label="Download CSV File"), | |
| gr.Textbox(label="Info / Encryption Key") | |
| ], | |
| title="Secure Book Scraper", | |
| description="Scrape book details from 'Books to Scrape'. Mask book titles or encrypt the CSV for secure storage." | |
| ) | |
| iface.launch() | |