Spaces:

amit0987
/

selenium-screenshot-gradio

Paused

App Files Files Community

niharika17032001 commited on May 28, 2025

Commit

5ca8483

1 Parent(s): 0bba599

Create app.py

Browse files

Files changed (4) hide show

app.py +13 -2
crawler.py +326 -0
main_script.py +95 -0
metadata_extractor.py +110 -0

app.py CHANGED Viewed

@@ -6,12 +6,14 @@ from PIL import Image
 from selenium import webdriver
 from selenium.common.exceptions import WebDriverException
 driver = None
 def get_chrome_options():
     options = webdriver.ChromeOptions()
-    options.add_argument('--headless')
     options.add_argument('--no-sandbox')
     options.add_argument('--disable-dev-shm-usage')
@@ -56,9 +58,18 @@ def take_screenshot(url):
     return images
 iface = gr.Interface(
-    fn=take_screenshot,
     inputs=gr.Textbox(label="Website URL", value="https://www.google.com/"),
     outputs=gr.Gallery(label="Screenshots", columns=3, height="auto"),
     title="Website Screenshots",

 from selenium import webdriver
 from selenium.common.exceptions import WebDriverException
+import main_script
 driver = None
 def get_chrome_options():
     options = webdriver.ChromeOptions()
+    # options.add_argument('--headless')
     options.add_argument('--no-sandbox')
     options.add_argument('--disable-dev-shm-usage')
     return images
+def call_main_script():
+    main_script.main()
+def main(url):
+    return call_main_script()
+    # return take_screenshot(url)
 iface = gr.Interface(
+    fn=main,
     inputs=gr.Textbox(label="Website URL", value="https://www.google.com/"),
     outputs=gr.Gallery(label="Screenshots", columns=3, height="auto"),
     title="Website Screenshots",

crawler.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import requests
+from lxml import html
+from collections import deque
+import json
+import time
+import os
+# --- Choose your Selenium setup ---
+# OPTION A: Standard Selenium
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+# OPTION B: undetected_chromedriver (Uncomment these if you want to use UC)
+# import undetected_chromedriver as uc
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
+from PIL import Image
+from io import BytesIO
+def set_screenshot(driver, images=[]):
+    png = driver.get_screenshot_as_png()
+    image = Image.open(BytesIO(png))
+    images.append(image)
+    return images
+def get_chrome_options():
+    options = webdriver.ChromeOptions()
+    # options.add_argument('--headless')
+    options.add_argument('--no-sandbox')
+    options.add_argument('--disable-dev-shm-usage')
+    return options
+def set_driver():
+    options = get_chrome_options()
+    try:
+        web_driver = webdriver.Chrome(options=options)
+        web_driver.set_window_size(1080, 720)  # Adjust the window size here
+    except WebDriverException as e:
+        return Image.new('RGB', (1, 1))
+    return web_driver
+# --- Selenium setup functions (choose one based on your choice above) ---
+# OPTION A: Standard Selenium (Use this if you prefer standard selenium)
+# def get_chrome_options():
+#     options = webdriver.ChromeOptions()
+#     options.add_argument("--headless")
+#     options.add_argument("--no-sandbox")
+#     options.add_argument("--disable-gpu")
+#     options.add_argument("--disable-dev-shm-usage")
+#     options.add_argument("--window-size=1920,1080")
+#     options.add_argument(
+#         "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+#     return options
+def create_webdriver_instance(browser_type="chrome"):
+    if browser_type.lower() == "chrome":
+        chrome_options = get_chrome_options()
+        try:
+            # Assumes chromedriver is in PATH or specified path (e.g., /usr/bin/chromedriver on GitHub Actions)
+            service = Service(executable_path="/usr/bin/chromedriver")
+            driver = webdriver.Chrome(service=service, options=chrome_options)
+            return driver
+        except WebDriverException as e:
+            print(f"Error initializing ChromeDriver. Error: {e}")
+            return None
+    else:
+        raise ValueError("Unsupported browser type.")
+# OPTION B: undetected_chromedriver (Uncomment this block and comment OPTION A if you want to use UC)
+# def get_chrome_options():
+#     options = uc.ChromeOptions()
+#     options.add_argument("--headless")
+#     options.add_argument("--no-sandbox")
+#     options.add_argument("--disable-gpu")
+#     options.add_argument("--disable-dev-shm-usage")
+#     options.add_argument("--window-size=1920,1080")
+#     return options
+# def create_webdriver_instance(browser_type="chrome"):
+#     if browser_type.lower() == "chrome":
+#         chrome_options = get_chrome_options()
+#         try:
+#             driver = uc.Chrome(options=chrome_options)
+#             return driver
+#         except WebDriverException as e:
+#             print(f"Error initializing undetected_chromedriver. Error: {e}")
+#             return None
+#     else:
+#         raise ValueError("Unsupported browser type.")
+# --- Resumable Crawling Logic ---
+def save_crawl_state(to_visit_deque, visited_set, song_urls_list, state_filename="crawl_state.json",
+                     song_pages_json_file="pagalgana_song_pages.json"):
+    """Saves the current state of the crawler to JSON files."""
+    try:
+        with open(song_pages_json_file, 'w', encoding='utf-8') as f:
+            json.dump(song_urls_list, f, indent=4)
+        crawl_state_data = {
+            "to_visit": list(to_visit_deque),
+            "visited_urls": list(visited_set)
+        }
+        with open(state_filename, 'w', encoding='utf-8') as f:
+            json.dump(crawl_state_data, f, indent=4)
+        print(
+            f"--- Crawl state saved. URLs to visit: {len(to_visit_deque)}, Visited: {len(visited_set)}, Song pages found: {len(song_urls_list)} ---")
+    except IOError as e:
+        print(f"Error saving crawl state: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred while saving state: {e}")
+def load_crawl_state(state_filename="crawl_state.json", song_pages_json_file="pagalgana_song_pages.json"):
+    """Loads previous crawl state if files exist."""
+    to_visit_deque = deque()
+    visited_set = set()
+    song_urls_list = []
+    if os.path.exists(song_pages_json_file):
+        try:
+            with open(song_pages_json_file, 'r', encoding='utf-8') as f:
+                song_urls_list = json.load(f)
+            print(f"Loaded {len(song_urls_list)} song URLs from '{song_pages_json_file}'.")
+        except json.JSONDecodeError:
+            print(f"Warning: '{song_pages_json_file}' corrupted or empty. Starting fresh song list.")
+            song_urls_list = []
+        except Exception as e:
+            print(f"Error loading '{song_pages_json_file}': {e}")
+    if os.path.exists(state_filename):
+        try:
+            with open(state_filename, 'r', encoding='utf-8') as f:
+                crawl_state_data = json.load(f)
+            to_visit_deque = deque(crawl_state_data.get("to_visit", []))
+            visited_set = set(crawl_state_data.get("visited_urls", []))
+            print(f"Loaded crawl state: {len(to_visit_deque)} URLs to visit, {len(visited_set)} visited.")
+        except json.JSONDecodeError:
+            print(f"Warning: '{state_filename}' corrupted or empty. Starting fresh state.")
+            to_visit_deque = deque()
+            visited_set = set()
+        except Exception as e:
+            print(f"Error loading '{state_filename}': {e}")
+    return to_visit_deque, visited_set, song_urls_list
+def crawl_pagalgana_site(base_url: str, song_pages_json_file: str, max_crawl_depth: int, state_filename: str,
+                         save_interval: int,images):
+    """
+    Crawls Pagalgana.com to find and save song page URLs.
+    Supports resuming a crawl.
+    """
+    # driver = create_webdriver_instance()
+    driver = set_driver()
+    if not driver:
+        print("Failed to initialize WebDriver. Exiting.")
+        return []  # Return empty list if WebDriver fails
+    to_visit, visited_urls, song_page_urls = load_crawl_state(state_filename, song_pages_json_file)
+    if not to_visit and not visited_urls:
+        print("No previous crawl state found. Starting fresh.")
+        to_visit.append((base_url, 0))
+    else:
+        print("Resuming crawl from previous state.")
+        if base_url not in visited_urls and (base_url, 0) not in to_visit:
+            to_visit.appendleft((base_url, 0))
+    AUDIO_CONTAINER_XPATH = '//*[@id="audio-container"]'
+    LOAD_MORE_BUTTON_XPATH = '//a[@class="button" and contains(@onclick, "loadMoreCategory")]'
+    print(f"Starting/Resuming crawl with base: {base_url}, max depth: {max_crawl_depth}")
+    print(
+        f"Initial Queue size: {len(to_visit)}, Initial Visited size: {len(visited_urls)}, Song page URLs: {len(song_page_urls)}")
+    processed_count = 0
+    while to_visit:
+        current_url, current_depth = to_visit.popleft()
+        if current_url in visited_urls:
+            continue
+        if current_depth > max_crawl_depth:
+            print(f"Skipping {current_url} - max depth reached ({max_crawl_depth})")
+            continue
+        print(f"\n--- Visiting ({current_depth}): {current_url} ---")
+        visited_urls.add(current_url)
+        processed_count += 1
+        try:
+            driver.get(current_url)
+            time.sleep(3)  # Give page more time to load and execute JS
+            print(f"  Page title: {driver.title}")
+            print(f"  Current URL after load: {driver.current_url}")
+            images=set_screenshot(driver=driver,images=images)
+            # Optional: print HTML snippet for debugging. Remove for cleaner logs in production.
+            # print("  --- HTML snippet (first 2000 chars) ---")
+            # print(driver.page_source[:2000])
+            # print("  --- End HTML snippet ---")
+            # Check for Cloudflare challenge (if using standard Selenium)
+            if "Attention Required" in driver.title or "cloudflare" in driver.page_source.lower():
+                print(
+                    "  --> Cloudflare challenge detected! Try switching to undetected_chromedriver or add a longer sleep.")
+                print("  --> Skipping current URL due to Cloudflare challenge.")
+                images = set_screenshot(driver=driver, images=images)
+                continue  # Skip this URL if Cloudflare is blocking it
+            # Check if it's a song page
+            audio_container_elements = driver.find_elements(By.XPATH, AUDIO_CONTAINER_XPATH)
+            if audio_container_elements:
+                print(f"  --> FOUND AUDIO CONTAINER! This is a song page: {current_url}")
+                if current_url not in song_page_urls:
+                    song_page_urls.append(current_url)
+            # Handle "Load More" button if present
+            load_more_found_and_clicked = False
+            while True:
+                try:
+                    load_more_button = WebDriverWait(driver, 15).until(
+                        EC.element_to_be_clickable((By.XPATH, LOAD_MORE_BUTTON_XPATH))
+                    )
+                    last_height = driver.execute_script("return document.body.scrollHeight")
+                    print("  Clicking 'Load More' button...")
+                    load_more_button.click()
+                    load_more_found_and_clicked = True
+                    new_height = last_height
+                    scroll_attempts = 0
+                    while new_height == last_height and scroll_attempts < 7:
+                        time.sleep(2)
+                        new_height = driver.execute_script("return document.body.scrollHeight")
+                        scroll_attempts += 1
+                    if new_height == last_height:
+                        print("  No more content loaded after click, or button disappeared.")
+                        break
+                except (NoSuchElementException, TimeoutException):
+                    if not load_more_found_and_clicked:
+                        print("  'Load More' button not found or not clickable.")
+                    else:
+                        print("  'Load More' button no longer present (all content likely loaded).")
+                    break
+                except Exception as e:
+                    print(f"  Error clicking 'Load More': {e}")
+                    break
+            # After all content is loaded, parse the HTML
+            tree = html.fromstring(driver.page_source)
+            # Extract nested links from the fully loaded page
+            links = tree.xpath('//a/@href')
+            print(f"  Found {len(links)} raw links on the page.")
+            links_added_to_queue = 0
+            for link in links:
+                absolute_url = requests.compat.urljoin(current_url, link)
+                if "pagalgana.com" in absolute_url and "#" not in absolute_url and "?" not in absolute_url:
+                    if not (absolute_url.endswith(
+                            ('.mp3', '.zip', '.rar', '.jpg', '.png', '.gif', '.pdf', '.txt', '.xml', '.css', '.js'))):
+                        if absolute_url not in visited_urls and (absolute_url, current_depth + 1) not in to_visit:
+                            if absolute_url not in song_page_urls:  # Don't re-add if already identified as a song page
+                                to_visit.append((absolute_url, current_depth + 1))
+                                links_added_to_queue += 1
+            # print(f"  Added {links_added_to_queue} new valid links to the queue from {current_url}.")
+        except Exception as e:
+            print(f"  An unexpected error occurred for {current_url}: {e}")
+        finally:
+            if processed_count % save_interval == 0:
+                print(f"--- Processed {processed_count} pages. Saving current crawl state... ---")
+                save_crawl_state(to_visit, visited_urls, song_page_urls, state_filename, song_pages_json_file)
+    driver.quit()
+    print("\n--- Crawl finished. Performing final save of song page URLs. ---")
+    save_crawl_state(to_visit, visited_urls, song_page_urls, state_filename, song_pages_json_file)
+    print(f"\nCrawl complete. Total {len(song_page_urls)} song pages found and saved to '{song_pages_json_file}'.")
+    images = set_screenshot(driver=driver, images=images)
+    return song_page_urls,images  # Return the list of discovered song pages
+# This __name__ block is for testing `crawler.py` independently
+if __name__ == "__main__":
+    # Example usage for standalone testing of the crawler
+    # When run via main_script.py, this block won't execute
+    images=[]
+    discovered_urls,images = crawl_pagalgana_site(
+        base_url="https://pagalgana.com/category/bollywood-mp3-songs.html",
+        song_pages_json_file="bollywood_song_pages.json",
+        state_filename="bollywood_crawl_state.json",
+        max_crawl_depth=2,  # Keep low for testing
+        save_interval=5,
+        images=images
+    )
+    print(f"Crawler finished. Discovered {len(discovered_urls)} song URLs.")

main_script.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import json
+import os
+import time
+from typing import List, Dict
+# Import functions from your separate files
+from crawler import crawl_pagalgana_site, load_crawl_state, save_crawl_state
+from metadata_extractor import extract_song_metadata
+def main():
+    images=[]
+    # --- Configuration ---
+    BASE_URL = "https://pagalgana.com/12-baje-le-chalau-blender-2025-raushan-rohi-bhojp-uuw.html"
+    MAX_CRAWL_DEPTH = 10  # Adjust this for how deep you want to crawl
+    CRAWL_STATE_FILE = "bollywood_crawl_state.json"
+    SONG_PAGES_FILE = "bollywood_song_pages.json"  # Output from crawler
+    METADATA_OUTPUT_FILE = "bollywood_song_metadata.json"  # Final output with detailed metadata
+    CRAWLER_SAVE_INTERVAL = 10  # Save crawler state every X pages
+    METADATA_SAVE_INTERVAL = 50  # Save metadata periodically every X songs extracted
+    print("Starting Pagalgana Web Scraper and Metadata Extractor.")
+    # --- Phase 1: Crawl the site to find song page URLs ---
+    print("\n## Phase 1: Discovering Song Page URLs ##")
+    # This function will handle loading/saving its own state
+    discovered_song_urls,images = crawl_pagalgana_site(
+        base_url=BASE_URL,
+        song_pages_json_file=SONG_PAGES_FILE,
+        state_filename=CRAWL_STATE_FILE,
+        max_crawl_depth=MAX_CRAWL_DEPTH,
+        save_interval=CRAWLER_SAVE_INTERVAL,
+        images=images
+    )
+    print(f"\nPhase 1 Complete. Found {len(discovered_song_urls)} unique song page URLs.")
+    # --- Phase 2: Extract metadata from discovered song URLs ---
+    print("\n## Phase 2: Extracting Metadata from Song Pages ##")
+    # Load previously extracted metadata to enable resuming this phase
+    # We use a dummy state_filename for this load to just get the metadata list
+    _, _, _, existing_metadata = load_crawl_state(
+        state_filename="dummy_state_for_metadata_load.json",  # This specific file won't be used by crawler
+        song_pages_json_file=SONG_PAGES_FILE,  # This is loaded by the crawler
+        metadata_json_file=METADATA_OUTPUT_FILE  # This is the file we care about loading here
+    )
+    # Create a set of URLs for which we already have metadata
+    processed_metadata_urls = {entry.get("URL") for entry in existing_metadata if
+                               isinstance(entry, dict) and "URL" in entry}
+    metadata_extracted_count = 0
+    new_metadata_entries: List[Dict] = []  # To store new entries from this run
+    # Iterate through each discovered song URL
+    for url in discovered_song_urls:
+        if url in processed_metadata_urls:
+            print(f"  Metadata for {url} already extracted. Skipping.")
+            continue
+        metadata = extract_song_metadata(url)
+        new_metadata_entries.append(metadata)
+        metadata_extracted_count += 1
+        # Add the URL to our tracking set to avoid duplicates in this run
+        processed_metadata_urls.add(url)
+        # Save metadata periodically
+        if metadata_extracted_count % METADATA_SAVE_INTERVAL == 0:
+            # Combine existing and new metadata for periodic save
+            combined_metadata = existing_metadata + new_metadata_entries
+            try:
+                with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f:
+                    json.dump(combined_metadata, f, indent=4, ensure_ascii=False)
+                print(f"  --- Saved {len(combined_metadata)} metadata entries to '{METADATA_OUTPUT_FILE}'. ---")
+            except IOError as e:
+                print(f"  Error saving metadata periodically: {e}")
+        time.sleep(0.5)  # Be kind to the server, small delay between fetches
+    # Final save of all metadata
+    final_metadata = existing_metadata + new_metadata_entries
+    try:
+        with open(METADATA_OUTPUT_FILE, 'w', encoding='utf-8') as f:
+            json.dump(final_metadata, f, indent=4, ensure_ascii=False)
+    except IOError as e:
+        print(f"Error saving final metadata to '{METADATA_OUTPUT_FILE}': {e}")
+    print(f"\nPhase 2 Complete. Extracted metadata for {len(new_metadata_entries)} new song pages.")
+    print(f"Total {len(final_metadata)} unique song metadata entries saved to '{METADATA_OUTPUT_FILE}'.")
+    print("\nScraping process finished.")
+if __name__ == "__main__":
+    main()

metadata_extractor.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import requests
+from lxml import html
+from bs4 import BeautifulSoup
+import json
+import re
+def fetch_html_tree_requests(url: str) -> tuple:
+    """Fetches HTML using requests and returns lxml tree and raw HTML."""
+    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
+        return html.fromstring(response.content), response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url} with requests: {e}")
+        return None, None
+def extract_tbody_html(tree: html.HtmlElement, xpath: str = "/html/body/div[3]/table/tbody") -> str:
+    """Extracts the tbody HTML string from an lxml tree."""
+    result = tree.xpath(xpath)
+    if not result:
+        return None
+    return html.tostring(result[0], encoding='unicode')
+def extract_thumbnail(tree: html.HtmlElement) -> str:
+    """Extracts the thumbnail URL from JSON-LD script tags."""
+    scripts = tree.xpath("//script[@type='application/ld+json']/text()")
+    for script in scripts:
+        try:
+            json_data = json.loads(script.strip())
+            if isinstance(json_data, dict) and "image" in json_data:
+                return json_data["image"]
+        except json.JSONDecodeError:
+            continue
+    return None
+def extract_audio_url(html_text: str) -> str:
+    """Extracts the MP3 audio URL using regex from raw HTML."""
+    match = re.search(r'new Audio\(["\'](https://[^"\']+\.mp3)["\']\)', html_text)
+    return match.group(1) if match else None
+def tbody_to_json(html_tbody: str) -> dict:
+    """Parses tbody HTML using BeautifulSoup and converts to a dictionary."""
+    if not html_tbody:
+        return {}
+    soup = BeautifulSoup(html_tbody, "html.parser")
+    data = {}
+    for tr in soup.find_all("tr", class_="tr"):
+        tds = tr.find_all("td")
+        if len(tds) < 2:
+            continue
+        key = tds[0].get_text(strip=True).rstrip(":")
+        value_cell = tds[1]
+        if key == "Rating":
+            stars = value_cell.find_all("span")
+            if stars:
+                stars_str = ''.join(star.get_text(strip=True) for star in stars)
+                data[key] = {
+                    "stars": stars_str,
+                    "out_of": 5,
+                    "value": stars_str.count("★") + 0.5 * stars_str.count("☆")
+                }
+            continue
+        value = value_cell.get_text(" ", strip=True)
+        data[key] = value
+    return data
+def extract_song_metadata(url: str) -> dict:
+    """Fetches a song page and extracts all relevant metadata."""
+    print(f"  Attempting to extract metadata from: {url}")
+    tree, html_text = fetch_html_tree_requests(url)
+    if tree is None:
+        return {"URL": url, "error": "Failed to fetch page with requests or network issue."}
+    metadata = {"URL": url}
+    try:
+        tbody_html = extract_tbody_html(tree)
+        if tbody_html:
+            metadata.update(tbody_to_json(tbody_html))
+        else:
+            metadata["tbody_data_present"] = False
+        thumbnail_url = extract_thumbnail(tree)
+        if thumbnail_url:
+            metadata["Thumbnail"] = thumbnail_url
+        audio_url = extract_audio_url(html_text)
+        if audio_url:
+            metadata["Play Online"] = audio_url
+        else:
+            metadata["Play Online"] = None
+    except Exception as e:
+        metadata["error_extracting_metadata"] = str(e)
+        print(f"  Error extracting metadata for {url}: {e}")
+    return metadata
+# This __name__ block is for testing `metadata_extractor.py` independently
+if __name__ == "__main__":
+    # Example usage for standalone testing
+    test_url = "https://pagalgana.com/0mp-Mechanical-sundariye-2.0-hindiLl.html"
+    metadata = extract_song_metadata(test_url)
+    print(json.dumps(metadata, indent=4, ensure_ascii=False))