Spaces:

amit0987
/

selenium-screenshot-gradio

Paused

App Files Files Community

selenium-screenshot-gradio / metadata_extractor.py

niharika17032001

Create app.py

5ca8483 11 months ago

raw

history blame contribute delete

4.03 kB

	import requests
	from lxml import html
	from bs4 import BeautifulSoup
	import json
	import re

	def fetch_html_tree_requests(url: str) -> tuple:
	"""Fetches HTML using requests and returns lxml tree and raw HTML."""
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
	try:
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
	return html.fromstring(response.content), response.text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching {url} with requests: {e}")
	return None, None

	def extract_tbody_html(tree: html.HtmlElement, xpath: str = "/html/body/div[3]/table/tbody") -> str:
	"""Extracts the tbody HTML string from an lxml tree."""
	result = tree.xpath(xpath)
	if not result:
	return None
	return html.tostring(result[0], encoding='unicode')

	def extract_thumbnail(tree: html.HtmlElement) -> str:
	"""Extracts the thumbnail URL from JSON-LD script tags."""
	scripts = tree.xpath("//script[@type='application/ld+json']/text()")
	for script in scripts:
	try:
	json_data = json.loads(script.strip())
	if isinstance(json_data, dict) and "image" in json_data:
	return json_data["image"]
	except json.JSONDecodeError:
	continue
	return None

	def extract_audio_url(html_text: str) -> str:
	"""Extracts the MP3 audio URL using regex from raw HTML."""
	match = re.search(r'new Audio\(["\'](https://[^"\']+\.mp3)["\']\)', html_text)
	return match.group(1) if match else None

	def tbody_to_json(html_tbody: str) -> dict:
	"""Parses tbody HTML using BeautifulSoup and converts to a dictionary."""
	if not html_tbody:
	return {}
	soup = BeautifulSoup(html_tbody, "html.parser")
	data = {}

	for tr in soup.find_all("tr", class_="tr"):
	tds = tr.find_all("td")
	if len(tds) < 2:
	continue

	key = tds[0].get_text(strip=True).rstrip(":")
	value_cell = tds[1]

	if key == "Rating":
	stars = value_cell.find_all("span")
	if stars:
	stars_str = ''.join(star.get_text(strip=True) for star in stars)
	data[key] = {
	"stars": stars_str,
	"out_of": 5,
	"value": stars_str.count("★") + 0.5 * stars_str.count("☆")
	}
	continue

	value = value_cell.get_text(" ", strip=True)
	data[key] = value

	return data

	def extract_song_metadata(url: str) -> dict:
	"""Fetches a song page and extracts all relevant metadata."""
	print(f" Attempting to extract metadata from: {url}")
	tree, html_text = fetch_html_tree_requests(url)
	if tree is None:
	return {"URL": url, "error": "Failed to fetch page with requests or network issue."}

	metadata = {"URL": url}

	try:
	tbody_html = extract_tbody_html(tree)
	if tbody_html:
	metadata.update(tbody_to_json(tbody_html))
	else:
	metadata["tbody_data_present"] = False

	thumbnail_url = extract_thumbnail(tree)
	if thumbnail_url:
	metadata["Thumbnail"] = thumbnail_url

	audio_url = extract_audio_url(html_text)
	if audio_url:
	metadata["Play Online"] = audio_url
	else:
	metadata["Play Online"] = None

	except Exception as e:
	metadata["error_extracting_metadata"] = str(e)
	print(f" Error extracting metadata for {url}: {e}")

	return metadata

	# This __name__ block is for testing `metadata_extractor.py` independently
	if __name__ == "__main__":
	# Example usage for standalone testing
	test_url = "https://pagalgana.com/0mp-Mechanical-sundariye-2.0-hindiLl.html"
	metadata = extract_song_metadata(test_url)
	print(json.dumps(metadata, indent=4, ensure_ascii=False))