| import requests |
| from lxml import html |
| from bs4 import BeautifulSoup |
| import json |
| import re |
|
|
| def fetch_html_tree_requests(url: str) -> tuple: |
| """Fetches HTML using requests and returns lxml tree and raw HTML.""" |
| headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} |
| try: |
| response = requests.get(url, headers=headers, timeout=10) |
| response.raise_for_status() |
| return html.fromstring(response.content), response.text |
| except requests.exceptions.RequestException as e: |
| print(f"Error fetching {url} with requests: {e}") |
| return None, None |
|
|
| def extract_tbody_html(tree: html.HtmlElement, xpath: str = "/html/body/div[3]/table/tbody") -> str: |
| """Extracts the tbody HTML string from an lxml tree.""" |
| result = tree.xpath(xpath) |
| if not result: |
| return None |
| return html.tostring(result[0], encoding='unicode') |
|
|
| def extract_thumbnail(tree: html.HtmlElement) -> str: |
| """Extracts the thumbnail URL from JSON-LD script tags.""" |
| scripts = tree.xpath("//script[@type='application/ld+json']/text()") |
| for script in scripts: |
| try: |
| json_data = json.loads(script.strip()) |
| if isinstance(json_data, dict) and "image" in json_data: |
| return json_data["image"] |
| except json.JSONDecodeError: |
| continue |
| return None |
|
|
| def extract_audio_url(html_text: str) -> str: |
| """Extracts the MP3 audio URL using regex from raw HTML.""" |
| match = re.search(r'new Audio\(["\'](https://[^"\']+\.mp3)["\']\)', html_text) |
| return match.group(1) if match else None |
|
|
| def tbody_to_json(html_tbody: str) -> dict: |
| """Parses tbody HTML using BeautifulSoup and converts to a dictionary.""" |
| if not html_tbody: |
| return {} |
| soup = BeautifulSoup(html_tbody, "html.parser") |
| data = {} |
|
|
| for tr in soup.find_all("tr", class_="tr"): |
| tds = tr.find_all("td") |
| if len(tds) < 2: |
| continue |
|
|
| key = tds[0].get_text(strip=True).rstrip(":") |
| value_cell = tds[1] |
|
|
| if key == "Rating": |
| stars = value_cell.find_all("span") |
| if stars: |
| stars_str = ''.join(star.get_text(strip=True) for star in stars) |
| data[key] = { |
| "stars": stars_str, |
| "out_of": 5, |
| "value": stars_str.count("★") + 0.5 * stars_str.count("☆") |
| } |
| continue |
|
|
| value = value_cell.get_text(" ", strip=True) |
| data[key] = value |
|
|
| return data |
|
|
| def extract_song_metadata(url: str) -> dict: |
| """Fetches a song page and extracts all relevant metadata.""" |
| print(f" Attempting to extract metadata from: {url}") |
| tree, html_text = fetch_html_tree_requests(url) |
| if tree is None: |
| return {"URL": url, "error": "Failed to fetch page with requests or network issue."} |
|
|
| metadata = {"URL": url} |
|
|
| try: |
| tbody_html = extract_tbody_html(tree) |
| if tbody_html: |
| metadata.update(tbody_to_json(tbody_html)) |
| else: |
| metadata["tbody_data_present"] = False |
|
|
| thumbnail_url = extract_thumbnail(tree) |
| if thumbnail_url: |
| metadata["Thumbnail"] = thumbnail_url |
|
|
| audio_url = extract_audio_url(html_text) |
| if audio_url: |
| metadata["Play Online"] = audio_url |
| else: |
| metadata["Play Online"] = None |
|
|
| except Exception as e: |
| metadata["error_extracting_metadata"] = str(e) |
| print(f" Error extracting metadata for {url}: {e}") |
|
|
| return metadata |
|
|
| |
| if __name__ == "__main__": |
| |
| test_url = "https://pagalgana.com/0mp-Mechanical-sundariye-2.0-hindiLl.html" |
| metadata = extract_song_metadata(test_url) |
| print(json.dumps(metadata, indent=4, ensure_ascii=False)) |