Spaces:
Running
Running
| # scraper.py | |
| import urllib.request | |
| from bs4 import BeautifulSoup | |
| def extract_content(url): | |
| """ | |
| Extracts HTML content from a URL and returns: | |
| - all headings (h1-h6) | |
| - all paragraph texts | |
| - all image URLs | |
| - all hyperlinks | |
| - all visible text | |
| """ | |
| try: | |
| # Fetch webpage | |
| response = urllib.request.urlopen(url) | |
| page_data = response.read() | |
| soup = BeautifulSoup(page_data, "html5lib") | |
| # Headings | |
| headings = [] | |
| for i in range(1, 7): | |
| tag = f'h{i}' | |
| headings += [h.get_text(strip=True) for h in soup.find_all(tag)] | |
| # Paragraphs | |
| paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)] | |
| # Images | |
| images = [img['src'] for img in soup.find_all('img', src=True)] | |
| # Hyperlinks | |
| links = [a['href'] for a in soup.find_all('a', href=True)] | |
| # Visible text | |
| text = soup.get_text(separator=' ', strip=True) | |
| return { | |
| "headings": headings, | |
| "paragraphs": paragraphs, | |
| "images": images, | |
| "links": links, | |
| "text": text | |
| } | |
| except Exception as e: | |
| print("❌ Error while fetching webpage:", e) | |
| return None | |