Create data/scripts/scraper.py
Browse files- data/scripts/scraper.py +55 -0
data/scripts/scraper.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import time
|
| 5 |
+
from urllib.parse import urljoin
|
| 6 |
+
|
| 7 |
+
class SovereignScraper:
|
| 8 |
+
"""
|
| 9 |
+
Ingestion engine for ARAVALLI-1.
|
| 10 |
+
Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata.
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self, base_url, storage_path="data/raw/"):
|
| 13 |
+
self.base_url = base_url
|
| 14 |
+
self.storage_path = storage_path
|
| 15 |
+
if not os.path.exists(self.storage_path):
|
| 16 |
+
os.makedirs(self.storage_path)
|
| 17 |
+
|
| 18 |
+
def crawl_and_download(self, file_ext=".pdf"):
|
| 19 |
+
"""Crawls the index page and downloads relevant files for training."""
|
| 20 |
+
print(f"Initiating Sovereign Ingestion from: {self.base_url}")
|
| 21 |
+
try:
|
| 22 |
+
response = requests.get(self.base_url, timeout=15)
|
| 23 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 24 |
+
|
| 25 |
+
# Identify links to documents (Gazettes/Reports)
|
| 26 |
+
links = soup.find_all('a', href=True)
|
| 27 |
+
for link in links:
|
| 28 |
+
href = link['href']
|
| 29 |
+
if href.endswith(file_ext):
|
| 30 |
+
download_url = urljoin(self.base_url, href)
|
| 31 |
+
file_name = href.split('/')[-1]
|
| 32 |
+
self._save_file(download_url, file_name)
|
| 33 |
+
|
| 34 |
+
# Respectful delay to prevent server overload
|
| 35 |
+
time.sleep(1)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"Ingestion Breach: {e}")
|
| 38 |
+
|
| 39 |
+
def _save_file(self, url, name):
|
| 40 |
+
"""Saves the raw data to the sovereign vault."""
|
| 41 |
+
path = os.path.join(self.storage_path, name)
|
| 42 |
+
if os.path.exists(path):
|
| 43 |
+
return # Skip if already ingested
|
| 44 |
+
|
| 45 |
+
print(f"Ingesting: {name}")
|
| 46 |
+
r = requests.get(url, stream=True)
|
| 47 |
+
with open(path, 'wb') as f:
|
| 48 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 49 |
+
f.write(chunk)
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
# Example Target: The Gazette of India (Archive Node)
|
| 53 |
+
# Note: Use specific URLs for Environmental Impact Assessments
|
| 54 |
+
scraper = SovereignScraper("https://egazette.gov.in/")
|
| 55 |
+
scraper.crawl_and_download()
|