iamkoder001
/

ARAVALLI-1

ecological-intelligence

environmental-protection

Model card Files Files and versions

iamkoder001 commited on Feb 21

Commit

c7c6bc0

·

verified ·

1 Parent(s): fc5998a

Create data/scripts/scraper.py

Files changed (1) hide show

data/scripts/scraper.py +55 -0

data/scripts/scraper.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import requests
+from bs4 import BeautifulSoup
+import time
+from urllib.parse import urljoin
+class SovereignScraper:
+    """
+    Ingestion engine for ARAVALLI-1.
+    Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata.
+    """
+    def __init__(self, base_url, storage_path="data/raw/"):
+        self.base_url = base_url
+        self.storage_path = storage_path
+        if not os.path.exists(self.storage_path):
+            os.makedirs(self.storage_path)
+    def crawl_and_download(self, file_ext=".pdf"):
+        """Crawls the index page and downloads relevant files for training."""
+        print(f"Initiating Sovereign Ingestion from: {self.base_url}")
+        try:
+            response = requests.get(self.base_url, timeout=15)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Identify links to documents (Gazettes/Reports)
+            links = soup.find_all('a', href=True)
+            for link in links:
+                href = link['href']
+                if href.endswith(file_ext):
+                    download_url = urljoin(self.base_url, href)
+                    file_name = href.split('/')[-1]
+                    self._save_file(download_url, file_name)
+                    # Respectful delay to prevent server overload
+                    time.sleep(1)
+        except Exception as e:
+            print(f"Ingestion Breach: {e}")
+    def _save_file(self, url, name):
+        """Saves the raw data to the sovereign vault."""
+        path = os.path.join(self.storage_path, name)
+        if os.path.exists(path):
+            return # Skip if already ingested
+        print(f"Ingesting: {name}")
+        r = requests.get(url, stream=True)
+        with open(path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+if __name__ == "__main__":
+    # Example Target: The Gazette of India (Archive Node)
+    # Note: Use specific URLs for Environmental Impact Assessments
+    scraper = SovereignScraper("https://egazette.gov.in/")
+    scraper.crawl_and_download()