Transformers
English
Hindi
Sanskrit
sovereign-ai
ecological-intelligence
indian-llm
environmental-protection
iamkoder001 commited on
Commit
c7c6bc0
·
verified ·
1 Parent(s): fc5998a

Create data/scripts/scraper.py

Browse files
Files changed (1) hide show
  1. data/scripts/scraper.py +55 -0
data/scripts/scraper.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import time
5
+ from urllib.parse import urljoin
6
+
7
+ class SovereignScraper:
8
+ """
9
+ Ingestion engine for ARAVALLI-1.
10
+ Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata.
11
+ """
12
+ def __init__(self, base_url, storage_path="data/raw/"):
13
+ self.base_url = base_url
14
+ self.storage_path = storage_path
15
+ if not os.path.exists(self.storage_path):
16
+ os.makedirs(self.storage_path)
17
+
18
+ def crawl_and_download(self, file_ext=".pdf"):
19
+ """Crawls the index page and downloads relevant files for training."""
20
+ print(f"Initiating Sovereign Ingestion from: {self.base_url}")
21
+ try:
22
+ response = requests.get(self.base_url, timeout=15)
23
+ soup = BeautifulSoup(response.text, 'html.parser')
24
+
25
+ # Identify links to documents (Gazettes/Reports)
26
+ links = soup.find_all('a', href=True)
27
+ for link in links:
28
+ href = link['href']
29
+ if href.endswith(file_ext):
30
+ download_url = urljoin(self.base_url, href)
31
+ file_name = href.split('/')[-1]
32
+ self._save_file(download_url, file_name)
33
+
34
+ # Respectful delay to prevent server overload
35
+ time.sleep(1)
36
+ except Exception as e:
37
+ print(f"Ingestion Breach: {e}")
38
+
39
+ def _save_file(self, url, name):
40
+ """Saves the raw data to the sovereign vault."""
41
+ path = os.path.join(self.storage_path, name)
42
+ if os.path.exists(path):
43
+ return # Skip if already ingested
44
+
45
+ print(f"Ingesting: {name}")
46
+ r = requests.get(url, stream=True)
47
+ with open(path, 'wb') as f:
48
+ for chunk in r.iter_content(chunk_size=8192):
49
+ f.write(chunk)
50
+
51
+ if __name__ == "__main__":
52
+ # Example Target: The Gazette of India (Archive Node)
53
+ # Note: Use specific URLs for Environmental Impact Assessments
54
+ scraper = SovereignScraper("https://egazette.gov.in/")
55
+ scraper.crawl_and_download()