Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
Β·
f0c36b6
1
Parent(s):
158badf
Deploy from GitHub Actions 2025-12-11_03:43:35
Browse files- tools/build_dataset.py +564 -65
tools/build_dataset.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
# tools/build_dataset.py
|
| 2 |
"""
|
| 3 |
-
Enhanced SAP Dataset Builder
|
| 4 |
-
Scrapes from multiple free sources:
|
| 5 |
- SAP Community blogs
|
|
|
|
|
|
|
| 6 |
- GitHub SAP repositories
|
| 7 |
-
- SAP
|
| 8 |
- Dev.to & tech blogs
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
import requests
|
|
@@ -22,21 +26,266 @@ class SAPDatasetBuilder:
|
|
| 22 |
def __init__(self):
|
| 23 |
self.dataset = []
|
| 24 |
self.seen_urls = set()
|
|
|
|
| 25 |
self.headers = {
|
| 26 |
-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
|
|
|
|
|
|
| 27 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def setup_directories(self):
|
| 30 |
"""Create necessary directories"""
|
| 31 |
Path("data").mkdir(exist_ok=True)
|
| 32 |
Path("data/raw").mkdir(exist_ok=True)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# ============== SAP Community Source ==============
|
| 35 |
def scrape_sap_community(self):
|
| 36 |
-
"""Scrape from SAP Community blogs"""
|
| 37 |
print("\nπ΅ Scraping SAP Community blogs...")
|
| 38 |
|
| 39 |
-
|
|
|
|
| 40 |
# Core admin/dev topics
|
| 41 |
"SAP Basis",
|
| 42 |
"SAP ABAP",
|
|
@@ -112,13 +361,55 @@ class SAPDatasetBuilder:
|
|
| 112 |
time.sleep(0.2)
|
| 113 |
except Exception as e:
|
| 114 |
print(f" β οΈ SAP RSS error: {e}")
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
# ============== GitHub Source ==============
|
| 117 |
def scrape_github_sap_repos(self):
|
| 118 |
"""Scrape from GitHub SAP-related repositories"""
|
| 119 |
print("\nπ Scraping GitHub SAP repositories...")
|
| 120 |
|
| 121 |
queries = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
"SAP language:python",
|
| 123 |
"SAP language:typescript",
|
| 124 |
"SAP language:javascript",
|
|
@@ -142,12 +433,13 @@ class SAPDatasetBuilder:
|
|
| 142 |
content = readme_response.text
|
| 143 |
if len(content) > 300:
|
| 144 |
self.add_to_dataset({
|
| 145 |
-
'url':
|
| 146 |
'title': f"GitHub: {repo['name']}",
|
| 147 |
'content': content[:15000],
|
| 148 |
'description': repo.get('description', ''),
|
| 149 |
'source': 'github',
|
| 150 |
-
'content_type': 'markdown'
|
|
|
|
| 151 |
})
|
| 152 |
print(f" β
Added: {repo['name']}")
|
| 153 |
break
|
|
@@ -164,61 +456,86 @@ class SAPDatasetBuilder:
|
|
| 164 |
"""Scrape from dev.to"""
|
| 165 |
print("\nπ’ Scraping Dev.to articles...")
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
content = article.get('body_markdown', '') or article.get('description', '')
|
| 175 |
-
self.add_to_dataset({
|
| 176 |
-
'url': article['url'],
|
| 177 |
-
'title': article['title'],
|
| 178 |
-
'content': content,
|
| 179 |
-
'author': article['user']['name'],
|
| 180 |
-
'source': 'devto',
|
| 181 |
-
'published': article['published_at']
|
| 182 |
-
})
|
| 183 |
-
print(f" β
Added: {article['title'][:50]}")
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
# ============== Medium ==============
|
| 190 |
def scrape_medium_tag(self):
|
| 191 |
"""Scrape Medium articles tagged sap via RSS (public)"""
|
| 192 |
-
print("\nπ£ Scraping Medium
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
'
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
-
# ============== StackOverflow
|
| 220 |
def fetch_stackoverflow_answer(self, answer_id):
|
| 221 |
-
"""Fetch accepted answer
|
|
|
|
|
|
|
| 222 |
try:
|
| 223 |
api = (
|
| 224 |
f"https://api.stackexchange.com/2.3/answers/{answer_id}"
|
|
@@ -239,6 +556,7 @@ class SAPDatasetBuilder:
|
|
| 239 |
print("\nπ΄ Scraping StackOverflow Q&A...")
|
| 240 |
tags = [
|
| 241 |
"sap",
|
|
|
|
| 242 |
"sapui5",
|
| 243 |
"sap-fiori",
|
| 244 |
"abap",
|
|
@@ -247,13 +565,14 @@ class SAPDatasetBuilder:
|
|
| 247 |
"sap-btp",
|
| 248 |
"sap-hana",
|
| 249 |
"odata",
|
|
|
|
| 250 |
]
|
| 251 |
for tag in tags:
|
| 252 |
try:
|
| 253 |
api_url = (
|
| 254 |
"https://api.stackexchange.com/2.3/search/advanced"
|
| 255 |
f"?order=desc&sort=votes&tagged={quote(tag)}&site=stackoverflow"
|
| 256 |
-
"&pagesize=
|
| 257 |
)
|
| 258 |
print(f" π Tag: {tag}")
|
| 259 |
resp = requests.get(api_url, headers=self.headers, timeout=10)
|
|
@@ -296,10 +615,13 @@ class SAPDatasetBuilder:
|
|
| 296 |
print("\nπ‘ Scraping SAP Developers tutorials...")
|
| 297 |
base = "https://developers.sap.com"
|
| 298 |
listing_urls = [
|
|
|
|
| 299 |
f"{base}/tutorial-navigator.html?tag=software-product-function:technology-platform/sap-btp",
|
| 300 |
f"{base}/tutorial-navigator.html?tag=software-product-function:analytics/sap-analytics-cloud",
|
| 301 |
f"{base}/tutorial-navigator.html?tag=software-product-function:app-development/sapui5",
|
| 302 |
f"{base}/tutorial-navigator.html?tag=software-product-function:database/sap-hana",
|
|
|
|
|
|
|
| 303 |
]
|
| 304 |
for url in listing_urls:
|
| 305 |
try:
|
|
@@ -340,6 +662,151 @@ class SAPDatasetBuilder:
|
|
| 340 |
except Exception as e:
|
| 341 |
print(f" β οΈ Tutorial error: {e}")
|
| 342 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
def scrape_article(self, url, source):
|
| 345 |
"""Scrape article with structured parsing"""
|
|
@@ -381,37 +848,69 @@ class SAPDatasetBuilder:
|
|
| 381 |
|
| 382 |
def add_to_dataset(self, article_data):
|
| 383 |
"""Add article to dataset with deduplication"""
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
).hexdigest()[:8]
|
| 387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
article_data['id'] = content_hash
|
| 389 |
article_data['timestamp'] = datetime.now().isoformat()
|
| 390 |
|
| 391 |
self.dataset.append(article_data)
|
|
|
|
| 392 |
|
| 393 |
def build(self):
|
| 394 |
"""Build comprehensive dataset"""
|
| 395 |
-
print("π Starting comprehensive SAP dataset build...")
|
|
|
|
| 396 |
self.setup_directories()
|
| 397 |
|
|
|
|
| 398 |
self.scrape_sap_community()
|
| 399 |
self.scrape_sap_community_rss()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
self.scrape_github_sap_repos()
|
| 401 |
self.scrape_devto_articles()
|
| 402 |
self.scrape_medium_tag()
|
| 403 |
-
self.scrape_stackoverflow()
|
| 404 |
-
self.scrape_sap_developers_tutorials()
|
| 405 |
|
| 406 |
# Save dataset
|
| 407 |
output_file = "data/sap_dataset.json"
|
| 408 |
with open(output_file, 'w', encoding='utf-8') as f:
|
| 409 |
json.dump(self.dataset, f, indent=2, ensure_ascii=False)
|
| 410 |
|
| 411 |
-
print(
|
|
|
|
| 412 |
print(f" π Total documents: {len(self.dataset)}")
|
| 413 |
print(f" πΎ Saved to: {output_file}")
|
| 414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
return self.dataset
|
| 416 |
|
| 417 |
if __name__ == "__main__":
|
|
|
|
| 1 |
# tools/build_dataset.py
|
| 2 |
"""
|
| 3 |
+
Enhanced SAP Dataset Builder v2.0
|
| 4 |
+
Scrapes from multiple free sources with focus on SAP Basis administration:
|
| 5 |
- SAP Community blogs
|
| 6 |
+
- SAP Help Portal (help.sap.com)
|
| 7 |
+
- SAP Wiki
|
| 8 |
- GitHub SAP repositories
|
| 9 |
+
- SAP Developers tutorials
|
| 10 |
- Dev.to & tech blogs
|
| 11 |
+
- StackOverflow
|
| 12 |
+
- SAP Notes (public summaries)
|
| 13 |
"""
|
| 14 |
|
| 15 |
import requests
|
|
|
|
| 26 |
def __init__(self):
|
| 27 |
self.dataset = []
|
| 28 |
self.seen_urls = set()
|
| 29 |
+
self.seen_content_hashes = set()
|
| 30 |
self.headers = {
|
| 31 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 32 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 33 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 34 |
}
|
| 35 |
+
|
| 36 |
+
# SAP Basis Transaction Codes for targeted scraping
|
| 37 |
+
self.sap_tcodes = [
|
| 38 |
+
"SM50", "SM51", "SM21", "SM37", "SM36", "SM12", "SM13",
|
| 39 |
+
"ST22", "ST02", "ST03", "ST04", "ST05", "ST06", "ST07",
|
| 40 |
+
"SU01", "SU10", "SU53", "SUIM", "PFCG",
|
| 41 |
+
"SE01", "SE09", "SE10", "SE11", "SE16", "SE37", "SE38", "SE80",
|
| 42 |
+
"STMS", "SPAM", "SAINT",
|
| 43 |
+
"RZ10", "RZ11", "RZ12", "RZ20", "RZ70",
|
| 44 |
+
"SICF", "SMICM", "ICM",
|
| 45 |
+
"AL08", "AL11", "AL05",
|
| 46 |
+
"DB02", "DB13", "DB16",
|
| 47 |
+
"SMLG", "SM66", "SM04",
|
| 48 |
+
"SNOTE", "SUM", "SPS",
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# SAP Basis Topics for comprehensive coverage
|
| 52 |
+
self.sap_basis_topics = [
|
| 53 |
+
# Core Basis Administration
|
| 54 |
+
"SAP Basis administration",
|
| 55 |
+
"SAP system monitoring",
|
| 56 |
+
"SAP performance tuning",
|
| 57 |
+
"SAP memory management",
|
| 58 |
+
"SAP work process",
|
| 59 |
+
"SAP background jobs",
|
| 60 |
+
"SAP transport management",
|
| 61 |
+
"SAP client copy",
|
| 62 |
+
"SAP system refresh",
|
| 63 |
+
|
| 64 |
+
# User & Security
|
| 65 |
+
"SAP user administration",
|
| 66 |
+
"SAP role authorization",
|
| 67 |
+
"SAP security audit",
|
| 68 |
+
"SAP password policy",
|
| 69 |
+
"SAP SSO single sign on",
|
| 70 |
+
"SAP GRC access control",
|
| 71 |
+
|
| 72 |
+
# Database & Storage
|
| 73 |
+
"SAP HANA administration",
|
| 74 |
+
"SAP database backup",
|
| 75 |
+
"SAP archiving",
|
| 76 |
+
"SAP table maintenance",
|
| 77 |
+
"SAP data dictionary",
|
| 78 |
+
|
| 79 |
+
# System Configuration
|
| 80 |
+
"SAP profile parameters",
|
| 81 |
+
"SAP instance configuration",
|
| 82 |
+
"SAP RFC connection",
|
| 83 |
+
"SAP system landscape",
|
| 84 |
+
"SAP solution manager",
|
| 85 |
+
|
| 86 |
+
# Troubleshooting
|
| 87 |
+
"SAP dump analysis",
|
| 88 |
+
"SAP short dump",
|
| 89 |
+
"SAP system log",
|
| 90 |
+
"SAP trace analysis",
|
| 91 |
+
"SAP lock entry",
|
| 92 |
+
"SAP update error",
|
| 93 |
+
|
| 94 |
+
# Installation & Upgrade
|
| 95 |
+
"SAP installation guide",
|
| 96 |
+
"SAP upgrade procedure",
|
| 97 |
+
"SAP kernel update",
|
| 98 |
+
"SAP support package",
|
| 99 |
+
"SAP note implementation",
|
| 100 |
+
|
| 101 |
+
# Cloud & Modern
|
| 102 |
+
"SAP BTP administration",
|
| 103 |
+
"SAP Cloud Connector",
|
| 104 |
+
"SAP Fiori administration",
|
| 105 |
+
"SAP Gateway configuration",
|
| 106 |
+
"S/4HANA migration",
|
| 107 |
+
]
|
| 108 |
|
| 109 |
def setup_directories(self):
|
| 110 |
"""Create necessary directories"""
|
| 111 |
Path("data").mkdir(exist_ok=True)
|
| 112 |
Path("data/raw").mkdir(exist_ok=True)
|
| 113 |
|
| 114 |
+
# ============== SAP Help Portal ==============
|
| 115 |
+
def scrape_sap_help_portal(self):
|
| 116 |
+
"""Scrape from SAP Help Portal (help.sap.com) - Official documentation"""
|
| 117 |
+
print("\nπ Scraping SAP Help Portal...")
|
| 118 |
+
|
| 119 |
+
# SAP Help Portal search URLs for Basis topics
|
| 120 |
+
help_searches = [
|
| 121 |
+
# Basis Administration
|
| 122 |
+
"https://help.sap.com/docs/search?q=basis%20administration&locale=en-US&product=SAP_NETWEAVER",
|
| 123 |
+
"https://help.sap.com/docs/search?q=system%20administration&locale=en-US&product=SAP_NETWEAVER",
|
| 124 |
+
"https://help.sap.com/docs/search?q=transaction%20code&locale=en-US&product=SAP_NETWEAVER",
|
| 125 |
+
"https://help.sap.com/docs/search?q=monitoring&locale=en-US&product=SAP_NETWEAVER",
|
| 126 |
+
"https://help.sap.com/docs/search?q=performance&locale=en-US&product=SAP_NETWEAVER",
|
| 127 |
+
# HANA
|
| 128 |
+
"https://help.sap.com/docs/search?q=administration&locale=en-US&product=SAP_HANA_PLATFORM",
|
| 129 |
+
"https://help.sap.com/docs/search?q=backup%20recovery&locale=en-US&product=SAP_HANA_PLATFORM",
|
| 130 |
+
# S/4HANA
|
| 131 |
+
"https://help.sap.com/docs/search?q=basis&locale=en-US&product=SAP_S4HANA_ON-PREMISE",
|
| 132 |
+
]
|
| 133 |
+
|
| 134 |
+
for search_url in help_searches:
|
| 135 |
+
try:
|
| 136 |
+
print(f" π Searching: {search_url[:80]}...")
|
| 137 |
+
response = requests.get(search_url, headers=self.headers, timeout=15)
|
| 138 |
+
if response.status_code != 200:
|
| 139 |
+
continue
|
| 140 |
+
|
| 141 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 142 |
+
|
| 143 |
+
# Find documentation links
|
| 144 |
+
for link in soup.find_all('a', href=re.compile(r'help\.sap\.com/docs/')):
|
| 145 |
+
href = link.get('href', '')
|
| 146 |
+
if href and href not in self.seen_urls:
|
| 147 |
+
full_url = href if href.startswith('http') else f"https://help.sap.com{href}"
|
| 148 |
+
self.seen_urls.add(full_url)
|
| 149 |
+
self.scrape_help_page(full_url)
|
| 150 |
+
|
| 151 |
+
time.sleep(2)
|
| 152 |
+
except Exception as e:
|
| 153 |
+
print(f" β οΈ Error: {e}")
|
| 154 |
+
|
| 155 |
+
def scrape_help_page(self, url):
|
| 156 |
+
"""Scrape individual SAP Help page"""
|
| 157 |
+
try:
|
| 158 |
+
response = requests.get(url, headers=self.headers, timeout=12)
|
| 159 |
+
if response.status_code != 200:
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 163 |
+
|
| 164 |
+
# Get title
|
| 165 |
+
title = soup.find('h1') or soup.find('title')
|
| 166 |
+
title = title.get_text(strip=True) if title else "SAP Help Document"
|
| 167 |
+
|
| 168 |
+
# Get main content
|
| 169 |
+
content_elem = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content', re.I))
|
| 170 |
+
if content_elem:
|
| 171 |
+
content = content_elem.get_text(separator=' ', strip=True)
|
| 172 |
+
else:
|
| 173 |
+
content = soup.get_text(separator=' ', strip=True)
|
| 174 |
+
|
| 175 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
| 176 |
+
|
| 177 |
+
if len(content) > 500:
|
| 178 |
+
self.add_to_dataset({
|
| 179 |
+
'url': url,
|
| 180 |
+
'title': f"SAP Help: {title}",
|
| 181 |
+
'content': content[:20000],
|
| 182 |
+
'source': 'sap_help_portal',
|
| 183 |
+
'content_type': 'documentation'
|
| 184 |
+
})
|
| 185 |
+
print(f" β
Added: {title[:50]}")
|
| 186 |
+
return True
|
| 187 |
+
except Exception as e:
|
| 188 |
+
print(f" β οΈ Help page error: {e}")
|
| 189 |
+
return False
|
| 190 |
+
|
| 191 |
+
# ============== SAP Wiki ==============
|
| 192 |
+
def scrape_sap_wiki(self):
|
| 193 |
+
"""Scrape from SAP Wiki (wiki.scn.sap.com) - Community knowledge base"""
|
| 194 |
+
print("\nπ Scraping SAP Wiki...")
|
| 195 |
+
|
| 196 |
+
# SAP Wiki URLs for transaction codes and Basis topics
|
| 197 |
+
wiki_searches = []
|
| 198 |
+
|
| 199 |
+
# Add transaction code searches
|
| 200 |
+
for tcode in self.sap_tcodes[:20]: # Top 20 tcodes
|
| 201 |
+
wiki_searches.append(f"https://wiki.scn.sap.com/wiki/dosearchsite.action?queryString={tcode}")
|
| 202 |
+
|
| 203 |
+
# Add topic searches
|
| 204 |
+
for topic in ["Basis", "Administration", "Transport", "Authorization", "Performance"]:
|
| 205 |
+
wiki_searches.append(f"https://wiki.scn.sap.com/wiki/dosearchsite.action?queryString=SAP+{topic}")
|
| 206 |
+
|
| 207 |
+
for search_url in wiki_searches:
|
| 208 |
+
try:
|
| 209 |
+
print(f" π Wiki search...")
|
| 210 |
+
response = requests.get(search_url, headers=self.headers, timeout=12)
|
| 211 |
+
if response.status_code != 200:
|
| 212 |
+
continue
|
| 213 |
+
|
| 214 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 215 |
+
|
| 216 |
+
for link in soup.find_all('a', href=re.compile(r'/wiki/display/')):
|
| 217 |
+
href = link.get('href', '')
|
| 218 |
+
full_url = urljoin('https://wiki.scn.sap.com', href)
|
| 219 |
+
if full_url not in self.seen_urls:
|
| 220 |
+
self.seen_urls.add(full_url)
|
| 221 |
+
self.scrape_wiki_page(full_url)
|
| 222 |
+
|
| 223 |
+
time.sleep(1.5)
|
| 224 |
+
except Exception as e:
|
| 225 |
+
print(f" β οΈ Wiki error: {e}")
|
| 226 |
+
|
| 227 |
+
def scrape_wiki_page(self, url):
|
| 228 |
+
"""Scrape individual wiki page"""
|
| 229 |
+
try:
|
| 230 |
+
response = requests.get(url, headers=self.headers, timeout=10)
|
| 231 |
+
if response.status_code != 200:
|
| 232 |
+
return False
|
| 233 |
+
|
| 234 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 235 |
+
|
| 236 |
+
title = soup.find('h1', id='title-text') or soup.find('h1')
|
| 237 |
+
title = title.get_text(strip=True) if title else "SAP Wiki Article"
|
| 238 |
+
|
| 239 |
+
content_elem = soup.find('div', class_='wiki-content') or soup.find('main')
|
| 240 |
+
content = content_elem.get_text(separator=' ', strip=True) if content_elem else ''
|
| 241 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
| 242 |
+
|
| 243 |
+
if len(content) > 400:
|
| 244 |
+
self.add_to_dataset({
|
| 245 |
+
'url': url,
|
| 246 |
+
'title': title,
|
| 247 |
+
'content': content[:15000],
|
| 248 |
+
'source': 'sap_wiki'
|
| 249 |
+
})
|
| 250 |
+
print(f" β
Added wiki: {title[:50]}")
|
| 251 |
+
return True
|
| 252 |
+
except Exception as e:
|
| 253 |
+
pass
|
| 254 |
+
return False
|
| 255 |
+
|
| 256 |
+
# ============== SAP Blogs - Transaction Code Focus ==============
|
| 257 |
+
def scrape_sap_tcode_blogs(self):
|
| 258 |
+
"""Scrape blogs specifically about SAP transaction codes"""
|
| 259 |
+
print("\nπ§ Scraping SAP Transaction Code content...")
|
| 260 |
+
|
| 261 |
+
for tcode in self.sap_tcodes:
|
| 262 |
+
try:
|
| 263 |
+
# Search SAP Community for transaction code
|
| 264 |
+
search_url = f"https://community.sap.com/search/?q={tcode}&ct=blog"
|
| 265 |
+
print(f" π Transaction: {tcode}")
|
| 266 |
+
|
| 267 |
+
response = requests.get(search_url, headers=self.headers, timeout=10)
|
| 268 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 269 |
+
|
| 270 |
+
for link in soup.find_all('a', href=re.compile(r'/ba-p/\d+')):
|
| 271 |
+
href = link.get('href', '')
|
| 272 |
+
if '/ba-p/' in href:
|
| 273 |
+
full_url = urljoin('https://community.sap.com', href)
|
| 274 |
+
if full_url not in self.seen_urls:
|
| 275 |
+
self.seen_urls.add(full_url)
|
| 276 |
+
self.scrape_article(full_url, 'sap_community_tcode')
|
| 277 |
+
|
| 278 |
+
time.sleep(1.5)
|
| 279 |
+
except Exception as e:
|
| 280 |
+
print(f" β οΈ Error: {e}")
|
| 281 |
+
|
| 282 |
# ============== SAP Community Source ==============
|
| 283 |
def scrape_sap_community(self):
|
| 284 |
+
"""Scrape from SAP Community blogs with Basis focus"""
|
| 285 |
print("\nπ΅ Scraping SAP Community blogs...")
|
| 286 |
|
| 287 |
+
# Combine general and Basis-specific queries
|
| 288 |
+
search_queries = self.sap_basis_topics + [
|
| 289 |
# Core admin/dev topics
|
| 290 |
"SAP Basis",
|
| 291 |
"SAP ABAP",
|
|
|
|
| 361 |
time.sleep(0.2)
|
| 362 |
except Exception as e:
|
| 363 |
print(f" β οΈ SAP RSS error: {e}")
|
| 364 |
+
|
| 365 |
+
# ============== SAP Learning Hub / OpenSAP ==============
|
| 366 |
+
def scrape_opensap_courses(self):
|
| 367 |
+
"""Scrape course descriptions from openSAP"""
|
| 368 |
+
print("\nπ Scraping openSAP course info...")
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
# openSAP courses page
|
| 372 |
+
response = requests.get("https://open.sap.com/courses", headers=self.headers, timeout=15)
|
| 373 |
+
if response.status_code == 200:
|
| 374 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 375 |
+
|
| 376 |
+
for course in soup.find_all('div', class_=re.compile('course', re.I)):
|
| 377 |
+
try:
|
| 378 |
+
title_elem = course.find(['h2', 'h3', 'a'])
|
| 379 |
+
title = title_elem.get_text(strip=True) if title_elem else None
|
| 380 |
+
|
| 381 |
+
desc_elem = course.find('p') or course.find('div', class_=re.compile('desc', re.I))
|
| 382 |
+
desc = desc_elem.get_text(strip=True) if desc_elem else ''
|
| 383 |
+
|
| 384 |
+
link_elem = course.find('a', href=True)
|
| 385 |
+
link = link_elem.get('href', '') if link_elem else ''
|
| 386 |
+
if link and not link.startswith('http'):
|
| 387 |
+
link = f"https://open.sap.com{link}"
|
| 388 |
+
|
| 389 |
+
if title and len(desc) > 100:
|
| 390 |
+
self.add_to_dataset({
|
| 391 |
+
'url': link or 'https://open.sap.com/courses',
|
| 392 |
+
'title': f"openSAP: {title}",
|
| 393 |
+
'content': desc[:5000],
|
| 394 |
+
'source': 'opensap'
|
| 395 |
+
})
|
| 396 |
+
print(f" β
Added course: {title[:50]}")
|
| 397 |
+
except Exception:
|
| 398 |
+
pass
|
| 399 |
+
except Exception as e:
|
| 400 |
+
print(f" β οΈ openSAP error: {e}")
|
| 401 |
+
|
| 402 |
# ============== GitHub Source ==============
|
| 403 |
def scrape_github_sap_repos(self):
|
| 404 |
"""Scrape from GitHub SAP-related repositories"""
|
| 405 |
print("\nπ Scraping GitHub SAP repositories...")
|
| 406 |
|
| 407 |
queries = [
|
| 408 |
+
"SAP Basis",
|
| 409 |
+
"SAP ABAP",
|
| 410 |
+
"SAP HANA admin",
|
| 411 |
+
"SAP security",
|
| 412 |
+
"SAP transport",
|
| 413 |
"SAP language:python",
|
| 414 |
"SAP language:typescript",
|
| 415 |
"SAP language:javascript",
|
|
|
|
| 433 |
content = readme_response.text
|
| 434 |
if len(content) > 300:
|
| 435 |
self.add_to_dataset({
|
| 436 |
+
'url': repo['html_url'],
|
| 437 |
'title': f"GitHub: {repo['name']}",
|
| 438 |
'content': content[:15000],
|
| 439 |
'description': repo.get('description', ''),
|
| 440 |
'source': 'github',
|
| 441 |
+
'content_type': 'markdown',
|
| 442 |
+
'stars': repo.get('stargazers_count', 0)
|
| 443 |
})
|
| 444 |
print(f" β
Added: {repo['name']}")
|
| 445 |
break
|
|
|
|
| 456 |
"""Scrape from dev.to"""
|
| 457 |
print("\nπ’ Scraping Dev.to articles...")
|
| 458 |
|
| 459 |
+
tags = ["sap", "abap", "hana", "sapui5", "fiori"]
|
| 460 |
+
|
| 461 |
+
for tag in tags:
|
| 462 |
+
try:
|
| 463 |
+
api_url = f"https://dev.to/api/articles?tag={tag}&per_page=100"
|
| 464 |
+
response = requests.get(api_url, headers=self.headers, timeout=10)
|
| 465 |
+
articles = response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
|
| 467 |
+
for article in articles:
|
| 468 |
+
if article.get('readable_publish_date'):
|
| 469 |
+
# Fetch full article content
|
| 470 |
+
try:
|
| 471 |
+
article_resp = requests.get(f"https://dev.to/api/articles/{article['id']}", timeout=10)
|
| 472 |
+
full_article = article_resp.json()
|
| 473 |
+
content = full_article.get('body_markdown', '') or full_article.get('body_html', '') or article.get('description', '')
|
| 474 |
+
except:
|
| 475 |
+
content = article.get('description', '')
|
| 476 |
+
|
| 477 |
+
if len(content) > 200:
|
| 478 |
+
self.add_to_dataset({
|
| 479 |
+
'url': article['url'],
|
| 480 |
+
'title': article['title'],
|
| 481 |
+
'content': content[:15000],
|
| 482 |
+
'author': article['user']['name'],
|
| 483 |
+
'source': 'devto',
|
| 484 |
+
'published': article['published_at'],
|
| 485 |
+
'tags': article.get('tag_list', [])
|
| 486 |
+
})
|
| 487 |
+
print(f" β
Added: {article['title'][:50]}")
|
| 488 |
+
|
| 489 |
+
time.sleep(0.3)
|
| 490 |
+
except Exception as e:
|
| 491 |
+
print(f" β οΈ Dev.to Error for tag '{tag}': {e}")
|
| 492 |
+
time.sleep(1)
|
| 493 |
|
| 494 |
# ============== Medium ==============
|
| 495 |
def scrape_medium_tag(self):
|
| 496 |
"""Scrape Medium articles tagged sap via RSS (public)"""
|
| 497 |
+
print("\nπ£ Scraping Medium SAP articles...")
|
| 498 |
+
|
| 499 |
+
tags = ["sap", "sap-hana", "abap", "sap-fiori"]
|
| 500 |
+
|
| 501 |
+
for tag in tags:
|
| 502 |
+
feed_url = f"https://medium.com/feed/tag/{tag}"
|
| 503 |
+
try:
|
| 504 |
+
resp = requests.get(feed_url, headers=self.headers, timeout=10)
|
| 505 |
+
resp.raise_for_status()
|
| 506 |
+
soup = BeautifulSoup(resp.content, 'xml')
|
| 507 |
+
items = soup.find_all('item')[:30]
|
| 508 |
+
|
| 509 |
+
for item in items:
|
| 510 |
+
title = item.title.get_text(strip=True) if item.title else ''
|
| 511 |
+
link = item.link.get_text(strip=True) if item.link else ''
|
| 512 |
+
content = ''
|
| 513 |
+
if item.find('content:encoded'):
|
| 514 |
+
content = item.find('content:encoded').get_text(strip=True)
|
| 515 |
+
elif item.description:
|
| 516 |
+
content = item.description.get_text(strip=True)
|
| 517 |
+
|
| 518 |
+
content = re.sub(r'<[^>]+>', ' ', content)
|
| 519 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
| 520 |
+
|
| 521 |
+
if len(content) > 300 and link not in self.seen_urls:
|
| 522 |
+
self.seen_urls.add(link)
|
| 523 |
+
self.add_to_dataset({
|
| 524 |
+
'url': link,
|
| 525 |
+
'title': title,
|
| 526 |
+
'content': content[:12000],
|
| 527 |
+
'source': 'medium'
|
| 528 |
+
})
|
| 529 |
+
print(f" β
Added: {title[:50]}")
|
| 530 |
+
time.sleep(0.2)
|
| 531 |
+
except Exception as e:
|
| 532 |
+
print(f" β οΈ Medium error for tag '{tag}': {e}")
|
| 533 |
|
| 534 |
+
# ============== StackOverflow ==============
|
| 535 |
def fetch_stackoverflow_answer(self, answer_id):
|
| 536 |
+
"""Fetch accepted answer text by ID"""
|
| 537 |
+
if not answer_id:
|
| 538 |
+
return ""
|
| 539 |
try:
|
| 540 |
api = (
|
| 541 |
f"https://api.stackexchange.com/2.3/answers/{answer_id}"
|
|
|
|
| 556 |
print("\nπ΄ Scraping StackOverflow Q&A...")
|
| 557 |
tags = [
|
| 558 |
"sap",
|
| 559 |
+
"sap-basis",
|
| 560 |
"sapui5",
|
| 561 |
"sap-fiori",
|
| 562 |
"abap",
|
|
|
|
| 565 |
"sap-btp",
|
| 566 |
"sap-hana",
|
| 567 |
"odata",
|
| 568 |
+
"sap-netweaver",
|
| 569 |
]
|
| 570 |
for tag in tags:
|
| 571 |
try:
|
| 572 |
api_url = (
|
| 573 |
"https://api.stackexchange.com/2.3/search/advanced"
|
| 574 |
f"?order=desc&sort=votes&tagged={quote(tag)}&site=stackoverflow"
|
| 575 |
+
"&pagesize=30&filter=withbody"
|
| 576 |
)
|
| 577 |
print(f" π Tag: {tag}")
|
| 578 |
resp = requests.get(api_url, headers=self.headers, timeout=10)
|
|
|
|
| 615 |
print("\nπ‘ Scraping SAP Developers tutorials...")
|
| 616 |
base = "https://developers.sap.com"
|
| 617 |
listing_urls = [
|
| 618 |
+
f"{base}/tutorial-navigator.html",
|
| 619 |
f"{base}/tutorial-navigator.html?tag=software-product-function:technology-platform/sap-btp",
|
| 620 |
f"{base}/tutorial-navigator.html?tag=software-product-function:analytics/sap-analytics-cloud",
|
| 621 |
f"{base}/tutorial-navigator.html?tag=software-product-function:app-development/sapui5",
|
| 622 |
f"{base}/tutorial-navigator.html?tag=software-product-function:database/sap-hana",
|
| 623 |
+
f"{base}/tutorial-navigator.html?tag=topic:security",
|
| 624 |
+
f"{base}/tutorial-navigator.html?tag=topic:abap",
|
| 625 |
]
|
| 626 |
for url in listing_urls:
|
| 627 |
try:
|
|
|
|
| 662 |
except Exception as e:
|
| 663 |
print(f" β οΈ Tutorial error: {e}")
|
| 664 |
return False
|
| 665 |
+
|
| 666 |
+
# ============== Guru99 SAP Tutorials ==============
|
| 667 |
+
def scrape_guru99_sap(self):
|
| 668 |
+
"""Scrape Guru99 SAP tutorials - popular learning resource"""
|
| 669 |
+
print("\nπ Scraping Guru99 SAP tutorials...")
|
| 670 |
+
|
| 671 |
+
base_url = "https://www.guru99.com"
|
| 672 |
+
sap_pages = [
|
| 673 |
+
"/sap-basis-tutorial.html",
|
| 674 |
+
"/sap-hana-tutorial.html",
|
| 675 |
+
"/sap-mm-training.html",
|
| 676 |
+
"/sap-sd-tutorial.html",
|
| 677 |
+
"/sap-fico-training.html",
|
| 678 |
+
"/sap-abap-tutorial.html",
|
| 679 |
+
"/sap-security-tutorial.html",
|
| 680 |
+
]
|
| 681 |
+
|
| 682 |
+
for page in sap_pages:
|
| 683 |
+
try:
|
| 684 |
+
url = f"{base_url}{page}"
|
| 685 |
+
resp = requests.get(url, headers=self.headers, timeout=12)
|
| 686 |
+
if resp.status_code != 200:
|
| 687 |
+
continue
|
| 688 |
+
|
| 689 |
+
soup = BeautifulSoup(resp.content, 'html.parser')
|
| 690 |
+
|
| 691 |
+
# Get tutorial links from the page
|
| 692 |
+
for link in soup.find_all('a', href=re.compile(r'/sap-')):
|
| 693 |
+
href = link.get('href', '')
|
| 694 |
+
full_url = urljoin(base_url, href)
|
| 695 |
+
if full_url not in self.seen_urls and 'guru99.com' in full_url:
|
| 696 |
+
self.seen_urls.add(full_url)
|
| 697 |
+
self.scrape_guru99_page(full_url)
|
| 698 |
+
|
| 699 |
+
time.sleep(1)
|
| 700 |
+
except Exception as e:
|
| 701 |
+
print(f" β οΈ Guru99 error: {e}")
|
| 702 |
+
|
| 703 |
+
def scrape_guru99_page(self, url):
|
| 704 |
+
"""Scrape individual Guru99 page"""
|
| 705 |
+
try:
|
| 706 |
+
resp = requests.get(url, headers=self.headers, timeout=10)
|
| 707 |
+
if resp.status_code != 200:
|
| 708 |
+
return False
|
| 709 |
+
|
| 710 |
+
soup = BeautifulSoup(resp.content, 'html.parser')
|
| 711 |
+
|
| 712 |
+
title = soup.find('h1')
|
| 713 |
+
title = title.get_text(strip=True) if title else "Guru99 SAP Tutorial"
|
| 714 |
+
|
| 715 |
+
# Get article content
|
| 716 |
+
article = soup.find('article') or soup.find('div', class_=re.compile('content', re.I))
|
| 717 |
+
if article:
|
| 718 |
+
content = article.get_text(separator=' ', strip=True)
|
| 719 |
+
else:
|
| 720 |
+
content = soup.get_text(separator=' ', strip=True)
|
| 721 |
+
|
| 722 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
| 723 |
+
|
| 724 |
+
if len(content) > 500:
|
| 725 |
+
self.add_to_dataset({
|
| 726 |
+
'url': url,
|
| 727 |
+
'title': f"Guru99: {title}",
|
| 728 |
+
'content': content[:15000],
|
| 729 |
+
'source': 'guru99'
|
| 730 |
+
})
|
| 731 |
+
print(f" β
Added: {title[:50]}")
|
| 732 |
+
return True
|
| 733 |
+
except Exception:
|
| 734 |
+
pass
|
| 735 |
+
return False
|
| 736 |
+
|
| 737 |
+
# ============== TutorialsPoint SAP ==============
|
| 738 |
+
def scrape_tutorialspoint_sap(self):
|
| 739 |
+
"""Scrape TutorialsPoint SAP tutorials"""
|
| 740 |
+
print("\nπ Scraping TutorialsPoint SAP content...")
|
| 741 |
+
|
| 742 |
+
base_url = "https://www.tutorialspoint.com"
|
| 743 |
+
sap_sections = [
|
| 744 |
+
"/sap_basis/index.htm",
|
| 745 |
+
"/sap_hana/index.htm",
|
| 746 |
+
"/sap_abap/index.htm",
|
| 747 |
+
"/sap_security/index.htm",
|
| 748 |
+
"/sap_mm/index.htm",
|
| 749 |
+
"/sap_sd/index.htm",
|
| 750 |
+
"/sap_fico/index.htm",
|
| 751 |
+
]
|
| 752 |
+
|
| 753 |
+
for section in sap_sections:
|
| 754 |
+
try:
|
| 755 |
+
url = f"{base_url}{section}"
|
| 756 |
+
resp = requests.get(url, headers=self.headers, timeout=12)
|
| 757 |
+
if resp.status_code != 200:
|
| 758 |
+
continue
|
| 759 |
+
|
| 760 |
+
soup = BeautifulSoup(resp.content, 'html.parser')
|
| 761 |
+
|
| 762 |
+
# Find tutorial links in sidebar/menu
|
| 763 |
+
for link in soup.find_all('a', href=re.compile(r'\.htm$')):
|
| 764 |
+
href = link.get('href', '')
|
| 765 |
+
if href and not href.startswith('http'):
|
| 766 |
+
full_url = urljoin(url, href)
|
| 767 |
+
else:
|
| 768 |
+
full_url = href
|
| 769 |
+
|
| 770 |
+
if full_url not in self.seen_urls and 'tutorialspoint.com/sap' in full_url:
|
| 771 |
+
self.seen_urls.add(full_url)
|
| 772 |
+
self.scrape_tutorialspoint_page(full_url)
|
| 773 |
+
|
| 774 |
+
time.sleep(1)
|
| 775 |
+
except Exception as e:
|
| 776 |
+
print(f" β οΈ TutorialsPoint error: {e}")
|
| 777 |
+
|
| 778 |
+
def scrape_tutorialspoint_page(self, url):
|
| 779 |
+
"""Scrape individual TutorialsPoint page"""
|
| 780 |
+
try:
|
| 781 |
+
resp = requests.get(url, headers=self.headers, timeout=10)
|
| 782 |
+
if resp.status_code != 200:
|
| 783 |
+
return False
|
| 784 |
+
|
| 785 |
+
soup = BeautifulSoup(resp.content, 'html.parser')
|
| 786 |
+
|
| 787 |
+
title = soup.find('h1')
|
| 788 |
+
title = title.get_text(strip=True) if title else "TutorialsPoint SAP"
|
| 789 |
+
|
| 790 |
+
content_div = soup.find('div', class_='tutorial-content') or soup.find('div', id='mainContent')
|
| 791 |
+
if content_div:
|
| 792 |
+
content = content_div.get_text(separator=' ', strip=True)
|
| 793 |
+
else:
|
| 794 |
+
content = soup.get_text(separator=' ', strip=True)
|
| 795 |
+
|
| 796 |
+
content = re.sub(r'\s+', ' ', content).strip()
|
| 797 |
+
|
| 798 |
+
if len(content) > 400:
|
| 799 |
+
self.add_to_dataset({
|
| 800 |
+
'url': url,
|
| 801 |
+
'title': f"TutorialsPoint: {title}",
|
| 802 |
+
'content': content[:12000],
|
| 803 |
+
'source': 'tutorialspoint'
|
| 804 |
+
})
|
| 805 |
+
print(f" β
Added: {title[:50]}")
|
| 806 |
+
return True
|
| 807 |
+
except Exception:
|
| 808 |
+
pass
|
| 809 |
+
return False
|
| 810 |
|
| 811 |
def scrape_article(self, url, source):
|
| 812 |
"""Scrape article with structured parsing"""
|
|
|
|
| 848 |
|
| 849 |
def add_to_dataset(self, article_data):
|
| 850 |
"""Add article to dataset with deduplication"""
|
| 851 |
+
content = article_data.get('content', '')
|
| 852 |
+
content_hash = hashlib.md5(content.encode()).hexdigest()[:12]
|
|
|
|
| 853 |
|
| 854 |
+
# Skip if we've seen this content before
|
| 855 |
+
if content_hash in self.seen_content_hashes:
|
| 856 |
+
return False
|
| 857 |
+
|
| 858 |
+
self.seen_content_hashes.add(content_hash)
|
| 859 |
article_data['id'] = content_hash
|
| 860 |
article_data['timestamp'] = datetime.now().isoformat()
|
| 861 |
|
| 862 |
self.dataset.append(article_data)
|
| 863 |
+
return True
|
| 864 |
|
| 865 |
def build(self):
|
| 866 |
"""Build comprehensive dataset"""
|
| 867 |
+
print("π Starting comprehensive SAP dataset build v2.0...")
|
| 868 |
+
print("=" * 60)
|
| 869 |
self.setup_directories()
|
| 870 |
|
| 871 |
+
# Core sources
|
| 872 |
self.scrape_sap_community()
|
| 873 |
self.scrape_sap_community_rss()
|
| 874 |
+
self.scrape_sap_tcode_blogs() # NEW: Transaction code focused
|
| 875 |
+
|
| 876 |
+
# Official documentation
|
| 877 |
+
self.scrape_sap_help_portal() # NEW: help.sap.com
|
| 878 |
+
self.scrape_sap_developers_tutorials()
|
| 879 |
+
self.scrape_opensap_courses() # NEW: openSAP
|
| 880 |
+
|
| 881 |
+
# Wiki & Community
|
| 882 |
+
self.scrape_sap_wiki() # NEW: SAP Wiki
|
| 883 |
+
self.scrape_stackoverflow()
|
| 884 |
+
|
| 885 |
+
# Learning platforms
|
| 886 |
+
self.scrape_guru99_sap() # NEW: Guru99
|
| 887 |
+
self.scrape_tutorialspoint_sap() # NEW: TutorialsPoint
|
| 888 |
+
|
| 889 |
+
# Developer resources
|
| 890 |
self.scrape_github_sap_repos()
|
| 891 |
self.scrape_devto_articles()
|
| 892 |
self.scrape_medium_tag()
|
|
|
|
|
|
|
| 893 |
|
| 894 |
# Save dataset
|
| 895 |
output_file = "data/sap_dataset.json"
|
| 896 |
with open(output_file, 'w', encoding='utf-8') as f:
|
| 897 |
json.dump(self.dataset, f, indent=2, ensure_ascii=False)
|
| 898 |
|
| 899 |
+
print("\n" + "=" * 60)
|
| 900 |
+
print(f"β
Dataset build completed!")
|
| 901 |
print(f" π Total documents: {len(self.dataset)}")
|
| 902 |
print(f" πΎ Saved to: {output_file}")
|
| 903 |
|
| 904 |
+
# Print source breakdown
|
| 905 |
+
sources = {}
|
| 906 |
+
for doc in self.dataset:
|
| 907 |
+
src = doc.get('source', 'unknown')
|
| 908 |
+
sources[src] = sources.get(src, 0) + 1
|
| 909 |
+
|
| 910 |
+
print("\n π Source breakdown:")
|
| 911 |
+
for src, count in sorted(sources.items(), key=lambda x: -x[1]):
|
| 912 |
+
print(f" - {src}: {count}")
|
| 913 |
+
|
| 914 |
return self.dataset
|
| 915 |
|
| 916 |
if __name__ == "__main__":
|