github-actions[bot] commited on
Commit
f0c36b6
Β·
1 Parent(s): 158badf

Deploy from GitHub Actions 2025-12-11_03:43:35

Browse files
Files changed (1) hide show
  1. tools/build_dataset.py +564 -65
tools/build_dataset.py CHANGED
@@ -1,11 +1,15 @@
1
  # tools/build_dataset.py
2
  """
3
- Enhanced SAP Dataset Builder
4
- Scrapes from multiple free sources:
5
  - SAP Community blogs
 
 
6
  - GitHub SAP repositories
7
- - SAP official documentation
8
  - Dev.to & tech blogs
 
 
9
  """
10
 
11
  import requests
@@ -22,21 +26,266 @@ class SAPDatasetBuilder:
22
  def __init__(self):
23
  self.dataset = []
24
  self.seen_urls = set()
 
25
  self.headers = {
26
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
 
 
27
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def setup_directories(self):
30
  """Create necessary directories"""
31
  Path("data").mkdir(exist_ok=True)
32
  Path("data/raw").mkdir(exist_ok=True)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # ============== SAP Community Source ==============
35
  def scrape_sap_community(self):
36
- """Scrape from SAP Community blogs"""
37
  print("\nπŸ”΅ Scraping SAP Community blogs...")
38
 
39
- search_queries = [
 
40
  # Core admin/dev topics
41
  "SAP Basis",
42
  "SAP ABAP",
@@ -112,13 +361,55 @@ class SAPDatasetBuilder:
112
  time.sleep(0.2)
113
  except Exception as e:
114
  print(f" ⚠️ SAP RSS error: {e}")
115
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  # ============== GitHub Source ==============
117
  def scrape_github_sap_repos(self):
118
  """Scrape from GitHub SAP-related repositories"""
119
  print("\n🟠 Scraping GitHub SAP repositories...")
120
 
121
  queries = [
 
 
 
 
 
122
  "SAP language:python",
123
  "SAP language:typescript",
124
  "SAP language:javascript",
@@ -142,12 +433,13 @@ class SAPDatasetBuilder:
142
  content = readme_response.text
143
  if len(content) > 300:
144
  self.add_to_dataset({
145
- 'url': readme_url,
146
  'title': f"GitHub: {repo['name']}",
147
  'content': content[:15000],
148
  'description': repo.get('description', ''),
149
  'source': 'github',
150
- 'content_type': 'markdown'
 
151
  })
152
  print(f" βœ… Added: {repo['name']}")
153
  break
@@ -164,61 +456,86 @@ class SAPDatasetBuilder:
164
  """Scrape from dev.to"""
165
  print("\n🟒 Scraping Dev.to articles...")
166
 
167
- try:
168
- api_url = "https://dev.to/api/articles?tag=sap&per_page=100"
169
- response = requests.get(api_url, headers=self.headers, timeout=10)
170
- articles = response.json()
171
-
172
- for article in articles:
173
- if article['readable_publish_date']:
174
- content = article.get('body_markdown', '') or article.get('description', '')
175
- self.add_to_dataset({
176
- 'url': article['url'],
177
- 'title': article['title'],
178
- 'content': content,
179
- 'author': article['user']['name'],
180
- 'source': 'devto',
181
- 'published': article['published_at']
182
- })
183
- print(f" βœ… Added: {article['title'][:50]}")
184
 
185
- time.sleep(0.5)
186
- except Exception as e:
187
- print(f" ⚠️ Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # ============== Medium ==============
190
  def scrape_medium_tag(self):
191
  """Scrape Medium articles tagged sap via RSS (public)"""
192
- print("\n🟣 Scraping Medium tag: sap ...")
193
- feed_url = "https://medium.com/feed/tag/sap"
194
- try:
195
- resp = requests.get(feed_url, headers=self.headers, timeout=10)
196
- resp.raise_for_status()
197
- soup = BeautifulSoup(resp.content, 'xml')
198
- items = soup.find_all('item')[:50]
199
- for item in items:
200
- title = item.title.get_text(strip=True)
201
- link = item.link.get_text(strip=True)
202
- content = item.find('content:encoded')
203
- content_text = content.get_text(strip=True) if content else ''
204
- # Basic cleanup
205
- content_text = re.sub(r'<[^>]+>', ' ', content_text)
206
- content_text = re.sub(r'\s+', ' ', content_text).strip()
207
- if len(content_text) > 300:
208
- self.add_to_dataset({
209
- 'url': link,
210
- 'title': title,
211
- 'content': content_text[:15000],
212
- 'source': 'medium'
213
- })
214
- print(f" βœ… Added: {title[:60]}")
215
- time.sleep(0.3)
216
- except Exception as e:
217
- print(f" ⚠️ Medium scrape error: {e}")
 
 
 
 
 
 
 
 
 
 
218
 
219
- # ============== StackOverflow (free, public API) ==============
220
  def fetch_stackoverflow_answer(self, answer_id):
221
- """Fetch accepted answer body via Stack Exchange API"""
 
 
222
  try:
223
  api = (
224
  f"https://api.stackexchange.com/2.3/answers/{answer_id}"
@@ -239,6 +556,7 @@ class SAPDatasetBuilder:
239
  print("\nπŸ”΄ Scraping StackOverflow Q&A...")
240
  tags = [
241
  "sap",
 
242
  "sapui5",
243
  "sap-fiori",
244
  "abap",
@@ -247,13 +565,14 @@ class SAPDatasetBuilder:
247
  "sap-btp",
248
  "sap-hana",
249
  "odata",
 
250
  ]
251
  for tag in tags:
252
  try:
253
  api_url = (
254
  "https://api.stackexchange.com/2.3/search/advanced"
255
  f"?order=desc&sort=votes&tagged={quote(tag)}&site=stackoverflow"
256
- "&pagesize=25&filter=withbody"
257
  )
258
  print(f" πŸ” Tag: {tag}")
259
  resp = requests.get(api_url, headers=self.headers, timeout=10)
@@ -296,10 +615,13 @@ class SAPDatasetBuilder:
296
  print("\n🟑 Scraping SAP Developers tutorials...")
297
  base = "https://developers.sap.com"
298
  listing_urls = [
 
299
  f"{base}/tutorial-navigator.html?tag=software-product-function:technology-platform/sap-btp",
300
  f"{base}/tutorial-navigator.html?tag=software-product-function:analytics/sap-analytics-cloud",
301
  f"{base}/tutorial-navigator.html?tag=software-product-function:app-development/sapui5",
302
  f"{base}/tutorial-navigator.html?tag=software-product-function:database/sap-hana",
 
 
303
  ]
304
  for url in listing_urls:
305
  try:
@@ -340,6 +662,151 @@ class SAPDatasetBuilder:
340
  except Exception as e:
341
  print(f" ⚠️ Tutorial error: {e}")
342
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  def scrape_article(self, url, source):
345
  """Scrape article with structured parsing"""
@@ -381,37 +848,69 @@ class SAPDatasetBuilder:
381
 
382
  def add_to_dataset(self, article_data):
383
  """Add article to dataset with deduplication"""
384
- content_hash = hashlib.md5(
385
- article_data.get('content', '').encode()
386
- ).hexdigest()[:8]
387
 
 
 
 
 
 
388
  article_data['id'] = content_hash
389
  article_data['timestamp'] = datetime.now().isoformat()
390
 
391
  self.dataset.append(article_data)
 
392
 
393
  def build(self):
394
  """Build comprehensive dataset"""
395
- print("πŸš€ Starting comprehensive SAP dataset build...")
 
396
  self.setup_directories()
397
 
 
398
  self.scrape_sap_community()
399
  self.scrape_sap_community_rss()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  self.scrape_github_sap_repos()
401
  self.scrape_devto_articles()
402
  self.scrape_medium_tag()
403
- self.scrape_stackoverflow()
404
- self.scrape_sap_developers_tutorials()
405
 
406
  # Save dataset
407
  output_file = "data/sap_dataset.json"
408
  with open(output_file, 'w', encoding='utf-8') as f:
409
  json.dump(self.dataset, f, indent=2, ensure_ascii=False)
410
 
411
- print(f"\nβœ… Dataset build completed!")
 
412
  print(f" πŸ“Š Total documents: {len(self.dataset)}")
413
  print(f" πŸ’Ύ Saved to: {output_file}")
414
 
 
 
 
 
 
 
 
 
 
 
415
  return self.dataset
416
 
417
  if __name__ == "__main__":
 
1
  # tools/build_dataset.py
2
  """
3
+ Enhanced SAP Dataset Builder v2.0
4
+ Scrapes from multiple free sources with focus on SAP Basis administration:
5
  - SAP Community blogs
6
+ - SAP Help Portal (help.sap.com)
7
+ - SAP Wiki
8
  - GitHub SAP repositories
9
+ - SAP Developers tutorials
10
  - Dev.to & tech blogs
11
+ - StackOverflow
12
+ - SAP Notes (public summaries)
13
  """
14
 
15
  import requests
 
26
  def __init__(self):
27
  self.dataset = []
28
  self.seen_urls = set()
29
+ self.seen_content_hashes = set()
30
  self.headers = {
31
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
32
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33
+ 'Accept-Language': 'en-US,en;q=0.5',
34
  }
35
+
36
+ # SAP Basis Transaction Codes for targeted scraping
37
+ self.sap_tcodes = [
38
+ "SM50", "SM51", "SM21", "SM37", "SM36", "SM12", "SM13",
39
+ "ST22", "ST02", "ST03", "ST04", "ST05", "ST06", "ST07",
40
+ "SU01", "SU10", "SU53", "SUIM", "PFCG",
41
+ "SE01", "SE09", "SE10", "SE11", "SE16", "SE37", "SE38", "SE80",
42
+ "STMS", "SPAM", "SAINT",
43
+ "RZ10", "RZ11", "RZ12", "RZ20", "RZ70",
44
+ "SICF", "SMICM", "ICM",
45
+ "AL08", "AL11", "AL05",
46
+ "DB02", "DB13", "DB16",
47
+ "SMLG", "SM66", "SM04",
48
+ "SNOTE", "SUM", "SPS",
49
+ ]
50
+
51
+ # SAP Basis Topics for comprehensive coverage
52
+ self.sap_basis_topics = [
53
+ # Core Basis Administration
54
+ "SAP Basis administration",
55
+ "SAP system monitoring",
56
+ "SAP performance tuning",
57
+ "SAP memory management",
58
+ "SAP work process",
59
+ "SAP background jobs",
60
+ "SAP transport management",
61
+ "SAP client copy",
62
+ "SAP system refresh",
63
+
64
+ # User & Security
65
+ "SAP user administration",
66
+ "SAP role authorization",
67
+ "SAP security audit",
68
+ "SAP password policy",
69
+ "SAP SSO single sign on",
70
+ "SAP GRC access control",
71
+
72
+ # Database & Storage
73
+ "SAP HANA administration",
74
+ "SAP database backup",
75
+ "SAP archiving",
76
+ "SAP table maintenance",
77
+ "SAP data dictionary",
78
+
79
+ # System Configuration
80
+ "SAP profile parameters",
81
+ "SAP instance configuration",
82
+ "SAP RFC connection",
83
+ "SAP system landscape",
84
+ "SAP solution manager",
85
+
86
+ # Troubleshooting
87
+ "SAP dump analysis",
88
+ "SAP short dump",
89
+ "SAP system log",
90
+ "SAP trace analysis",
91
+ "SAP lock entry",
92
+ "SAP update error",
93
+
94
+ # Installation & Upgrade
95
+ "SAP installation guide",
96
+ "SAP upgrade procedure",
97
+ "SAP kernel update",
98
+ "SAP support package",
99
+ "SAP note implementation",
100
+
101
+ # Cloud & Modern
102
+ "SAP BTP administration",
103
+ "SAP Cloud Connector",
104
+ "SAP Fiori administration",
105
+ "SAP Gateway configuration",
106
+ "S/4HANA migration",
107
+ ]
108
 
109
  def setup_directories(self):
110
  """Create necessary directories"""
111
  Path("data").mkdir(exist_ok=True)
112
  Path("data/raw").mkdir(exist_ok=True)
113
 
114
+ # ============== SAP Help Portal ==============
115
+ def scrape_sap_help_portal(self):
116
+ """Scrape from SAP Help Portal (help.sap.com) - Official documentation"""
117
+ print("\nπŸ“˜ Scraping SAP Help Portal...")
118
+
119
+ # SAP Help Portal search URLs for Basis topics
120
+ help_searches = [
121
+ # Basis Administration
122
+ "https://help.sap.com/docs/search?q=basis%20administration&locale=en-US&product=SAP_NETWEAVER",
123
+ "https://help.sap.com/docs/search?q=system%20administration&locale=en-US&product=SAP_NETWEAVER",
124
+ "https://help.sap.com/docs/search?q=transaction%20code&locale=en-US&product=SAP_NETWEAVER",
125
+ "https://help.sap.com/docs/search?q=monitoring&locale=en-US&product=SAP_NETWEAVER",
126
+ "https://help.sap.com/docs/search?q=performance&locale=en-US&product=SAP_NETWEAVER",
127
+ # HANA
128
+ "https://help.sap.com/docs/search?q=administration&locale=en-US&product=SAP_HANA_PLATFORM",
129
+ "https://help.sap.com/docs/search?q=backup%20recovery&locale=en-US&product=SAP_HANA_PLATFORM",
130
+ # S/4HANA
131
+ "https://help.sap.com/docs/search?q=basis&locale=en-US&product=SAP_S4HANA_ON-PREMISE",
132
+ ]
133
+
134
+ for search_url in help_searches:
135
+ try:
136
+ print(f" πŸ” Searching: {search_url[:80]}...")
137
+ response = requests.get(search_url, headers=self.headers, timeout=15)
138
+ if response.status_code != 200:
139
+ continue
140
+
141
+ soup = BeautifulSoup(response.content, 'html.parser')
142
+
143
+ # Find documentation links
144
+ for link in soup.find_all('a', href=re.compile(r'help\.sap\.com/docs/')):
145
+ href = link.get('href', '')
146
+ if href and href not in self.seen_urls:
147
+ full_url = href if href.startswith('http') else f"https://help.sap.com{href}"
148
+ self.seen_urls.add(full_url)
149
+ self.scrape_help_page(full_url)
150
+
151
+ time.sleep(2)
152
+ except Exception as e:
153
+ print(f" ⚠️ Error: {e}")
154
+
155
+ def scrape_help_page(self, url):
156
+ """Scrape individual SAP Help page"""
157
+ try:
158
+ response = requests.get(url, headers=self.headers, timeout=12)
159
+ if response.status_code != 200:
160
+ return False
161
+
162
+ soup = BeautifulSoup(response.content, 'html.parser')
163
+
164
+ # Get title
165
+ title = soup.find('h1') or soup.find('title')
166
+ title = title.get_text(strip=True) if title else "SAP Help Document"
167
+
168
+ # Get main content
169
+ content_elem = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile('content', re.I))
170
+ if content_elem:
171
+ content = content_elem.get_text(separator=' ', strip=True)
172
+ else:
173
+ content = soup.get_text(separator=' ', strip=True)
174
+
175
+ content = re.sub(r'\s+', ' ', content).strip()
176
+
177
+ if len(content) > 500:
178
+ self.add_to_dataset({
179
+ 'url': url,
180
+ 'title': f"SAP Help: {title}",
181
+ 'content': content[:20000],
182
+ 'source': 'sap_help_portal',
183
+ 'content_type': 'documentation'
184
+ })
185
+ print(f" βœ… Added: {title[:50]}")
186
+ return True
187
+ except Exception as e:
188
+ print(f" ⚠️ Help page error: {e}")
189
+ return False
190
+
191
+ # ============== SAP Wiki ==============
192
+ def scrape_sap_wiki(self):
193
+ """Scrape from SAP Wiki (wiki.scn.sap.com) - Community knowledge base"""
194
+ print("\nπŸ“š Scraping SAP Wiki...")
195
+
196
+ # SAP Wiki URLs for transaction codes and Basis topics
197
+ wiki_searches = []
198
+
199
+ # Add transaction code searches
200
+ for tcode in self.sap_tcodes[:20]: # Top 20 tcodes
201
+ wiki_searches.append(f"https://wiki.scn.sap.com/wiki/dosearchsite.action?queryString={tcode}")
202
+
203
+ # Add topic searches
204
+ for topic in ["Basis", "Administration", "Transport", "Authorization", "Performance"]:
205
+ wiki_searches.append(f"https://wiki.scn.sap.com/wiki/dosearchsite.action?queryString=SAP+{topic}")
206
+
207
+ for search_url in wiki_searches:
208
+ try:
209
+ print(f" πŸ” Wiki search...")
210
+ response = requests.get(search_url, headers=self.headers, timeout=12)
211
+ if response.status_code != 200:
212
+ continue
213
+
214
+ soup = BeautifulSoup(response.content, 'html.parser')
215
+
216
+ for link in soup.find_all('a', href=re.compile(r'/wiki/display/')):
217
+ href = link.get('href', '')
218
+ full_url = urljoin('https://wiki.scn.sap.com', href)
219
+ if full_url not in self.seen_urls:
220
+ self.seen_urls.add(full_url)
221
+ self.scrape_wiki_page(full_url)
222
+
223
+ time.sleep(1.5)
224
+ except Exception as e:
225
+ print(f" ⚠️ Wiki error: {e}")
226
+
227
+ def scrape_wiki_page(self, url):
228
+ """Scrape individual wiki page"""
229
+ try:
230
+ response = requests.get(url, headers=self.headers, timeout=10)
231
+ if response.status_code != 200:
232
+ return False
233
+
234
+ soup = BeautifulSoup(response.content, 'html.parser')
235
+
236
+ title = soup.find('h1', id='title-text') or soup.find('h1')
237
+ title = title.get_text(strip=True) if title else "SAP Wiki Article"
238
+
239
+ content_elem = soup.find('div', class_='wiki-content') or soup.find('main')
240
+ content = content_elem.get_text(separator=' ', strip=True) if content_elem else ''
241
+ content = re.sub(r'\s+', ' ', content).strip()
242
+
243
+ if len(content) > 400:
244
+ self.add_to_dataset({
245
+ 'url': url,
246
+ 'title': title,
247
+ 'content': content[:15000],
248
+ 'source': 'sap_wiki'
249
+ })
250
+ print(f" βœ… Added wiki: {title[:50]}")
251
+ return True
252
+ except Exception as e:
253
+ pass
254
+ return False
255
+
256
+ # ============== SAP Blogs - Transaction Code Focus ==============
257
+ def scrape_sap_tcode_blogs(self):
258
+ """Scrape blogs specifically about SAP transaction codes"""
259
+ print("\nπŸ”§ Scraping SAP Transaction Code content...")
260
+
261
+ for tcode in self.sap_tcodes:
262
+ try:
263
+ # Search SAP Community for transaction code
264
+ search_url = f"https://community.sap.com/search/?q={tcode}&ct=blog"
265
+ print(f" πŸ” Transaction: {tcode}")
266
+
267
+ response = requests.get(search_url, headers=self.headers, timeout=10)
268
+ soup = BeautifulSoup(response.content, 'html.parser')
269
+
270
+ for link in soup.find_all('a', href=re.compile(r'/ba-p/\d+')):
271
+ href = link.get('href', '')
272
+ if '/ba-p/' in href:
273
+ full_url = urljoin('https://community.sap.com', href)
274
+ if full_url not in self.seen_urls:
275
+ self.seen_urls.add(full_url)
276
+ self.scrape_article(full_url, 'sap_community_tcode')
277
+
278
+ time.sleep(1.5)
279
+ except Exception as e:
280
+ print(f" ⚠️ Error: {e}")
281
+
282
  # ============== SAP Community Source ==============
283
  def scrape_sap_community(self):
284
+ """Scrape from SAP Community blogs with Basis focus"""
285
  print("\nπŸ”΅ Scraping SAP Community blogs...")
286
 
287
+ # Combine general and Basis-specific queries
288
+ search_queries = self.sap_basis_topics + [
289
  # Core admin/dev topics
290
  "SAP Basis",
291
  "SAP ABAP",
 
361
  time.sleep(0.2)
362
  except Exception as e:
363
  print(f" ⚠️ SAP RSS error: {e}")
364
+
365
+ # ============== SAP Learning Hub / OpenSAP ==============
366
+ def scrape_opensap_courses(self):
367
+ """Scrape course descriptions from openSAP"""
368
+ print("\nπŸŽ“ Scraping openSAP course info...")
369
+
370
+ try:
371
+ # openSAP courses page
372
+ response = requests.get("https://open.sap.com/courses", headers=self.headers, timeout=15)
373
+ if response.status_code == 200:
374
+ soup = BeautifulSoup(response.content, 'html.parser')
375
+
376
+ for course in soup.find_all('div', class_=re.compile('course', re.I)):
377
+ try:
378
+ title_elem = course.find(['h2', 'h3', 'a'])
379
+ title = title_elem.get_text(strip=True) if title_elem else None
380
+
381
+ desc_elem = course.find('p') or course.find('div', class_=re.compile('desc', re.I))
382
+ desc = desc_elem.get_text(strip=True) if desc_elem else ''
383
+
384
+ link_elem = course.find('a', href=True)
385
+ link = link_elem.get('href', '') if link_elem else ''
386
+ if link and not link.startswith('http'):
387
+ link = f"https://open.sap.com{link}"
388
+
389
+ if title and len(desc) > 100:
390
+ self.add_to_dataset({
391
+ 'url': link or 'https://open.sap.com/courses',
392
+ 'title': f"openSAP: {title}",
393
+ 'content': desc[:5000],
394
+ 'source': 'opensap'
395
+ })
396
+ print(f" βœ… Added course: {title[:50]}")
397
+ except Exception:
398
+ pass
399
+ except Exception as e:
400
+ print(f" ⚠️ openSAP error: {e}")
401
+
402
  # ============== GitHub Source ==============
403
  def scrape_github_sap_repos(self):
404
  """Scrape from GitHub SAP-related repositories"""
405
  print("\n🟠 Scraping GitHub SAP repositories...")
406
 
407
  queries = [
408
+ "SAP Basis",
409
+ "SAP ABAP",
410
+ "SAP HANA admin",
411
+ "SAP security",
412
+ "SAP transport",
413
  "SAP language:python",
414
  "SAP language:typescript",
415
  "SAP language:javascript",
 
433
  content = readme_response.text
434
  if len(content) > 300:
435
  self.add_to_dataset({
436
+ 'url': repo['html_url'],
437
  'title': f"GitHub: {repo['name']}",
438
  'content': content[:15000],
439
  'description': repo.get('description', ''),
440
  'source': 'github',
441
+ 'content_type': 'markdown',
442
+ 'stars': repo.get('stargazers_count', 0)
443
  })
444
  print(f" βœ… Added: {repo['name']}")
445
  break
 
456
  """Scrape from dev.to"""
457
  print("\n🟒 Scraping Dev.to articles...")
458
 
459
+ tags = ["sap", "abap", "hana", "sapui5", "fiori"]
460
+
461
+ for tag in tags:
462
+ try:
463
+ api_url = f"https://dev.to/api/articles?tag={tag}&per_page=100"
464
+ response = requests.get(api_url, headers=self.headers, timeout=10)
465
+ articles = response.json()
 
 
 
 
 
 
 
 
 
 
466
 
467
+ for article in articles:
468
+ if article.get('readable_publish_date'):
469
+ # Fetch full article content
470
+ try:
471
+ article_resp = requests.get(f"https://dev.to/api/articles/{article['id']}", timeout=10)
472
+ full_article = article_resp.json()
473
+ content = full_article.get('body_markdown', '') or full_article.get('body_html', '') or article.get('description', '')
474
+ except:
475
+ content = article.get('description', '')
476
+
477
+ if len(content) > 200:
478
+ self.add_to_dataset({
479
+ 'url': article['url'],
480
+ 'title': article['title'],
481
+ 'content': content[:15000],
482
+ 'author': article['user']['name'],
483
+ 'source': 'devto',
484
+ 'published': article['published_at'],
485
+ 'tags': article.get('tag_list', [])
486
+ })
487
+ print(f" βœ… Added: {article['title'][:50]}")
488
+
489
+ time.sleep(0.3)
490
+ except Exception as e:
491
+ print(f" ⚠️ Dev.to Error for tag '{tag}': {e}")
492
+ time.sleep(1)
493
 
494
  # ============== Medium ==============
495
  def scrape_medium_tag(self):
496
  """Scrape Medium articles tagged sap via RSS (public)"""
497
+ print("\n🟣 Scraping Medium SAP articles...")
498
+
499
+ tags = ["sap", "sap-hana", "abap", "sap-fiori"]
500
+
501
+ for tag in tags:
502
+ feed_url = f"https://medium.com/feed/tag/{tag}"
503
+ try:
504
+ resp = requests.get(feed_url, headers=self.headers, timeout=10)
505
+ resp.raise_for_status()
506
+ soup = BeautifulSoup(resp.content, 'xml')
507
+ items = soup.find_all('item')[:30]
508
+
509
+ for item in items:
510
+ title = item.title.get_text(strip=True) if item.title else ''
511
+ link = item.link.get_text(strip=True) if item.link else ''
512
+ content = ''
513
+ if item.find('content:encoded'):
514
+ content = item.find('content:encoded').get_text(strip=True)
515
+ elif item.description:
516
+ content = item.description.get_text(strip=True)
517
+
518
+ content = re.sub(r'<[^>]+>', ' ', content)
519
+ content = re.sub(r'\s+', ' ', content).strip()
520
+
521
+ if len(content) > 300 and link not in self.seen_urls:
522
+ self.seen_urls.add(link)
523
+ self.add_to_dataset({
524
+ 'url': link,
525
+ 'title': title,
526
+ 'content': content[:12000],
527
+ 'source': 'medium'
528
+ })
529
+ print(f" βœ… Added: {title[:50]}")
530
+ time.sleep(0.2)
531
+ except Exception as e:
532
+ print(f" ⚠️ Medium error for tag '{tag}': {e}")
533
 
534
+ # ============== StackOverflow ==============
535
  def fetch_stackoverflow_answer(self, answer_id):
536
+ """Fetch accepted answer text by ID"""
537
+ if not answer_id:
538
+ return ""
539
  try:
540
  api = (
541
  f"https://api.stackexchange.com/2.3/answers/{answer_id}"
 
556
  print("\nπŸ”΄ Scraping StackOverflow Q&A...")
557
  tags = [
558
  "sap",
559
+ "sap-basis",
560
  "sapui5",
561
  "sap-fiori",
562
  "abap",
 
565
  "sap-btp",
566
  "sap-hana",
567
  "odata",
568
+ "sap-netweaver",
569
  ]
570
  for tag in tags:
571
  try:
572
  api_url = (
573
  "https://api.stackexchange.com/2.3/search/advanced"
574
  f"?order=desc&sort=votes&tagged={quote(tag)}&site=stackoverflow"
575
+ "&pagesize=30&filter=withbody"
576
  )
577
  print(f" πŸ” Tag: {tag}")
578
  resp = requests.get(api_url, headers=self.headers, timeout=10)
 
615
  print("\n🟑 Scraping SAP Developers tutorials...")
616
  base = "https://developers.sap.com"
617
  listing_urls = [
618
+ f"{base}/tutorial-navigator.html",
619
  f"{base}/tutorial-navigator.html?tag=software-product-function:technology-platform/sap-btp",
620
  f"{base}/tutorial-navigator.html?tag=software-product-function:analytics/sap-analytics-cloud",
621
  f"{base}/tutorial-navigator.html?tag=software-product-function:app-development/sapui5",
622
  f"{base}/tutorial-navigator.html?tag=software-product-function:database/sap-hana",
623
+ f"{base}/tutorial-navigator.html?tag=topic:security",
624
+ f"{base}/tutorial-navigator.html?tag=topic:abap",
625
  ]
626
  for url in listing_urls:
627
  try:
 
662
  except Exception as e:
663
  print(f" ⚠️ Tutorial error: {e}")
664
  return False
665
+
666
+ # ============== Guru99 SAP Tutorials ==============
667
+ def scrape_guru99_sap(self):
668
+ """Scrape Guru99 SAP tutorials - popular learning resource"""
669
+ print("\nπŸ“– Scraping Guru99 SAP tutorials...")
670
+
671
+ base_url = "https://www.guru99.com"
672
+ sap_pages = [
673
+ "/sap-basis-tutorial.html",
674
+ "/sap-hana-tutorial.html",
675
+ "/sap-mm-training.html",
676
+ "/sap-sd-tutorial.html",
677
+ "/sap-fico-training.html",
678
+ "/sap-abap-tutorial.html",
679
+ "/sap-security-tutorial.html",
680
+ ]
681
+
682
+ for page in sap_pages:
683
+ try:
684
+ url = f"{base_url}{page}"
685
+ resp = requests.get(url, headers=self.headers, timeout=12)
686
+ if resp.status_code != 200:
687
+ continue
688
+
689
+ soup = BeautifulSoup(resp.content, 'html.parser')
690
+
691
+ # Get tutorial links from the page
692
+ for link in soup.find_all('a', href=re.compile(r'/sap-')):
693
+ href = link.get('href', '')
694
+ full_url = urljoin(base_url, href)
695
+ if full_url not in self.seen_urls and 'guru99.com' in full_url:
696
+ self.seen_urls.add(full_url)
697
+ self.scrape_guru99_page(full_url)
698
+
699
+ time.sleep(1)
700
+ except Exception as e:
701
+ print(f" ⚠️ Guru99 error: {e}")
702
+
703
+ def scrape_guru99_page(self, url):
704
+ """Scrape individual Guru99 page"""
705
+ try:
706
+ resp = requests.get(url, headers=self.headers, timeout=10)
707
+ if resp.status_code != 200:
708
+ return False
709
+
710
+ soup = BeautifulSoup(resp.content, 'html.parser')
711
+
712
+ title = soup.find('h1')
713
+ title = title.get_text(strip=True) if title else "Guru99 SAP Tutorial"
714
+
715
+ # Get article content
716
+ article = soup.find('article') or soup.find('div', class_=re.compile('content', re.I))
717
+ if article:
718
+ content = article.get_text(separator=' ', strip=True)
719
+ else:
720
+ content = soup.get_text(separator=' ', strip=True)
721
+
722
+ content = re.sub(r'\s+', ' ', content).strip()
723
+
724
+ if len(content) > 500:
725
+ self.add_to_dataset({
726
+ 'url': url,
727
+ 'title': f"Guru99: {title}",
728
+ 'content': content[:15000],
729
+ 'source': 'guru99'
730
+ })
731
+ print(f" βœ… Added: {title[:50]}")
732
+ return True
733
+ except Exception:
734
+ pass
735
+ return False
736
+
737
+ # ============== TutorialsPoint SAP ==============
738
+ def scrape_tutorialspoint_sap(self):
739
+ """Scrape TutorialsPoint SAP tutorials"""
740
+ print("\nπŸ“— Scraping TutorialsPoint SAP content...")
741
+
742
+ base_url = "https://www.tutorialspoint.com"
743
+ sap_sections = [
744
+ "/sap_basis/index.htm",
745
+ "/sap_hana/index.htm",
746
+ "/sap_abap/index.htm",
747
+ "/sap_security/index.htm",
748
+ "/sap_mm/index.htm",
749
+ "/sap_sd/index.htm",
750
+ "/sap_fico/index.htm",
751
+ ]
752
+
753
+ for section in sap_sections:
754
+ try:
755
+ url = f"{base_url}{section}"
756
+ resp = requests.get(url, headers=self.headers, timeout=12)
757
+ if resp.status_code != 200:
758
+ continue
759
+
760
+ soup = BeautifulSoup(resp.content, 'html.parser')
761
+
762
+ # Find tutorial links in sidebar/menu
763
+ for link in soup.find_all('a', href=re.compile(r'\.htm$')):
764
+ href = link.get('href', '')
765
+ if href and not href.startswith('http'):
766
+ full_url = urljoin(url, href)
767
+ else:
768
+ full_url = href
769
+
770
+ if full_url not in self.seen_urls and 'tutorialspoint.com/sap' in full_url:
771
+ self.seen_urls.add(full_url)
772
+ self.scrape_tutorialspoint_page(full_url)
773
+
774
+ time.sleep(1)
775
+ except Exception as e:
776
+ print(f" ⚠️ TutorialsPoint error: {e}")
777
+
778
+ def scrape_tutorialspoint_page(self, url):
779
+ """Scrape individual TutorialsPoint page"""
780
+ try:
781
+ resp = requests.get(url, headers=self.headers, timeout=10)
782
+ if resp.status_code != 200:
783
+ return False
784
+
785
+ soup = BeautifulSoup(resp.content, 'html.parser')
786
+
787
+ title = soup.find('h1')
788
+ title = title.get_text(strip=True) if title else "TutorialsPoint SAP"
789
+
790
+ content_div = soup.find('div', class_='tutorial-content') or soup.find('div', id='mainContent')
791
+ if content_div:
792
+ content = content_div.get_text(separator=' ', strip=True)
793
+ else:
794
+ content = soup.get_text(separator=' ', strip=True)
795
+
796
+ content = re.sub(r'\s+', ' ', content).strip()
797
+
798
+ if len(content) > 400:
799
+ self.add_to_dataset({
800
+ 'url': url,
801
+ 'title': f"TutorialsPoint: {title}",
802
+ 'content': content[:12000],
803
+ 'source': 'tutorialspoint'
804
+ })
805
+ print(f" βœ… Added: {title[:50]}")
806
+ return True
807
+ except Exception:
808
+ pass
809
+ return False
810
 
811
  def scrape_article(self, url, source):
812
  """Scrape article with structured parsing"""
 
848
 
849
  def add_to_dataset(self, article_data):
850
  """Add article to dataset with deduplication"""
851
+ content = article_data.get('content', '')
852
+ content_hash = hashlib.md5(content.encode()).hexdigest()[:12]
 
853
 
854
+ # Skip if we've seen this content before
855
+ if content_hash in self.seen_content_hashes:
856
+ return False
857
+
858
+ self.seen_content_hashes.add(content_hash)
859
  article_data['id'] = content_hash
860
  article_data['timestamp'] = datetime.now().isoformat()
861
 
862
  self.dataset.append(article_data)
863
+ return True
864
 
865
  def build(self):
866
  """Build comprehensive dataset"""
867
+ print("πŸš€ Starting comprehensive SAP dataset build v2.0...")
868
+ print("=" * 60)
869
  self.setup_directories()
870
 
871
+ # Core sources
872
  self.scrape_sap_community()
873
  self.scrape_sap_community_rss()
874
+ self.scrape_sap_tcode_blogs() # NEW: Transaction code focused
875
+
876
+ # Official documentation
877
+ self.scrape_sap_help_portal() # NEW: help.sap.com
878
+ self.scrape_sap_developers_tutorials()
879
+ self.scrape_opensap_courses() # NEW: openSAP
880
+
881
+ # Wiki & Community
882
+ self.scrape_sap_wiki() # NEW: SAP Wiki
883
+ self.scrape_stackoverflow()
884
+
885
+ # Learning platforms
886
+ self.scrape_guru99_sap() # NEW: Guru99
887
+ self.scrape_tutorialspoint_sap() # NEW: TutorialsPoint
888
+
889
+ # Developer resources
890
  self.scrape_github_sap_repos()
891
  self.scrape_devto_articles()
892
  self.scrape_medium_tag()
 
 
893
 
894
  # Save dataset
895
  output_file = "data/sap_dataset.json"
896
  with open(output_file, 'w', encoding='utf-8') as f:
897
  json.dump(self.dataset, f, indent=2, ensure_ascii=False)
898
 
899
+ print("\n" + "=" * 60)
900
+ print(f"βœ… Dataset build completed!")
901
  print(f" πŸ“Š Total documents: {len(self.dataset)}")
902
  print(f" πŸ’Ύ Saved to: {output_file}")
903
 
904
+ # Print source breakdown
905
+ sources = {}
906
+ for doc in self.dataset:
907
+ src = doc.get('source', 'unknown')
908
+ sources[src] = sources.get(src, 0) + 1
909
+
910
+ print("\n πŸ“ˆ Source breakdown:")
911
+ for src, count in sorted(sources.items(), key=lambda x: -x[1]):
912
+ print(f" - {src}: {count}")
913
+
914
  return self.dataset
915
 
916
  if __name__ == "__main__":