Spaces:
Running
Running
| import requests, re | |
| from lxml import html | |
| r = requests.get( | |
| 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10725812/', | |
| headers={'User-Agent': 'Mozilla/5.0'}, | |
| timeout=15 | |
| ) | |
| tree = html.fromstring(r.content) | |
| # Find main article body — skip nav/header | |
| article = tree.xpath('//article') or tree.xpath('//*[@role="main"]') or tree.xpath('//div[@class="article"]') | |
| root = article[0] if article else tree | |
| print('Using root:', root.tag, root.get('class','')[:40]) | |
| # Find all sections with their h2/h3 and paragraphs | |
| sections = root.xpath('.//section') | |
| print(f'\nTotal sections: {len(sections)}') | |
| # Show first Recommendations section content | |
| for sec in sections: | |
| h3 = sec.xpath('.//h3') | |
| if h3 and 'Recommendation' in h3[0].text_content(): | |
| print('\n--- RECOMMENDATIONS SECTION ---') | |
| print('H3:', h3[0].text_content().strip()) | |
| # Get all list items and paragraphs in this section | |
| items = sec.xpath('.//li | .//p') | |
| for item in items[:8]: | |
| t = item.text_content().strip() | |
| if t and len(t) > 20: | |
| print(' TEXT:', t[:200]) | |
| break | |
| # Check how rec numbers look — find paragraphs starting with N.N pattern | |
| all_p = root.xpath('.//p') | |
| print('\n--- PARAGRAPHS WITH REC NUMBERS ---') | |
| rec_re = re.compile(r'^\s*\d+\.\d+[a-z]?\s+\w') | |
| count = 0 | |
| for p in all_p: | |
| t = p.text_content().strip() | |
| if rec_re.match(t): | |
| print(' REC:', t[:200]) | |
| count += 1 | |
| if count >= 5: | |
| break | |
| # Show structure of first H2 section | |
| print('\n--- FIRST H2 SECTION STRUCTURE ---') | |
| h2_secs = root.xpath('.//section[.//h2]') | |
| if h2_secs: | |
| sec = h2_secs[0] | |
| print('H2:', sec.xpath('.//h2')[0].text_content().strip()[:60]) | |
| children = list(sec) | |
| print('Direct children tags:', [c.tag for c in children[:10]]) | |