Spaces:

joytheslothh
/

MediRAG-API

Running

App Files Files Community

MediRAG-API / scripts /debug_pmc.py

joytheslothh

deploy: clean build

b6f9fa8 1 day ago

raw

history blame contribute delete

1.81 kB

	import requests, re
	from lxml import html

	r = requests.get(
	'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10725812/',
	headers={'User-Agent': 'Mozilla/5.0'},
	timeout=15
	)
	tree = html.fromstring(r.content)

	# Find main article body — skip nav/header
	article = tree.xpath('//article') or tree.xpath('//*[@role="main"]') or tree.xpath('//div[@class="article"]')
	root = article[0] if article else tree
	print('Using root:', root.tag, root.get('class','')[:40])

	# Find all sections with their h2/h3 and paragraphs
	sections = root.xpath('.//section')
	print(f'\nTotal sections: {len(sections)}')

	# Show first Recommendations section content
	for sec in sections:
	h3 = sec.xpath('.//h3')
	if h3 and 'Recommendation' in h3[0].text_content():
	print('\n--- RECOMMENDATIONS SECTION ---')
	print('H3:', h3[0].text_content().strip())
	# Get all list items and paragraphs in this section
	items = sec.xpath('.//li \| .//p')
	for item in items[:8]:
	t = item.text_content().strip()
	if t and len(t) > 20:
	print(' TEXT:', t[:200])
	break

	# Check how rec numbers look — find paragraphs starting with N.N pattern
	all_p = root.xpath('.//p')
	print('\n--- PARAGRAPHS WITH REC NUMBERS ---')
	rec_re = re.compile(r'^\s*\d+\.\d+[a-z]?\s+\w')
	count = 0
	for p in all_p:
	t = p.text_content().strip()
	if rec_re.match(t):
	print(' REC:', t[:200])
	count += 1
	if count >= 5:
	break

	# Show structure of first H2 section
	print('\n--- FIRST H2 SECTION STRUCTURE ---')
	h2_secs = root.xpath('.//section[.//h2]')
	if h2_secs:
	sec = h2_secs[0]
	print('H2:', sec.xpath('.//h2')[0].text_content().strip()[:60])
	children = list(sec)
	print('Direct children tags:', [c.tag for c in children[:10]])