MediRAG-API / scripts /debug_pmc.py
joytheslothh's picture
deploy: clean build
b6f9fa8
import requests, re
from lxml import html
r = requests.get(
'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10725812/',
headers={'User-Agent': 'Mozilla/5.0'},
timeout=15
)
tree = html.fromstring(r.content)
# Find main article body — skip nav/header
article = tree.xpath('//article') or tree.xpath('//*[@role="main"]') or tree.xpath('//div[@class="article"]')
root = article[0] if article else tree
print('Using root:', root.tag, root.get('class','')[:40])
# Find all sections with their h2/h3 and paragraphs
sections = root.xpath('.//section')
print(f'\nTotal sections: {len(sections)}')
# Show first Recommendations section content
for sec in sections:
h3 = sec.xpath('.//h3')
if h3 and 'Recommendation' in h3[0].text_content():
print('\n--- RECOMMENDATIONS SECTION ---')
print('H3:', h3[0].text_content().strip())
# Get all list items and paragraphs in this section
items = sec.xpath('.//li | .//p')
for item in items[:8]:
t = item.text_content().strip()
if t and len(t) > 20:
print(' TEXT:', t[:200])
break
# Check how rec numbers look — find paragraphs starting with N.N pattern
all_p = root.xpath('.//p')
print('\n--- PARAGRAPHS WITH REC NUMBERS ---')
rec_re = re.compile(r'^\s*\d+\.\d+[a-z]?\s+\w')
count = 0
for p in all_p:
t = p.text_content().strip()
if rec_re.match(t):
print(' REC:', t[:200])
count += 1
if count >= 5:
break
# Show structure of first H2 section
print('\n--- FIRST H2 SECTION STRUCTURE ---')
h2_secs = root.xpath('.//section[.//h2]')
if h2_secs:
sec = h2_secs[0]
print('H2:', sec.xpath('.//h2')[0].text_content().strip()[:60])
children = list(sec)
print('Direct children tags:', [c.tag for c in children[:10]])