NurseLex / download_notes.py
NurseCitizenDeveloper's picture
feat: complete local embedding search with i-dot-ai HF model
19a3093
import json
import httpx
from concurrent.futures import ThreadPoolExecutor, as_completed
BASE_URL = 'https://lex.lab.i.ai.gov.uk'
INPUT_FILE = 'nursing_sections.json'
def fetch_note_for_section(section):
url = f'{BASE_URL}/explanatory_note/section/search'
act_title = section.get('act_name', '')
section_number = section.get('number', '')
parent_leg_id = section.get('legislation_id', '')
if not act_title or not section_number:
return None
query = f'"{act_title}" Section {section_number}'
payload = {
'query': query,
'limit': 5
}
try:
r = httpx.post(url, json=payload, timeout=15)
r.raise_for_status()
data = r.json()
# We need to find a note that actually belongs to this Act
if isinstance(data, list):
for note in data:
note_leg_id = note.get('legislation_id', '')
if note_leg_id and parent_leg_id in note_leg_id:
# Double check the text or title mentions the section
# Explanatory notes usually format like "Section 2: ..." or "2. ..." or "Paragraph 2"
return {
'section_uri': section.get('uri'),
'act_name': act_title,
'section_number': section_number,
'note_text': note.get('text', '')
}
except Exception as e:
print(f"Error for {query}: {e}")
return None
def main():
print("Loading sections...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
sections = json.load(f)
print(f"Loaded {len(sections)} sections.")
# Test on a small but diverse subset (MHA 1983, MCA 2005)
test_sections = []
has_mca = False
for s in sections:
if s.get('number') in [2, 3, 5, 136]:
test_sections.append(s)
if 'Capacity' in s.get('act_name', ''):
has_mca = True
if len(test_sections) > 50 and has_mca:
break
test_sections = test_sections[:20]
print(f"Testing {len(test_sections)} sections...")
notes = {}
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_section = {executor.submit(fetch_note_for_section, s): s for s in test_sections}
for future in as_completed(future_to_section):
s = future_to_section[future]
result = future.result()
if result:
notes[s['uri']] = result
print(f"✅ Found note for {result['act_name']} S.{result['section_number']}")
else:
print(f"❌ No note found for {s.get('act_name')} S.{s.get('number')}")
print(f"Found {len(notes)} notes in test batch.")
with open('test_notes.json', 'w', encoding='utf-8') as f:
json.dump(notes, f, indent=2)
if __name__ == '__main__':
main()