NurseCitizenDeveloper commited on
Commit
4da3879
Β·
verified Β·
1 Parent(s): 3f3fea1

Upload streamlit_app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. streamlit_app.py +66 -17
streamlit_app.py CHANGED
@@ -14,6 +14,12 @@ import re
14
  import sys
15
  import os
16
 
 
 
 
 
 
 
17
  sys.path.insert(0, os.path.dirname(__file__))
18
  from wiki.starter import get_starter_wiki
19
  from core.compiler import compile_source, rebuild_index
@@ -137,6 +143,17 @@ def add_or_update_article(article: dict):
137
  wiki["metadata"]["article_count"] = len(wiki["articles"])
138
 
139
 
 
 
 
 
 
 
 
 
 
 
 
140
  def export_wiki_zip() -> bytes:
141
  buf = io.BytesIO()
142
  with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
@@ -387,14 +404,9 @@ with tab_sources:
387
  st.markdown("""
388
  Add source material to the wiki. Claude will integrate it when you run **Compile**.
389
 
390
- Suitable sources include:
391
- - NICE clinical guidelines (copy and paste the text)
392
- - NMC guidance documents
393
- - Research paper abstracts or full text
394
- - NHS trust protocols
395
- - Textbook chapters
396
- - Clinical audit findings
397
- - Your own clinical notes or case studies
398
  """)
399
 
400
  col_add, col_list_src = st.columns([1, 1])
@@ -403,25 +415,60 @@ Suitable sources include:
403
  st.markdown("#### Add New Source")
404
  src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β€” Sepsis (2016)")
405
  src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
406
- src_content = st.text_area(
407
- "Source content (paste text here)",
408
- height=300,
409
- placeholder="Paste the full text of the guideline, paper, or document here...",
410
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  if st.button("βž• Add Source", type="primary", disabled=not (src_title and src_content)):
413
  src_id = f"src_{len(wiki.get('sources', {})) + 1:04d}"
414
  if "sources" not in wiki:
415
  wiki["sources"] = {}
416
- wiki["sources"][src_id] = {
417
  "title": src_title,
418
  "type": src_type,
419
  "content": src_content,
420
  "added": datetime.date.today().isoformat(),
421
  "processed": False,
422
  }
423
- log(f"ingest | Added source: {src_title}")
424
- st.success(f"Source added: {src_title}")
 
 
 
 
425
  st.rerun()
426
 
427
  with col_list_src:
@@ -476,7 +523,9 @@ This is the core Karpathy pattern: **you add sources, Claude maintains the knowl
476
  results_container = st.container()
477
 
478
  for i, (src_id, src) in enumerate(pending.items()):
479
- status.markdown(f"βš™οΈ Compiling: **{src['title']}** ({i+1}/{len(pending)})...")
 
 
480
  try:
481
  result = compile_source(
482
  client=client,
 
14
  import sys
15
  import os
16
 
17
+ try:
18
+ from pypdf import PdfReader
19
+ _PDF_AVAILABLE = True
20
+ except ImportError:
21
+ _PDF_AVAILABLE = False
22
+
23
  sys.path.insert(0, os.path.dirname(__file__))
24
  from wiki.starter import get_starter_wiki
25
  from core.compiler import compile_source, rebuild_index
 
143
  wiki["metadata"]["article_count"] = len(wiki["articles"])
144
 
145
 
146
+ def extract_pdf_text(file_bytes: bytes) -> tuple[str, int]:
147
+ """Extract all text from a PDF. Returns (text, page_count)."""
148
+ reader = PdfReader(io.BytesIO(file_bytes))
149
+ pages = []
150
+ for i, page in enumerate(reader.pages):
151
+ text = page.extract_text() or ""
152
+ if text.strip():
153
+ pages.append(f"--- Page {i + 1} ---\n{text}")
154
+ return "\n\n".join(pages), len(reader.pages)
155
+
156
+
157
  def export_wiki_zip() -> bytes:
158
  buf = io.BytesIO()
159
  with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
 
404
  st.markdown("""
405
  Add source material to the wiki. Claude will integrate it when you run **Compile**.
406
 
407
+ Suitable sources include NICE clinical guidelines, NMC documents, NHS trust protocols,
408
+ research papers, textbook chapters, or clinical audit findings β€” as **PDF or pasted text**.
409
+ Large PDFs (100+ pages) are supported; text is extracted from every page automatically.
 
 
 
 
 
410
  """)
411
 
412
  col_add, col_list_src = st.columns([1, 1])
 
415
  st.markdown("#### Add New Source")
416
  src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β€” Sepsis (2016)")
417
  src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
418
+
419
+ input_method = st.radio("Input method", ["Upload PDF", "Paste text"], horizontal=True)
420
+
421
+ src_content = ""
422
+ pdf_meta = None
423
+
424
+ if input_method == "Upload PDF":
425
+ if not _PDF_AVAILABLE:
426
+ st.error("pypdf not installed β€” PDF upload unavailable.")
427
+ else:
428
+ uploaded_pdf = st.file_uploader(
429
+ "Upload PDF (up to 500 MB)",
430
+ type=["pdf"],
431
+ key="pdf_upload",
432
+ help="Text is extracted from every page. Large documents are fully supported.",
433
+ )
434
+ if uploaded_pdf is not None:
435
+ with st.spinner(f"Extracting text from {uploaded_pdf.name}..."):
436
+ raw_bytes = uploaded_pdf.read()
437
+ try:
438
+ extracted, page_count = extract_pdf_text(raw_bytes)
439
+ src_content = extracted
440
+ pdf_meta = {"pages": page_count, "size_kb": len(raw_bytes) // 1024}
441
+ st.success(f"Extracted {page_count} pages / {len(extracted):,} characters")
442
+ with st.expander("Preview extracted text"):
443
+ st.text(extracted[:1500] + ("..." if len(extracted) > 1500 else ""))
444
+ except Exception as e:
445
+ st.error(f"PDF extraction failed: {e}")
446
+ if not src_title and uploaded_pdf:
447
+ src_title = uploaded_pdf.name.replace(".pdf", "").replace("_", " ")
448
+ else:
449
+ src_content = st.text_area(
450
+ "Paste text here",
451
+ height=300,
452
+ placeholder="Paste the full text of the guideline, paper, or document here...",
453
+ )
454
 
455
  if st.button("βž• Add Source", type="primary", disabled=not (src_title and src_content)):
456
  src_id = f"src_{len(wiki.get('sources', {})) + 1:04d}"
457
  if "sources" not in wiki:
458
  wiki["sources"] = {}
459
+ entry = {
460
  "title": src_title,
461
  "type": src_type,
462
  "content": src_content,
463
  "added": datetime.date.today().isoformat(),
464
  "processed": False,
465
  }
466
+ if pdf_meta:
467
+ entry["pdf_pages"] = pdf_meta["pages"]
468
+ entry["pdf_size_kb"] = pdf_meta["size_kb"]
469
+ wiki["sources"][src_id] = entry
470
+ log(f"ingest | Added source: {src_title} ({len(src_content):,} chars)")
471
+ st.success(f"Source added: **{src_title}**")
472
  st.rerun()
473
 
474
  with col_list_src:
 
523
  results_container = st.container()
524
 
525
  for i, (src_id, src) in enumerate(pending.items()):
526
+ char_count = len(src["content"])
527
+ chunk_note = f" β€” {char_count:,} chars, will chunk" if char_count > 7000 else ""
528
+ status.markdown(f"βš™οΈ Compiling: **{src['title']}** ({i+1}/{len(pending)}){chunk_note}...")
529
  try:
530
  result = compile_source(
531
  client=client,