Spaces:

NurseCitizenDeveloper
/

nursing-knowledge-base

Sleeping

App Files Files Community

NurseCitizenDeveloper commited on Apr 4

Commit

4da3879

verified ·

1 Parent(s): 3f3fea1

Upload streamlit_app.py with huggingface_hub

Browse files

Files changed (1) hide show

streamlit_app.py +66 -17

streamlit_app.py CHANGED Viewed

@@ -14,6 +14,12 @@ import re
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
 from wiki.starter import get_starter_wiki
 from core.compiler import compile_source, rebuild_index
@@ -137,6 +143,17 @@ def add_or_update_article(article: dict):
     wiki["metadata"]["article_count"] = len(wiki["articles"])
 def export_wiki_zip() -> bytes:
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
@@ -387,14 +404,9 @@ with tab_sources:
     st.markdown("""
 Add source material to the wiki. Claude will integrate it when you run **Compile**.
-Suitable sources include:
-- NICE clinical guidelines (copy and paste the text)
-- NMC guidance documents
-- Research paper abstracts or full text
-- NHS trust protocols
-- Textbook chapters
-- Clinical audit findings
-- Your own clinical notes or case studies
 """)
     col_add, col_list_src = st.columns([1, 1])
@@ -403,25 +415,60 @@ Suitable sources include:
         st.markdown("#### Add New Source")
         src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 — Sepsis (2016)")
         src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
-        src_content = st.text_area(
-            "Source content (paste text here)",
-            height=300,
-            placeholder="Paste the full text of the guideline, paper, or document here...",
-        )
         if st.button("➕ Add Source", type="primary", disabled=not (src_title and src_content)):
             src_id = f"src_{len(wiki.get('sources', {})) + 1:04d}"
             if "sources" not in wiki:
                 wiki["sources"] = {}
-            wiki["sources"][src_id] = {
                 "title": src_title,
                 "type": src_type,
                 "content": src_content,
                 "added": datetime.date.today().isoformat(),
                 "processed": False,
             }
-            log(f"ingest | Added source: {src_title}")
-            st.success(f"Source added: {src_title}")
             st.rerun()
     with col_list_src:
@@ -476,7 +523,9 @@ This is the core Karpathy pattern: **you add sources, Claude maintains the knowl
                 results_container = st.container()
                 for i, (src_id, src) in enumerate(pending.items()):
-                    status.markdown(f"⚙️ Compiling: **{src['title']}** ({i+1}/{len(pending)})...")
                     try:
                         result = compile_source(
                             client=client,

 import sys
 import os
+try:
+    from pypdf import PdfReader
+    _PDF_AVAILABLE = True
+except ImportError:
+    _PDF_AVAILABLE = False
 sys.path.insert(0, os.path.dirname(__file__))
 from wiki.starter import get_starter_wiki
 from core.compiler import compile_source, rebuild_index
     wiki["metadata"]["article_count"] = len(wiki["articles"])
+def extract_pdf_text(file_bytes: bytes) -> tuple[str, int]:
+    """Extract all text from a PDF. Returns (text, page_count)."""
+    reader = PdfReader(io.BytesIO(file_bytes))
+    pages = []
+    for i, page in enumerate(reader.pages):
+        text = page.extract_text() or ""
+        if text.strip():
+            pages.append(f"--- Page {i + 1} ---\n{text}")
+    return "\n\n".join(pages), len(reader.pages)
 def export_wiki_zip() -> bytes:
     buf = io.BytesIO()
     with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
     st.markdown("""
 Add source material to the wiki. Claude will integrate it when you run **Compile**.
+Suitable sources include NICE clinical guidelines, NMC documents, NHS trust protocols,
+research papers, textbook chapters, or clinical audit findings — as **PDF or pasted text**.
+Large PDFs (100+ pages) are supported; text is extracted from every page automatically.
 """)
     col_add, col_list_src = st.columns([1, 1])
         st.markdown("#### Add New Source")
         src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 — Sepsis (2016)")
         src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
+        input_method = st.radio("Input method", ["Upload PDF", "Paste text"], horizontal=True)
+        src_content = ""
+        pdf_meta = None
+        if input_method == "Upload PDF":
+            if not _PDF_AVAILABLE:
+                st.error("pypdf not installed — PDF upload unavailable.")
+            else:
+                uploaded_pdf = st.file_uploader(
+                    "Upload PDF (up to 500 MB)",
+                    type=["pdf"],
+                    key="pdf_upload",
+                    help="Text is extracted from every page. Large documents are fully supported.",
+                )
+                if uploaded_pdf is not None:
+                    with st.spinner(f"Extracting text from {uploaded_pdf.name}..."):
+                        raw_bytes = uploaded_pdf.read()
+                        try:
+                            extracted, page_count = extract_pdf_text(raw_bytes)
+                            src_content = extracted
+                            pdf_meta = {"pages": page_count, "size_kb": len(raw_bytes) // 1024}
+                            st.success(f"Extracted {page_count} pages / {len(extracted):,} characters")
+                            with st.expander("Preview extracted text"):
+                                st.text(extracted[:1500] + ("..." if len(extracted) > 1500 else ""))
+                        except Exception as e:
+                            st.error(f"PDF extraction failed: {e}")
+                    if not src_title and uploaded_pdf:
+                        src_title = uploaded_pdf.name.replace(".pdf", "").replace("_", " ")
+        else:
+            src_content = st.text_area(
+                "Paste text here",
+                height=300,
+                placeholder="Paste the full text of the guideline, paper, or document here...",
+            )
         if st.button("➕ Add Source", type="primary", disabled=not (src_title and src_content)):
             src_id = f"src_{len(wiki.get('sources', {})) + 1:04d}"
             if "sources" not in wiki:
                 wiki["sources"] = {}
+            entry = {
                 "title": src_title,
                 "type": src_type,
                 "content": src_content,
                 "added": datetime.date.today().isoformat(),
                 "processed": False,
             }
+            if pdf_meta:
+                entry["pdf_pages"] = pdf_meta["pages"]
+                entry["pdf_size_kb"] = pdf_meta["size_kb"]
+            wiki["sources"][src_id] = entry
+            log(f"ingest | Added source: {src_title} ({len(src_content):,} chars)")
+            st.success(f"Source added: **{src_title}**")
             st.rerun()
     with col_list_src:
                 results_container = st.container()
                 for i, (src_id, src) in enumerate(pending.items()):
+                    char_count = len(src["content"])
+                    chunk_note = f" — {char_count:,} chars, will chunk" if char_count > 7000 else ""
+                    status.markdown(f"⚙️ Compiling: **{src['title']}** ({i+1}/{len(pending)}){chunk_note}...")
                     try:
                         result = compile_source(
                             client=client,