Upload streamlit_app.py with huggingface_hub
Browse files- streamlit_app.py +66 -17
streamlit_app.py
CHANGED
|
@@ -14,6 +14,12 @@ import re
|
|
| 14 |
import sys
|
| 15 |
import os
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
sys.path.insert(0, os.path.dirname(__file__))
|
| 18 |
from wiki.starter import get_starter_wiki
|
| 19 |
from core.compiler import compile_source, rebuild_index
|
|
@@ -137,6 +143,17 @@ def add_or_update_article(article: dict):
|
|
| 137 |
wiki["metadata"]["article_count"] = len(wiki["articles"])
|
| 138 |
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def export_wiki_zip() -> bytes:
|
| 141 |
buf = io.BytesIO()
|
| 142 |
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
@@ -387,14 +404,9 @@ with tab_sources:
|
|
| 387 |
st.markdown("""
|
| 388 |
Add source material to the wiki. Claude will integrate it when you run **Compile**.
|
| 389 |
|
| 390 |
-
Suitable sources include
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
- Research paper abstracts or full text
|
| 394 |
-
- NHS trust protocols
|
| 395 |
-
- Textbook chapters
|
| 396 |
-
- Clinical audit findings
|
| 397 |
-
- Your own clinical notes or case studies
|
| 398 |
""")
|
| 399 |
|
| 400 |
col_add, col_list_src = st.columns([1, 1])
|
|
@@ -403,25 +415,60 @@ Suitable sources include:
|
|
| 403 |
st.markdown("#### Add New Source")
|
| 404 |
src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β Sepsis (2016)")
|
| 405 |
src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
if st.button("β Add Source", type="primary", disabled=not (src_title and src_content)):
|
| 413 |
src_id = f"src_{len(wiki.get('sources', {})) + 1:04d}"
|
| 414 |
if "sources" not in wiki:
|
| 415 |
wiki["sources"] = {}
|
| 416 |
-
|
| 417 |
"title": src_title,
|
| 418 |
"type": src_type,
|
| 419 |
"content": src_content,
|
| 420 |
"added": datetime.date.today().isoformat(),
|
| 421 |
"processed": False,
|
| 422 |
}
|
| 423 |
-
|
| 424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
st.rerun()
|
| 426 |
|
| 427 |
with col_list_src:
|
|
@@ -476,7 +523,9 @@ This is the core Karpathy pattern: **you add sources, Claude maintains the knowl
|
|
| 476 |
results_container = st.container()
|
| 477 |
|
| 478 |
for i, (src_id, src) in enumerate(pending.items()):
|
| 479 |
-
|
|
|
|
|
|
|
| 480 |
try:
|
| 481 |
result = compile_source(
|
| 482 |
client=client,
|
|
|
|
| 14 |
import sys
|
| 15 |
import os
|
| 16 |
|
| 17 |
+
try:
|
| 18 |
+
from pypdf import PdfReader
|
| 19 |
+
_PDF_AVAILABLE = True
|
| 20 |
+
except ImportError:
|
| 21 |
+
_PDF_AVAILABLE = False
|
| 22 |
+
|
| 23 |
sys.path.insert(0, os.path.dirname(__file__))
|
| 24 |
from wiki.starter import get_starter_wiki
|
| 25 |
from core.compiler import compile_source, rebuild_index
|
|
|
|
| 143 |
wiki["metadata"]["article_count"] = len(wiki["articles"])
|
| 144 |
|
| 145 |
|
| 146 |
+
def extract_pdf_text(file_bytes: bytes) -> tuple[str, int]:
|
| 147 |
+
"""Extract all text from a PDF. Returns (text, page_count)."""
|
| 148 |
+
reader = PdfReader(io.BytesIO(file_bytes))
|
| 149 |
+
pages = []
|
| 150 |
+
for i, page in enumerate(reader.pages):
|
| 151 |
+
text = page.extract_text() or ""
|
| 152 |
+
if text.strip():
|
| 153 |
+
pages.append(f"--- Page {i + 1} ---\n{text}")
|
| 154 |
+
return "\n\n".join(pages), len(reader.pages)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
def export_wiki_zip() -> bytes:
|
| 158 |
buf = io.BytesIO()
|
| 159 |
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
|
|
| 404 |
st.markdown("""
|
| 405 |
Add source material to the wiki. Claude will integrate it when you run **Compile**.
|
| 406 |
|
| 407 |
+
Suitable sources include NICE clinical guidelines, NMC documents, NHS trust protocols,
|
| 408 |
+
research papers, textbook chapters, or clinical audit findings β as **PDF or pasted text**.
|
| 409 |
+
Large PDFs (100+ pages) are supported; text is extracted from every page automatically.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
""")
|
| 411 |
|
| 412 |
col_add, col_list_src = st.columns([1, 1])
|
|
|
|
| 415 |
st.markdown("#### Add New Source")
|
| 416 |
src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β Sepsis (2016)")
|
| 417 |
src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
|
| 418 |
+
|
| 419 |
+
input_method = st.radio("Input method", ["Upload PDF", "Paste text"], horizontal=True)
|
| 420 |
+
|
| 421 |
+
src_content = ""
|
| 422 |
+
pdf_meta = None
|
| 423 |
+
|
| 424 |
+
if input_method == "Upload PDF":
|
| 425 |
+
if not _PDF_AVAILABLE:
|
| 426 |
+
st.error("pypdf not installed β PDF upload unavailable.")
|
| 427 |
+
else:
|
| 428 |
+
uploaded_pdf = st.file_uploader(
|
| 429 |
+
"Upload PDF (up to 500 MB)",
|
| 430 |
+
type=["pdf"],
|
| 431 |
+
key="pdf_upload",
|
| 432 |
+
help="Text is extracted from every page. Large documents are fully supported.",
|
| 433 |
+
)
|
| 434 |
+
if uploaded_pdf is not None:
|
| 435 |
+
with st.spinner(f"Extracting text from {uploaded_pdf.name}..."):
|
| 436 |
+
raw_bytes = uploaded_pdf.read()
|
| 437 |
+
try:
|
| 438 |
+
extracted, page_count = extract_pdf_text(raw_bytes)
|
| 439 |
+
src_content = extracted
|
| 440 |
+
pdf_meta = {"pages": page_count, "size_kb": len(raw_bytes) // 1024}
|
| 441 |
+
st.success(f"Extracted {page_count} pages / {len(extracted):,} characters")
|
| 442 |
+
with st.expander("Preview extracted text"):
|
| 443 |
+
st.text(extracted[:1500] + ("..." if len(extracted) > 1500 else ""))
|
| 444 |
+
except Exception as e:
|
| 445 |
+
st.error(f"PDF extraction failed: {e}")
|
| 446 |
+
if not src_title and uploaded_pdf:
|
| 447 |
+
src_title = uploaded_pdf.name.replace(".pdf", "").replace("_", " ")
|
| 448 |
+
else:
|
| 449 |
+
src_content = st.text_area(
|
| 450 |
+
"Paste text here",
|
| 451 |
+
height=300,
|
| 452 |
+
placeholder="Paste the full text of the guideline, paper, or document here...",
|
| 453 |
+
)
|
| 454 |
|
| 455 |
if st.button("β Add Source", type="primary", disabled=not (src_title and src_content)):
|
| 456 |
src_id = f"src_{len(wiki.get('sources', {})) + 1:04d}"
|
| 457 |
if "sources" not in wiki:
|
| 458 |
wiki["sources"] = {}
|
| 459 |
+
entry = {
|
| 460 |
"title": src_title,
|
| 461 |
"type": src_type,
|
| 462 |
"content": src_content,
|
| 463 |
"added": datetime.date.today().isoformat(),
|
| 464 |
"processed": False,
|
| 465 |
}
|
| 466 |
+
if pdf_meta:
|
| 467 |
+
entry["pdf_pages"] = pdf_meta["pages"]
|
| 468 |
+
entry["pdf_size_kb"] = pdf_meta["size_kb"]
|
| 469 |
+
wiki["sources"][src_id] = entry
|
| 470 |
+
log(f"ingest | Added source: {src_title} ({len(src_content):,} chars)")
|
| 471 |
+
st.success(f"Source added: **{src_title}**")
|
| 472 |
st.rerun()
|
| 473 |
|
| 474 |
with col_list_src:
|
|
|
|
| 523 |
results_container = st.container()
|
| 524 |
|
| 525 |
for i, (src_id, src) in enumerate(pending.items()):
|
| 526 |
+
char_count = len(src["content"])
|
| 527 |
+
chunk_note = f" β {char_count:,} chars, will chunk" if char_count > 7000 else ""
|
| 528 |
+
status.markdown(f"βοΈ Compiling: **{src['title']}** ({i+1}/{len(pending)}){chunk_note}...")
|
| 529 |
try:
|
| 530 |
result = compile_source(
|
| 531 |
client=client,
|