import streamlit as st
import sys
import os
import html
import re
import json
import uuid
import streamlit.components.v1 as components
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from pipeline import run_query, PipelineResult
from retrieval.retriever import RetrievedDocument
from retrieval.refine import refine_with_user_query
from evaluation.feedback import log_feedback
st.set_page_config(
page_title="Digital Commonwealth Β· BPL Search",
page_icon="π",
layout="wide",
initial_sidebar_state="collapsed",
)
st.markdown("""
""", unsafe_allow_html=True)
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
EXAMPLE_QUERIES = [
"What happened in Boston in 1900?",
"Find photographs of Greece",
"Show me circus posters",
"Victorian era correspondence",
"Boston Traveler newspaper 1900",
"Women's suffrage documents",
]
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def linkify_citations(text: str, num_docs: int) -> str:
def replace(match):
n = int(match.group(1))
if 1 <= n <= num_docs:
return (
''
'[' + str(n) + ']'
)
return match.group(0)
return re.sub(r'\[(\d+)\]', replace, text)
def format_card(doc: RetrievedDocument) -> dict:
topics = doc.topics or []
title_lower = (doc.title or "").lower()
if any(t.lower() in ["photograph","photography","photographs"] for t in topics):
doc_type = "Photograph"
elif any(t.lower() in ["map","maps","cartography"] for t in topics):
doc_type = "Map"
elif any(w in title_lower for w in ["traveler","globe","herald","gazette","journal","tribune"]):
doc_type = "Newspaper"
elif any(t.lower() in ["correspondence","manuscript","letter","papers"] for t in topics):
doc_type = "Manuscript"
else:
doc_type = "Document"
date_str = doc.issue_date or (str(doc.year[0]) if doc.year else "Date unknown")
snippet = doc.best_chunk_text[:300] if doc.best_chunk_text else ""
full_text = doc.best_chunk_text if doc.best_chunk_text else ""
tags = list(set((doc.topics or []) + (doc.geography or [])))[:5]
url = f"https://www.digitalcommonwealth.org/search/commonwealth:{doc.ark_id}"
thumbnail_url = (
f"https://iiif.digitalcommonwealth.org/iiif/2/{doc.exemplary_image_id}/full/400,/0/default.jpg"
if doc.exemplary_image_id and doc.exemplary_image_id.strip() else ""
)
return {
"type": doc_type, "title": doc.title or "Untitled",
"date": date_str, "collection": doc.institution or "Boston Public Library",
"snippet": snippet, "full_text": full_text, "tags": tags,
"score": round(doc.final_score, 2), "url": url, "thumbnail": thumbnail_url,
}
def build_card_html(r: dict, i: int) -> str:
score_pct = min(int(r["score"] * 100), 100)
tags_html = "".join(
'' + html.escape(t) + '' for t in r["tags"]
)
thumb = (
'
'
if r.get("thumbnail", "").startswith("https://") else ""
)
full_text = r.get("full_text", "")
if full_text and len(full_text) > 300:
expander = (
''
'Full Text
'
'' + html.escape(full_text) + '
'
' '
)
else:
expander = ""
return (
'
'
+ thumb
+ '
' + html.escape(r["type"]) + '
'
+ '
' + html.escape(r["title"]) + '
'
+ '
'
+ html.escape(r["date"]) + ' Β· ' + html.escape(r["collection"])
+ '
'
+ '
' + html.escape(r["snippet"]) + '
'
+ expander
+ '
' + tags_html + '
'
+ '
'
+ '
'
)
# ββ Session state βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
for k, v in [
("query", ""), ("results", None), ("searched", False),
("context", ""), ("latency_ms", 0), ("_last_ran", ""), ("page", 0),
("query_id", None),
("docs", []),
("thumbs", {}),
("missing_text", ""),
("refined_with", []),
("_scroll_to_top", False),
]:
if k not in st.session_state:
st.session_state[k] = v
if "session_id" not in st.session_state:
st.session_state["session_id"] = str(uuid.uuid4())
# ββ Masthead ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.markdown("""
""", unsafe_allow_html=True)
# ββ Search box ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.markdown('Natural Language Query
', unsafe_allow_html=True)
with st.form(key="search_form", border=False):
col_input, col_btn = st.columns([5, 1])
with col_input:
typed = st.text_input(
"Natural Language Query",
value=st.session_state.query,
placeholder='e.g. "Find photographs of Boston Harbor from the 1800s"',
label_visibility="collapsed",
key="text_input_box",
)
with col_btn:
search_clicked = st.form_submit_button(
"Search β", type="primary", use_container_width=True
)
st.markdown(
''
'Try an example
',
unsafe_allow_html=True,
)
pill_clicked = None
row1 = st.columns(3)
row2 = st.columns(3)
for i, q in enumerate(EXAMPLE_QUERIES):
col = row1[i] if i < 3 else row2[i - 3]
with col:
# use_container_width fills the column β this is what makes pills look good
if st.button(q, key=f"pill_{i}", use_container_width=True):
pill_clicked = q
# ββ Determine active query ββββββββββββββββββββββββββββββββββββββββββββββββββββ
if pill_clicked:
st.session_state.query = pill_clicked
elif search_clicked and typed.strip():
st.session_state.query = typed.strip()
active_query = st.session_state.query.strip()
# ββ Run pipeline ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if active_query and active_query != st.session_state["_last_ran"]:
st.session_state["_last_ran"] = active_query
st.session_state["page"] = 0
st.session_state["thumbs"] = {}
st.session_state["missing_text"] = ""
st.session_state["refined_with"] = []
with st.spinner("Searching the archiveβ¦"):
try:
result: PipelineResult = run_query(
active_query,
session_id=st.session_state["session_id"],
)
cards = [format_card(doc) for doc in result.documents]
st.session_state.results = cards
st.session_state.docs = result.documents
st.session_state.query_id = result.query_id
st.session_state.context = result.generation.response
st.session_state.latency_ms = result.latency_ms
st.session_state.searched = True
except Exception as e:
st.error(f"Search failed: {e}")
st.session_state.searched = False
# ββ Results βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if st.session_state.searched and st.session_state.results is not None:
results = st.session_state.results
context = st.session_state.context
latency = st.session_state.latency_ms
st.markdown('
', unsafe_allow_html=True)
if results:
if st.session_state["refined_with"]:
chips = " Β· ".join(html.escape(q) for q in st.session_state["refined_with"])
st.markdown(
''
'Refined search. ' + chips + '
',
unsafe_allow_html=True,
)
if st.session_state["_scroll_to_top"]:
st.session_state["_scroll_to_top"] = False
components.html(
"""
""",
height=0,
)
context_with_links = linkify_citations(context, len(results))
st.markdown(
'About these results. '
+ context_with_links + '
',
unsafe_allow_html=True,
)
st.markdown(
'',
unsafe_allow_html=True,
)
PAGE_SIZE = 10
total_pages = max(1, (len(results) + PAGE_SIZE - 1) // PAGE_SIZE)
page = st.session_state["page"]
start = page * PAGE_SIZE
end = start + PAGE_SIZE
page_results = results[start:end]
page_docs = st.session_state.docs[start:end]
for i, (r, doc) in enumerate(zip(page_results, page_docs), start=start + 1):
with st.container():
st.markdown(build_card_html(r, i), unsafe_allow_html=True)
# ββ Feedback row ββββββββββββββββββββββββββββββββββββββββββ
# No use_container_width β buttons size naturally to their text.
# The narrow columns prevent them from ever stretching wide.
current = st.session_state["thumbs"].get(doc.ark_id)
up_type = "primary" if current == "up" else "secondary"
down_type = "primary" if current == "down" else "secondary"
fb_col1, fb_col2, _ = st.columns([1, 1.6, 7])
with fb_col1:
if st.button(
"β Helpful",
key=f"up_{i}_{doc.ark_id}",
type=up_type,
help="Mark as helpful",
):
st.session_state["thumbs"][doc.ark_id] = "up"
log_feedback(
query_id = st.session_state["query_id"],
ark_id = doc.ark_id,
signal = "up",
session_id = st.session_state["session_id"],
raw_query = st.session_state["query"],
)
st.rerun()
with fb_col2:
if st.button(
"β Not relevant",
key=f"down_{i}_{doc.ark_id}",
type=down_type,
help="Mark as not relevant",
):
st.session_state["thumbs"][doc.ark_id] = "down"
log_feedback(
query_id = st.session_state["query_id"],
ark_id = doc.ark_id,
signal = "down",
session_id = st.session_state["session_id"],
raw_query = st.session_state["query"],
)
st.rerun()
st.markdown('', unsafe_allow_html=True)
# ββ Pagination controls βββββββββββββββββββββββββββββββββββββββββββ
if total_pages > 1:
st.markdown('', unsafe_allow_html=True)
pcol_prev, pcol_info, pcol_next = st.columns([1, 2, 1])
with pcol_prev:
if page > 0:
if st.button("β Previous", key="prev_page", use_container_width=True):
st.session_state["page"] -= 1
st.rerun()
with pcol_info:
st.markdown(
''
'Page ' + str(page + 1) + ' of ' + str(total_pages)
+ ' · ' + str(len(results)) + ' total results'
+ '
',
unsafe_allow_html=True,
)
with pcol_next:
if page < total_pages - 1:
if st.button("Next β", key="next_page", use_container_width=True):
st.session_state["page"] += 1
st.rerun()
# ββ Human-in-the-loop refinement βββββββββββββββββββββββββββββββββ
st.markdown('
', unsafe_allow_html=True)
st.markdown(
'Didn\'t find any relevant results?
',
unsafe_allow_html=True,
)
st.markdown(
''
'Refine your search. Be more specific about what you want.'
'
',
unsafe_allow_html=True,
)
st.text_area(
"Refine your search",
key="missing_text",
height=110,
placeholder="e.g. photographs of JFK as a senator in 1958, not newspaper clippings",
label_visibility="collapsed",
)
if st.button("Refine search", key="refine_btn", use_container_width=True):
user_text = st.session_state["missing_text"].strip()
if not user_text:
st.info("Type a refined query before clicking refine.")
else:
log_feedback(
query_id = st.session_state["query_id"],
ark_id = "",
signal = "missing",
comment = user_text,
session_id = st.session_state["session_id"],
raw_query = st.session_state["query"],
)
with st.spinner("Searching with your refined queryβ¦"):
try:
merged, follow_ups, _child_ids = refine_with_user_query(
original_query = st.session_state["query"],
original_results = st.session_state.docs,
user_query = user_text,
top_k = max(50, len(st.session_state.docs)),
session_id = st.session_state["session_id"],
parent_query_id = st.session_state["query_id"],
)
if follow_ups:
st.session_state.docs = merged
st.session_state.results = [format_card(d) for d in merged]
st.session_state.refined_with = follow_ups
st.session_state["page"] = 0
st.session_state["_scroll_to_top"] = True
st.rerun()
else:
st.warning("Refinement search failed. Try again.")
except Exception as e:
st.error(f"Refine failed: {e}")
else:
context_with_links = linkify_citations(context, len(results))
st.markdown(
''
+ context_with_links +
'
',
unsafe_allow_html=True,
)
st.markdown(
''
'
ποΈ
'
'
No matching materials found
'
'
Try rephrasing your query, or use one of the example searches above.
'
'
',
unsafe_allow_html=True,
)
# ββ Footer ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.markdown(
'',
unsafe_allow_html=True,
)