import streamlit as st import sys import os import html import re import json import uuid import streamlit.components.v1 as components sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from pipeline import run_query, PipelineResult from retrieval.retriever import RetrievedDocument from retrieval.refine import refine_with_user_query from evaluation.feedback import log_feedback st.set_page_config( page_title="Digital Commonwealth · BPL Search", page_icon="📚", layout="wide", initial_sidebar_state="collapsed", ) st.markdown(""" """, unsafe_allow_html=True) # ── Constants ───────────────────────────────────────────────────────────────── EXAMPLE_QUERIES = [ "What happened in Boston in 1900?", "Find photographs of Greece", "Show me circus posters", "Victorian era correspondence", "Boston Traveler newspaper 1900", "Women's suffrage documents", ] # ── Helpers ─────────────────────────────────────────────────────────────────── def linkify_citations(text: str, num_docs: int) -> str: def replace(match): n = int(match.group(1)) if 1 <= n <= num_docs: return ( '' '[' + str(n) + ']' ) return match.group(0) return re.sub(r'\[(\d+)\]', replace, text) def format_card(doc: RetrievedDocument) -> dict: topics = doc.topics or [] title_lower = (doc.title or "").lower() if any(t.lower() in ["photograph","photography","photographs"] for t in topics): doc_type = "Photograph" elif any(t.lower() in ["map","maps","cartography"] for t in topics): doc_type = "Map" elif any(w in title_lower for w in ["traveler","globe","herald","gazette","journal","tribune"]): doc_type = "Newspaper" elif any(t.lower() in ["correspondence","manuscript","letter","papers"] for t in topics): doc_type = "Manuscript" else: doc_type = "Document" date_str = doc.issue_date or (str(doc.year[0]) if doc.year else "Date unknown") snippet = doc.best_chunk_text[:300] if doc.best_chunk_text else "" full_text = doc.best_chunk_text if doc.best_chunk_text else "" tags = list(set((doc.topics or []) + (doc.geography or [])))[:5] url = f"https://www.digitalcommonwealth.org/search/commonwealth:{doc.ark_id}" thumbnail_url = ( f"https://iiif.digitalcommonwealth.org/iiif/2/{doc.exemplary_image_id}/full/400,/0/default.jpg" if doc.exemplary_image_id and doc.exemplary_image_id.strip() else "" ) return { "type": doc_type, "title": doc.title or "Untitled", "date": date_str, "collection": doc.institution or "Boston Public Library", "snippet": snippet, "full_text": full_text, "tags": tags, "score": round(doc.final_score, 2), "url": url, "thumbnail": thumbnail_url, } def build_card_html(r: dict, i: int) -> str: score_pct = min(int(r["score"] * 100), 100) tags_html = "".join( '' + html.escape(t) + '' for t in r["tags"] ) thumb = ( '

' if r.get("thumbnail", "").startswith("https://") else "" ) full_text = r.get("full_text", "") if full_text and len(full_text) > 300: expander = ( '

' '

Full Text

' '

' + html.escape(full_text) + '

' '

' ) else: expander = "" return ( '

' + thumb + '

' + html.escape(r["type"]) + '

' + '

' + html.escape(r["title"]) + '

' + '

' + html.escape(r["date"]) + ' · ' + html.escape(r["collection"]) + '

' + '

' + html.escape(r["snippet"]) + '

' + expander + '

' + tags_html + '

' + '

' + 'Relevance' + '

' + '

' + '' + str(score_pct) + '%' + ' ' + '' + 'View in Digital Commonwealth ↗' + '

' + '

' ) # ── Session state ───────────────────────────────────────────────────────────── for k, v in [ ("query", ""), ("results", None), ("searched", False), ("context", ""), ("latency_ms", 0), ("_last_ran", ""), ("page", 0), ("query_id", None), ("docs", []), ("thumbs", {}), ("missing_text", ""), ("refined_with", []), ("_scroll_to_top", False), ]: if k not in st.session_state: st.session_state[k] = v if "session_id" not in st.session_state: st.session_state["session_id"] = str(uuid.uuid4()) # ── Masthead ────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── Search box ──────────────────────────────────────────────────────────────── st.markdown('

Natural Language Query

', unsafe_allow_html=True) with st.form(key="search_form", border=False): col_input, col_btn = st.columns([5, 1]) with col_input: typed = st.text_input( "Natural Language Query", value=st.session_state.query, placeholder='e.g. "Find photographs of Boston Harbor from the 1800s"', label_visibility="collapsed", key="text_input_box", ) with col_btn: search_clicked = st.form_submit_button( "Search →", type="primary", use_container_width=True ) st.markdown( '

' 'Try an example

', unsafe_allow_html=True, ) pill_clicked = None row1 = st.columns(3) row2 = st.columns(3) for i, q in enumerate(EXAMPLE_QUERIES): col = row1[i] if i < 3 else row2[i - 3] with col: # use_container_width fills the column — this is what makes pills look good if st.button(q, key=f"pill_{i}", use_container_width=True): pill_clicked = q # ── Determine active query ──────────────────────────────────────────────────── if pill_clicked: st.session_state.query = pill_clicked elif search_clicked and typed.strip(): st.session_state.query = typed.strip() active_query = st.session_state.query.strip() # ── Run pipeline ────────────────────────────────────────────────────────────── if active_query and active_query != st.session_state["_last_ran"]: st.session_state["_last_ran"] = active_query st.session_state["page"] = 0 st.session_state["thumbs"] = {} st.session_state["missing_text"] = "" st.session_state["refined_with"] = [] with st.spinner("Searching the archive…"): try: result: PipelineResult = run_query( active_query, session_id=st.session_state["session_id"], ) cards = [format_card(doc) for doc in result.documents] st.session_state.results = cards st.session_state.docs = result.documents st.session_state.query_id = result.query_id st.session_state.context = result.generation.response st.session_state.latency_ms = result.latency_ms st.session_state.searched = True except Exception as e: st.error(f"Search failed: {e}") st.session_state.searched = False # ── Results ─────────────────────────────────────────────────────────────────── if st.session_state.searched and st.session_state.results is not None: results = st.session_state.results context = st.session_state.context latency = st.session_state.latency_ms st.markdown('

', unsafe_allow_html=True) if results: if st.session_state["refined_with"]: chips = " · ".join(html.escape(q) for q in st.session_state["refined_with"]) st.markdown( '

' 'Refined search. ' + chips + '

', unsafe_allow_html=True, ) if st.session_state["_scroll_to_top"]: st.session_state["_scroll_to_top"] = False components.html( """ """, height=0, ) context_with_links = linkify_citations(context, len(results)) st.markdown( '

About these results. ' + context_with_links + '

', unsafe_allow_html=True, ) st.markdown( '

' + '

Found ' + str(len(results)) + ' items for “' + html.escape(st.session_state.query) + '”

' + '

Ranked by relevance · ' + str(latency) + 'ms · Digital Commonwealth BPL Subset

' + '

', unsafe_allow_html=True, ) PAGE_SIZE = 10 total_pages = max(1, (len(results) + PAGE_SIZE - 1) // PAGE_SIZE) page = st.session_state["page"] start = page * PAGE_SIZE end = start + PAGE_SIZE page_results = results[start:end] page_docs = st.session_state.docs[start:end] for i, (r, doc) in enumerate(zip(page_results, page_docs), start=start + 1): with st.container(): st.markdown(build_card_html(r, i), unsafe_allow_html=True) # ── Feedback row ────────────────────────────────────────── # No use_container_width — buttons size naturally to their text. # The narrow columns prevent them from ever stretching wide. current = st.session_state["thumbs"].get(doc.ark_id) up_type = "primary" if current == "up" else "secondary" down_type = "primary" if current == "down" else "secondary" fb_col1, fb_col2, _ = st.columns([1, 1.6, 7]) with fb_col1: if st.button( "✓ Helpful", key=f"up_{i}_{doc.ark_id}", type=up_type, help="Mark as helpful", ): st.session_state["thumbs"][doc.ark_id] = "up" log_feedback( query_id = st.session_state["query_id"], ark_id = doc.ark_id, signal = "up", session_id = st.session_state["session_id"], raw_query = st.session_state["query"], ) st.rerun() with fb_col2: if st.button( "✗ Not relevant", key=f"down_{i}_{doc.ark_id}", type=down_type, help="Mark as not relevant", ): st.session_state["thumbs"][doc.ark_id] = "down" log_feedback( query_id = st.session_state["query_id"], ark_id = doc.ark_id, signal = "down", session_id = st.session_state["session_id"], raw_query = st.session_state["query"], ) st.rerun() st.markdown('

', unsafe_allow_html=True) # ── Pagination controls ─────────────────────────────────────────── if total_pages > 1: st.markdown('

', unsafe_allow_html=True) pcol_prev, pcol_info, pcol_next = st.columns([1, 2, 1]) with pcol_prev: if page > 0: if st.button("← Previous", key="prev_page", use_container_width=True): st.session_state["page"] -= 1 st.rerun() with pcol_info: st.markdown( '

' 'Page ' + str(page + 1) + ' of ' + str(total_pages) + ' · ' + str(len(results)) + ' total results' + '

', unsafe_allow_html=True, ) with pcol_next: if page < total_pages - 1: if st.button("Next →", key="next_page", use_container_width=True): st.session_state["page"] += 1 st.rerun() # ── Human-in-the-loop refinement ───────────────────────────────── st.markdown('

', unsafe_allow_html=True) st.markdown( '

Didn\'t find any relevant results?

', unsafe_allow_html=True, ) st.markdown( '

' 'Refine your search. Be more specific about what you want.' '

', unsafe_allow_html=True, ) st.text_area( "Refine your search", key="missing_text", height=110, placeholder="e.g. photographs of JFK as a senator in 1958, not newspaper clippings", label_visibility="collapsed", ) if st.button("Refine search", key="refine_btn", use_container_width=True): user_text = st.session_state["missing_text"].strip() if not user_text: st.info("Type a refined query before clicking refine.") else: log_feedback( query_id = st.session_state["query_id"], ark_id = "", signal = "missing", comment = user_text, session_id = st.session_state["session_id"], raw_query = st.session_state["query"], ) with st.spinner("Searching with your refined query…"): try: merged, follow_ups, _child_ids = refine_with_user_query( original_query = st.session_state["query"], original_results = st.session_state.docs, user_query = user_text, top_k = max(50, len(st.session_state.docs)), session_id = st.session_state["session_id"], parent_query_id = st.session_state["query_id"], ) if follow_ups: st.session_state.docs = merged st.session_state.results = [format_card(d) for d in merged] st.session_state.refined_with = follow_ups st.session_state["page"] = 0 st.session_state["_scroll_to_top"] = True st.rerun() else: st.warning("Refinement search failed. Try again.") except Exception as e: st.error(f"Refine failed: {e}") else: context_with_links = linkify_citations(context, len(results)) st.markdown( '

' + context_with_links + '

', unsafe_allow_html=True, ) st.markdown( '

' '

🗂️

' '

No matching materials found

' '

Try rephrasing your query, or use one of the example searches above.

' '

', unsafe_allow_html=True, ) # ── Footer ──────────────────────────────────────────────────────────────────── st.markdown( '', unsafe_allow_html=True, )

Search the Archive