"""Streamlit frontend for Dokumentintelligens-system. Calls the FastAPI backend at http://localhost:8000. Single-page document search interface with clean sans-serif design. """ import datetime import html import json import os import random import uuid import extra_streamlit_components as stx import streamlit as st import requests API_BASE = os.environ.get("API_BASE_URL", "http://localhost:8000") # Cookie name used to persist the per-browser session ID across page reloads. _SESSION_COOKIE_NAME = "kuda_session_id" _SESSION_COOKIE_TTL_DAYS = 30 # --------------------------------------------------------------------------- # Example questions — drawn from the documents in docs/ # --------------------------------------------------------------------------- EXAMPLE_QUESTIONS: list[str] = [ "Hvad er reglerne for brug af generativ AI til eksamen på KU?", "Hvordan håndteres uansøgt afsked begrundet i institutionens forhold?", "Hvad er de disciplinære foranstaltninger over for studerende?", "Hvordan skal klager over medarbejdere og ledere behandles?", "Hvad er retningslinjerne for afholdelse af MUS-samtaler?", "Hvordan er års- og skemastrukturen organiseret på KU?", "Hvilke regler gælder for eksamenstilmelding og afmelding?", "Hvordan skal studerende dokumentere brug af GAI i skriftlige opgaver?", "Hvad er kommunernes ansvar ved brug af generativ AI?", "Hvilke principper gælder for akademisk integritet ved brug af AI?", "Hvornår kan en leder afvise en klage som åbenbart grundløs?", "Hvad er reglerne for forlænget tid til eksamen?", ] # --------------------------------------------------------------------------- # Internationalisation — all UI strings live here # --------------------------------------------------------------------------- TEXTS: dict[str, dict[str, str]] = { "da": { "page_title": "Dokumentintelligens-system", "lang_label": "Sprog", "sidebar_heading": "Om systemet", "sidebar_body": ( "- **Python + FastAPI** REST-backend\n" "- **Ustruktureret data** — File-parsing, preprocessing, " "tre chunking-strategier\n" "- **Embedding-modeller** — flersproget semantisk " "vektorrepræsentation\n" "- **Vektordatabase + hybrid søgning** — Qdrant (semantisk) " "+ BM25 (leksikalsk)\n" "- **Reranking** — cross-encoder for præcis relevans\n" "- **RAG-arkitektur** — LangChain + LangGraph-orkestreret pipeline\n" "- **LLM-integration** — provider-agnostisk, prompt-styret " "svargenerering\n" "- **Evaluering** — RAGAS-baseret kvalitetsmåling\n" "- **Agent Flows** — LangGraph Plan-and-Execute med værktøjskald og samtalehukommelse\n" "- [**Kildedokumenter**](https://github.com/Xiiqiing/Dokumentassistent/tree/main/docs)" " — de dokumenter systemet er indekseret fra" ), "chunking_label": "Chunking-strategi", "chunking_help": "Vælg hvordan dokumenterne opdeles i tekststykker.", "topk_label": "Antal kilder (top_k)", "topk_help": "Antal dokumentfragmenter der hentes fra søgeindekset.", "title": "Dokumentintelligens-system", "title_badge": "", "subtitle": ( "Et dokumentintelligens-system bygget på en RAG-arkitektur, dækkende file-indlæsning, semantisk chunking, " "hybrid søgning med reranking " "og LLM-genererede svar med kildehenvisninger. LLM-laget er provider-agnostisk. " "To tilstande: en LangGraph Plan-and-Execute-agent (standard) med samtalehukommelse til komplekse forespørgsler, " "og en foruddefineret pipeline til lette modeller. Søgekvaliteten evalueres med RAGAS. " 'Vidensbasen indeholder eksempler på dokumenter om universitetsregler og -forskrifter. Prøv at stille et spørgsmål om den.' ), "search_label": "Stil et spørgsmål om ... ", "search_placeholder": "F.eks.: Hvad er reglerne for behandling af personoplysninger?", "search_button": "Søg", "example_button": "Tilfældigt eksempel", "spinner": "Søger i dokumenterne ...", "status_label": "Tænker ...", "status_done": "Færdig", "status_error": "Noget gik galt", "confidence_label": "Konfidensgrad", "intent_label": "Intent", "strategy_label": "Strategi", "no_answer": "Intet svar modtaget.", "sources_label": "Kilder", "page_label": "side", "no_sources": "Ingen kilder fundet for denne forespørgsel.", "empty_warning": "Indtast venligst et spørgsmål.", "err_connection": ( "Kunne ikke oprette forbindelse til API-serveren. " "Kontroller at backend kører på http://localhost:8000." ), "err_api": "API-fejl", "err_rate_limit": "For mange samtidige forespørgsler, eller API-kvoten er midlertidigt opbrugt. Vent venligst et øjeblik, og prøv igen.", "err_timeout": "Forespørgslen tog for lang tid. Prøv igen.", "unknown": "ukendt", "model_heading": "Aktuel model", "model_llm": "LLM", "model_embedding": "Embedding", "model_unavailable": "Kunne ikke hente modelinfo.", "pipeline_heading": "Pipeline-detaljer", "pipeline_translation": "Oversættelse", "pipeline_original": "Original forespørgsel", "pipeline_translated": "Oversat til dansk", "pipeline_lang": "Sprog registreret", "pipeline_no_translation": "Ingen oversættelse nødvendig", "pipeline_bm25": "BM25-resultater (leksikalsk søgning)", "pipeline_dense": "Vektorsøgning (semantisk)", "pipeline_fused": "RRF-fusioneret rækkefølge", "pipeline_reranked": "Reranking (endelig rækkefølge)", "pipeline_doc": "Dokument", "pipeline_score": "Score", "pipeline_rank": "#", "pipeline_no_results": "Ingen resultater", "pipeline_score_change": "Score-ændring", "pipeline_plan_steps": "Udførelsesplan", "pipeline_tool_calls": "Værktøjskald", "synthesize_status": "Syntetiserer endeligt svar ...", "example_note": "", }, "en": { "page_title": "Document Intelligence System", "lang_label": "Language", "sidebar_heading": "About the system", "sidebar_body": ( "- **Python + FastAPI** REST backend\n" "- **Unstructured data** — File parsing, preprocessing, " "three chunking strategies\n" "- **Embedding models** — multilingual semantic vector " "representations\n" "- **Vector database + hybrid search** — Qdrant (semantic) " "+ BM25 (lexical)\n" "- **Reranking** — cross-encoder for precise relevance\n" "- **RAG architecture** — LangChain + LangGraph-orchestrated pipeline\n" "- **LLM integration** — provider-agnostic, prompt-driven " "answer generation\n" "- **Evaluation** — RAGAS-based quality measurement\n" "- **Agent Flows** — LangGraph Plan-and-Execute with tool calling and conversation memory\n" "- [**Source documents**](https://github.com/Xiiqiing/Dokumentassistent/tree/main/docs)" " — the documents indexed into the knowledge base" ), "chunking_label": "Chunking strategy", "chunking_help": "Choose how documents are split into text chunks.", "topk_label": "Number of sources (top_k)", "topk_help": "Number of document fragments retrieved from the search index.", "title": "Document Intelligence System", "title_badge": "", "subtitle": ( "A document intelligence system built on a RAG architecture, covering file ingestion, semantic chunking, " "hybrid retrieval with reranking, " "and LLM-generated answers with source citations. The LLM layer is provider-agnostic. " "Two modes: a LangGraph Plan-and-Execute agent (default) with conversation memory for complex multi-step queries, " "and a predefined pipeline for lightweight models. " "Retrieval quality is evaluated with RAGAS. " 'The knowledge base contains example documents of university rules and regulations. Try to ask questions about it.' ), "search_label": "Ask a question ...", "search_placeholder": "E.g.: What are the rules for processing personal data?", "search_button": "Search", "example_button": "Random question", "spinner": "Searching documents ...", "status_label": "Thinking ...", "status_done": "Done", "status_error": "Something went wrong", "confidence_label": "Confidence", "intent_label": "Intent", "strategy_label": "Strategy", "no_answer": "No answer received.", "sources_label": "Sources", "page_label": "page", "no_sources": "No sources found for this query.", "empty_warning": "Please enter a question.", "err_connection": ( "Could not connect to the API server. " "Make sure the backend is running at http://localhost:8000." ), "err_api": "API error", "err_rate_limit": "Too many simultaneous requests, or API quota temporarily exhausted. Please wait a moment and try again.", "err_timeout": "The request took too long. Please try again.", "unknown": "unknown", "model_heading": "Current model", "model_llm": "LLM", "model_embedding": "Embedding", "model_unavailable": "Could not fetch model info.", "pipeline_heading": "Pipeline Details", "pipeline_translation": "Query Translation", "pipeline_original": "Original query", "pipeline_translated": "Translated to Danish", "pipeline_lang": "Detected language", "pipeline_no_translation": "No need for translation", "pipeline_bm25": "BM25 Results (lexical search)", "pipeline_dense": "Vector Search (semantic)", "pipeline_fused": "RRF Fused Ranking", "pipeline_reranked": "Reranked (final ranking)", "pipeline_doc": "Document", "pipeline_score": "Score", "pipeline_rank": "#", "pipeline_no_results": "No results", "pipeline_score_change": "Score change", "pipeline_plan_steps": "Execution Plan", "pipeline_tool_calls": "Tool Calls", "synthesize_status": "Synthesizing final answer ...", "example_note": "", }, } # --------------------------------------------------------------------------- # Page config # --------------------------------------------------------------------------- st.set_page_config( page_title="Dokumentintelligens-system", page_icon="📄", layout="centered", ) st.markdown('', unsafe_allow_html=True) # --------------------------------------------------------------------------- # Per-browser session ID — persisted in a cookie so chat history survives # page refreshes. Falls back to a freshly generated UUID if the cookie is # not yet readable (first visit, or before the JS component has initialised). # # CookieManager must be instantiated directly on every rerun (it cannot be # wrapped in @st.cache_resource because its constructor calls a widget # command). Streamlit treats it as the same widget across reruns thanks to # the stable `key` argument. # --------------------------------------------------------------------------- _cookie_manager = stx.CookieManager(key="kuda_cookie_manager") _cookies = _cookie_manager.get_all() # CookieManager loads cookies asynchronously via a JS component. On the very # first script run after a page load, get_all() returns None because the # component has not yet reported back. Stop here and wait for the rerun the # component triggers once it delivers the browser's cookies — otherwise we # would always see "no cookie" on first render and overwrite any existing # session_id with a fresh UUID. if _cookies is None: st.stop() _existing_sid = _cookies.get(_SESSION_COOKIE_NAME) if _existing_sid: # Cookie present → reuse it so the backend can find prior turns. st.session_state["session_id"] = _existing_sid elif "session_id" not in st.session_state: # No cookie yet → mint a fresh ID and persist it for next reload. new_sid = str(uuid.uuid4()) st.session_state["session_id"] = new_sid _cookie_manager.set( _SESSION_COOKIE_NAME, new_sid, expires_at=datetime.datetime.now() + datetime.timedelta(days=_SESSION_COOKIE_TTL_DAYS), key="kuda_set_session_cookie", ) # --------------------------------------------------------------------------- # Analytics — Umami Cloud # --------------------------------------------------------------------------- # `st.html` injects via React's dangerouslySetInnerHTML, and scripts inserted # through innerHTML never execute (HTML5 spec). We instead use a tiny iframe # bootstrap (via components.html) that attaches the real Umami script to the # parent document, so analytics track the actual Streamlit page URL. import streamlit.components.v1 as components # noqa: E402 components.html( """ """, height=0, width=0, ) # --------------------------------------------------------------------------- # Custom CSS -- Clean sans-serif design # --------------------------------------------------------------------------- st.markdown( """ """, unsafe_allow_html=True, ) # --------------------------------------------------------------------------- # Language selector -- right-aligned toggle styled in KU red # --------------------------------------------------------------------------- _col_spacer, _col_lang = st.columns([5, 1.5]) with _col_lang: lang = st.radio( "Language", options=["da", "en"], format_func=lambda c: "Dansk" if c == "da" else "English", index=0, horizontal=True, label_visibility="collapsed", ) t = TEXTS[lang] # --------------------------------------------------------------------------- # Sidebar # --------------------------------------------------------------------------- with st.sidebar: st.markdown( f'
', unsafe_allow_html=True, ) st.markdown(t["sidebar_body"]) st.markdown("---") strategy = st.selectbox( t["chunking_label"], options=["fixed_size", "recursive", "semantic"], index=2, help=t["chunking_help"], ) top_k = st.slider( t["topk_label"], min_value=1, max_value=20, value=5, help=t["topk_help"], ) st.markdown("---") try: _health = requests.get(f"{API_BASE}/health", timeout=5).json() _llm = _health.get("llm_model", "") _llm_prov = _health.get("llm_provider", "") _emb = _health.get("embedding_model", "") _emb_prov = _health.get("embedding_provider", "") st.markdown( f'', unsafe_allow_html=True, ) st.markdown( f'**{t["model_llm"]}:** {_llm} ({_llm_prov}) \n' f'**{t["model_embedding"]}:** {_emb} ({_emb_prov})' ) except Exception: st.caption(t["model_unavailable"]) # --------------------------------------------------------------------------- # Main content # --------------------------------------------------------------------------- # Accent line st.markdown('', unsafe_allow_html=True) # Title block st.markdown( f'