Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from typing import Dict, List, Tuple | |
| import streamlit as st | |
| from dotenv import load_dotenv | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.runnables import RunnableLambda | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_groq import ChatGroq | |
| from ingest import build_index, INDEX_VERSION | |
| APP_TITLE = "SCDM Knowledge Assistant" | |
| INDEX_PATH = os.path.join(os.path.dirname(__file__), "data", "index") | |
| SOURCE_LINKS_PATH = os.path.join(os.path.dirname(__file__), "data", "source_links.json") | |
| SUMMARIES_PATH = os.path.join(os.path.dirname(__file__), "data", "summaries") | |
| def _manifest_path() -> str: | |
| return os.path.join(INDEX_PATH, "manifest.json") | |
| def _needs_rebuild() -> bool: | |
| if not os.path.exists(INDEX_PATH): | |
| return True | |
| mpath = _manifest_path() | |
| if not os.path.exists(mpath): | |
| return True | |
| try: | |
| with open(mpath, "r", encoding="utf-8") as f: | |
| manifest = json.load(f) | |
| return int(manifest.get("index_version", 0)) < int(INDEX_VERSION) | |
| except Exception: | |
| return True | |
| def load_vectorstore(): | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| print("Loading vector index from:", INDEX_PATH) | |
| if _needs_rebuild(): | |
| print("Index missing or outdated, rebuilding...") | |
| build_index() | |
| if _needs_rebuild(): | |
| raise FileNotFoundError(f"Index at {INDEX_PATH} missing or invalid after rebuild.") | |
| return FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True) | |
| def load_source_links() -> Dict[str, str]: | |
| with open(SOURCE_LINKS_PATH, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def load_competency_summary(competency: str) -> str: | |
| """Load competency summary from text file""" | |
| competency_files = { | |
| "Risk-Based CDM": "risk_based_cdm.txt", | |
| "Soft Skills including Leadership and Executive Skills": "soft_skills_leadership.txt", | |
| "Clinical Data Competencies & Cross-Functional Interactions": "clinical_data_competencies.txt", | |
| "Technology & Data Platforms": "technology_data_platforms.txt", | |
| "AI & Cognitive Tech": "ai_cognitive_tech.txt", | |
| "Regulations & Standards": "regulations_standards.txt", | |
| "Clinical Trial Operations": "clinical_trial_operations.txt" | |
| } | |
| filename = competency_files.get(competency) | |
| if not filename: | |
| return "Summary file for {competency} not found. Please add content to {filename}" | |
| filepath = os.path.join(SUMMARIES_PATH, filename) | |
| try: | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| return f.read().strip() | |
| except FileNotFoundError: | |
| return f"Summary file for {competency} not found. Please add content to {filename}" | |
| except Exception as e: | |
| return f"Error loading summary: {str(e)}" | |
| def page_url(url: str, page: int) -> str: | |
| if not url: | |
| return "" | |
| # Typical viewers support #page= | |
| joiner = "#page=" | |
| return f"{url}{joiner}{page}" | |
| def render_sources(sources: List[Dict]): | |
| grouped: Dict[Tuple[str, int], List[Dict]] = {} | |
| for s in sources: | |
| key = (s.get("file_name", ""), s.get("page", 0)) | |
| grouped.setdefault(key, []).append(s) | |
| src_links = load_source_links() | |
| for (file_name, page), items in grouped.items(): | |
| title = items[0].get("title") or file_name | |
| url = src_links.get(file_name, items[0].get("url", "")) | |
| human_url = page_url(url, page) if url else "" | |
| with st.expander(f"Source: {title} β page {page}"): | |
| if human_url: | |
| st.markdown(f"[Open source (page {page})]({human_url})") | |
| # Show unique paragraphs | |
| seen = set() | |
| for it in items: | |
| text = it.get("text", "") | |
| if not text or text in seen: | |
| continue | |
| seen.add(text) | |
| st.markdown("> " + text.replace("\n", "\n> ")) | |
| NOISE_SECTION_KEYWORDS = { | |
| "table of contents", | |
| "contents", | |
| "references", | |
| "bibliography", | |
| "glossary", | |
| "acknowledgements", | |
| "acknowledgments", | |
| "foreword", | |
| "index", | |
| "list of figures", | |
| "list of tables", | |
| } | |
| def _looks_like_toc(text: str) -> bool: | |
| import re as _re | |
| if not text: | |
| return False | |
| matches = _re.findall(r"\.{2,}\s*\d{1,3}\b", text) | |
| return len(matches) >= 5 | |
| def _is_noise_text(text: str, page: int) -> bool: | |
| lower = (text or "").lower() | |
| if page == 1 and ("table of contents" in lower or "contents" in lower): | |
| return True | |
| if any(kw in lower for kw in NOISE_SECTION_KEYWORDS): | |
| return True | |
| if _looks_like_toc(text): | |
| return True | |
| # Very short paragraphs are low-signal | |
| if len((text or "").strip()) < 40: | |
| return True | |
| return False | |
| def build_llm(model: str, temperature: float) -> ChatGroq: | |
| return ChatGroq(model=model, temperature=temperature) | |
| def classify_intent(llm: ChatGroq, user_input: str) -> str: | |
| system = ( | |
| "You are an intent classifier for a clinical research assistant. " | |
| "Return one label only from: QA, SUMMARIZE, QUIZ. " | |
| "- QA: user asks a factual question or wants mapping/links. " | |
| "- SUMMARIZE: user asks to summarize, compare, or extract key points. " | |
| "- QUIZ: user mentions QUIZ or MCQ." | |
| "Respond with only the label." | |
| ) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system), | |
| ("user", "{q}") | |
| ]) | |
| try: | |
| chain = prompt | llm | StrOutputParser() | |
| label = chain.invoke({"q": user_input}).strip().upper() | |
| if label not in {"QA", "SUMMARIZE", "QUIZ"}: | |
| return "QA" | |
| return label | |
| except Exception: | |
| # Fallback to QA on any LLM classification error | |
| return "QA" | |
| def retrieve_context(vs: FAISS, query: str, k: int) -> List[Dict]: | |
| pre_k = max(k * 4, 20) | |
| docs = vs.similarity_search(query, k=pre_k) | |
| candidates: List[Dict] = [] | |
| for d in docs: | |
| md = d.metadata or {} | |
| item = { | |
| "text": d.page_content, | |
| "file_name": md.get("file_name", ""), | |
| "title": md.get("title", ""), | |
| "url": md.get("url", ""), | |
| "page": md.get("page", 0), | |
| "paragraph_index": md.get("paragraph_index", 0), | |
| } | |
| if not _is_noise_text(item["text"], item["page"]): | |
| candidates.append(item) | |
| return candidates[:k] | |
| def answer_with_citations(llm: ChatGroq, question: str, contexts: List[Dict]) -> str: | |
| context_blocks = [] | |
| for c in contexts: | |
| title = c.get("title") or c.get("file_name") | |
| page = c.get("page") | |
| context_blocks.append( | |
| f"Title: {title}\nPage: {page}\nParagraph: {c['text']}" | |
| ) | |
| context_str = "\n\n".join(context_blocks) | |
| system = ( | |
| "You answer with high precision using provided sources only. " | |
| "Always support key claims with quotes and human-readable citations in the form (Title, p. X). " | |
| "Be timeline-aware and note when guidance differs by year." | |
| ) | |
| user_tmpl = ( | |
| "Question: {q}\n\n" | |
| "Sources:\n{ctx}\n\n" | |
| "Instructions:\n" | |
| "- Answer concisely and clearly for clinical data professionals.\n" | |
| "- Include short quotes for key claims.\n" | |
| "- Use citations like (Title, p. X).\n" | |
| "- If uncertain or conflicting, say so and present options." | |
| ) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system), | |
| ("user", user_tmpl) | |
| ]) | |
| try: | |
| chain = prompt | llm | StrOutputParser() | |
| return chain.invoke({"q": question, "ctx": context_str}) | |
| except Exception as e: | |
| return ( | |
| "I ran into an issue generating the answer. Please ensure dependencies are updated (groq and langchain-groq). " | |
| f"Error: {e}" | |
| ) | |
| def summarize_with_citations(llm: ChatGroq, task: str, contexts: List[Dict]) -> str: | |
| context_str = "\n\n".join( | |
| f"Title: {c.get('title') or c.get('file_name')}\nPage: {c.get('page')}\nParagraph: {c['text']}" | |
| for c in contexts | |
| ) | |
| system = ( | |
| "You summarize clinical research documents for a professional audience. " | |
| "Use quotes sparingly but provide citations like (Title, p. X)." | |
| ) | |
| user_tmpl = ( | |
| "Task: {task}\n\nSources:\n{ctx}\n\n" | |
| "Produce a structured summary with bullets and a short concluding note." | |
| ) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system), | |
| ("user", user_tmpl) | |
| ]) | |
| try: | |
| chain = prompt | llm | StrOutputParser() | |
| return chain.invoke({"task": task, "ctx": context_str}) | |
| except Exception as e: | |
| return ( | |
| "I ran into an issue generating the summary. Please ensure dependencies are updated (groq and langchain-groq). " | |
| f"Error: {e}" | |
| ) | |
| def quiz_from_context(llm: ChatGroq, instruction: str, contexts: List[Dict], num_q: int) -> str: | |
| context_str = "\n\n".join( | |
| f"Title: {c.get('title') or c.get('file_name')}\nPage: {c.get('page')}\nParagraph: {c['text']}" | |
| for c in contexts | |
| ) | |
| system = ( | |
| "Generate professional multiple-choice quiz questions for clinical data science topics. " | |
| "Each question should have 4 options, correct answer, brief explanation, and at least one quote with (Title, p. X)." | |
| ) | |
| user_tmpl = ( | |
| "Create {n} MCQs based on the sources.\n\n" | |
| "Instruction: {inst}\n\n" | |
| "Sources:\n{ctx}\n\n" | |
| "Format with clear numbering and options A-D." | |
| ) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system), | |
| ("user", user_tmpl) | |
| ]) | |
| try: | |
| chain = prompt | llm | StrOutputParser() | |
| return chain.invoke({"n": num_q, "inst": instruction, "ctx": context_str}) | |
| except Exception as e: | |
| return ( | |
| "I ran into an issue generating the quiz. Please ensure dependencies are updated (groq and langchain-groq). " | |
| f"Error: {e}" | |
| ) | |
| def ensure_session_state(): | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [] | |
| if "sample_question" not in st.session_state: | |
| st.session_state.sample_question = None | |
| if "last_processed_question" not in st.session_state: | |
| st.session_state.last_processed_question = None | |
| if "sample_question_placeholder" not in st.session_state: | |
| st.session_state.sample_question_placeholder = None | |
| # Sample questions for the sidebar | |
| SAMPLE_QUESTIONS = [ | |
| "What is Clinical Data Science and how does it differ from Clinical Data Management?", | |
| "What are the key competencies for CDM professionals?", | |
| "How has the CDM profession evolved over the past 5 years?", | |
| "What are the main drivers for the transition to Clinical Data Science?", | |
| "What certifications does SCDM offer?", | |
| "What are the best practices for data integrity in clinical trials?" | |
| ] | |
| # Removed old competency rendering functions - now integrated into main chat interface | |
| def render_sample_questions_sidebar(): | |
| """Render sample questions in the sidebar""" | |
| st.sidebar.markdown("## π‘ Sample Questions") | |
| st.sidebar.markdown("Click any question to get started:") | |
| for i, question in enumerate(SAMPLE_QUESTIONS): | |
| if st.sidebar.button(question, key=f"sample_{i}", use_container_width=True): | |
| st.session_state.sample_question = question | |
| st.rerun() | |
| def render_about_sidebar(): | |
| """Render the about section in the sidebar above sample questions""" | |
| st.sidebar.markdown("## βΉοΈ About this chatbot") | |
| st.sidebar.markdown( | |
| "This conversational assistant helps you explore SCDM and the SCDM Framework during the conference. " | |
| "It answers questions, explains concepts, and points you to relevant source documents." | |
| ) | |
| st.sidebar.caption("Disclaimer: All documents used are available publicly, this is a GenAI powered chatbot please verify your own information, not sanctioned by SCDM.") | |
| def render_sources_sidebar(): | |
| """Render the sources section in the sidebar""" | |
| st.sidebar.markdown("## π Sources used") | |
| st.sidebar.markdown( | |
| "- SCDM Topic Briefs and whitepapers (e.g., eSource Playbooks, 5Vs, CDM Role Evolution)" | |
| ) | |
| st.sidebar.markdown("- ICH E6(R3) and E8(R1) guidelines") | |
| def main(): | |
| load_dotenv() | |
| st.set_page_config(page_title=APP_TITLE, page_icon="π") | |
| # Create header with title and logo | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| st.title(APP_TITLE) | |
| with col2: | |
| st.image("logo1.png", width=120) | |
| # Initialize all heavy components upfront for better performance | |
| with st.spinner("π Initializing SCDM Assistant..."): | |
| # Check API key first | |
| api_key = os.getenv("GROQ_API_KEY", "") | |
| if not api_key: | |
| st.error("GROQ_API_KEY is not set. Add it to your .env file.") | |
| st.stop() | |
| # Load vector store and LLM once | |
| try: | |
| vs = load_vectorstore() | |
| llm = build_llm(model="llama-3.3-70b-versatile", temperature=0.2) | |
| except Exception as e: | |
| st.error(f"Failed to initialize: {e}") | |
| st.stop() | |
| # Force mode to Q&A | |
| mode = "Q&A" | |
| model = "llama-3.3-70b-versatile" | |
| temperature = 0.2 | |
| top_k = 5 | |
| ensure_session_state() | |
| # Render About, Sample Questions, and Sources in sidebar | |
| render_about_sidebar() | |
| render_sample_questions_sidebar() | |
| render_sources_sidebar() | |
| # Always show the main chat interface | |
| st.markdown("Ask me anything about SCDM, clinical data management, or explore competency areas!") | |
| # Chat history display | |
| for m in st.session_state.messages: | |
| with st.chat_message(m["role"]): | |
| st.markdown(m["content"]) | |
| if m.get("sources"): | |
| render_sources(m["sources"]) | |
| # Competency exploration buttons above chat input | |
| st.markdown("### π― Explore Competencies") | |
| competencies = [ | |
| "Risk-Based CDM", | |
| "Soft Skills including Leadership and Executive Skills", | |
| "Clinical Data Competencies & Cross-Functional Interactions", | |
| "Technology & Data Platforms", | |
| "AI & Cognitive Tech", | |
| "Regulations & Standards", | |
| "Clinical Trial Operations" | |
| ] | |
| # Create a 3-column grid for consistent button sizing | |
| cols = st.columns(3) | |
| # Distribute buttons across 3 columns (3 rows) | |
| for i, competency in enumerate(competencies): | |
| row = i // 3 # Which row (0, 1, or 2) | |
| col = i % 3 # Which column (0, 1, or 2) | |
| with cols[col]: | |
| if st.button(competency, key=f"comp_{i}", use_container_width=True, | |
| help=f"Click to learn about {competency}"): | |
| summary = load_competency_summary(competency) | |
| st.session_state.messages.append({ | |
| "role": "assistant", | |
| "content": f"## {competency}\n\n{summary}\n\n---\n*Ask me follow-up questions about {competency}!*", | |
| "sources": [] | |
| }) | |
| st.rerun() | |
| # Chat input section (bottom anchored) | |
| # If a sample question was clicked, auto-run it as a message | |
| if st.session_state.sample_question: | |
| user_q = st.session_state.sample_question | |
| st.session_state.sample_question = None | |
| with st.chat_message("user"): | |
| st.markdown(user_q) | |
| st.session_state.messages.append({"role": "user", "content": user_q}) | |
| st.session_state.last_processed_question = user_q | |
| with st.chat_message("assistant"): | |
| with st.spinner("π Searching knowledge base..."): | |
| contexts = retrieve_context(vs, user_q, k=top_k) | |
| with st.spinner("π€ Generating answer..."): | |
| answer = answer_with_citations(llm, user_q, contexts) | |
| st.markdown(answer) | |
| render_sources(contexts) | |
| st.session_state.messages.append({ | |
| "role": "assistant", | |
| "content": answer, | |
| "sources": contexts, | |
| }) | |
| # Always keep the chat input at the bottom of the page | |
| user_input = st.chat_input("Type your question hereβ¦") | |
| if user_input: | |
| with st.chat_message("user"): | |
| st.markdown(user_input) | |
| st.session_state.messages.append({"role": "user", "content": user_input}) | |
| st.session_state.last_processed_question = user_input | |
| with st.chat_message("assistant"): | |
| with st.spinner("π Searching knowledge base..."): | |
| contexts = retrieve_context(vs, user_input, k=top_k) | |
| with st.spinner("π€ Generating answer..."): | |
| answer = answer_with_citations(llm, user_input, contexts) | |
| st.markdown(answer) | |
| render_sources(contexts) | |
| st.session_state.messages.append({ | |
| "role": "assistant", | |
| "content": answer, | |
| "sources": contexts, | |
| }) | |
| # Show welcome prompt if there is no conversation yet | |
| if not st.session_state.messages: | |
| st.info("π€ I'm ready to answer your question. What is it?") | |
| if __name__ == "__main__": | |
| main() | |