File size: 7,503 Bytes
7a80ec2
 
66b97d2
fa75b21
d5baf47
fa75b21
 
 
 
d5baf47
 
 
 
 
fa75b21
 
 
 
 
 
2ff1e19
fa75b21
5c15993
fa75b21
 
 
9e1b307
5c15993
 
 
 
 
 
 
 
 
fa75b21
9e1b307
4b05cf5
 
fa75b21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17c2c31
66b97d2
17c2c31
66b97d2
17c2c31
66b97d2
17c2c31
fa75b21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8852afc
fa75b21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c9f9c9
fa75b21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8852afc
 
 
 
 
fa75b21
 
 
 
 
 
 
 
 
 
 
 
 
 
0230093
 
fa75b21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0230093
 
fa75b21
 
 
 
 
 
 
 
985eea4
 
 
fa75b21
 
 
 
 
985eea4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from src.document_processor import DocumentProcessor
from src.vector_store import FAISSVectorStore
from src.metadata_patcher import patch_metadata_for_store
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from pathlib import Path
import logging
import os

load_dotenv()

if not os.getenv('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY environment variable is not set")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from langchain.callbacks.base import BaseCallbackHandler

class StreamHandler(BaseCallbackHandler):
    def __init__(self):
        self.current_text = ""
        self.placeholder = None  # will be set by the UI

    def on_llm_new_token(self, token: str, **kwargs):
        self.current_text += token
        if self.placeholder is not None:
            try:
                # keep an element id so JS can find & center the in-flight answer
                self.placeholder.markdown(
                    f"<div id='assistant-inflight'>{self.current_text}▌</div>",
                    unsafe_allow_html=True
                )
            except Exception:
                # placeholder may be invalid during reruns; ignore errors
                pass

    def get_text(self):
        return self.current_text


class BioethicsChatbot:
    def __init__(self, data_dir: str="data/sample_papers"):
        self.processor = DocumentProcessor()
        self.vector_store = FAISSVectorStore()
        self.history = []
        self.confidence_thresholds = {
            'high': 0.8,
            'medium': 0.65,
            'low': 0.5}

        if not self.vector_store.load_index():
            print("No existing vector store, creating one...")
            pdf_files = list(Path(data_dir).glob("*.pdf"))
            if not pdf_files:
                raise ValueError(f"No PDFs found in {data_dir}")

            chunks = self.processor.process_documents([str(p) for p in pdf_files])
            self.vector_store.add_documents(chunks)
            logger.info("Indexed %d documents.", len(chunks))

        else:
            logger.info("Index loaded from disk")

        metadata_fixes = {
            "A_Theory_of_Bioethics.pdf": {"authors": "DeGrazia and Millum", "year": "2021"},
            "588.full": {"authors": "Wilkinson et al.", "year": "2024"},
            "The Concept of Personal Utility in Genomic Testing  Three Ethical Tensions": {"authors": "Watts and Newson", "year": "2025"},
        }
        patch_metadata_for_store(self.vector_store, metadata_fixes)

        self.stream_handler = StreamHandler()
        self.llm = ChatOpenAI(model="gpt-4o-mini", streaming=True,
                              callbacks=[self.stream_handler])

    def add_new_document(self, pdf_path: str):
        filename = Path(pdf_path).name

        # Check if already in the index
        existing_files = {doc["metadata"].get("filename") for doc in self.vector_store.documents}
        if filename in existing_files:
            print(f"Skipping {filename}: already indexed.")
            return

        # Otherwise process & add
        chunks = self.processor.process_document(pdf_path)
        self.vector_store.add_documents(chunks)
        print(f"Added {len(chunks)} chunks from {pdf_path}")

    def get_citation_confidence(self, similarity_score: float) -> str:
        """Determine citation confidence level based on similarity score"""
        if similarity_score >= self.confidence_thresholds['high']:
            return "high_confidence"
        elif similarity_score >= self.confidence_thresholds['medium']:
            return "medium_confidence"
        elif similarity_score >= self.confidence_thresholds['low']:
            return "low_confidence"
        return "context_only"

    def ask(self, question: str, k: int = 10, history_pairs=None) -> str:
        # Step 1: Retrieve relevant chunks
        results = self.vector_store.search(question, k=k)

        # DEBUG: Print what we found
        print(f"Found {len(results)} results for query: '{question}'")
        for i, r in enumerate(results[:3]):  # Show top 3
            print(f"Result {i + 1} (score: {r.get('similarity_score', 'N/A'):.3f}): {r['content'][:200]}...")

        if not results:
            return "I couldn't find relevant information in the documents."

        # Step 2: Build context from retrieved chunks
        context_blocks = []
        citation_groups = {
            'high_confidence': [],
            'medium_confidence': [],
            'low_confidence': [],
            'context_only': []
        }
        for r in results:
            title = r["metadata"].get("title", None)
            authors = r["metadata"].get("authors", None)
            year = r["metadata"].get("year", "n.d.")

            confidence = self.get_citation_confidence(r["similarity_score"])

            block = (
                f"Source: {authors} ({year}). *{title}* "
                f"[chunk {r['metadata'].get('chunk_id', '?')}, confidence: {confidence}]\n"
                f"{r['content']}\n"
            )

            context_blocks.append(block)
            if authors is not None and authors != "Unknown Author(s)":
                citation_groups[confidence].append(block)

        if history_pairs:
            limited = history_pairs[-4:]
            history_text = "\n".join([f"User: {u}\nBot: {b}" for u, b in limited])
        else:
            history_text = "No previous conversation."
        
        # Build text outside f-string
        joined_context = "\n\n".join(context_blocks)
        joined_high = "\n\n".join(citation_groups['high_confidence']) or "None"
        joined_medium = "\n\n".join(citation_groups['medium_confidence']) or "None"
        joined_low = "\n\n".join(citation_groups['low_confidence']) or "None"
        
        context = f"""
        Conversation so far:
        {history_text}
        
        Relevant sources (use them to guide your answer, but cite only the ones in citation groups):
        {joined_context}
        
        DO NOT CITE IF THE AUTHOR IS "Unknown Author(s)".
        
        CITATION GUIDELINES:        
        - HIGH CONFIDENCE sources: Use direct citations "(Author, Year)"
        - MEDIUM CONFIDENCE sources: Use "According to Author (Year)..."
        - LOW CONFIDENCE sources: Use "(see Author, Year)"
        
        High confidence sources:
        {joined_high}
        
        Medium confidence sources:
        {joined_medium}
        
        Low confidence sources:
        {joined_low}
        """

        # Step 3: Construct prompt
        prompt = f"""
        You are a bioethics expert assistant. 
        Answer the user's question using the context provided below. 
        Draw justified connections between concepts even if not explicitly stated.
        If you need to make reasonable inferences based on the context, do so.
        If the context doesn't contain enough information, say what you do know from the context and indicate what information is missing.
        If the question doesn't concern neither bioethics nor previous questions, inform the user about it and don't answer it. Do not
        be rude; respond to a greeting or goodbye. 
        Context:
        {context}

        Question: {question}
        Answer:
        """

        self.stream_handler.current_text = ""
        
        # streaming happens here
        _ = self.llm.invoke(prompt)
        answer = self.stream_handler.get_text()
        return answer