Ryan commited on
Commit
fd01d7b
·
1 Parent(s): a197ca5

upgrade to gpt-4o

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. citation_validator.py +22 -63
  3. rag_chat.py +1 -1
README.md CHANGED
@@ -24,7 +24,7 @@ A Retrieval-Augmented Generation (RAG) system that answers career-related questi
24
 
25
  1. Your question is converted to a vector embedding
26
  2. Relevant article chunks are retrieved from Qdrant vector database
27
- 3. GPT-4o-mini generates an answer with citations
28
  4. Citations are validated against source material
29
  5. You get an answer with verified quotes and source links
30
 
 
24
 
25
  1. Your question is converted to a vector embedding
26
  2. Relevant article chunks are retrieved from Qdrant vector database
27
+ 3. GPT-4o generates an answer with citations
28
  4. Citations are validated against source material
29
  5. You get an answer with verified quotes and source links
30
 
citation_validator.py CHANGED
@@ -172,81 +172,39 @@ def generate_answer_with_citations(
172
  """
173
  client = OpenAI(api_key=openai_api_key)
174
 
175
- system_prompt = """You are a helpful assistant that answers questions based on 80,000 Hours articles.
176
 
177
- You MUST return your response in valid JSON format with this exact structure:
178
- {
179
- "answer": "Your conversational answer with inline citation markers like [1], [2]",
180
- "citations": [
181
- {
182
- "citation_id": 1,
183
- "source_id": 1,
184
- "quote": "exact sentence or sentences from the source that support your claim"
185
- }
186
- ]
187
- }
188
-
189
- CITATION HARD RULES:
190
- 1. Copy quotes EXACTLY as they appear in the provided context
191
- - NO ellipses (...)
192
- - NO paraphrasing
193
- - NO punctuation changes
194
- - Word-for-word, character-for-character accuracy required
195
-
196
- 2. If the needed support is in two places, use TWO SEPARATE citation entries
197
- - Do NOT combine quotes from different sources or different parts of text
198
- - Each citation must contain a continuous, unmodified quote
199
-
200
- 3. Use the CORRECT source_id from the provided list
201
- - Source IDs are numbered [Source 1], [Source 2], etc. in the context
202
- - Verify the source_id matches where you found the quote
203
-
204
- CRITICAL RULES FOR CITATIONS:
205
- - For EVERY claim (advice, fact, statistic, recommendation), add an inline citation [1], [2], etc.
206
- - For each citation, extract and quote the EXACT sentence(s) from the source that directly support your claim
207
- - Find the specific sentence(s) in the source that contain the relevant information
208
- - Each quote should be at least 20 characters and contain complete sentence(s)
209
- - Multiple consecutive sentences can be quoted if needed to fully support the claim
210
 
211
- WRITING STYLE:
212
- - Write concisely in a natural, conversational tone
213
- - You may paraphrase information in your answer, but always cite the source with exact quotes
214
- - You can add brief context/transitions without citations, but cite all substantive claims
215
- - If the sources don't fully answer the question, acknowledge that briefly
216
- - Only use information from the provided sources - don't add external knowledge
217
 
218
- EXAMPLES:
 
 
 
 
219
 
220
- Example 1 - Single claim:
221
  {
222
- "answer": "One of the most effective ways to build career capital is to work at a high-performing organization where you can learn from talented colleagues [1].",
223
  "citations": [
224
  {
225
  "citation_id": 1,
226
  "source_id": 2,
227
- "quote": "Working at a high-performing organization is one of the fastest ways to build career capital because you learn from talented colleagues and develop strong professional networks."
228
- }
229
- ]
230
- }
231
-
232
- Example 2 - Multiple claims:
233
- {
234
- "answer": "AI safety is considered one of the most pressing problems of our time [1]. Experts estimate that advanced AI could be developed within the next few decades [2], and there's a significant talent gap in the field [3]. This means your contributions could have an outsized impact.",
235
- "citations": [
236
- {
237
- "citation_id": 1,
238
- "source_id": 1,
239
- "quote": "We believe that risks from artificial intelligence are one of the most pressing problems facing humanity today."
240
  },
241
  {
242
  "citation_id": 2,
243
- "source_id": 1,
244
- "quote": "Many AI researchers believe there's a 10-50% chance of human-level AI being developed by 2050."
245
- },
246
- {
247
- "citation_id": 3,
248
- "source_id": 3,
249
- "quote": "There are currently fewer than 300 people working full-time on technical AI safety research, despite the field's critical importance."
250
  }
251
  ]
252
  }"""
@@ -295,6 +253,7 @@ Provide your answer in JSON format with exact quotes from the sources."""
295
  "validation_errors": ["Failed to parse JSON response"]
296
  }
297
 
 
298
  # Validate each citation
299
  validation_start = time.time()
300
  validated_citations = []
 
172
  """
173
  client = OpenAI(api_key=openai_api_key)
174
 
175
+ system_prompt = """Answer the user's question using ONLY the provided sources from 80,000 Hours articles.
176
 
177
+ STEP 1: Write your answer
178
+ - Write a clear, concise answer to the question
179
+ - Use a natural, conversational tone
180
+ - After EACH substantive claim, add [1], [2], [3], etc. in order
181
+ - Example: "Career capital is important [1]. You can build it through work [2]."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ STEP 2: Provide citations
184
+ - For each [N] in your answer, provide a citation with:
185
+ * citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
186
+ * source_id: Which source it came from (see [Source N] in context below)
187
+ * quote: Copy the EXACT sentences from that source, word-for-word
 
188
 
189
+ CRITICAL RULES:
190
+ 1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
191
+ 2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
192
+ 3. Match source_id to where you found the quote ([Source 1] → source_id: 1)
193
+ 4. Each quote must be complete sentences from the source
194
 
195
+ OUTPUT FORMAT (valid JSON):
196
  {
197
+ "answer": "Your answer with [1], [2], [3] after each claim.",
198
  "citations": [
199
  {
200
  "citation_id": 1,
201
  "source_id": 2,
202
+ "quote": "Exact sentence from the source."
 
 
 
 
 
 
 
 
 
 
 
 
203
  },
204
  {
205
  "citation_id": 2,
206
+ "source_id": 5,
207
+ "quote": "Another exact sentence from a different source."
 
 
 
 
 
208
  }
209
  ]
210
  }"""
 
253
  "validation_errors": ["Failed to parse JSON response"]
254
  }
255
 
256
+
257
  # Validate each citation
258
  validation_start = time.time()
259
  validated_citations = []
rag_chat.py CHANGED
@@ -9,7 +9,7 @@ from config import MODEL_NAME, COLLECTION_NAME
9
 
10
  load_dotenv()
11
 
12
- LLM_MODEL = "gpt-4o-mini"
13
  SOURCE_COUNT = 10
14
  SCORE_THRESHOLD = 0.4
15
 
 
9
 
10
  load_dotenv()
11
 
12
+ LLM_MODEL = "gpt-4o"
13
  SOURCE_COUNT = 10
14
  SCORE_THRESHOLD = 0.4
15