Spaces:
Sleeping
Sleeping
Ryan commited on
Commit ·
fd01d7b
1
Parent(s): a197ca5
upgrade to gpt-4o
Browse files- README.md +1 -1
- citation_validator.py +22 -63
- rag_chat.py +1 -1
README.md
CHANGED
|
@@ -24,7 +24,7 @@ A Retrieval-Augmented Generation (RAG) system that answers career-related questi
|
|
| 24 |
|
| 25 |
1. Your question is converted to a vector embedding
|
| 26 |
2. Relevant article chunks are retrieved from Qdrant vector database
|
| 27 |
-
3. GPT-4o
|
| 28 |
4. Citations are validated against source material
|
| 29 |
5. You get an answer with verified quotes and source links
|
| 30 |
|
|
|
|
| 24 |
|
| 25 |
1. Your question is converted to a vector embedding
|
| 26 |
2. Relevant article chunks are retrieved from Qdrant vector database
|
| 27 |
+
3. GPT-4o generates an answer with citations
|
| 28 |
4. Citations are validated against source material
|
| 29 |
5. You get an answer with verified quotes and source links
|
| 30 |
|
citation_validator.py
CHANGED
|
@@ -172,81 +172,39 @@ def generate_answer_with_citations(
|
|
| 172 |
"""
|
| 173 |
client = OpenAI(api_key=openai_api_key)
|
| 174 |
|
| 175 |
-
system_prompt = """
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
"citation_id": 1,
|
| 183 |
-
"source_id": 1,
|
| 184 |
-
"quote": "exact sentence or sentences from the source that support your claim"
|
| 185 |
-
}
|
| 186 |
-
]
|
| 187 |
-
}
|
| 188 |
-
|
| 189 |
-
CITATION HARD RULES:
|
| 190 |
-
1. Copy quotes EXACTLY as they appear in the provided context
|
| 191 |
-
- NO ellipses (...)
|
| 192 |
-
- NO paraphrasing
|
| 193 |
-
- NO punctuation changes
|
| 194 |
-
- Word-for-word, character-for-character accuracy required
|
| 195 |
-
|
| 196 |
-
2. If the needed support is in two places, use TWO SEPARATE citation entries
|
| 197 |
-
- Do NOT combine quotes from different sources or different parts of text
|
| 198 |
-
- Each citation must contain a continuous, unmodified quote
|
| 199 |
-
|
| 200 |
-
3. Use the CORRECT source_id from the provided list
|
| 201 |
-
- Source IDs are numbered [Source 1], [Source 2], etc. in the context
|
| 202 |
-
- Verify the source_id matches where you found the quote
|
| 203 |
-
|
| 204 |
-
CRITICAL RULES FOR CITATIONS:
|
| 205 |
-
- For EVERY claim (advice, fact, statistic, recommendation), add an inline citation [1], [2], etc.
|
| 206 |
-
- For each citation, extract and quote the EXACT sentence(s) from the source that directly support your claim
|
| 207 |
-
- Find the specific sentence(s) in the source that contain the relevant information
|
| 208 |
-
- Each quote should be at least 20 characters and contain complete sentence(s)
|
| 209 |
-
- Multiple consecutive sentences can be quoted if needed to fully support the claim
|
| 210 |
|
| 211 |
-
|
| 212 |
-
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
- Only use information from the provided sources - don't add external knowledge
|
| 217 |
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
|
| 221 |
{
|
| 222 |
-
"answer": "
|
| 223 |
"citations": [
|
| 224 |
{
|
| 225 |
"citation_id": 1,
|
| 226 |
"source_id": 2,
|
| 227 |
-
"quote": "
|
| 228 |
-
}
|
| 229 |
-
]
|
| 230 |
-
}
|
| 231 |
-
|
| 232 |
-
Example 2 - Multiple claims:
|
| 233 |
-
{
|
| 234 |
-
"answer": "AI safety is considered one of the most pressing problems of our time [1]. Experts estimate that advanced AI could be developed within the next few decades [2], and there's a significant talent gap in the field [3]. This means your contributions could have an outsized impact.",
|
| 235 |
-
"citations": [
|
| 236 |
-
{
|
| 237 |
-
"citation_id": 1,
|
| 238 |
-
"source_id": 1,
|
| 239 |
-
"quote": "We believe that risks from artificial intelligence are one of the most pressing problems facing humanity today."
|
| 240 |
},
|
| 241 |
{
|
| 242 |
"citation_id": 2,
|
| 243 |
-
"source_id":
|
| 244 |
-
"quote": "
|
| 245 |
-
},
|
| 246 |
-
{
|
| 247 |
-
"citation_id": 3,
|
| 248 |
-
"source_id": 3,
|
| 249 |
-
"quote": "There are currently fewer than 300 people working full-time on technical AI safety research, despite the field's critical importance."
|
| 250 |
}
|
| 251 |
]
|
| 252 |
}"""
|
|
@@ -295,6 +253,7 @@ Provide your answer in JSON format with exact quotes from the sources."""
|
|
| 295 |
"validation_errors": ["Failed to parse JSON response"]
|
| 296 |
}
|
| 297 |
|
|
|
|
| 298 |
# Validate each citation
|
| 299 |
validation_start = time.time()
|
| 300 |
validated_citations = []
|
|
|
|
| 172 |
"""
|
| 173 |
client = OpenAI(api_key=openai_api_key)
|
| 174 |
|
| 175 |
+
system_prompt = """Answer the user's question using ONLY the provided sources from 80,000 Hours articles.
|
| 176 |
|
| 177 |
+
STEP 1: Write your answer
|
| 178 |
+
- Write a clear, concise answer to the question
|
| 179 |
+
- Use a natural, conversational tone
|
| 180 |
+
- After EACH substantive claim, add [1], [2], [3], etc. in order
|
| 181 |
+
- Example: "Career capital is important [1]. You can build it through work [2]."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
STEP 2: Provide citations
|
| 184 |
+
- For each [N] in your answer, provide a citation with:
|
| 185 |
+
* citation_id: The number from your answer (1 for [1], 2 for [2], etc.)
|
| 186 |
+
* source_id: Which source it came from (see [Source N] in context below)
|
| 187 |
+
* quote: Copy the EXACT sentences from that source, word-for-word
|
|
|
|
| 188 |
|
| 189 |
+
CRITICAL RULES:
|
| 190 |
+
1. Number citations in ORDER: [1] is first, [2] is second, [3] is third, etc.
|
| 191 |
+
2. Copy quotes EXACTLY - no changes, no ellipses, no paraphrasing
|
| 192 |
+
3. Match source_id to where you found the quote ([Source 1] → source_id: 1)
|
| 193 |
+
4. Each quote must be complete sentences from the source
|
| 194 |
|
| 195 |
+
OUTPUT FORMAT (valid JSON):
|
| 196 |
{
|
| 197 |
+
"answer": "Your answer with [1], [2], [3] after each claim.",
|
| 198 |
"citations": [
|
| 199 |
{
|
| 200 |
"citation_id": 1,
|
| 201 |
"source_id": 2,
|
| 202 |
+
"quote": "Exact sentence from the source."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
},
|
| 204 |
{
|
| 205 |
"citation_id": 2,
|
| 206 |
+
"source_id": 5,
|
| 207 |
+
"quote": "Another exact sentence from a different source."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
}
|
| 209 |
]
|
| 210 |
}"""
|
|
|
|
| 253 |
"validation_errors": ["Failed to parse JSON response"]
|
| 254 |
}
|
| 255 |
|
| 256 |
+
|
| 257 |
# Validate each citation
|
| 258 |
validation_start = time.time()
|
| 259 |
validated_citations = []
|
rag_chat.py
CHANGED
|
@@ -9,7 +9,7 @@ from config import MODEL_NAME, COLLECTION_NAME
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
-
LLM_MODEL = "gpt-4o
|
| 13 |
SOURCE_COUNT = 10
|
| 14 |
SCORE_THRESHOLD = 0.4
|
| 15 |
|
|
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
+
LLM_MODEL = "gpt-4o"
|
| 13 |
SOURCE_COUNT = 10
|
| 14 |
SCORE_THRESHOLD = 0.4
|
| 15 |
|