Dhruv-Ty commited on
Commit
cb23461
·
verified ·
1 Parent(s): 4c99acf

Update src/model.py

Browse files
Files changed (1) hide show
  1. src/model.py +105 -81
src/model.py CHANGED
@@ -147,73 +147,70 @@ def extract_and_link_sources(text, evidence_snippets):
147
  for snippet in evidence_snippets:
148
  if source_id_match == snippet["id"]:
149
  source_map[source_id_match] = {
150
- "id": snippet["id"],
151
- "title": snippet["title"].strip(),
152
- "url": snippet["url"],
153
  "citation": snippet["citation"],
154
  "pmid": snippet.get("pmid", ""),
155
  "doi": snippet.get("doi", "")
156
- }
157
  break
158
 
159
- # Replace PMID citations with links
 
 
 
 
 
 
 
 
 
 
 
160
  linked_text = text
161
- for pmid_key in [f"PMID:{pmid}" for pmid in pmid_matches]:
162
- if pmid_key in source_map:
163
- source_data = source_map[pmid_key]
164
- safe_key = re.escape(pmid_key)
165
- pattern = f"\\[{safe_key}\\]"
166
-
167
- # Create a replacement with title and URL
168
- short_title = source_data['title'][:60] + "..." if len(source_data['title']) > 60 else source_data['title']
169
- replacement = f"[{short_title}]({source_data['url']})"
170
-
171
- linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
172
-
173
- # Replace DOI citations with links
174
- for doi_key in [f"DOI:{doi}" for doi in doi_matches]:
175
- if doi_key in source_map:
176
- source_data = source_map[doi_key]
177
- safe_key = re.escape(doi_key)
178
- pattern = f"\\[{safe_key}\\]"
179
-
180
- # Create a replacement with title and URL
181
- short_title = source_data['title'][:60] + "..." if len(source_data['title']) > 60 else source_data['title']
182
- replacement = f"[{short_title}]({source_data['url']})"
183
-
184
- linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
185
-
186
- # Replace other citation formats
187
- for source_id_key, source_data in source_map.items():
188
- if not (source_id_key.startswith("PMID:") or source_id_key.startswith("DOI:")):
189
- safe_id = re.escape(source_id_key)
190
- pattern = f"\\[{safe_id}\\]"
191
- replacement = f"[{source_data['title']}]({source_data['url']})"
192
- linked_text = re.sub(pattern, replacement, linked_text)
193
-
194
- # Handle generic [source_id] placeholder
195
- if "source_id" in source_matches:
196
- # Use the first snippet available if we have any
197
- if evidence_snippets and "source_id" not in source_map:
198
- snippet = evidence_snippets[0] # Use the first snippet
199
- if snippet.get("url") and snippet.get("title"):
200
- source_map["source_id"] = {
201
- "id": snippet["id"],
202
- "title": snippet["title"].strip(),
203
- "url": snippet["url"],
204
- "citation": snippet["citation"],
205
- "pmid": snippet.get("pmid", ""),
206
- "doi": snippet.get("doi", "")
207
- }
208
- replacement = f"[{snippet['title']}]({snippet['url']})"
209
  linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
 
 
 
 
 
 
 
 
 
 
210
 
211
  # Final fallback for any remaining placeholders
212
- linked_text = re.sub(r'\[source_id\]', "[Medical Reference]", linked_text)
213
- linked_text = re.sub(r'\[PMID:(\d+)\]', r'[PubMed Article]', linked_text)
214
- linked_text = re.sub(r'\[DOI:(10\.\d+\/[^\]]+)\]', r'[Europe PMC Article]', linked_text)
215
 
216
- return linked_text, source_map
 
 
 
 
 
217
 
218
  # Implement PubMed API integration for medical evidence retrieval
219
  def fetch_from_pubmed_api(query, max_results=3, api_key=None):
@@ -1290,7 +1287,20 @@ def parse_doctor_response(response_text):
1290
  sources_text = sources_match.group(2).strip()
1291
  # Split into individual sources
1292
  if '\n' in sources_text:
1293
- parsed["sources"] = [item.strip() for item in sources_text.split('\n') if item.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
1294
  else:
1295
  parsed["sources"] = [sources_text]
1296
 
@@ -1318,11 +1328,19 @@ def parse_doctor_response(response_text):
1318
 
1319
  parsed["main_response"] = '\n'.join(main_response_lines)
1320
 
1321
- # Extract citations in the text (format: [source_id])
1322
- citation_matches = re.findall(r'\[([\w\d:]+)\]', response_text)
1323
- for citation in citation_matches:
 
 
1324
  if citation not in parsed["sources"]:
1325
  parsed["sources"].append(citation)
 
 
 
 
 
 
1326
 
1327
  return parsed
1328
 
@@ -1386,16 +1404,19 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1386
  evidence_text += """CITATION INSTRUCTIONS:
1387
  1. IMPORTANT: Provide a direct answer first before asking follow-up questions. Even with limited information, give your best assessment.
1388
  2. You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
1389
- 3. When citing information from these articles, use the following formats:
1390
- • For PubMed articles: [PMID:123456] where 123456 is the actual PubMed ID
1391
- For Europe PMC articles without PMID: [DOI:10.xxxx/yyyy] where 10.xxxx/yyyy is the DOI
 
 
1392
 
1393
- Example: "Recent studies have shown improved outcomes with early intervention [PMID:34567890]."
1394
- Example: "Current guidelines recommend a multidisciplinary approach [DOI:10.1234/abcd]."
1395
  4. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
1396
  5. When multiple sources support a claim, cite all of them for stronger evidence.
1397
- Example: "This approach is supported by multiple studies [PMID:12345678][PMID:87654321]."
1398
- 6. Include full citations in your Sources section with clickable URLs.
 
1399
  7. If the abstracts have conflicting information, acknowledge this and present both perspectives with citations.
1400
  8. Use the most recent sources when available, especially for treatment recommendations.
1401
  9. If full text is available (marked as "Open Access" or "Full Text Available"), prioritize information from those sources as they contain more complete data.
@@ -1421,13 +1442,17 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1421
  **Priority 2: Follow-up Questions**
1422
  After your direct answer, include a clearly labeled "Follow-up Questions:" section with specific questions that would help refine your assessment.
1423
 
 
 
1424
  **Main Response Structure:**
1425
  1. A direct answer to the patient's concerns WITHOUT the heading "Direct Answer:".
1426
  2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
1427
  3. Recommendations for a treatment plan or next steps.
1428
- 4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using either:
1429
- [PMID:123456] format for PubMed articles
1430
- • [DOI:10.xxxx/yyyy] format for Europe PMC articles without PMID
 
 
1431
 
1432
  Use no more than 3 sources and no fewer than 2 sources.
1433
 
@@ -1436,18 +1461,17 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1436
  Do NOT start the first question with asterisks (**). Format each question properly with just a number.
1437
  - **Reasoning**: Bullet points detailing your clinical reasoning.
1438
  Do NOT start the first point with asterisks (**). Format each bullet point properly.
1439
- - **Sources**: A list of all references cited in your main response (2-3 sources), formatted as:
1440
- - PMID: 12345678 - Author et al. (Year). Title. Journal.
1441
- URL: https://pubmed.ncbi.nlm.nih.gov/12345678/
1442
- - DOI: 10.xxxx/yyyy - Author et al. (Year). Title. Journal.
1443
- URL: https://doi.org/10.xxxx/yyyy
1444
 
1445
  **IMPORTANT FORMATTING NOTES:**
1446
- 1. Do NOT include technical information like URLs, PMIDs or DOIs in the main answer - these belong in the Sources section only.
1447
- 2. For follow-up questions, use numbered format (1. 2. 3.) not bullet points.
1448
- 3. Number the follow-up questions starting from 1, not from any other number.
1449
- 4. NEVER use markdown formatting like ** (asterisks) at the beginning of any points, questions, or lines.
1450
- 5. Make sure all bullet points and numbered items are clean, with no markdown formatting.
 
1451
 
1452
  IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references, PMIDs, or DOIs.
1453
  """
 
147
  for snippet in evidence_snippets:
148
  if source_id_match == snippet["id"]:
149
  source_map[source_id_match] = {
150
+ "id": snippet["id"],
151
+ "title": snippet["title"].strip(),
152
+ "url": snippet["url"],
153
  "citation": snippet["citation"],
154
  "pmid": snippet.get("pmid", ""),
155
  "doi": snippet.get("doi", "")
156
+ }
157
  break
158
 
159
+ # Create a numbered citation system
160
+ numbered_sources = {}
161
+ citation_number = 1
162
+ citation_map = {} # Maps original citation keys to numbers
163
+
164
+ # First create a numbering system for all sources
165
+ for key in source_map.keys():
166
+ citation_map[key] = citation_number
167
+ numbered_sources[citation_number] = source_map[key]
168
+ citation_number += 1
169
+
170
+ # Replace citations with numbered format
171
  linked_text = text
172
+ for source_key, number in citation_map.items():
173
+ source_data = source_map[source_key]
174
+ safe_key = re.escape(source_key)
175
+ pattern = f"\\[{safe_key}\\]"
176
+
177
+ # Create a colored, clickable numbered reference
178
+ colored_ref = f"<span style='color:#3366cc;'>[{number}]</span>"
179
+ replacement = f"<a href='{source_data['url']}' target='_blank'>{colored_ref}</a>"
180
+
181
+ # Replace all instances of this citation with the numbered format
182
+ linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
183
+
184
+ # Handle any remaining source placeholders
185
+ if "source_id" in source_matches and "source_id" not in citation_map:
186
+ if evidence_snippets:
187
+ snippet = evidence_snippets[0]
188
+ next_number = len(numbered_sources) + 1
189
+ colored_ref = f"<span style='color:#3366cc;'>[{next_number}]</span>"
190
+ replacement = f"<a href='{snippet['url']}' target='_blank'>{colored_ref}</a>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
192
+
193
+ # Add to numbered sources
194
+ numbered_sources[next_number] = {
195
+ "id": snippet["id"],
196
+ "title": snippet["title"].strip(),
197
+ "url": snippet["url"],
198
+ "citation": snippet["citation"],
199
+ "pmid": snippet.get("pmid", ""),
200
+ "doi": snippet.get("doi", "")
201
+ }
202
 
203
  # Final fallback for any remaining placeholders
204
+ linked_text = re.sub(r'\[source_id\]', "<span style='color:#999999;'>[?]</span>", linked_text)
205
+ linked_text = re.sub(r'\[PMID:(\d+)\]', r"<span style='color:#999999;'>[?]</span>", linked_text)
206
+ linked_text = re.sub(r'\[DOI:(10\.\d+\/[^\]]+)\]', r"<span style='color:#999999;'>[?]</span>", linked_text)
207
 
208
+ # Now update source_map to use the numbered format for the sources section
209
+ numbered_source_map = {}
210
+ for number, data in numbered_sources.items():
211
+ numbered_source_map[str(number)] = data
212
+
213
+ return linked_text, numbered_source_map
214
 
215
  # Implement PubMed API integration for medical evidence retrieval
216
  def fetch_from_pubmed_api(query, max_results=3, api_key=None):
 
1287
  sources_text = sources_match.group(2).strip()
1288
  # Split into individual sources
1289
  if '\n' in sources_text:
1290
+ # Parse each line as a potential source
1291
+ source_lines = [item.strip() for item in sources_text.split('\n') if item.strip()]
1292
+
1293
+ # Process source lines to adapt to new numbered format
1294
+ formatted_sources = []
1295
+ for line in source_lines:
1296
+ # Look for numbered source pattern: [1] or 1. or similar
1297
+ if re.match(r'^\d+[\.\)]|^\[\d+\]', line):
1298
+ formatted_sources.append(line)
1299
+ else:
1300
+ # If no number detected, just add the source
1301
+ formatted_sources.append(line)
1302
+
1303
+ parsed["sources"] = formatted_sources
1304
  else:
1305
  parsed["sources"] = [sources_text]
1306
 
 
1328
 
1329
  parsed["main_response"] = '\n'.join(main_response_lines)
1330
 
1331
+ # Extract citations in the text - both numbered [1] and PMID/DOI formats
1332
+ # Standard citation formats
1333
+ pmid_doi_citation_matches = re.findall(r'\[(PMID|DOI):([\w\d:\.\/]+)\]', response_text)
1334
+ for match in pmid_doi_citation_matches:
1335
+ citation = f"{match[0]}:{match[1]}"
1336
  if citation not in parsed["sources"]:
1337
  parsed["sources"].append(citation)
1338
+
1339
+ # Numbered citations like [1]
1340
+ numbered_citation_matches = re.findall(r'\[(\d+)\]', response_text)
1341
+ for num in numbered_citation_matches:
1342
+ if num not in parsed["sources"]:
1343
+ parsed["sources"].append(num)
1344
 
1345
  return parsed
1346
 
 
1404
  evidence_text += """CITATION INSTRUCTIONS:
1405
  1. IMPORTANT: Provide a direct answer first before asking follow-up questions. Even with limited information, give your best assessment.
1406
  2. You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
1407
+ 3. When citing information from these articles, use NUMBERED references [1], [2], [3], etc.
1408
+
1409
+ Example: "Recent studies have shown improved outcomes with early intervention [1]."
1410
+ Example: "Current guidelines recommend a multidisciplinary approach [2]."
1411
+ Example: "This approach is supported by multiple studies [1][3]."
1412
 
1413
+ DO NOT use formats like [PMID:123456] or [DOI:10.xxxx/yyyy] in the main text.
1414
+
1415
  4. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
1416
  5. When multiple sources support a claim, cite all of them for stronger evidence.
1417
+ 6. Include full citations in your Sources section with the format:
1418
+ [1] PMID: 12345678 - Author et al. (Year). Title. Journal.
1419
+ [2] DOI: 10.xxxx/yyyy - Author et al. (Year). Title. Journal.
1420
  7. If the abstracts have conflicting information, acknowledge this and present both perspectives with citations.
1421
  8. Use the most recent sources when available, especially for treatment recommendations.
1422
  9. If full text is available (marked as "Open Access" or "Full Text Available"), prioritize information from those sources as they contain more complete data.
 
1442
  **Priority 2: Follow-up Questions**
1443
  After your direct answer, include a clearly labeled "Follow-up Questions:" section with specific questions that would help refine your assessment.
1444
 
1445
+ **IMPORTANT CITATION FORMAT CHANGE: Use numbered references [1], [2], [3] instead of PMID/DOI format**
1446
+
1447
  **Main Response Structure:**
1448
  1. A direct answer to the patient's concerns WITHOUT the heading "Direct Answer:".
1449
  2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
1450
  3. Recommendations for a treatment plan or next steps.
1451
+ 4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using numbered references [1], [2], [3].
1452
+ Each time you use information from a source, cite it with [1], [2], etc.
1453
+
1454
+ Example: "Recent studies have shown improved outcomes with early intervention [1]."
1455
+ Example: "This approach is supported by multiple studies [1][2]."
1456
 
1457
  Use no more than 3 sources and no fewer than 2 sources.
1458
 
 
1461
  Do NOT start the first question with asterisks (**). Format each question properly with just a number.
1462
  - **Reasoning**: Bullet points detailing your clinical reasoning.
1463
  Do NOT start the first point with asterisks (**). Format each bullet point properly.
1464
+ - **Sources**: A numbered list of all references cited in your main response (2-3 sources), formatted as:
1465
+ [1] PMID: 12345678 - Author et al. (Year). Title. Journal.
1466
+ [2] DOI: 10.xxxx/yyyy - Author et al. (Year). Title. Journal.
 
 
1467
 
1468
  **IMPORTANT FORMATTING NOTES:**
1469
+ 1. Use numbered citations [1], [2], [3] instead of [PMID:12345678] or [DOI:10.xxxx/yyyy] in the main text.
1470
+ 2. In the Sources section, include the full citation details with the PMID or DOI.
1471
+ 3. For follow-up questions, use numbered format (1. 2. 3.) not bullet points.
1472
+ 4. Number the follow-up questions starting from 1, not from any other number.
1473
+ 5. NEVER use markdown formatting like ** (asterisks) at the beginning of any points, questions, or lines.
1474
+ 6. Make sure all bullet points and numbered items are clean, with no markdown formatting.
1475
 
1476
  IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references, PMIDs, or DOIs.
1477
  """