jmisak commited on
Commit
9619c6a
·
verified ·
1 Parent(s): ae6e4db

Upload validation.py

Browse files
Files changed (1) hide show
  1. validation.py +77 -7
validation.py CHANGED
@@ -242,33 +242,103 @@ def validate_summary_quality(summary: str, num_transcripts: int) -> Tuple[float,
242
  """Check summary for rigor and accuracy"""
243
  issues = []
244
  score = 1.0
245
-
246
  # Check for quantification
247
  if not re.search(r'\d+\s*(?:out of|of|participants|%)', summary):
248
  issues.append("No quantified findings (must include counts/percentages)")
249
  score -= 0.3
250
-
251
  # Check for vague claims
252
  vague_terms = ['many', 'most', 'some', 'several', 'often', 'frequently']
253
  if any(term in summary.lower() for term in vague_terms):
254
  issues.append("Contains vague terms - should use specific numbers")
255
  score -= 0.2
256
-
257
  # Check for absolute claims
258
  absolute_terms = ['all', 'everyone', 'nobody', 'never', 'always']
259
  for term in absolute_terms:
260
  if re.search(rf'\b{term}\b', summary.lower()):
261
  issues.append(f"Absolute claim '{term}' found - likely overgeneralization")
262
  score -= 0.2
263
-
264
  # Check for evidence markers
265
  if 'consensus' not in summary.lower() and 'majority' not in summary.lower():
266
  issues.append("Missing consensus indicators")
267
  score -= 0.1
268
-
269
  # Check length is substantial
270
  if len(summary) < 500:
271
  issues.append("Summary too brief for thorough analysis")
272
  score -= 0.2
273
-
274
- return max(0.0, score), issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  """Check summary for rigor and accuracy"""
243
  issues = []
244
  score = 1.0
245
+
246
  # Check for quantification
247
  if not re.search(r'\d+\s*(?:out of|of|participants|%)', summary):
248
  issues.append("No quantified findings (must include counts/percentages)")
249
  score -= 0.3
250
+
251
  # Check for vague claims
252
  vague_terms = ['many', 'most', 'some', 'several', 'often', 'frequently']
253
  if any(term in summary.lower() for term in vague_terms):
254
  issues.append("Contains vague terms - should use specific numbers")
255
  score -= 0.2
256
+
257
  # Check for absolute claims
258
  absolute_terms = ['all', 'everyone', 'nobody', 'never', 'always']
259
  for term in absolute_terms:
260
  if re.search(rf'\b{term}\b', summary.lower()):
261
  issues.append(f"Absolute claim '{term}' found - likely overgeneralization")
262
  score -= 0.2
263
+
264
  # Check for evidence markers
265
  if 'consensus' not in summary.lower() and 'majority' not in summary.lower():
266
  issues.append("Missing consensus indicators")
267
  score -= 0.1
268
+
269
  # Check length is substantial
270
  if len(summary) < 500:
271
  issues.append("Summary too brief for thorough analysis")
272
  score -= 0.2
273
+
274
+ return max(0.0, score), issues
275
+
276
+
277
+ def verify_consensus_claims(summary: str, valid_results: List[Dict]) -> List[str]:
278
+ """Cross-check consensus claims against actual data"""
279
+
280
+ warnings = []
281
+ total = len(valid_results)
282
+
283
+ # Extract consensus claims from summary (e.g., "8 out of 10", "8/10", "8 of 10")
284
+ consensus_pattern = r'(\d+)\s*(?:out of|of|/)\s*(\d+)\s*(?:participants|transcripts|interviews)?'
285
+ claims = re.findall(consensus_pattern, summary, re.IGNORECASE)
286
+
287
+ for claim_count, claim_total in claims:
288
+ count = int(claim_count)
289
+ claimed_total = int(claim_total)
290
+
291
+ # Verify total matches actual transcript count
292
+ if claimed_total != total:
293
+ warnings.append(
294
+ f"Claimed total '{claimed_total}' doesn't match actual transcript count '{total}'"
295
+ )
296
+ continue
297
+
298
+ percentage = (count / total) * 100 if total > 0 else 0
299
+
300
+ # Extract surrounding context to find consensus level labels
301
+ # Look for labels within 200 chars before the claim
302
+ for match in re.finditer(consensus_pattern, summary, re.IGNORECASE):
303
+ match_text = match.group()
304
+ if claim_count in match_text:
305
+ start_pos = max(0, match.start() - 200)
306
+ context = summary[start_pos:match.end()]
307
+
308
+ # Verify consensus level labels match percentages
309
+ if "STRONG CONSENSUS" in context.upper() and percentage < 80:
310
+ warnings.append(
311
+ f"Claimed 'STRONG CONSENSUS' but {count}/{total} is only {percentage:.0f}% (needs ≥80%)"
312
+ )
313
+
314
+ if "MAJORITY" in context.upper() and "STRONG" not in context.upper() and percentage < 60:
315
+ warnings.append(
316
+ f"Claimed 'MAJORITY' but {count}/{total} is only {percentage:.0f}% (needs ≥60%)"
317
+ )
318
+
319
+ if percentage < 40 and ("CONSENSUS" in context.upper() or "MAJORITY" in context.upper()):
320
+ warnings.append(
321
+ f"Claimed consensus/majority but {count}/{total} is only {percentage:.0f}% (should be labeled as minority/outlier)"
322
+ )
323
+
324
+ # Check for standalone percentage claims
325
+ pct_pattern = r'(\d+)%'
326
+ percentages = re.findall(pct_pattern, summary)
327
+
328
+ for pct in percentages:
329
+ pct_val = int(pct)
330
+ if pct_val > 100:
331
+ warnings.append(f"Invalid percentage: {pct}% (exceeds 100%)")
332
+ elif pct_val < 0:
333
+ warnings.append(f"Invalid percentage: {pct}% (negative value)")
334
+
335
+ # Check for transcript ID references
336
+ id_pattern = r'[Tt]ranscript\s+#?(\d+)'
337
+ referenced_ids = [int(id_num) for id_num in re.findall(id_pattern, summary)]
338
+
339
+ if referenced_ids:
340
+ for ref_id in referenced_ids:
341
+ if ref_id < 1 or ref_id > total:
342
+ warnings.append(f"Referenced Transcript #{ref_id} but only {total} transcripts exist")
343
+
344
+ return warnings