TranscriptWriting / validation.py
jmisak's picture
Upload 23 files
54c99ad verified
raw
history blame
9.25 kB
import re
from typing import Tuple, Dict, List
def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
"""
Validate that text extraction was successful
"""
if not text or not text.strip():
return False, "No text extracted"
# Check for minimum content
if len(text) < 50:
return False, f"Extracted text too short ({len(text)} chars)"
# Check for garbled text indicators
garbled_patterns = [
(r'[^\x00-\x7F]{50,}', "Contains large blocks of non-ASCII characters"),
(r'(.)\1{20,}', "Contains suspicious character repetition"),
(r'^[\W\d\s]+$', "Contains only symbols/numbers/whitespace")
]
for pattern, msg in garbled_patterns:
if re.search(pattern, text):
return False, msg
# Check word count
words = text.split()
if len(words) < 20:
return False, f"Too few words ({len(words)})"
# Calculate ratio of real words (heuristic)
potential_words = [w for w in words if re.match(r'^[a-zA-Z]{2,}$', w)]
word_ratio = len(potential_words) / len(words) if words else 0
if word_ratio < 0.3:
return False, f"Low word ratio ({word_ratio:.2f}) - possible extraction issue"
return True, f"Valid ({len(words)} words, {len(text)} chars)"
def validate_transcript_quality(
analyzed_text: str,
structured_data: Dict,
interviewee_type: str
) -> Tuple[float, str]:
"""
Assess quality of analyzed transcript
Returns:
Tuple of (quality_score [0-1], issues_description)
"""
score = 1.0
issues = []
# Check 1: Length of analysis
if len(analyzed_text) < 100:
score -= 0.3
issues.append("Analysis too brief")
elif len(analyzed_text) < 300:
score -= 0.1
issues.append("Analysis somewhat brief")
# Check 2: Presence of structured data
if not structured_data:
score -= 0.2
issues.append("No structured data extracted")
else:
# Check if structured data has content
empty_fields = sum(1 for v in structured_data.values() if not v)
total_fields = len(structured_data)
if empty_fields == total_fields:
score -= 0.3
issues.append("All structured fields empty")
elif empty_fields > total_fields * 0.7:
score -= 0.2
issues.append("Most structured fields empty")
# Check 3: Type-specific validation
if interviewee_type == "HCP":
# Expect medical terminology
medical_terms = re.findall(
r'\b(diagnos\w+|prescri\w+|treatment|medication|patient|clinical|therapy)\b',
analyzed_text,
re.IGNORECASE
)
if len(medical_terms) < 3:
score -= 0.2
issues.append("Limited medical terminology for HCP interview")
# Check for key structured fields
key_fields = ["diagnoses", "prescriptions", "treatment_rationale"]
missing_fields = [f for f in key_fields if not structured_data.get(f)]
if len(missing_fields) == len(key_fields):
score -= 0.2
issues.append("No key HCP data extracted")
elif interviewee_type == "Patient":
# Expect patient-centric language
patient_terms = re.findall(
r'\b(symptom|feel|concern|experience|treatment|side effect|quality of life)\b',
analyzed_text,
re.IGNORECASE
)
if len(patient_terms) < 3:
score -= 0.2
issues.append("Limited patient-centric content")
# Check for key structured fields
key_fields = ["symptoms", "concerns", "treatment_response"]
missing_fields = [f for f in key_fields if not structured_data.get(f)]
if len(missing_fields) == len(key_fields):
score -= 0.2
issues.append("No key patient data extracted")
# Check 4: Error indicators
error_patterns = [
r'\[Error\]',
r'failed to',
r'could not',
r'unable to',
r'timeout'
]
for pattern in error_patterns:
if re.search(pattern, analyzed_text, re.IGNORECASE):
score -= 0.3
issues.append("Contains error messages")
break
# Check 5: Repetitive content (potential LLM failure)
sentences = analyzed_text.split('.')
if len(sentences) > 3:
unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
repetition_ratio = len(sentences) / len(unique_sentences) if unique_sentences else 1
if repetition_ratio > 1.5:
score -= 0.2
issues.append("High content repetition")
# Ensure score is in valid range
score = max(0.0, min(1.0, score))
issues_text = "; ".join(issues) if issues else "No issues detected"
return score, issues_text
def check_data_completeness(csv_rows: List[Dict], interviewee_type: str) -> Dict:
"""
Analyze completeness of extracted data across all transcripts
"""
if not csv_rows:
return {"error": "No data to check"}
# Determine key fields based on type
if interviewee_type == "HCP":
key_fields = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
elif interviewee_type == "Patient":
key_fields = ["Primary Symptoms", "Main Concerns", "Treatment Response"]
else:
key_fields = ["Key Insights"]
completeness = {}
for field in key_fields:
if field in csv_rows[0]: # Check if field exists
filled_count = sum(1 for row in csv_rows if row.get(field) and row[field].strip())
completeness[field] = {
"filled": filled_count,
"total": len(csv_rows),
"percentage": (filled_count / len(csv_rows) * 100) if csv_rows else 0
}
# Overall completeness
total_fields = sum(c["total"] for c in completeness.values())
filled_fields = sum(c["filled"] for c in completeness.values())
overall_percentage = (filled_fields / total_fields * 100) if total_fields > 0 else 0
return {
"by_field": completeness,
"overall": {
"filled": filled_fields,
"total": total_fields,
"percentage": overall_percentage
},
"quality_grade": (
"Excellent" if overall_percentage >= 80 else
"Good" if overall_percentage >= 60 else
"Fair" if overall_percentage >= 40 else
"Poor"
)
}
def validate_structured_data_format(data: Dict, interviewee_type: str) -> Tuple[bool, List[str]]:
"""
Validate that structured data has expected format
"""
issues = []
if not isinstance(data, dict):
return False, ["Data is not a dictionary"]
# Define expected fields by type
expected_fields = {
"HCP": ["diagnoses", "prescriptions", "treatment_rationale"],
"Patient": ["symptoms", "concerns", "treatment_response"],
"Other": ["key_insights"]
}
required = expected_fields.get(interviewee_type, [])
# Check for expected fields
missing = [f for f in required if f not in data]
if missing:
issues.append(f"Missing expected fields: {', '.join(missing)}")
# Check field types (should be lists)
for key, value in data.items():
if not isinstance(value, list):
issues.append(f"Field '{key}' should be a list, got {type(value)}")
# Check for empty lists
empty_fields = [k for k, v in data.items() if isinstance(v, list) and not v]
if len(empty_fields) == len(data):
issues.append("All fields are empty lists")
is_valid = len(issues) == 0
return is_valid, issues
def validate_summary_quality(summary: str, num_transcripts: int) -> Tuple[float, List[str]]:
"""Check summary for rigor and accuracy"""
issues = []
score = 1.0
# Check for quantification
if not re.search(r'\d+\s*(?:out of|of|participants|%)', summary):
issues.append("No quantified findings (must include counts/percentages)")
score -= 0.3
# Check for vague claims
vague_terms = ['many', 'most', 'some', 'several', 'often', 'frequently']
if any(term in summary.lower() for term in vague_terms):
issues.append("Contains vague terms - should use specific numbers")
score -= 0.2
# Check for absolute claims
absolute_terms = ['all', 'everyone', 'nobody', 'never', 'always']
for term in absolute_terms:
if re.search(rf'\b{term}\b', summary.lower()):
issues.append(f"Absolute claim '{term}' found - likely overgeneralization")
score -= 0.2
# Check for evidence markers
if 'consensus' not in summary.lower() and 'majority' not in summary.lower():
issues.append("Missing consensus indicators")
score -= 0.1
# Check length is substantial
if len(summary) < 500:
issues.append("Summary too brief for thorough analysis")
score -= 0.2
return max(0.0, score), issues