Spaces:

empirenexus
/

TranscriptWriting

Paused

App Files Files Community

TranscriptWriting / validation.py

jmisak

Upload 23 files

54c99ad verified 6 months ago

raw

history blame

9.25 kB

	import re
	from typing import Tuple, Dict, List

	def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
	"""
	Validate that text extraction was successful
	"""

	if not text or not text.strip():
	return False, "No text extracted"

	# Check for minimum content
	if len(text) < 50:
	return False, f"Extracted text too short ({len(text)} chars)"

	# Check for garbled text indicators
	garbled_patterns = [
	(r'[^\x00-\x7F]{50,}', "Contains large blocks of non-ASCII characters"),
	(r'(.)\1{20,}', "Contains suspicious character repetition"),
	(r'^[\W\d\s]+$', "Contains only symbols/numbers/whitespace")
	]

	for pattern, msg in garbled_patterns:
	if re.search(pattern, text):
	return False, msg

	# Check word count
	words = text.split()
	if len(words) < 20:
	return False, f"Too few words ({len(words)})"

	# Calculate ratio of real words (heuristic)
	potential_words = [w for w in words if re.match(r'^[a-zA-Z]{2,}$', w)]
	word_ratio = len(potential_words) / len(words) if words else 0

	if word_ratio < 0.3:
	return False, f"Low word ratio ({word_ratio:.2f}) - possible extraction issue"

	return True, f"Valid ({len(words)} words, {len(text)} chars)"


	def validate_transcript_quality(
	analyzed_text: str,
	structured_data: Dict,
	interviewee_type: str
	) -> Tuple[float, str]:
	"""
	Assess quality of analyzed transcript

	Returns:
	Tuple of (quality_score [0-1], issues_description)
	"""

	score = 1.0
	issues = []

	# Check 1: Length of analysis
	if len(analyzed_text) < 100:
	score -= 0.3
	issues.append("Analysis too brief")
	elif len(analyzed_text) < 300:
	score -= 0.1
	issues.append("Analysis somewhat brief")

	# Check 2: Presence of structured data
	if not structured_data:
	score -= 0.2
	issues.append("No structured data extracted")
	else:
	# Check if structured data has content
	empty_fields = sum(1 for v in structured_data.values() if not v)
	total_fields = len(structured_data)

	if empty_fields == total_fields:
	score -= 0.3
	issues.append("All structured fields empty")
	elif empty_fields > total_fields * 0.7:
	score -= 0.2
	issues.append("Most structured fields empty")

	# Check 3: Type-specific validation
	if interviewee_type == "HCP":
	# Expect medical terminology
	medical_terms = re.findall(
	r'\b(diagnos\w+\|prescri\w+\|treatment\|medication\|patient\|clinical\|therapy)\b',
	analyzed_text,
	re.IGNORECASE
	)

	if len(medical_terms) < 3:
	score -= 0.2
	issues.append("Limited medical terminology for HCP interview")

	# Check for key structured fields
	key_fields = ["diagnoses", "prescriptions", "treatment_rationale"]
	missing_fields = [f for f in key_fields if not structured_data.get(f)]

	if len(missing_fields) == len(key_fields):
	score -= 0.2
	issues.append("No key HCP data extracted")

	elif interviewee_type == "Patient":
	# Expect patient-centric language
	patient_terms = re.findall(
	r'\b(symptom\|feel\|concern\|experience\|treatment\|side effect\|quality of life)\b',
	analyzed_text,
	re.IGNORECASE
	)

	if len(patient_terms) < 3:
	score -= 0.2
	issues.append("Limited patient-centric content")

	# Check for key structured fields
	key_fields = ["symptoms", "concerns", "treatment_response"]
	missing_fields = [f for f in key_fields if not structured_data.get(f)]

	if len(missing_fields) == len(key_fields):
	score -= 0.2
	issues.append("No key patient data extracted")

	# Check 4: Error indicators
	error_patterns = [
	r'\[Error\]',
	r'failed to',
	r'could not',
	r'unable to',
	r'timeout'
	]

	for pattern in error_patterns:
	if re.search(pattern, analyzed_text, re.IGNORECASE):
	score -= 0.3
	issues.append("Contains error messages")
	break

	# Check 5: Repetitive content (potential LLM failure)
	sentences = analyzed_text.split('.')
	if len(sentences) > 3:
	unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
	repetition_ratio = len(sentences) / len(unique_sentences) if unique_sentences else 1

	if repetition_ratio > 1.5:
	score -= 0.2
	issues.append("High content repetition")

	# Ensure score is in valid range
	score = max(0.0, min(1.0, score))

	issues_text = "; ".join(issues) if issues else "No issues detected"

	return score, issues_text


	def check_data_completeness(csv_rows: List[Dict], interviewee_type: str) -> Dict:
	"""
	Analyze completeness of extracted data across all transcripts
	"""

	if not csv_rows:
	return {"error": "No data to check"}

	# Determine key fields based on type
	if interviewee_type == "HCP":
	key_fields = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
	elif interviewee_type == "Patient":
	key_fields = ["Primary Symptoms", "Main Concerns", "Treatment Response"]
	else:
	key_fields = ["Key Insights"]

	completeness = {}

	for field in key_fields:
	if field in csv_rows[0]: # Check if field exists
	filled_count = sum(1 for row in csv_rows if row.get(field) and row[field].strip())
	completeness[field] = {
	"filled": filled_count,
	"total": len(csv_rows),
	"percentage": (filled_count / len(csv_rows) * 100) if csv_rows else 0
	}

	# Overall completeness
	total_fields = sum(c["total"] for c in completeness.values())
	filled_fields = sum(c["filled"] for c in completeness.values())
	overall_percentage = (filled_fields / total_fields * 100) if total_fields > 0 else 0

	return {
	"by_field": completeness,
	"overall": {
	"filled": filled_fields,
	"total": total_fields,
	"percentage": overall_percentage
	},
	"quality_grade": (
	"Excellent" if overall_percentage >= 80 else
	"Good" if overall_percentage >= 60 else
	"Fair" if overall_percentage >= 40 else
	"Poor"
	)
	}


	def validate_structured_data_format(data: Dict, interviewee_type: str) -> Tuple[bool, List[str]]:
	"""
	Validate that structured data has expected format
	"""

	issues = []

	if not isinstance(data, dict):
	return False, ["Data is not a dictionary"]

	# Define expected fields by type
	expected_fields = {
	"HCP": ["diagnoses", "prescriptions", "treatment_rationale"],
	"Patient": ["symptoms", "concerns", "treatment_response"],
	"Other": ["key_insights"]
	}

	required = expected_fields.get(interviewee_type, [])

	# Check for expected fields
	missing = [f for f in required if f not in data]
	if missing:
	issues.append(f"Missing expected fields: {', '.join(missing)}")

	# Check field types (should be lists)
	for key, value in data.items():
	if not isinstance(value, list):
	issues.append(f"Field '{key}' should be a list, got {type(value)}")

	# Check for empty lists
	empty_fields = [k for k, v in data.items() if isinstance(v, list) and not v]
	if len(empty_fields) == len(data):
	issues.append("All fields are empty lists")

	is_valid = len(issues) == 0

	return is_valid, issues

	def validate_summary_quality(summary: str, num_transcripts: int) -> Tuple[float, List[str]]:
	"""Check summary for rigor and accuracy"""
	issues = []
	score = 1.0

	# Check for quantification
	if not re.search(r'\d+\s*(?:out of\|of\|participants\|%)', summary):
	issues.append("No quantified findings (must include counts/percentages)")
	score -= 0.3

	# Check for vague claims
	vague_terms = ['many', 'most', 'some', 'several', 'often', 'frequently']
	if any(term in summary.lower() for term in vague_terms):
	issues.append("Contains vague terms - should use specific numbers")
	score -= 0.2

	# Check for absolute claims
	absolute_terms = ['all', 'everyone', 'nobody', 'never', 'always']
	for term in absolute_terms:
	if re.search(rf'\b{term}\b', summary.lower()):
	issues.append(f"Absolute claim '{term}' found - likely overgeneralization")
	score -= 0.2

	# Check for evidence markers
	if 'consensus' not in summary.lower() and 'majority' not in summary.lower():
	issues.append("Missing consensus indicators")
	score -= 0.1

	# Check length is substantial
	if len(summary) < 500:
	issues.append("Summary too brief for thorough analysis")
	score -= 0.2

	return max(0.0, score), issues