Spaces:

empirenexus
/

TranscriptWriting

Paused

App Files Files Community

TranscriptWriting / app.py

jmisak

Upload 5 files

9be3a11 verified 6 months ago

raw

history blame

45.2 kB

	import gradio as gr
	import os
	from datetime import datetime
	from typing import List, Dict, Tuple
	from extractors import extract_docx, extract_pdf, validate_extraction
	from tagging import tag_speakers_advanced
	from chunking import chunk_text_semantic
	from llm import query_llm, extract_structured_data
	from reporting import generate_enhanced_csv, generate_enhanced_pdf
	from dashboard import generate_comprehensive_dashboard
	from validation import validate_transcript_quality, check_data_completeness

	# Import new modules
	try:
	from logger import get_logger, LogContext
	logger = get_logger()
	except ImportError:
	# Fallback if logger not available
	import logging
	logger = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO)
	class LogContext:
	def __init__(self, args, *kwargs): pass
	def __enter__(self): return self
	def __exit__(self, *args): pass

	try:
	from redaction import PIIRedactor, redact_quotes, generate_redaction_report
	HAS_REDACTION = True
	except ImportError:
	HAS_REDACTION = False
	logger.warning("Redaction module not available - PII masking disabled")

	# Optional: Production logging for session tracking
	try:
	from production_logger import init_session, ProductionLogger, PerformanceMonitor
	HAS_PRODUCTION_LOGGING = True
	except ImportError:
	HAS_PRODUCTION_LOGGING = False
	print("⚠️ Production logging not available - using basic logging")

	# Stub classes when production_logger is not available
	from contextlib import contextmanager

	class ProductionLogger:
	def __init__(self, session_id):
	self.session_id = session_id
	self.logger = self
	def info(self, msg):
	print(f"[INFO] {msg}")
	def warning(self, msg):
	print(f"[WARNING] {msg}")
	def error(self, msg):
	print(f"[ERROR] {msg}")
	def log_warning(self, msg):
	print(f"[WARNING] {msg}")
	def log_transcript_start(self, file_name, file_type, interviewee_type):
	print(f"[INFO] Processing started: {file_name}")
	def log_transcript_complete(self, file_name, quality_score, word_count, processing_time):
	print(f"[INFO] Processing complete: {file_name} \| Quality: {quality_score:.2f}")
	def log_transcript_error(self, file_name, error_type, error_details):
	print(f"[ERROR] Processing failed: {file_name} - {error_type}")
	def log_quote_extraction(self, quote_count, top_score, themes):
	print(f"[INFO] Quote extraction complete: {quote_count} quotes")
	def finalize_session(self):
	print(f"[INFO] Session {self.session_id} complete")
	return {}

	class PerformanceMonitor:
	def __init__(self, logger):
	self.logger = logger
	self.timers = {}
	def start_timer(self, name):
	import time
	self.timers[name] = time.time()
	def end_timer(self, name):
	import time
	if name in self.timers:
	elapsed = time.time() - self.timers[name]
	del self.timers[name]
	return elapsed
	return 0
	@contextmanager
	def measure(self, name):
	self.start_timer(name)
	try:
	yield
	finally:
	self.end_timer(name)

	def init_session(session_id):
	return ProductionLogger(session_id)

	# Optional: Quote extraction for market research storytelling
	try:
	from quote_extractor import extract_quotes_from_results
	HAS_QUOTE_EXTRACTION = True
	except ImportError:
	HAS_QUOTE_EXTRACTION = False
	print("⚠️ Quote extraction not available - reports will not include storytelling quotes")
	def extract_quotes_from_results(results, interviewee_type):
	"""Stub function when quote_extractor is not available"""
	return {"quotes": [], "themes": {}, "top_quotes": []}

	# Optional imports for enhanced validation (may not exist in older deployments)
	try:
	from validation import verify_consensus_claims, validate_summary_quality
	HAS_ENHANCED_VALIDATION = True
	except ImportError:
	HAS_ENHANCED_VALIDATION = False
	print("⚠️ Enhanced validation functions not available - using basic validation only")

	# Load environment configuration from .env file
	def load_env_file(filepath='.env'):
	"""Manually load environment variables from .env file"""
	if os.path.exists(filepath):
	with open(filepath, 'r') as f:
	for line in f:
	line = line.strip()
	# Skip comments and empty lines
	if line and not line.startswith('#'):
	if '=' in line:
	key, value = line.split('=', 1)
	os.environ[key.strip()] = value.strip()
	print(f"✅ Loaded configuration from {filepath}")
	return True
	return False

	# HuggingFace Spaces Configuration
	# Settings can be configured via Spaces Secrets/Variables
	# Defaults to local model inference (no API calls)

	# Try to load .env if it exists (for local development)
	if os.path.exists('.env'):
	load_env_file('.env')
	print("✅ Loaded .env file (local development mode)")
	else:
	print("ℹ️ No .env file found - using HuggingFace Spaces configuration")

	# Set defaults for HuggingFace Spaces (can be overridden with Spaces Variables)
	os.environ.setdefault("USE_HF_API", "False")
	os.environ.setdefault("USE_LMSTUDIO", "False")
	os.environ.setdefault("DEBUG_MODE", os.getenv("DEBUG_MODE", "False"))
	os.environ.setdefault("LLM_BACKEND", "local")
	os.environ.setdefault("LLM_TIMEOUT", "120")
	os.environ.setdefault("MAX_TOKENS_PER_REQUEST", "1500")
	os.environ.setdefault("LLM_TEMPERATURE", "0.7")

	print("✅ Configuration loaded for HuggingFace Spaces")

	# Auto-detect HuggingFace Spaces and force HF API (local models timeout on free tier)
	# Check if we're running on HF Spaces (no .env file + SPACE_ID might be set)
	is_hf_spaces = not os.path.exists('.env') and (os.getenv('SPACE_ID') or os.getenv('SYSTEM') == 'spaces')
	hf_token = os.getenv("HUGGINGFACE_TOKEN", "")

	if is_hf_spaces or not os.path.exists('.env'):
	# Likely running on HF Spaces or similar cloud platform
	if hf_token:
	print("🌐 Detected cloud/Spaces environment - forcing HF API mode for best performance...")
	os.environ["USE_HF_API"] = "True"
	os.environ["USE_LMSTUDIO"] = "False"
	os.environ["LLM_BACKEND"] = "hf_api"
	os.environ["LLM_TIMEOUT"] = "180" # 3 minutes for API calls
	print("✅ HF API mode enabled (local models disabled)")
	else:
	print("⚠️ WARNING: Running on cloud platform without HUGGINGFACE_TOKEN!")
	print(" Local models will likely timeout. Please add HUGGINGFACE_TOKEN in Settings.")
	print(" Get token from: https://huggingface.co/settings/tokens")
	# Still allow it to run, but warn user
	os.environ["LLM_TIMEOUT"] = "300" # Increase timeout as fallback

	print(f"🚀 TranscriptorAI Enterprise - LLM Backend: {os.getenv('LLM_BACKEND')}")
	print(f"🔧 USE_HF_API: {os.getenv('USE_HF_API')}")
	print(f"🔧 USE_LMSTUDIO: {os.getenv('USE_LMSTUDIO')}")
	print(f"🔧 DEBUG_MODE: {os.getenv('DEBUG_MODE')}")
	print(f"🔧 LLM_TIMEOUT: {os.getenv('LLM_TIMEOUT')}s")

	def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type,
	enable_pii_redaction, redaction_level, progress=gr.Progress()):
	"""
	Enhanced analysis pipeline with robust error handling, validation, production logging,
	and optional PII redaction

	Args:
	files: Uploaded transcript files
	file_type: DOCX or PDF
	user_comments: User analysis instructions
	role_hint: Speaker role mapping
	debug_mode: Enable debug output
	interviewee_type: HCP, Patient, or Other
	enable_pii_redaction: Whether to redact PII from outputs
	redaction_level: strict, moderate, or minimal
	progress: Gradio progress tracker
	"""
	# Initialize production logging session
	session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
	prod_logger = init_session(session_id)
	perf_monitor = PerformanceMonitor(prod_logger)

	prod_logger.logger.info(f"="*80)
	prod_logger.logger.info(f"NEW ANALYSIS SESSION: {session_id}")
	prod_logger.logger.info(f"Files: {len(files)} \| Type: {file_type} \| Interviewee: {interviewee_type}")
	prod_logger.logger.info(f"="*80)

	os.environ["DEBUG_MODE"] = str(debug_mode)

	if not files:
	prod_logger.log_warning("No files uploaded")
	return "Error: No files uploaded", None, None, None

	all_results = []
	csv_rows = []
	processing_errors = []

	progress(0, desc="Initializing...")
	print(f"[Start] Processing {len(files)} file(s) as {file_type}")

	# Enhanced interviewee context
	interviewee_context = {
	"HCP": {
	"focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns",
	"extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"]
	},
	"Patient": {
	"focus": "symptoms, concerns, emotional state, treatment understanding, adherence",
	"extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"]
	},
	"Other": {
	"focus": "context-dependent insights, relevant observations",
	"extract": ["key_insights", "context", "recommendations"]
	}
	}.get(interviewee_type, {})

	# Build enhanced user context
	user_context = f"""
	Interviewee Type: {interviewee_type}
	Analysis Focus: {interviewee_context.get('focus', 'general insights')}
	Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))}

	Additional Instructions:
	{user_comments}
	""".strip()

	total_steps = len(files) * 4 + 2 # extraction, validation, tagging, chunking per file + summary + report
	current_step = 0

	for i, file in enumerate(files):
	file_name = os.path.basename(file.name)
	prod_logger.log_transcript_start(file_name, file_type, interviewee_type)
	perf_monitor.start_timer(f"transcript_{i+1}_processing")

	try:
	# Step 1: Extract text
	progress((current_step / total_steps), desc=f"Extracting {file_name}...")
	print(f"[File {i+1}/{len(files)}] Extracting: {file_name}")

	raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file)
	current_step += 1

	# Step 2: Validate extraction
	progress((current_step / total_steps), desc=f"Validating {file_name}...")
	is_valid, validation_msg = validate_extraction(raw_text, file_name)
	if not is_valid:
	raise ValueError(f"Extraction validation failed: {validation_msg}")

	print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}")
	current_step += 1

	# Step 3: Tag speakers with advanced logic
	progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...")
	tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type)
	print(f"[File {i+1}] Tagged {len(tagged_text)} characters")
	current_step += 1

	# Step 4: Semantic chunking
	progress((current_step / total_steps), desc=f"Processing {file_name}...")
	chunks = chunk_text_semantic(tagged_text, interviewee_type)
	print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)")
	current_step += 1

	# Step 5: LLM Analysis with structured extraction
	transcript_result = []
	structured_data = {}

	for j, chunk in enumerate(chunks):
	chunk_progress = (current_step + (j / len(chunks))) / total_steps
	progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...")

	result, chunk_data = query_llm(
	chunk,
	user_context,
	interviewee_type,
	extract_structured=True
	)

	# Ensure result is a string before appending
	if not isinstance(result, str):
	print(f"[Warning] LLM result is not a string (type: {type(result)}), converting...")
	if isinstance(result, dict):
	result = str(result.get('content', str(result)))
	else:
	result = str(result)

	# Additional safety: Only append non-empty strings
	if result and isinstance(result, str) and len(result.strip()) > 0:
	transcript_result.append(result)
	else:
	print(f"[Warning] Skipping empty/invalid result for chunk {j+1}")

	# Merge structured data
	for key, value in chunk_data.items():
	if key not in structured_data:
	structured_data[key] = []
	if isinstance(value, list):
	structured_data[key].extend(value)
	else:
	structured_data[key].append(value)

	current_step += 1

	# Combine and validate results
	# Final safety check: ensure ALL items in transcript_result are strings
	cleaned_results = []
	for idx, item in enumerate(transcript_result):
	if isinstance(item, str):
	cleaned_results.append(item)
	else:
	print(f"[Warning] Removing non-string item at index {idx}: {type(item)}")
	# Try to extract text from dict if possible
	if isinstance(item, dict) and 'content' in item:
	cleaned_results.append(str(item['content']))
	# Otherwise skip it

	full_text = "\n\n".join(cleaned_results)

	# Quality check
	quality_score, quality_issues = validate_transcript_quality(
	full_text,
	structured_data,
	interviewee_type
	)

	if quality_score < 0.3:
	print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}")
	processing_errors.append(f"{file_name}: Low quality - {quality_issues}")

	all_results.append({
	"transcript_id": f"Transcript {i+1}",
	"file_name": file_name,
	"full_text": full_text,
	"structured_data": structured_data,
	"quality_score": quality_score,
	"word_count": len(raw_text.split())
	})

	# Enhanced CSV row with structured data
	csv_row = {
	"Transcript ID": f"Transcript {i+1}",
	"File Name": file_name,
	"Quality Score": f"{quality_score:.2f}",
	"Word Count": len(raw_text.split()),
	}

	# Helper function to safely join structured data (convert dicts to strings if needed)
	def safe_join(items):
	"""Convert all items to strings before joining"""
	str_items = []
	for item in items:
	if isinstance(item, str):
	str_items.append(item)
	elif isinstance(item, dict):
	# Try to extract meaningful text from dict
	# Common patterns: {"name": "X"}, {"condition": "Y", "severity": "Z"}
	if "name" in item:
	str_items.append(str(item["name"]))
	elif "condition" in item:
	# Format as "condition (severity)"
	cond = item["condition"]
	if "severity" in item:
	str_items.append(f"{cond} ({item['severity']})")
	else:
	str_items.append(cond)
	else:
	# Fallback: just stringify the dict
	str_items.append(str(item))
	else:
	str_items.append(str(item))
	return "; ".join(str_items)

	# Add interviewee-specific fields
	if interviewee_type == "HCP":
	csv_row.update({
	"Diagnoses": safe_join(structured_data.get("diagnoses", [])),
	"Prescriptions": safe_join(structured_data.get("prescriptions", [])),
	"Treatment Strategies": safe_join(structured_data.get("treatment_rationale", [])),
	"Guidelines Mentioned": safe_join(structured_data.get("guidelines_mentioned", []))
	})
	elif interviewee_type == "Patient":
	csv_row.update({
	"Primary Symptoms": safe_join(structured_data.get("symptoms", [])),
	"Main Concerns": safe_join(structured_data.get("concerns", [])),
	"Treatment Response": safe_join(structured_data.get("treatment_response", [])),
	"Side Effects": safe_join(structured_data.get("side_effects", []))
	})
	else:
	csv_row.update({
	"Key Insights": safe_join(structured_data.get("key_insights", [])),
	"Recommendations": safe_join(structured_data.get("recommendations", []))
	})

	csv_rows.append(csv_row)

	# Log successful completion
	processing_time = perf_monitor.end_timer(f"transcript_{i+1}_processing")
	prod_logger.log_transcript_complete(file_name, quality_score, len(raw_text.split()), processing_time)

	print(f"[File {i+1}] ✓ Processing complete")

	except Exception as e:
	# Enhanced error tracking with type and traceback
	import traceback
	error_type = type(e).__name__
	error_details = str(e)
	error_traceback = traceback.format_exc()

	error_msg = f"[{error_type}] {file_name}: {error_details}"
	print(error_msg)

	# Log error
	perf_monitor.end_timer(f"transcript_{i+1}_processing") # End timer even on error
	prod_logger.log_transcript_error(file_name, error_type, error_details[:200])

	# Store comprehensive error information
	processing_errors.append({
	"transcript_id": f"Transcript {i+1}",
	"file_name": file_name,
	"error_type": error_type,
	"error_message": error_details[:200], # Truncate long messages
	"timestamp": datetime.now().isoformat()
	})

	all_results.append({
	"transcript_id": f"Transcript {i+1}",
	"file_name": file_name,
	"full_text": error_msg,
	"structured_data": {},
	"quality_score": 0.0,
	"word_count": 0,
	"processing_status": "FAILED",
	"error_type": error_type
	})

	# Add to CSV with error metadata
	csv_rows.append({
	"Transcript ID": f"Transcript {i+1}",
	"File Name": file_name,
	"Quality Score": 0.0,
	"Word Count": 0,
	"Processing Status": "FAILED",
	"Error Type": error_type,
	"Error Message": error_details[:100]
	})

	# Generate cross-transcript summary
	try:
	progress(0.9, desc="Generating summary and reports...")
	print("[Summary] Analyzing trends across transcripts")

	# Combine successful results
	valid_results = [r for r in all_results if r["quality_score"] > 0]

	if not valid_results:
	return "Error: No transcripts were successfully processed", None, None, None

	# Extract quotes for storytelling
	print("[Quotes] Extracting impactful quotes from transcripts...")
	with perf_monitor.measure("quote_extraction"):
	quotes_data = extract_quotes_from_results(valid_results, interviewee_type)

	top_score = quotes_data['top_quotes'][0]['impact_score'] if quotes_data['top_quotes'] else 0
	themes = list(quotes_data['by_theme'].keys())
	prod_logger.log_quote_extraction(len(quotes_data['all_quotes']), top_score, themes)

	print(f"[Quotes] Extracted {len(quotes_data['all_quotes'])} quotes, top impact score: {top_score:.2f}" if quotes_data['top_quotes'] else "[Quotes] No quotes extracted")

	# Apply PII redaction if enabled
	if enable_pii_redaction and HAS_REDACTION:
	logger.info(f"Applying PII redaction (level: {redaction_level})")

	# Redact quotes
	if quotes_data['all_quotes']:
	quotes_data['all_quotes'] = redact_quotes(quotes_data['all_quotes'], redaction_level)
	quotes_data['top_quotes'] = [q for q in quotes_data['all_quotes'] if q.get('impact_score', 0) > 0]
	quotes_data['top_quotes'].sort(key=lambda x: x['impact_score'], reverse=True)
	quotes_data['top_quotes'] = quotes_data['top_quotes'][:20]

	# Redact full text in results
	redactor = PIIRedactor(redaction_level)
	total_redactions = {"total": 0}

	for result in valid_results:
	redacted_text, redaction_report = redactor.redact_text(result['full_text'])
	result['full_text'] = redacted_text
	result['redaction_report'] = redaction_report
	total_redactions['total'] += sum(redaction_report.values())

	logger.success(f"Redacted {total_redactions['total']} PII items across {len(valid_results)} transcripts")
	elif enable_pii_redaction and not HAS_REDACTION:
	logger.warning("PII redaction requested but redaction module not available!")

	# Use enhanced hierarchical summarization for better quality
	# Import the enhanced summarizer
	try:
	from summarizer_enhanced import (
	hierarchical_summarize,
	enhance_summary_with_quotes,
	validate_summary_consensus
	)
	use_hierarchical = True
	print("[Summary] Using enhanced hierarchical summarization")
	except ImportError:
	use_hierarchical = False
	print("[Summary] Using standard summarization (hierarchical not available)")

	# Build comprehensive summary prompt with quotes (standard approach - fallback)
	summary_prompt = f"""
	CROSS-INTERVIEW SYNTHESIS TASK

	SAMPLE: {len(valid_results)} {interviewee_type} transcripts
	FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')}
	"""

	# Add top quotes section for storytelling context
	if quotes_data['top_quotes']:
	summary_prompt += f"""

	TOP PARTICIPANT QUOTES (use these to bring findings to life):
	"""
	for i, quote in enumerate(quotes_data['top_quotes'][:10], 1):
	summary_prompt += f"\n{i}. [{quote['theme'].upper()}] (from {quote['transcript_id']})\n \"{quote['text']}\"\n"

	summary_prompt += """

	COMPLETE TRANSCRIPT DATA:
	"""

	for idx, result in enumerate(valid_results, 1):
	summary_prompt += f"\n{'='60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='60}\n"
	summary_prompt += f"{result['full_text'][:2000]}\n"

	summary_prompt += f"""

	ANALYSIS REQUIREMENTS:

	1. QUANTIFY EVERYTHING:
	- Count participants: "X out of {len(valid_results)} participants mentioned..."
	- Never use vague terms (many/most/some)
	- Calculate percentages where relevant

	2. INTEGRATE PARTICIPANT VOICE:
	- Weave in quotes from the "TOP PARTICIPANT QUOTES" section above
	- Use quotes to bring data to life and prove points
	- Format as: "X out of {len(valid_results)} mentioned [finding]. As one {interviewee_type.lower()} described, '[quote]'"
	- Include 3-5 quotes in your narrative

	3. IDENTIFY PATTERNS BY CONSENSUS LEVEL:
	- STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree)
	- MAJORITY VIEW (60-79% = {int(len(valid_results)0.6)}-{int(len(valid_results)0.79)} transcripts)
	- SPLIT PERSPECTIVES (40-59% = mixed views)
	- MINORITY/OUTLIER (<40% but notable)

	4. CROSS-VALIDATE:
	- Check for contradictions between transcripts
	- Note where perspectives diverge and why
	- Flag any quality issues in individual transcripts

	5. CITE EVIDENCE:
	- Reference specific transcript numbers
	- Brief supporting details
	- Use participant quotes as proof points
	- Distinguish verified facts from interpretation

	OUTPUT FORMAT:
	Write 2-3 sentence executive overview WITH a compelling quote, then structure as:

	STRONG CONSENSUS FINDINGS:
	- [Finding with count, supporting quote if available, and business implication]

	MAJORITY FINDINGS:
	- [Finding with count and quote]

	DIVERGENT PERSPECTIVES:
	- [Where views split, with quotes showing both sides if possible]

	NOTABLE OUTLIERS:
	- [Unique but important points, use quote if impactful]

	DATA QUALITY NOTES:
	- [Any gaps or transcript issues]

	CRITICAL: Integrate quotes naturally. Use participant voice to make findings memorable and credible.
	Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
	"""

	# Use enhanced hierarchical summarization if available, otherwise standard
	print("[Summary] Generating cross-transcript summary...")
	print("[Summary] Note: This may take 30-60 seconds for large datasets")

	try:
	from llm_robust import query_llm_with_timeout

	if use_hierarchical and len(valid_results) > 3:
	# Use hierarchical approach for better quality with 4+ transcripts
	print(f"[Summary] Using hierarchical approach for {len(valid_results)} transcripts")
	summary, summary_data = hierarchical_summarize(
	valid_results,
	quotes_data,
	interviewee_type,
	interviewee_context,
	query_llm_with_timeout,
	user_context
	)

	# Enhance with additional quote integration
	summary = enhance_summary_with_quotes(summary, quotes_data, max_quotes=6)

	# Validate consensus claims
	consensus_warnings = validate_summary_consensus(summary, valid_results)
	if consensus_warnings:
	print(f"[Summary] Consensus validation warnings: {len(consensus_warnings)}")
	for warning in consensus_warnings[:3]:
	print(f" - {warning}")
	else:
	# Standard single-pass summarization for small datasets
	print("[Summary] Using standard single-pass summarization")
	summary, summary_data = query_llm_with_timeout(
	summary_prompt,
	user_context,
	interviewee_type,
	extract_structured=False,
	is_summary=True,
	max_timeout=60 # 60 second hard timeout
	)
	except Exception as e:
	# Ultimate fallback
	print(f"[Summary] Critical error: {e}")
	print("[Summary] Using emergency fallback...")
	from llm_robust import generate_emergency_summary
	summary, summary_data = generate_emergency_summary(interviewee_type)

	# Ensure summary is a string (defensive check for LLM response format issues)
	if not isinstance(summary, str):
	print(f"[Warning] Summary is not a string (type: {type(summary)}), converting...")
	if isinstance(summary, dict):
	summary = str(summary.get('content', str(summary)))
	else:
	summary = str(summary)

	# Validate summary quality and retry if needed
	if HAS_ENHANCED_VALIDATION:
	summary_score, summary_issues = validate_summary_quality(
	summary,
	len(valid_results)
	)
	else:
	summary_score = 1.0
	summary_issues = []

	if HAS_ENHANCED_VALIDATION and summary_score < 0.7: # Quality threshold
	print(f"[Warning] Summary quality issues (score: {summary_score:.2f}): {summary_issues}")
	print("[Summary] Retrying with stricter validation...")

	# Retry with enhanced prompt emphasizing validation failures
	retry_prompt = summary_prompt + f"""

	CRITICAL: Previous attempt failed validation with these issues:
	{chr(10).join('- ' + issue for issue in summary_issues)}

	MANDATORY CORRECTIONS:
	- Use ONLY specific numbers (e.g., "8 out of {len(valid_results)}" not "most")
	- Include percentages in parentheses
	- Cite transcript numbers for every claim
	- Minimum length: 500 words
	- No absolute terms (all/everyone/never/always) without 100% evidence
	"""

	try:
	summary, summary_data = query_llm_with_timeout(
	retry_prompt,
	user_context,
	interviewee_type,
	extract_structured=False,
	is_summary=True,
	max_timeout=60 # 60 second hard timeout for retry
	)
	except Exception as e:
	print(f"[Summary] Retry also failed: {e}")
	print("[Summary] Using emergency fallback for retry...")
	summary, summary_data = generate_emergency_summary(interviewee_type)

	# Ensure summary is a string after retry
	if not isinstance(summary, str):
	print(f"[Warning] Retry summary is not a string (type: {type(summary)}), converting...")
	if isinstance(summary, dict):
	summary = str(summary.get('content', str(summary)))
	else:
	summary = str(summary)

	# Re-validate
	summary_score, summary_issues = validate_summary_quality(summary, len(valid_results))

	if summary_score < 0.7:
	# Add quality warning to summary header
	warning_header = f"""[QUALITY WARNING - Score: {summary_score:.2f}]
	Validation issues detected: {'; '.join(summary_issues)}
	Please review findings carefully and verify against source data.

	{'='*60}

	"""
	summary = warning_header + summary
	print(f"[Warning] Summary still has issues after retry (score: {summary_score:.2f})")
	else:
	print(f"[Summary] ✓ Validation passed after retry (score: {summary_score:.2f})")
	else:
	print(f"[Summary] ✓ Validation passed (score: {summary_score:.2f})")

	# Verify consensus claims against actual data
	if HAS_ENHANCED_VALIDATION:
	consensus_warnings = verify_consensus_claims(summary, valid_results)
	if consensus_warnings:
	print(f"[Warning] Consensus verification issues: {len(consensus_warnings)} found")
	consensus_note = "\n\n[CONSENSUS VERIFICATION NOTES]:\n" + "\n".join(f"- {w}" for w in consensus_warnings) + "\n\n"
	summary = summary + consensus_note
	else:
	print("[Summary] ✓ Consensus claims verified")
	else:
	print("[Summary] ⚠️ Consensus verification skipped (enhanced validation not available)")

	# Generate enhanced reports
	csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
	print(f"[CSV] ✓ Saved to {csv_path}")

	pdf_path = generate_enhanced_pdf(
	summary,
	all_results,
	interviewee_type,
	processing_errors
	)
	print(f"[PDF] ✓ Saved to {pdf_path}")

	dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type)
	print("[Dashboard] ✓ Generated")

	# Compile final output
	output_text = f"""# Analysis Complete

	## Summary of Findings
	{summary}

	## Processing Statistics
	- Total Files: {len(files)}
	- Successfully Processed: {len(valid_results)}
	- Failed: {len(processing_errors)}
	- Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f}

	"""

	if processing_errors:
	# Convert error dicts to readable strings
	error_messages = []
	for err in processing_errors:
	if isinstance(err, dict):
	# Format: "Transcript X (filename.docx): ErrorType - message"
	error_msg = f"{err.get('transcript_id', 'Unknown')} ({err.get('file_name', 'unknown')}): {err.get('error_type', 'Error')} - {err.get('error_message', 'Unknown error')}"
	error_messages.append(error_msg)
	else:
	error_messages.append(str(err))
	output_text += f"\n## Processing Errors\n" + "\n".join(f"- {msg}" for msg in error_messages)

	output_text += "\n\n---\n\n## Individual Transcript Results\n\n"

	for result in all_results:
	output_text += f"### {result['transcript_id']} - {result['file_name']}\n"
	output_text += f"Quality Score: {result['quality_score']:.2f} \| Words: {result['word_count']}\n\n"
	output_text += result['full_text'] + "\n\n---\n\n"

	progress(1.0, desc="Complete!")

	# Finalize production logging session
	session_summary = prod_logger.finalize_session()
	prod_logger.logger.info(f"Session logs saved to: logs/session_{session_id}.*")

	return output_text, csv_path, pdf_path, dashboard

	except Exception as e:
	error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}"
	print(error_msg)
	import traceback
	traceback.print_exc()

	prod_logger.log_transcript_error("SUMMARY_GENERATION", type(e).__name__, str(e))
	prod_logger.finalize_session()

	return error_msg, None, None, None

	def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style):
	"""
	Wrapper function for Gradio UI to generate narrative reports
	"""
	try:
	from narrative_report_generator import generate_narrative_report
	import tempfile
	import os

	# Check if CSV file exists
	if csv_file is None:
	return "Error: No CSV file provided. Please run analysis first.", None, None, None

	# Save summary text to temp file if provided
	summary_path = None
	if summary_text and summary_text.strip():
	with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
	f.write(summary_text)
	summary_path = f.name

	# Determine LLM backend
	llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api"

	# Generate narrative report (quotes will be extracted inside the function)
	pdf_path, word_path, html_path = generate_narrative_report(
	csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file,
	summary_path=summary_path,
	interviewee_type=interviewee_type,
	report_style=report_style,
	llm_backend=llm_backend
	)

	# Clean up temp file
	if summary_path and os.path.exists(summary_path):
	os.remove(summary_path)

	return (
	f"✓ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}",
	pdf_path,
	word_path,
	html_path
	)

	except Exception as e:
	import traceback
	error_detail = traceback.format_exc()
	return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None


	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎯 TranscriptorAI - Enterprise Transcript Analyzer

	Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis.
	""")

	with gr.Tabs():

	with gr.TabItem("📊 Transcript Analysis"):
	with gr.Row():
	with gr.Column(scale=1):
	files = gr.File(
	label="📁 Upload Transcripts",
	file_types=[".docx", ".pdf"],
	file_count="multiple"
	)
	file_type = gr.Radio(
	["DOCX", "PDF"],
	label="File Type",
	value="DOCX"
	)
	interviewee_type = gr.Radio(
	["HCP", "Patient", "Other"],
	label="Interviewee Type",
	value="Patient",
	info="Select the type of person being interviewed"
	)

	with gr.Column(scale=1):
	user_comments = gr.Textbox(
	label="Analysis Instructions",
	lines=6,
	placeholder="Enter specific analysis goals, questions to answer, or context...",
	info="Provide guidance for the AI analyzer"
	)
	role_hint = gr.Textbox(
	label="Speaker Role Mapping (Optional)",
	placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor",
	info="Help identify speakers if needed"
	)

	with gr.Row():
	debug_mode = gr.Checkbox(label="🔍 Enable Debug Mode", value=False)

	with gr.Row():
	with gr.Column():
	enable_pii_redaction = gr.Checkbox(
	label="🔒 Enable PII Redaction",
	value=False,
	info="Mask sensitive information (names, dates, SSN, emails, etc.)"
	)
	with gr.Column():
	redaction_level = gr.Radio(
	["minimal", "moderate", "strict"],
	label="Redaction Level",
	value="moderate",
	info="minimal=IDs only, moderate=common PII, strict=all PII including names"
	)

	with gr.Row():
	gr.Markdown("""
	⚠️ IMPORTANT PRIVACY NOTICE:
	- If using real patient/healthcare data, ALWAYS enable PII redaction
	- Private HF Spaces are NOT HIPAA-compliant - use de-identified data only
	- For HIPAA compliance, deploy on your own HIPAA-certified infrastructure
	""")

	with gr.Row():
	analyze_btn = gr.Button("🚀 Analyze Transcripts", variant="primary", scale=2)

	with gr.Row():
	output_text = gr.Textbox(label="📊 Analysis Report", lines=40)

	with gr.Row():
	csv_output = gr.File(label="📥 Download CSV")
	pdf_output = gr.File(label="📥 Download PDF")

	with gr.Row():
	dashboard_output = gr.Plot(label="📈 Dashboard Visualization")

	analyze_btn.click(
	fn=analyze,
	inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type,
	enable_pii_redaction, redaction_level],
	outputs=[output_text, csv_output, pdf_output, dashboard_output]
	)


	with gr.TabItem("📝 Narrative Report"):
	gr.Markdown("""
	## Generate Storytelling Report

	Transform your analysis into a narrative report with:
	- Executive summary with key insights
	- Data-driven storytelling
	- Professional formatting (PDF, Word, HTML)
	- Actionable recommendations

	Instructions: First run the analysis in the previous tab, then use the outputs here to generate a narrative report.
	""")

	with gr.Row():
	with gr.Column():
	narrative_csv = gr.File(
	label="CSV Output from Analysis",
	file_types=[".csv"]
	)
	narrative_summary = gr.Textbox(
	label="Copy/Paste Summary Text from Analysis (Optional)",
	lines=10,
	placeholder="Paste the executive summary text here..."
	)

	with gr.Column():
	narrative_interviewee_type = gr.Radio(
	["HCP", "Patient", "Other"],
	label="Interviewee Type",
	value="Patient"
	)
	narrative_report_style = gr.Radio(
	["executive", "detailed", "presentation"],
	label="Report Style",
	value="executive",
	info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready"
	)
	generate_narrative_btn = gr.Button("📖 Generate Narrative Report", variant="primary")

	narrative_status = gr.Textbox(label="Status", lines=5)

	with gr.Row():
	narrative_pdf_output = gr.File(label="📥 Download PDF Report")
	narrative_word_output = gr.File(label="📥 Download Word Report")
	narrative_html_output = gr.File(label="📥 Download HTML Report")

	generate_narrative_btn.click(
	fn=generate_narrative_report_ui,
	inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style],
	outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output]
	)


	with gr.TabItem("❓ Help"):
	gr.Markdown("""
	### Quick Start Guide

	Step 1: Analyze Transcripts
	1. Upload your DOCX or PDF files
	2. Select interviewee type (HCP, Patient, or Other)
	3. Add analysis instructions
	4. Click "Analyze Transcripts"
	5. Download CSV, PDF, and view dashboard

	Step 2: Generate Narrative Report (Optional)
	1. Go to "Narrative Report" tab
	2. Upload the CSV from Step 1
	3. Optionally paste the summary text
	4. Select report style
	5. Click "Generate Narrative Report"
	6. Download PDF, Word, or HTML versions

	### Tips
	- CSV Upload: Download the CSV from analysis, then upload it to narrative report generator
	- Summary Text: Copy from the "Analysis Report" textbox and paste into narrative generator
	- Report Styles:
	- Executive: Best for C-level, investors, decision-makers
	- Detailed: Best for researchers, comprehensive analysis
	- Presentation: Best for slides, briefings, quick overviews

	### LLM Configuration
	- Set `USE_LMSTUDIO=True` to use your local LM Studio
	- Set `HUGGINGFACE_TOKEN` to use HF API for faster processing
	- Default: Uses local model (slower but free)

	### Support
	For issues, check the console output or enable debug mode.
	""")

	gr.Markdown("""
	---
	TranscriptorAI \| Enterprise-grade transcript analysis with narrative reporting
	""")

	if __name__ == "__main__":
	demo.queue(
	max_size=10,
	api_open=False
	).launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)