Spaces:

empirenexus
/

TranscriptWriting

Sleeping

App Files Files Community

TranscriptWriting / app.py

jmisak

Update app.py

0964dee verified 2 months ago

raw

history blame

28.7 kB

	import gradio as gr
	import os
	from datetime import datetime
	from typing import List, Dict, Tuple
	from extractors import extract_docx, extract_pdf, validate_extraction
	from tagging import tag_speakers_advanced
	from chunking import chunk_text_semantic
	from llm import query_llm, extract_structured_data
	from reporting import generate_enhanced_csv, generate_enhanced_pdf
	from dashboard import generate_comprehensive_dashboard
	from validation import validate_transcript_quality, check_data_completeness
	from audio_transcriber import transcribe_with_diarization_streaming

	# HuggingFace Spaces Configuration
	import os
	os.environ["LLM_BACKEND"] = "hf_api"
	os.environ["LLM_TIMEOUT"] = "25"
	os.environ["MAX_TOKENS_PER_REQUEST"] = "100"
	print("🚀 Running on HuggingFace Spaces - Optimized Configuration Loaded")

	def preprocess_audio(audio_files, num_speakers):
	"""Convert audio to transcripts"""
	if not audio_files:
	return None, "No audio files provided"

	transcript_paths = []
	status = ""

	for audio in audio_files:
	try:
	# Get the actual file path
	audio_path = audio.name if hasattr(audio, 'name') else str(audio)

	transcript_path = transcribe_with_diarization(audio_path, num_speakers)
	transcript_paths.append(transcript_path)
	status += f"✓ {os.path.basename(audio_path)} → {transcript_path}\n"
	except Exception as e:
	status += f"✗ {os.path.basename(audio_path)}: {str(e)}\n"

	# Return list of paths for file component
	return transcript_paths if transcript_paths else None, status


	def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
	"""
	Enhanced analysis pipeline with robust error handling and validation
	"""
	os.environ["DEBUG_MODE"] = str(debug_mode)

	if not files:
	return "Error: No files uploaded", None, None, None

	all_results = []
	csv_rows = []
	processing_errors = []

	progress(0, desc="Initializing...")
	print(f"[Start] Processing {len(files)} file(s) as {file_type}")

	# Enhanced interviewee context
	interviewee_context = {
	"HCP": {
	"focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns",
	"extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"]
	},
	"Patient": {
	"focus": "symptoms, concerns, emotional state, treatment understanding, adherence",
	"extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"]
	},
	"Other": {
	"focus": "context-dependent insights, relevant observations",
	"extract": ["key_insights", "context", "recommendations"]
	}
	}.get(interviewee_type, {})

	# Build enhanced user context
	user_context = f"""
	Interviewee Type: {interviewee_type}
	Analysis Focus: {interviewee_context.get('focus', 'general insights')}
	Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))}

	Additional Instructions:
	{user_comments}
	""".strip()

	total_steps = len(files) * 4 + 2 # extraction, validation, tagging, chunking per file + summary + report
	current_step = 0

	for i, file in enumerate(files):
	file_name = os.path.basename(file.name)
	try:
	# Step 1: Extract text
	progress((current_step / total_steps), desc=f"Extracting {file_name}...")
	print(f"[File {i+1}/{len(files)}] Extracting: {file_name}")

	raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file)
	current_step += 1

	# Step 2: Validate extraction
	progress((current_step / total_steps), desc=f"Validating {file_name}...")
	is_valid, validation_msg = validate_extraction(raw_text, file_name)
	if not is_valid:
	raise ValueError(f"Extraction validation failed: {validation_msg}")

	print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}")
	current_step += 1

	# Step 3: Tag speakers with advanced logic
	progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...")
	tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type)
	print(f"[File {i+1}] Tagged {len(tagged_text)} characters")
	current_step += 1

	# Step 4: Semantic chunking
	progress((current_step / total_steps), desc=f"Processing {file_name}...")
	chunks = chunk_text_semantic(tagged_text, interviewee_type)
	print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)")
	current_step += 1

	# Step 5: LLM Analysis with structured extraction
	transcript_result = []
	structured_data = {}

	for j, chunk in enumerate(chunks):
	chunk_progress = (current_step + (j / len(chunks))) / total_steps
	progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...")

	result, chunk_data = query_llm(
	chunk,
	user_context,
	interviewee_type,
	extract_structured=True
	)

	transcript_result.append(result)

	# Merge structured data
	for key, value in chunk_data.items():
	if key not in structured_data:
	structured_data[key] = []
	if isinstance(value, list):
	structured_data[key].extend(value)
	else:
	structured_data[key].append(value)

	current_step += 1

	# Combine and validate results
	full_text = "\n\n".join(transcript_result)

	# Quality check
	quality_score, quality_issues = validate_transcript_quality(
	full_text,
	structured_data,
	interviewee_type
	)

	if quality_score < 0.3:
	print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}")
	processing_errors.append(f"{file_name}: Low quality - {quality_issues}")

	all_results.append({
	"transcript_id": f"Transcript {i+1}",
	"file_name": file_name,
	"full_text": full_text,
	"structured_data": structured_data,
	"quality_score": quality_score,
	"word_count": len(raw_text.split())
	})

	# Enhanced CSV row with structured data
	csv_row = {
	"Transcript ID": f"Transcript {i+1}",
	"File Name": file_name,
	"Quality Score": f"{quality_score:.2f}",
	"Word Count": len(raw_text.split()),
	}

	# Add interviewee-specific fields
	if interviewee_type == "HCP":
	csv_row.update({
	"Diagnoses": "; ".join(structured_data.get("diagnoses", [])),
	"Prescriptions": "; ".join(structured_data.get("prescriptions", [])),
	"Treatment Strategies": "; ".join(structured_data.get("treatment_rationale", [])),
	"Guidelines Mentioned": "; ".join(structured_data.get("guidelines_mentioned", []))
	})
	elif interviewee_type == "Patient":
	csv_row.update({
	"Primary Symptoms": "; ".join(structured_data.get("symptoms", [])),
	"Main Concerns": "; ".join(structured_data.get("concerns", [])),
	"Treatment Response": "; ".join(structured_data.get("treatment_response", [])),
	"Side Effects": "; ".join(structured_data.get("side_effects", []))
	})
	else:
	csv_row.update({
	"Key Insights": "; ".join(structured_data.get("key_insights", [])),
	"Recommendations": "; ".join(structured_data.get("recommendations", []))
	})

	csv_rows.append(csv_row)

	print(f"[File {i+1}] ✓ Processing complete")

	except Exception as e:
	# Enhanced error tracking with type and traceback
	import traceback
	error_type = type(e).__name__
	error_details = str(e)
	error_traceback = traceback.format_exc()

	error_msg = f"[{error_type}] {file_name}: {error_details}"
	print(error_msg)

	# Store comprehensive error information
	processing_errors.append({
	"transcript_id": f"Transcript {i+1}",
	"file_name": file_name,
	"error_type": error_type,
	"error_message": error_details[:200], # Truncate long messages
	"timestamp": datetime.now().isoformat()
	})

	all_results.append({
	"transcript_id": f"Transcript {i+1}",
	"file_name": file_name,
	"full_text": error_msg,
	"structured_data": {},
	"quality_score": 0.0,
	"word_count": 0,
	"processing_status": "FAILED",
	"error_type": error_type
	})

	# Add to CSV with error metadata
	csv_rows.append({
	"Transcript ID": f"Transcript {i+1}",
	"File Name": file_name,
	"Quality Score": 0.0,
	"Word Count": 0,
	"Processing Status": "FAILED",
	"Error Type": error_type,
	"Error Message": error_details[:100]
	})

	# Generate cross-transcript summary
	try:
	progress(0.9, desc="Generating summary and reports...")
	print("[Summary] Analyzing trends across transcripts")

	# Combine successful results
	valid_results = [r for r in all_results if r["quality_score"] > 0]

	if not valid_results:
	return "Error: No transcripts were successfully processed", None, None, None

	# Build comprehensive summary prompt
	summary_prompt = f"""
	CROSS-INTERVIEW SYNTHESIS TASK

	SAMPLE: {len(valid_results)} {interviewee_type} transcripts
	FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')}

	COMPLETE TRANSCRIPT DATA:
	"""

	for idx, result in enumerate(valid_results, 1):
	summary_prompt += f"\n{'='60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='60}\n"
	summary_prompt += f"{result['full_text'][:2000]}\n"

	summary_prompt += f"""

	ANALYSIS REQUIREMENTS:

	1. QUANTIFY EVERYTHING:
	- Count participants: "X out of {len(valid_results)} participants mentioned..."
	- Never use vague terms (many/most/some)
	- Calculate percentages where relevant

	2. IDENTIFY PATTERNS BY CONSENSUS LEVEL:
	- STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree)
	- MAJORITY VIEW (60-79% = {int(len(valid_results)0.6)}-{int(len(valid_results)0.79)} transcripts)
	- SPLIT PERSPECTIVES (40-59% = mixed views)
	- MINORITY/OUTLIER (<40% but notable)

	3. CROSS-VALIDATE:
	- Check for contradictions between transcripts
	- Note where perspectives diverge and why
	- Flag any quality issues in individual transcripts

	4. CITE EVIDENCE:
	- Reference specific transcript numbers
	- Brief supporting details
	- Distinguish verified facts from interpretation

	OUTPUT FORMAT:
	Write 2-3 sentence executive overview, then structure as:

	STRONG CONSENSUS FINDINGS:
	- [Finding with count and evidence]

	MAJORITY FINDINGS:
	- [Finding with count]

	DIVERGENT PERSPECTIVES:
	- [Where views split and context]

	NOTABLE OUTLIERS:
	- [Unique but important points]

	DATA QUALITY NOTES:
	- [Any gaps or transcript issues]

	Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
	"""

	# Use robust LLM with aggressive timeout protection
	print("[Summary] Generating cross-transcript summary...")
	print("[Summary] Note: This may take 30-60 seconds for large datasets")

	try:
	from llm_robust import query_llm_with_timeout

	summary, summary_data = query_llm_with_timeout(
	summary_prompt,
	user_context,
	interviewee_type,
	extract_structured=False,
	is_summary=True,
	max_timeout=60 # 60 second hard timeout
	)
	except Exception as e:
	# Ultimate fallback
	print(f"[Summary] Critical error: {e}")
	print("[Summary] Using emergency fallback...")
	from llm_robust import generate_emergency_summary
	summary, summary_data = generate_emergency_summary(interviewee_type)

	# Validate summary quality and retry if needed
	from validation import validate_summary_quality
	summary_score, summary_issues = validate_summary_quality(
	summary,
	len(valid_results)
	)

	if summary_score < 0.7: # Quality threshold
	print(f"[Warning] Summary quality issues (score: {summary_score:.2f}): {summary_issues}")
	print("[Summary] Retrying with stricter validation...")

	# Retry with enhanced prompt emphasizing validation failures
	retry_prompt = summary_prompt + f"""

	CRITICAL: Previous attempt failed validation with these issues:
	{chr(10).join('- ' + issue for issue in summary_issues)}

	MANDATORY CORRECTIONS:
	- Use ONLY specific numbers (e.g., "8 out of {len(valid_results)}" not "most")
	- Include percentages in parentheses
	- Cite transcript numbers for every claim
	- Minimum length: 500 words
	- No absolute terms (all/everyone/never/always) without 100% evidence
	"""

	try:
	summary, summary_data = query_llm_with_timeout(
	retry_prompt,
	user_context,
	interviewee_type,
	extract_structured=False,
	is_summary=True,
	max_timeout=60 # 60 second hard timeout for retry
	)
	except Exception as e:
	print(f"[Summary] Retry also failed: {e}")
	print("[Summary] Using emergency fallback for retry...")
	summary, summary_data = generate_emergency_summary(interviewee_type)

	# Re-validate
	summary_score, summary_issues = validate_summary_quality(summary, len(valid_results))

	if summary_score < 0.7:
	# Add quality warning to summary header
	warning_header = f"""[QUALITY WARNING - Score: {summary_score:.2f}]
	Validation issues detected: {'; '.join(summary_issues)}
	Please review findings carefully and verify against source data.

	{'='*60}

	"""
	summary = warning_header + summary
	print(f"[Warning] Summary still has issues after retry (score: {summary_score:.2f})")
	else:
	print(f"[Summary] ✓ Validation passed after retry (score: {summary_score:.2f})")
	else:
	print(f"[Summary] ✓ Validation passed (score: {summary_score:.2f})")

	# Verify consensus claims against actual data
	from validation import verify_consensus_claims
	consensus_warnings = verify_consensus_claims(summary, valid_results)
	if consensus_warnings:
	print(f"[Warning] Consensus verification issues: {len(consensus_warnings)} found")
	consensus_note = "\n\n[CONSENSUS VERIFICATION NOTES]:\n" + "\n".join(f"- {w}" for w in consensus_warnings) + "\n\n"
	summary = summary + consensus_note
	else:
	print("[Summary] ✓ Consensus claims verified")

	# Generate enhanced reports
	csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
	print(f"[CSV] ✓ Saved to {csv_path}")

	pdf_path = generate_enhanced_pdf(
	summary,
	all_results,
	interviewee_type,
	processing_errors
	)
	print(f"[PDF] ✓ Saved to {pdf_path}")

	dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type)
	print("[Dashboard] ✓ Generated")

	# Compile final output
	output_text = f"""# Analysis Complete

	## Summary of Findings
	{summary}

	## Processing Statistics
	- Total Files: {len(files)}
	- Successfully Processed: {len(valid_results)}
	- Failed: {len(processing_errors)}
	- Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f}

	"""

	if processing_errors:
	output_text += f"\n## Processing Errors\n" + "\n".join(f"- {err}" for err in processing_errors)

	output_text += "\n\n---\n\n## Individual Transcript Results\n\n"

	for result in all_results:
	output_text += f"### {result['transcript_id']} - {result['file_name']}\n"
	output_text += f"Quality Score: {result['quality_score']:.2f} \| Words: {result['word_count']}\n\n"
	output_text += result['full_text'] + "\n\n---\n\n"

	progress(1.0, desc="Complete!")
	return output_text, csv_path, pdf_path, dashboard

	except Exception as e:
	error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}"
	print(error_msg)
	import traceback
	traceback.print_exc()
	return error_msg, None, None, None

	def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style):
	"""
	Wrapper function for Gradio UI to generate narrative reports
	"""
	try:
	from narrative_report_generator import generate_narrative_report
	import tempfile
	import os

	# Check if CSV file exists
	if csv_file is None:
	return "Error: No CSV file provided. Please run analysis first.", None, None, None

	# Save summary text to temp file if provided
	summary_path = None
	if summary_text and summary_text.strip():
	with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
	f.write(summary_text)
	summary_path = f.name

	# Determine LLM backend
	llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api"

	# Generate narrative report
	pdf_path, word_path, html_path = generate_narrative_report(
	csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file,
	summary_path=summary_path,
	interviewee_type=interviewee_type,
	report_style=report_style,
	llm_backend=llm_backend
	)

	# Clean up temp file
	if summary_path and os.path.exists(summary_path):
	os.remove(summary_path)

	return (
	f"✓ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}",
	pdf_path,
	word_path,
	html_path
	)

	except Exception as e:
	import traceback
	error_detail = traceback.format_exc()
	return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None


	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎯 TranscriptorAI - Enterprise Transcript Analyzer

	Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis.
	""")

	with gr.Tabs():

	with gr.TabItem("🎤 Audio Preprocessing"):
	gr.Markdown("""
	Upload audio interviews to auto-transcribe with speaker identification.
	Outputs DOCX files ready for analysis.
	""")

	with gr.Row():
	audio_input = gr.File(
	label="Upload Audio Files",
	file_types=[".mp3", ".wav", ".m4a", ".flac"],
	file_count="multiple"
	)
	num_speakers_input = gr.Slider(
	minimum=1,
	maximum=5,
	value=2,
	step=1,
	label="Number of Speakers"
	)

	transcribe_btn = gr.Button("🎙️ Transcribe Audio", variant="primary")
	transcribe_status = gr.Textbox(label="Status", lines=10)
	transcript_files = gr.File(label="Download Transcripts", file_count="multiple")

	transcribe_btn.click(
	fn=preprocess_audio,
	inputs=[audio_input, num_speakers_input],
	outputs=[transcript_files, transcribe_status]
	)

	gr.Markdown("""
	Next: Download transcripts, then go to "Transcript Analysis" tab to analyze them.
	""")



	with gr.TabItem("📊 Transcript Analysis"):
	with gr.Row():
	with gr.Column(scale=1):
	files = gr.File(
	label="📁 Upload Transcripts",
	file_types=[".docx", ".pdf"],
	file_count="multiple"
	)
	file_type = gr.Radio(
	["DOCX", "PDF"],
	label="File Type",
	value="DOCX"
	)
	interviewee_type = gr.Radio(
	["HCP", "Patient", "Other"],
	label="Interviewee Type",
	value="Patient",
	info="Select the type of person being interviewed"
	)

	with gr.Column(scale=1):
	user_comments = gr.Textbox(
	label="Analysis Instructions",
	lines=6,
	placeholder="Enter specific analysis goals, questions to answer, or context...",
	info="Provide guidance for the AI analyzer"
	)
	role_hint = gr.Textbox(
	label="Speaker Role Mapping (Optional)",
	placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor",
	info="Help identify speakers if needed"
	)

	with gr.Row():
	debug_mode = gr.Checkbox(label="🔍 Enable Debug Mode", value=False)
	analyze_btn = gr.Button("🚀 Analyze Transcripts", variant="primary", scale=2)

	with gr.Row():
	output_text = gr.Textbox(label="📊 Analysis Report", lines=40)

	with gr.Row():
	csv_output = gr.File(label="📥 Download CSV")
	pdf_output = gr.File(label="📥 Download PDF")

	with gr.Row():
	dashboard_output = gr.Plot(label="📈 Dashboard Visualization")

	analyze_btn.click(
	fn=analyze,
	inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type],
	outputs=[output_text, csv_output, pdf_output, dashboard_output]
	)


	with gr.TabItem("📝 Narrative Report"):
	gr.Markdown("""
	## Generate Storytelling Report

	Transform your analysis into a narrative report with:
	- Executive summary with key insights
	- Data-driven storytelling
	- Professional formatting (PDF, Word, HTML)
	- Actionable recommendations

	Instructions: First run the analysis in the previous tab, then use the outputs here to generate a narrative report.
	""")

	with gr.Row():
	with gr.Column():
	narrative_csv = gr.File(
	label="CSV Output from Analysis",
	file_types=[".csv"]
	)
	narrative_summary = gr.Textbox(
	label="Copy/Paste Summary Text from Analysis (Optional)",
	lines=10,
	placeholder="Paste the executive summary text here..."
	)

	with gr.Column():
	narrative_interviewee_type = gr.Radio(
	["HCP", "Patient", "Other"],
	label="Interviewee Type",
	value="Patient"
	)
	narrative_report_style = gr.Radio(
	["executive", "detailed", "presentation"],
	label="Report Style",
	value="executive",
	info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready"
	)
	generate_narrative_btn = gr.Button("📖 Generate Narrative Report", variant="primary")

	narrative_status = gr.Textbox(label="Status", lines=5)

	with gr.Row():
	narrative_pdf_output = gr.File(label="📥 Download PDF Report")
	narrative_word_output = gr.File(label="📥 Download Word Report")
	narrative_html_output = gr.File(label="📥 Download HTML Report")

	generate_narrative_btn.click(
	fn=generate_narrative_report_ui,
	inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style],
	outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output]
	)


	with gr.TabItem("❓ Help"):
	gr.Markdown("""
	### Quick Start Guide

	Step 1: Analyze Transcripts
	1. Upload your DOCX or PDF files
	2. Select interviewee type (HCP, Patient, or Other)
	3. Add analysis instructions
	4. Click "Analyze Transcripts"
	5. Download CSV, PDF, and view dashboard

	Step 2: Generate Narrative Report (Optional)
	1. Go to "Narrative Report" tab
	2. Upload the CSV from Step 1
	3. Optionally paste the summary text
	4. Select report style
	5. Click "Generate Narrative Report"
	6. Download PDF, Word, or HTML versions

	### Tips
	- CSV Upload: Download the CSV from analysis, then upload it to narrative report generator
	- Summary Text: Copy from the "Analysis Report" textbox and paste into narrative generator
	- Report Styles:
	- Executive: Best for C-level, investors, decision-makers
	- Detailed: Best for researchers, comprehensive analysis
	- Presentation: Best for slides, briefings, quick overviews

	### LLM Configuration
	- Set `USE_LMSTUDIO=True` to use your local LM Studio
	- Set `HUGGINGFACE_TOKEN` to use HF API for faster processing
	- Default: Uses local model (slower but free)

	### Support
	For issues, check the console output or enable debug mode.
	""")

	gr.Markdown("""
	---
	TranscriptorAI \| Enterprise-grade transcript analysis with narrative reporting
	""")

	if __name__ == "__main__":
	demo.queue(
	concurrency_count=1,
	max_size=10,
	api_open=False
	).launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)