import gradio as gr import os from datetime import datetime from typing import List, Dict, Tuple from extractors import extract_docx, extract_pdf, validate_extraction from tagging import tag_speakers_advanced from chunking import chunk_text_semantic from llm import query_llm, extract_structured_data from reporting import generate_enhanced_csv, generate_enhanced_pdf from dashboard import generate_comprehensive_dashboard from validation import validate_transcript_quality, check_data_completeness from audio_transcriber import transcribe_with_diarization_streaming # HuggingFace Spaces Configuration import os os.environ["LLM_BACKEND"] = "hf_api" os.environ["LLM_TIMEOUT"] = "25" os.environ["MAX_TOKENS_PER_REQUEST"] = "100" print("🚀 Running on HuggingFace Spaces - Optimized Configuration Loaded") def preprocess_audio(audio_files, num_speakers): """Convert audio to transcripts""" if not audio_files: return None, "No audio files provided" transcript_paths = [] status = "" for audio in audio_files: try: # Get the actual file path audio_path = audio.name if hasattr(audio, 'name') else str(audio) transcript_path = transcribe_with_diarization(audio_path, num_speakers) transcript_paths.append(transcript_path) status += f"✓ {os.path.basename(audio_path)} → {transcript_path}\n" except Exception as e: status += f"✗ {os.path.basename(audio_path)}: {str(e)}\n" # Return list of paths for file component return transcript_paths if transcript_paths else None, status def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()): """ Enhanced analysis pipeline with robust error handling and validation """ os.environ["DEBUG_MODE"] = str(debug_mode) if not files: return "Error: No files uploaded", None, None, None all_results = [] csv_rows = [] processing_errors = [] progress(0, desc="Initializing...") print(f"[Start] Processing {len(files)} file(s) as {file_type}") # Enhanced interviewee context interviewee_context = { "HCP": { "focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns", "extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"] }, "Patient": { "focus": "symptoms, concerns, emotional state, treatment understanding, adherence", "extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"] }, "Other": { "focus": "context-dependent insights, relevant observations", "extract": ["key_insights", "context", "recommendations"] } }.get(interviewee_type, {}) # Build enhanced user context user_context = f""" Interviewee Type: {interviewee_type} Analysis Focus: {interviewee_context.get('focus', 'general insights')} Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))} Additional Instructions: {user_comments} """.strip() total_steps = len(files) * 4 + 2 # extraction, validation, tagging, chunking per file + summary + report current_step = 0 for i, file in enumerate(files): file_name = os.path.basename(file.name) try: # Step 1: Extract text progress((current_step / total_steps), desc=f"Extracting {file_name}...") print(f"[File {i+1}/{len(files)}] Extracting: {file_name}") raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file) current_step += 1 # Step 2: Validate extraction progress((current_step / total_steps), desc=f"Validating {file_name}...") is_valid, validation_msg = validate_extraction(raw_text, file_name) if not is_valid: raise ValueError(f"Extraction validation failed: {validation_msg}") print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}") current_step += 1 # Step 3: Tag speakers with advanced logic progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...") tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type) print(f"[File {i+1}] Tagged {len(tagged_text)} characters") current_step += 1 # Step 4: Semantic chunking progress((current_step / total_steps), desc=f"Processing {file_name}...") chunks = chunk_text_semantic(tagged_text, interviewee_type) print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)") current_step += 1 # Step 5: LLM Analysis with structured extraction transcript_result = [] structured_data = {} for j, chunk in enumerate(chunks): chunk_progress = (current_step + (j / len(chunks))) / total_steps progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...") result, chunk_data = query_llm( chunk, user_context, interviewee_type, extract_structured=True ) transcript_result.append(result) # Merge structured data for key, value in chunk_data.items(): if key not in structured_data: structured_data[key] = [] if isinstance(value, list): structured_data[key].extend(value) else: structured_data[key].append(value) current_step += 1 # Combine and validate results full_text = "\n\n".join(transcript_result) # Quality check quality_score, quality_issues = validate_transcript_quality( full_text, structured_data, interviewee_type ) if quality_score < 0.3: print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}") processing_errors.append(f"{file_name}: Low quality - {quality_issues}") all_results.append({ "transcript_id": f"Transcript {i+1}", "file_name": file_name, "full_text": full_text, "structured_data": structured_data, "quality_score": quality_score, "word_count": len(raw_text.split()) }) # Enhanced CSV row with structured data csv_row = { "Transcript ID": f"Transcript {i+1}", "File Name": file_name, "Quality Score": f"{quality_score:.2f}", "Word Count": len(raw_text.split()), } # Add interviewee-specific fields if interviewee_type == "HCP": csv_row.update({ "Diagnoses": "; ".join(structured_data.get("diagnoses", [])), "Prescriptions": "; ".join(structured_data.get("prescriptions", [])), "Treatment Strategies": "; ".join(structured_data.get("treatment_rationale", [])), "Guidelines Mentioned": "; ".join(structured_data.get("guidelines_mentioned", [])) }) elif interviewee_type == "Patient": csv_row.update({ "Primary Symptoms": "; ".join(structured_data.get("symptoms", [])), "Main Concerns": "; ".join(structured_data.get("concerns", [])), "Treatment Response": "; ".join(structured_data.get("treatment_response", [])), "Side Effects": "; ".join(structured_data.get("side_effects", [])) }) else: csv_row.update({ "Key Insights": "; ".join(structured_data.get("key_insights", [])), "Recommendations": "; ".join(structured_data.get("recommendations", [])) }) csv_rows.append(csv_row) print(f"[File {i+1}] ✓ Processing complete") except Exception as e: # Enhanced error tracking with type and traceback import traceback error_type = type(e).__name__ error_details = str(e) error_traceback = traceback.format_exc() error_msg = f"[{error_type}] {file_name}: {error_details}" print(error_msg) # Store comprehensive error information processing_errors.append({ "transcript_id": f"Transcript {i+1}", "file_name": file_name, "error_type": error_type, "error_message": error_details[:200], # Truncate long messages "timestamp": datetime.now().isoformat() }) all_results.append({ "transcript_id": f"Transcript {i+1}", "file_name": file_name, "full_text": error_msg, "structured_data": {}, "quality_score": 0.0, "word_count": 0, "processing_status": "FAILED", "error_type": error_type }) # Add to CSV with error metadata csv_rows.append({ "Transcript ID": f"Transcript {i+1}", "File Name": file_name, "Quality Score": 0.0, "Word Count": 0, "Processing Status": "FAILED", "Error Type": error_type, "Error Message": error_details[:100] }) # Generate cross-transcript summary try: progress(0.9, desc="Generating summary and reports...") print("[Summary] Analyzing trends across transcripts") # Combine successful results valid_results = [r for r in all_results if r["quality_score"] > 0] if not valid_results: return "Error: No transcripts were successfully processed", None, None, None # Build comprehensive summary prompt summary_prompt = f""" CROSS-INTERVIEW SYNTHESIS TASK SAMPLE: {len(valid_results)} {interviewee_type} transcripts FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')} COMPLETE TRANSCRIPT DATA: """ for idx, result in enumerate(valid_results, 1): summary_prompt += f"\n{'='*60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='*60}\n" summary_prompt += f"{result['full_text'][:2000]}\n" summary_prompt += f""" ANALYSIS REQUIREMENTS: 1. QUANTIFY EVERYTHING: - Count participants: "X out of {len(valid_results)} participants mentioned..." - Never use vague terms (many/most/some) - Calculate percentages where relevant 2. IDENTIFY PATTERNS BY CONSENSUS LEVEL: - STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree) - MAJORITY VIEW (60-79% = {int(len(valid_results)*0.6)}-{int(len(valid_results)*0.79)} transcripts) - SPLIT PERSPECTIVES (40-59% = mixed views) - MINORITY/OUTLIER (<40% but notable) 3. CROSS-VALIDATE: - Check for contradictions between transcripts - Note where perspectives diverge and why - Flag any quality issues in individual transcripts 4. CITE EVIDENCE: - Reference specific transcript numbers - Brief supporting details - Distinguish verified facts from interpretation OUTPUT FORMAT: Write 2-3 sentence executive overview, then structure as: **STRONG CONSENSUS FINDINGS:** - [Finding with count and evidence] **MAJORITY FINDINGS:** - [Finding with count] **DIVERGENT PERSPECTIVES:** - [Where views split and context] **NOTABLE OUTLIERS:** - [Unique but important points] **DATA QUALITY NOTES:** - [Any gaps or transcript issues] Be specific. Use numbers. Cite transcript IDs. Flag weak evidence. """ # Use robust LLM with aggressive timeout protection print("[Summary] Generating cross-transcript summary...") print("[Summary] Note: This may take 30-60 seconds for large datasets") try: from llm_robust import query_llm_with_timeout summary, summary_data = query_llm_with_timeout( summary_prompt, user_context, interviewee_type, extract_structured=False, is_summary=True, max_timeout=60 # 60 second hard timeout ) except Exception as e: # Ultimate fallback print(f"[Summary] Critical error: {e}") print("[Summary] Using emergency fallback...") from llm_robust import generate_emergency_summary summary, summary_data = generate_emergency_summary(interviewee_type) # Validate summary quality and retry if needed from validation import validate_summary_quality summary_score, summary_issues = validate_summary_quality( summary, len(valid_results) ) if summary_score < 0.7: # Quality threshold print(f"[Warning] Summary quality issues (score: {summary_score:.2f}): {summary_issues}") print("[Summary] Retrying with stricter validation...") # Retry with enhanced prompt emphasizing validation failures retry_prompt = summary_prompt + f""" CRITICAL: Previous attempt failed validation with these issues: {chr(10).join('- ' + issue for issue in summary_issues)} MANDATORY CORRECTIONS: - Use ONLY specific numbers (e.g., "8 out of {len(valid_results)}" not "most") - Include percentages in parentheses - Cite transcript numbers for every claim - Minimum length: 500 words - No absolute terms (all/everyone/never/always) without 100% evidence """ try: summary, summary_data = query_llm_with_timeout( retry_prompt, user_context, interviewee_type, extract_structured=False, is_summary=True, max_timeout=60 # 60 second hard timeout for retry ) except Exception as e: print(f"[Summary] Retry also failed: {e}") print("[Summary] Using emergency fallback for retry...") summary, summary_data = generate_emergency_summary(interviewee_type) # Re-validate summary_score, summary_issues = validate_summary_quality(summary, len(valid_results)) if summary_score < 0.7: # Add quality warning to summary header warning_header = f"""[QUALITY WARNING - Score: {summary_score:.2f}] Validation issues detected: {'; '.join(summary_issues)} Please review findings carefully and verify against source data. {'='*60} """ summary = warning_header + summary print(f"[Warning] Summary still has issues after retry (score: {summary_score:.2f})") else: print(f"[Summary] ✓ Validation passed after retry (score: {summary_score:.2f})") else: print(f"[Summary] ✓ Validation passed (score: {summary_score:.2f})") # Verify consensus claims against actual data from validation import verify_consensus_claims consensus_warnings = verify_consensus_claims(summary, valid_results) if consensus_warnings: print(f"[Warning] Consensus verification issues: {len(consensus_warnings)} found") consensus_note = "\n\n[CONSENSUS VERIFICATION NOTES]:\n" + "\n".join(f"- {w}" for w in consensus_warnings) + "\n\n" summary = summary + consensus_note else: print("[Summary] ✓ Consensus claims verified") # Generate enhanced reports csv_path = generate_enhanced_csv(csv_rows, interviewee_type) print(f"[CSV] ✓ Saved to {csv_path}") pdf_path = generate_enhanced_pdf( summary, all_results, interviewee_type, processing_errors ) print(f"[PDF] ✓ Saved to {pdf_path}") dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type) print("[Dashboard] ✓ Generated") # Compile final output output_text = f"""# Analysis Complete ## Summary of Findings {summary} ## Processing Statistics - Total Files: {len(files)} - Successfully Processed: {len(valid_results)} - Failed: {len(processing_errors)} - Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f} """ if processing_errors: output_text += f"\n## Processing Errors\n" + "\n".join(f"- {err}" for err in processing_errors) output_text += "\n\n---\n\n## Individual Transcript Results\n\n" for result in all_results: output_text += f"### {result['transcript_id']} - {result['file_name']}\n" output_text += f"Quality Score: {result['quality_score']:.2f} | Words: {result['word_count']}\n\n" output_text += result['full_text'] + "\n\n---\n\n" progress(1.0, desc="Complete!") return output_text, csv_path, pdf_path, dashboard except Exception as e: error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}" print(error_msg) import traceback traceback.print_exc() return error_msg, None, None, None def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style): """ Wrapper function for Gradio UI to generate narrative reports """ try: from narrative_report_generator import generate_narrative_report import tempfile import os # Check if CSV file exists if csv_file is None: return "Error: No CSV file provided. Please run analysis first.", None, None, None # Save summary text to temp file if provided summary_path = None if summary_text and summary_text.strip(): with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: f.write(summary_text) summary_path = f.name # Determine LLM backend llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api" # Generate narrative report pdf_path, word_path, html_path = generate_narrative_report( csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file, summary_path=summary_path, interviewee_type=interviewee_type, report_style=report_style, llm_backend=llm_backend ) # Clean up temp file if summary_path and os.path.exists(summary_path): os.remove(summary_path) return ( f"✓ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}", pdf_path, word_path, html_path ) except Exception as e: import traceback error_detail = traceback.format_exc() return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎯 TranscriptorAI - Enterprise Transcript Analyzer Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis. """) with gr.Tabs(): with gr.TabItem("🎤 Audio Preprocessing"): gr.Markdown(""" Upload audio interviews to auto-transcribe with speaker identification. Outputs DOCX files ready for analysis. """) with gr.Row(): audio_input = gr.File( label="Upload Audio Files", file_types=[".mp3", ".wav", ".m4a", ".flac"], file_count="multiple" ) num_speakers_input = gr.Slider( minimum=1, maximum=5, value=2, step=1, label="Number of Speakers" ) transcribe_btn = gr.Button("🎙️ Transcribe Audio", variant="primary") transcribe_status = gr.Textbox(label="Status", lines=10) transcript_files = gr.File(label="Download Transcripts", file_count="multiple") transcribe_btn.click( fn=preprocess_audio, inputs=[audio_input, num_speakers_input], outputs=[transcript_files, transcribe_status] ) gr.Markdown(""" **Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them. """) with gr.TabItem("📊 Transcript Analysis"): with gr.Row(): with gr.Column(scale=1): files = gr.File( label="📁 Upload Transcripts", file_types=[".docx", ".pdf"], file_count="multiple" ) file_type = gr.Radio( ["DOCX", "PDF"], label="File Type", value="DOCX" ) interviewee_type = gr.Radio( ["HCP", "Patient", "Other"], label="Interviewee Type", value="Patient", info="Select the type of person being interviewed" ) with gr.Column(scale=1): user_comments = gr.Textbox( label="Analysis Instructions", lines=6, placeholder="Enter specific analysis goals, questions to answer, or context...", info="Provide guidance for the AI analyzer" ) role_hint = gr.Textbox( label="Speaker Role Mapping (Optional)", placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor", info="Help identify speakers if needed" ) with gr.Row(): debug_mode = gr.Checkbox(label="🔍 Enable Debug Mode", value=False) analyze_btn = gr.Button("🚀 Analyze Transcripts", variant="primary", scale=2) with gr.Row(): output_text = gr.Textbox(label="📊 Analysis Report", lines=40) with gr.Row(): csv_output = gr.File(label="📥 Download CSV") pdf_output = gr.File(label="📥 Download PDF") with gr.Row(): dashboard_output = gr.Plot(label="📈 Dashboard Visualization") analyze_btn.click( fn=analyze, inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type], outputs=[output_text, csv_output, pdf_output, dashboard_output] ) with gr.TabItem("📝 Narrative Report"): gr.Markdown(""" ## Generate Storytelling Report Transform your analysis into a narrative report with: - Executive summary with key insights - Data-driven storytelling - Professional formatting (PDF, Word, HTML) - Actionable recommendations **Instructions:** First run the analysis in the previous tab, then use the outputs here to generate a narrative report. """) with gr.Row(): with gr.Column(): narrative_csv = gr.File( label="CSV Output from Analysis", file_types=[".csv"] ) narrative_summary = gr.Textbox( label="Copy/Paste Summary Text from Analysis (Optional)", lines=10, placeholder="Paste the executive summary text here..." ) with gr.Column(): narrative_interviewee_type = gr.Radio( ["HCP", "Patient", "Other"], label="Interviewee Type", value="Patient" ) narrative_report_style = gr.Radio( ["executive", "detailed", "presentation"], label="Report Style", value="executive", info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready" ) generate_narrative_btn = gr.Button("📖 Generate Narrative Report", variant="primary") narrative_status = gr.Textbox(label="Status", lines=5) with gr.Row(): narrative_pdf_output = gr.File(label="📥 Download PDF Report") narrative_word_output = gr.File(label="📥 Download Word Report") narrative_html_output = gr.File(label="📥 Download HTML Report") generate_narrative_btn.click( fn=generate_narrative_report_ui, inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style], outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output] ) with gr.TabItem("❓ Help"): gr.Markdown(""" ### Quick Start Guide **Step 1: Analyze Transcripts** 1. Upload your DOCX or PDF files 2. Select interviewee type (HCP, Patient, or Other) 3. Add analysis instructions 4. Click "Analyze Transcripts" 5. Download CSV, PDF, and view dashboard **Step 2: Generate Narrative Report (Optional)** 1. Go to "Narrative Report" tab 2. Upload the CSV from Step 1 3. Optionally paste the summary text 4. Select report style 5. Click "Generate Narrative Report" 6. Download PDF, Word, or HTML versions ### Tips - **CSV Upload**: Download the CSV from analysis, then upload it to narrative report generator - **Summary Text**: Copy from the "Analysis Report" textbox and paste into narrative generator - **Report Styles**: - **Executive**: Best for C-level, investors, decision-makers - **Detailed**: Best for researchers, comprehensive analysis - **Presentation**: Best for slides, briefings, quick overviews ### LLM Configuration - Set `USE_LMSTUDIO=True` to use your local LM Studio - Set `HUGGINGFACE_TOKEN` to use HF API for faster processing - Default: Uses local model (slower but free) ### Support For issues, check the console output or enable debug mode. """) gr.Markdown(""" --- **TranscriptorAI** | Enterprise-grade transcript analysis with narrative reporting """) if __name__ == "__main__": demo.queue( concurrency_count=1, max_size=10, api_open=False ).launch( server_name="0.0.0.0", server_port=7860, show_error=True )