Spaces:

empirenexus
/

TranscriptWriting

Sleeping

File size: 28,732 Bytes

import gradio as gr
import os
from datetime import datetime
from typing import List, Dict, Tuple
from extractors import extract_docx, extract_pdf, validate_extraction
from tagging import tag_speakers_advanced
from chunking import chunk_text_semantic
from llm import query_llm, extract_structured_data
from reporting import generate_enhanced_csv, generate_enhanced_pdf
from dashboard import generate_comprehensive_dashboard
from validation import validate_transcript_quality, check_data_completeness
from audio_transcriber import transcribe_with_diarization_streaming

# HuggingFace Spaces Configuration
import os
os.environ["LLM_BACKEND"] = "hf_api"
os.environ["LLM_TIMEOUT"] = "25"
os.environ["MAX_TOKENS_PER_REQUEST"] = "100"
print("🚀 Running on HuggingFace Spaces - Optimized Configuration Loaded")

def preprocess_audio(audio_files, num_speakers):
    """Convert audio to transcripts"""
    if not audio_files:
        return None, "No audio files provided"
    
    transcript_paths = []
    status = ""
    
    for audio in audio_files:
        try:
            # Get the actual file path
            audio_path = audio.name if hasattr(audio, 'name') else str(audio)
            
            transcript_path = transcribe_with_diarization(audio_path, num_speakers)
            transcript_paths.append(transcript_path)
            status += f"✓ {os.path.basename(audio_path)} → {transcript_path}\n"
        except Exception as e:
            status += f"✗ {os.path.basename(audio_path)}: {str(e)}\n"
    
    # Return list of paths for file component
    return transcript_paths if transcript_paths else None, status


def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
    """
    Enhanced analysis pipeline with robust error handling and validation
    """
    os.environ["DEBUG_MODE"] = str(debug_mode)
    
    if not files:
        return "Error: No files uploaded", None, None, None
    
    all_results = []
    csv_rows = []
    processing_errors = []
    
    progress(0, desc="Initializing...")
    print(f"[Start] Processing {len(files)} file(s) as {file_type}")
    
    # Enhanced interviewee context
    interviewee_context = {
        "HCP": {
            "focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns",
            "extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"]
        },
        "Patient": {
            "focus": "symptoms, concerns, emotional state, treatment understanding, adherence",
            "extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"]
        },
        "Other": {
            "focus": "context-dependent insights, relevant observations",
            "extract": ["key_insights", "context", "recommendations"]
        }
    }.get(interviewee_type, {})
    
    # Build enhanced user context
    user_context = f"""
Interviewee Type: {interviewee_type}
Analysis Focus: {interviewee_context.get('focus', 'general insights')}
Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))}

Additional Instructions:
{user_comments}
""".strip()
    
    total_steps = len(files) * 4 + 2  # extraction, validation, tagging, chunking per file + summary + report
    current_step = 0
    
    for i, file in enumerate(files):
        file_name = os.path.basename(file.name)
        try:
            # Step 1: Extract text
            progress((current_step / total_steps), desc=f"Extracting {file_name}...")
            print(f"[File {i+1}/{len(files)}] Extracting: {file_name}")
            
            raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file)
            current_step += 1
            
            # Step 2: Validate extraction
            progress((current_step / total_steps), desc=f"Validating {file_name}...")
            is_valid, validation_msg = validate_extraction(raw_text, file_name)
            if not is_valid:
                raise ValueError(f"Extraction validation failed: {validation_msg}")
            
            print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}")
            current_step += 1
            
            # Step 3: Tag speakers with advanced logic
            progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...")
            tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type)
            print(f"[File {i+1}] Tagged {len(tagged_text)} characters")
            current_step += 1
            
            # Step 4: Semantic chunking
            progress((current_step / total_steps), desc=f"Processing {file_name}...")
            chunks = chunk_text_semantic(tagged_text, interviewee_type)
            print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)")
            current_step += 1
            
            # Step 5: LLM Analysis with structured extraction
            transcript_result = []
            structured_data = {}
            
            for j, chunk in enumerate(chunks):
                chunk_progress = (current_step + (j / len(chunks))) / total_steps
                progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...")
                
                result, chunk_data = query_llm(
                    chunk, 
                    user_context, 
                    interviewee_type,
                    extract_structured=True
                )
                
                transcript_result.append(result)
                
                # Merge structured data
                for key, value in chunk_data.items():
                    if key not in structured_data:
                        structured_data[key] = []
                    if isinstance(value, list):
                        structured_data[key].extend(value)
                    else:
                        structured_data[key].append(value)
            
            current_step += 1
            
            # Combine and validate results
            full_text = "\n\n".join(transcript_result)
            
            # Quality check
            quality_score, quality_issues = validate_transcript_quality(
                full_text, 
                structured_data, 
                interviewee_type
            )
            
            if quality_score < 0.3:
                print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}")
                processing_errors.append(f"{file_name}: Low quality - {quality_issues}")
            
            all_results.append({
                "transcript_id": f"Transcript {i+1}",
                "file_name": file_name,
                "full_text": full_text,
                "structured_data": structured_data,
                "quality_score": quality_score,
                "word_count": len(raw_text.split())
            })
            
            # Enhanced CSV row with structured data
            csv_row = {
                "Transcript ID": f"Transcript {i+1}",
                "File Name": file_name,
                "Quality Score": f"{quality_score:.2f}",
                "Word Count": len(raw_text.split()),
            }
            
            # Add interviewee-specific fields
            if interviewee_type == "HCP":
                csv_row.update({
                    "Diagnoses": "; ".join(structured_data.get("diagnoses", [])),
                    "Prescriptions": "; ".join(structured_data.get("prescriptions", [])),
                    "Treatment Strategies": "; ".join(structured_data.get("treatment_rationale", [])),
                    "Guidelines Mentioned": "; ".join(structured_data.get("guidelines_mentioned", []))
                })
            elif interviewee_type == "Patient":
                csv_row.update({
                    "Primary Symptoms": "; ".join(structured_data.get("symptoms", [])),
                    "Main Concerns": "; ".join(structured_data.get("concerns", [])),
                    "Treatment Response": "; ".join(structured_data.get("treatment_response", [])),
                    "Side Effects": "; ".join(structured_data.get("side_effects", []))
                })
            else:
                csv_row.update({
                    "Key Insights": "; ".join(structured_data.get("key_insights", [])),
                    "Recommendations": "; ".join(structured_data.get("recommendations", []))
                })
            
            csv_rows.append(csv_row)
            
            print(f"[File {i+1}] ✓ Processing complete")
            
        except Exception as e:
            # Enhanced error tracking with type and traceback
            import traceback
            error_type = type(e).__name__
            error_details = str(e)
            error_traceback = traceback.format_exc()

            error_msg = f"[{error_type}] {file_name}: {error_details}"
            print(error_msg)

            # Store comprehensive error information
            processing_errors.append({
                "transcript_id": f"Transcript {i+1}",
                "file_name": file_name,
                "error_type": error_type,
                "error_message": error_details[:200],  # Truncate long messages
                "timestamp": datetime.now().isoformat()
            })

            all_results.append({
                "transcript_id": f"Transcript {i+1}",
                "file_name": file_name,
                "full_text": error_msg,
                "structured_data": {},
                "quality_score": 0.0,
                "word_count": 0,
                "processing_status": "FAILED",
                "error_type": error_type
            })

            # Add to CSV with error metadata
            csv_rows.append({
                "Transcript ID": f"Transcript {i+1}",
                "File Name": file_name,
                "Quality Score": 0.0,
                "Word Count": 0,
                "Processing Status": "FAILED",
                "Error Type": error_type,
                "Error Message": error_details[:100]
            })
    
    # Generate cross-transcript summary
    try:
        progress(0.9, desc="Generating summary and reports...")
        print("[Summary] Analyzing trends across transcripts")
        
        # Combine successful results
        valid_results = [r for r in all_results if r["quality_score"] > 0]
        
        if not valid_results:
            return "Error: No transcripts were successfully processed", None, None, None
        
        # Build comprehensive summary prompt
        summary_prompt = f"""
    CROSS-INTERVIEW SYNTHESIS TASK
    
    SAMPLE: {len(valid_results)} {interviewee_type} transcripts
    FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')}
    
    COMPLETE TRANSCRIPT DATA:
    """
        
        for idx, result in enumerate(valid_results, 1):
            summary_prompt += f"\n{'='*60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='*60}\n"
            summary_prompt += f"{result['full_text'][:2000]}\n"
        
        summary_prompt += f"""
    
    ANALYSIS REQUIREMENTS:
    
    1. QUANTIFY EVERYTHING:
       - Count participants: "X out of {len(valid_results)} participants mentioned..."
       - Never use vague terms (many/most/some)
       - Calculate percentages where relevant
    
    2. IDENTIFY PATTERNS BY CONSENSUS LEVEL:
       - STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree)
       - MAJORITY VIEW (60-79% = {int(len(valid_results)*0.6)}-{int(len(valid_results)*0.79)} transcripts)
       - SPLIT PERSPECTIVES (40-59% = mixed views)
       - MINORITY/OUTLIER (<40% but notable)
    
    3. CROSS-VALIDATE:
       - Check for contradictions between transcripts
       - Note where perspectives diverge and why
       - Flag any quality issues in individual transcripts
    
    4. CITE EVIDENCE:
       - Reference specific transcript numbers
       - Brief supporting details
       - Distinguish verified facts from interpretation
    
    OUTPUT FORMAT:
    Write 2-3 sentence executive overview, then structure as:
    
    **STRONG CONSENSUS FINDINGS:**
    - [Finding with count and evidence]
    
    **MAJORITY FINDINGS:**
    - [Finding with count]
    
    **DIVERGENT PERSPECTIVES:**
    - [Where views split and context]
    
    **NOTABLE OUTLIERS:**
    - [Unique but important points]
    
    **DATA QUALITY NOTES:**
    - [Any gaps or transcript issues]
    
    Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
    """
        
        # Use robust LLM with aggressive timeout protection
        print("[Summary] Generating cross-transcript summary...")
        print("[Summary] Note: This may take 30-60 seconds for large datasets")

        try:
            from llm_robust import query_llm_with_timeout

            summary, summary_data = query_llm_with_timeout(
                summary_prompt,
                user_context,
                interviewee_type,
                extract_structured=False,
                is_summary=True,
                max_timeout=60  # 60 second hard timeout
            )
        except Exception as e:
            # Ultimate fallback
            print(f"[Summary] Critical error: {e}")
            print("[Summary] Using emergency fallback...")
            from llm_robust import generate_emergency_summary
            summary, summary_data = generate_emergency_summary(interviewee_type)

        # Validate summary quality and retry if needed
        from validation import validate_summary_quality
        summary_score, summary_issues = validate_summary_quality(
            summary,
            len(valid_results)
        )

        if summary_score < 0.7:  # Quality threshold
            print(f"[Warning] Summary quality issues (score: {summary_score:.2f}): {summary_issues}")
            print("[Summary] Retrying with stricter validation...")

            # Retry with enhanced prompt emphasizing validation failures
            retry_prompt = summary_prompt + f"""

CRITICAL: Previous attempt failed validation with these issues:
{chr(10).join('- ' + issue for issue in summary_issues)}

MANDATORY CORRECTIONS:
- Use ONLY specific numbers (e.g., "8 out of {len(valid_results)}" not "most")
- Include percentages in parentheses
- Cite transcript numbers for every claim
- Minimum length: 500 words
- No absolute terms (all/everyone/never/always) without 100% evidence
"""

            try:
                summary, summary_data = query_llm_with_timeout(
                    retry_prompt,
                    user_context,
                    interviewee_type,
                    extract_structured=False,
                    is_summary=True,
                    max_timeout=60  # 60 second hard timeout for retry
                )
            except Exception as e:
                print(f"[Summary] Retry also failed: {e}")
                print("[Summary] Using emergency fallback for retry...")
                summary, summary_data = generate_emergency_summary(interviewee_type)

            # Re-validate
            summary_score, summary_issues = validate_summary_quality(summary, len(valid_results))

            if summary_score < 0.7:
                # Add quality warning to summary header
                warning_header = f"""[QUALITY WARNING - Score: {summary_score:.2f}]
Validation issues detected: {'; '.join(summary_issues)}
Please review findings carefully and verify against source data.

{'='*60}

"""
                summary = warning_header + summary
                print(f"[Warning] Summary still has issues after retry (score: {summary_score:.2f})")
            else:
                print(f"[Summary] ✓ Validation passed after retry (score: {summary_score:.2f})")
        else:
            print(f"[Summary] ✓ Validation passed (score: {summary_score:.2f})")

        # Verify consensus claims against actual data
        from validation import verify_consensus_claims
        consensus_warnings = verify_consensus_claims(summary, valid_results)
        if consensus_warnings:
            print(f"[Warning] Consensus verification issues: {len(consensus_warnings)} found")
            consensus_note = "\n\n[CONSENSUS VERIFICATION NOTES]:\n" + "\n".join(f"- {w}" for w in consensus_warnings) + "\n\n"
            summary = summary + consensus_note
        else:
            print("[Summary] ✓ Consensus claims verified")

        # Generate enhanced reports
        csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
        print(f"[CSV] ✓ Saved to {csv_path}")
        
        pdf_path = generate_enhanced_pdf(
            summary, 
            all_results, 
            interviewee_type,
            processing_errors
        )
        print(f"[PDF] ✓ Saved to {pdf_path}")
        
        dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type)
        print("[Dashboard] ✓ Generated")
        
        # Compile final output
        output_text = f"""# Analysis Complete
        
## Summary of Findings
{summary}

## Processing Statistics
- Total Files: {len(files)}
- Successfully Processed: {len(valid_results)}
- Failed: {len(processing_errors)}
- Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f}

"""
        
        if processing_errors:
            output_text += f"\n## Processing Errors\n" + "\n".join(f"- {err}" for err in processing_errors)
        
        output_text += "\n\n---\n\n## Individual Transcript Results\n\n"
        
        for result in all_results:
            output_text += f"### {result['transcript_id']} - {result['file_name']}\n"
            output_text += f"Quality Score: {result['quality_score']:.2f} | Words: {result['word_count']}\n\n"
            output_text += result['full_text'] + "\n\n---\n\n"
        
        progress(1.0, desc="Complete!")
        return output_text, csv_path, pdf_path, dashboard
        
    except Exception as e:
        error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}"
        print(error_msg)
        import traceback
        traceback.print_exc()
        return error_msg, None, None, None

def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style):
    """
    Wrapper function for Gradio UI to generate narrative reports
    """
    try:
        from narrative_report_generator import generate_narrative_report
        import tempfile
        import os
        
        # Check if CSV file exists
        if csv_file is None:
            return "Error: No CSV file provided. Please run analysis first.", None, None, None
        
        # Save summary text to temp file if provided
        summary_path = None
        if summary_text and summary_text.strip():
            with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
                f.write(summary_text)
                summary_path = f.name
        
        # Determine LLM backend
        llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api"
        
        # Generate narrative report
        pdf_path, word_path, html_path = generate_narrative_report(
            csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file,
            summary_path=summary_path,
            interviewee_type=interviewee_type,
            report_style=report_style,
            llm_backend=llm_backend
        )
        
        # Clean up temp file
        if summary_path and os.path.exists(summary_path):
            os.remove(summary_path)
        
        return (
            f"✓ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}",
            pdf_path,
            word_path,
            html_path
        )
        
    except Exception as e:
        import traceback
        error_detail = traceback.format_exc()
        return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎯 TranscriptorAI - Enterprise Transcript Analyzer
    
    Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis.
    """)
    
    with gr.Tabs():
     
        with gr.TabItem("🎤 Audio Preprocessing"):
            gr.Markdown("""
            Upload audio interviews to auto-transcribe with speaker identification.
            Outputs DOCX files ready for analysis.
            """)
            
            with gr.Row():
                audio_input = gr.File(
                    label="Upload Audio Files",
                    file_types=[".mp3", ".wav", ".m4a", ".flac"],
                    file_count="multiple"
                )
                num_speakers_input = gr.Slider(
                    minimum=1,
                    maximum=5,
                    value=2,
                    step=1,
                    label="Number of Speakers"
                )
            
            transcribe_btn = gr.Button("🎙️ Transcribe Audio", variant="primary")
            transcribe_status = gr.Textbox(label="Status", lines=10)
            transcript_files = gr.File(label="Download Transcripts", file_count="multiple")
            
            transcribe_btn.click(
                fn=preprocess_audio,
                inputs=[audio_input, num_speakers_input],
                outputs=[transcript_files, transcribe_status]
            )
            
            gr.Markdown("""
            **Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them.
            """)
        
        
              
        with gr.TabItem("📊 Transcript Analysis"):
            with gr.Row():
                with gr.Column(scale=1):
                    files = gr.File(
                        label="📁 Upload Transcripts", 
                        file_types=[".docx", ".pdf"], 
                        file_count="multiple"
                    )
                    file_type = gr.Radio(
                        ["DOCX", "PDF"], 
                        label="File Type",
                        value="DOCX"
                    )
                    interviewee_type = gr.Radio(
                        ["HCP", "Patient", "Other"], 
                        label="Interviewee Type",
                        value="Patient",
                        info="Select the type of person being interviewed"
                    )
                    
                with gr.Column(scale=1):
                    user_comments = gr.Textbox(
                        label="Analysis Instructions", 
                        lines=6,
                        placeholder="Enter specific analysis goals, questions to answer, or context...",
                        info="Provide guidance for the AI analyzer"
                    )
                    role_hint = gr.Textbox(
                        label="Speaker Role Mapping (Optional)", 
                        placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor",
                        info="Help identify speakers if needed"
                    )
            
            with gr.Row():
                debug_mode = gr.Checkbox(label="🔍 Enable Debug Mode", value=False)
                analyze_btn = gr.Button("🚀 Analyze Transcripts", variant="primary", scale=2)
            
            with gr.Row():
                output_text = gr.Textbox(label="📊 Analysis Report", lines=40)
            
            with gr.Row():
                csv_output = gr.File(label="📥 Download CSV")
                pdf_output = gr.File(label="📥 Download PDF")
            
            with gr.Row():
                dashboard_output = gr.Plot(label="📈 Dashboard Visualization")
            
            analyze_btn.click(
                fn=analyze,
                inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type],
                outputs=[output_text, csv_output, pdf_output, dashboard_output]
            )
        
       
        with gr.TabItem("📝 Narrative Report"):
            gr.Markdown("""
            ## Generate Storytelling Report
            
            Transform your analysis into a narrative report with:
            - Executive summary with key insights
            - Data-driven storytelling
            - Professional formatting (PDF, Word, HTML)
            - Actionable recommendations
            
            **Instructions:** First run the analysis in the previous tab, then use the outputs here to generate a narrative report.
            """)
            
            with gr.Row():
                with gr.Column():
                    narrative_csv = gr.File(
                        label="CSV Output from Analysis",
                        file_types=[".csv"]
                    )
                    narrative_summary = gr.Textbox(
                        label="Copy/Paste Summary Text from Analysis (Optional)",
                        lines=10,
                        placeholder="Paste the executive summary text here..."
                    )
                
                with gr.Column():
                    narrative_interviewee_type = gr.Radio(
                        ["HCP", "Patient", "Other"],
                        label="Interviewee Type",
                        value="Patient"
                    )
                    narrative_report_style = gr.Radio(
                        ["executive", "detailed", "presentation"],
                        label="Report Style",
                        value="executive",
                        info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready"
                    )
                    generate_narrative_btn = gr.Button("📖 Generate Narrative Report", variant="primary")
            
            narrative_status = gr.Textbox(label="Status", lines=5)
            
            with gr.Row():
                narrative_pdf_output = gr.File(label="📥 Download PDF Report")
                narrative_word_output = gr.File(label="📥 Download Word Report")
                narrative_html_output = gr.File(label="📥 Download HTML Report")
            
            generate_narrative_btn.click(
                fn=generate_narrative_report_ui,
                inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style],
                outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output]
            )
        
       
        with gr.TabItem("❓ Help"):
            gr.Markdown("""
            ### Quick Start Guide
            
            **Step 1: Analyze Transcripts**
            1. Upload your DOCX or PDF files
            2. Select interviewee type (HCP, Patient, or Other)
            3. Add analysis instructions
            4. Click "Analyze Transcripts"
            5. Download CSV, PDF, and view dashboard
            
            **Step 2: Generate Narrative Report (Optional)**
            1. Go to "Narrative Report" tab
            2. Upload the CSV from Step 1
            3. Optionally paste the summary text
            4. Select report style
            5. Click "Generate Narrative Report"
            6. Download PDF, Word, or HTML versions
            
            ### Tips
            - **CSV Upload**: Download the CSV from analysis, then upload it to narrative report generator
            - **Summary Text**: Copy from the "Analysis Report" textbox and paste into narrative generator
            - **Report Styles**:
              - **Executive**: Best for C-level, investors, decision-makers
              - **Detailed**: Best for researchers, comprehensive analysis
              - **Presentation**: Best for slides, briefings, quick overviews
            
            ### LLM Configuration
            - Set `USE_LMSTUDIO=True` to use your local LM Studio
            - Set `HUGGINGFACE_TOKEN` to use HF API for faster processing
            - Default: Uses local model (slower but free)
            
            ### Support
            For issues, check the console output or enable debug mode.
            """)
    
    gr.Markdown("""
    ---
    **TranscriptorAI** | Enterprise-grade transcript analysis with narrative reporting
    """)

    if __name__ == "__main__":
        demo.queue(
            concurrency_count=1,
            max_size=10,
            api_open=False
        ).launch(
            server_name="0.0.0.0",
            server_port=7860,
            show_error=True
        )