Spaces:

empirenexus
/

TranscriptWriting

Paused

App Files Files Community

jmisak commited on Oct 18, 2025

Commit

54c99ad

verified ·

1 Parent(s): 769f718

Upload 23 files

Browse files

Files changed (22) hide show

README.md +7 -5
Set-Service +0 -0
app.py +582 -0
audio_transcriber.py +100 -0
audio_transcriber_hf.py +104 -0
chunking.py +236 -0
config.py +283 -0
dashboard.py +340 -0
extractors.py +201 -0
llm.py +383 -0
narrative_report_generator.py +74 -0
outputs/sample.txt +0 -0
report.csv +2 -0
report.pdf +112 -0
report_parser.py +61 -0
reporting.py +239 -0
requirements.txt +41 -0
story_writer.py +55 -0
table_builder.py +51 -0
tagging.py +228 -0
utils.py +404 -0
validation.py +274 -0

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
-title: TranscriptWriting
-emoji: ⚡
-colorFrom: purple
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: StoryTellerTranscript
+emoji: 🌖
+colorFrom: green
+colorTo: gray
 sdk: gradio
+sdk_version: 5.49.0
 app_file: app.py
 pinned: false
+license: unknown
+short_description: Audio interviews to final reports
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

Set-Service ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import gradio as gr
+import os
+from typing import List, Dict, Tuple
+from extractors import extract_docx, extract_pdf, validate_extraction
+from tagging import tag_speakers_advanced
+from chunking import chunk_text_semantic
+from llm import query_llm, extract_structured_data
+from reporting import generate_enhanced_csv, generate_enhanced_pdf
+from dashboard import generate_comprehensive_dashboard
+from validation import validate_transcript_quality, check_data_completeness
+from audio_transcriber import transcribe_with_diarization_streaming
+def preprocess_audio(audio_files, num_speakers):
+    """Convert audio to transcripts"""
+    if not audio_files:
+        return None, "No audio files provided"
+    transcript_paths = []
+    status = ""
+    for audio in audio_files:
+        try:
+            # Get the actual file path
+            audio_path = audio.name if hasattr(audio, 'name') else str(audio)
+            transcript_path = transcribe_with_diarization(audio_path, num_speakers)
+            transcript_paths.append(transcript_path)
+            status += f"✓ {os.path.basename(audio_path)} → {transcript_path}\n"
+        except Exception as e:
+            status += f"✗ {os.path.basename(audio_path)}: {str(e)}\n"
+    # Return list of paths for file component
+    return transcript_paths if transcript_paths else None, status
+def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
+    """
+    Enhanced analysis pipeline with robust error handling and validation
+    """
+    os.environ["DEBUG_MODE"] = str(debug_mode)
+    if not files:
+        return "Error: No files uploaded", None, None, None
+    all_results = []
+    csv_rows = []
+    processing_errors = []
+    progress(0, desc="Initializing...")
+    print(f"[Start] Processing {len(files)} file(s) as {file_type}")
+    # Enhanced interviewee context
+    interviewee_context = {
+        "HCP": {
+            "focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns",
+            "extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"]
+        },
+        "Patient": {
+            "focus": "symptoms, concerns, emotional state, treatment understanding, adherence",
+            "extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"]
+        },
+        "Other": {
+            "focus": "context-dependent insights, relevant observations",
+            "extract": ["key_insights", "context", "recommendations"]
+        }
+    }.get(interviewee_type, {})
+    # Build enhanced user context
+    user_context = f"""
+Interviewee Type: {interviewee_type}
+Analysis Focus: {interviewee_context.get('focus', 'general insights')}
+Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))}
+Additional Instructions:
+{user_comments}
+""".strip()
+    total_steps = len(files) * 4 + 2  # extraction, validation, tagging, chunking per file + summary + report
+    current_step = 0
+    for i, file in enumerate(files):
+        file_name = os.path.basename(file.name)
+        try:
+            # Step 1: Extract text
+            progress((current_step / total_steps), desc=f"Extracting {file_name}...")
+            print(f"[File {i+1}/{len(files)}] Extracting: {file_name}")
+            raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file)
+            current_step += 1
+            # Step 2: Validate extraction
+            progress((current_step / total_steps), desc=f"Validating {file_name}...")
+            is_valid, validation_msg = validate_extraction(raw_text, file_name)
+            if not is_valid:
+                raise ValueError(f"Extraction validation failed: {validation_msg}")
+            print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}")
+            current_step += 1
+            # Step 3: Tag speakers with advanced logic
+            progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...")
+            tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type)
+            print(f"[File {i+1}] Tagged {len(tagged_text)} characters")
+            current_step += 1
+            # Step 4: Semantic chunking
+            progress((current_step / total_steps), desc=f"Processing {file_name}...")
+            chunks = chunk_text_semantic(tagged_text, interviewee_type)
+            print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)")
+            current_step += 1
+            # Step 5: LLM Analysis with structured extraction
+            transcript_result = []
+            structured_data = {}
+            for j, chunk in enumerate(chunks):
+                chunk_progress = (current_step + (j / len(chunks))) / total_steps
+                progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...")
+                result, chunk_data = query_llm(
+                    chunk,
+                    user_context,
+                    interviewee_type,
+                    extract_structured=True
+                )
+                transcript_result.append(result)
+                # Merge structured data
+                for key, value in chunk_data.items():
+                    if key not in structured_data:
+                        structured_data[key] = []
+                    if isinstance(value, list):
+                        structured_data[key].extend(value)
+                    else:
+                        structured_data[key].append(value)
+            current_step += 1
+            # Combine and validate results
+            full_text = "\n\n".join(transcript_result)
+            # Quality check
+            quality_score, quality_issues = validate_transcript_quality(
+                full_text,
+                structured_data,
+                interviewee_type
+            )
+            if quality_score < 0.3:
+                print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}")
+                processing_errors.append(f"{file_name}: Low quality - {quality_issues}")
+            all_results.append({
+                "transcript_id": f"Transcript {i+1}",
+                "file_name": file_name,
+                "full_text": full_text,
+                "structured_data": structured_data,
+                "quality_score": quality_score,
+                "word_count": len(raw_text.split())
+            })
+            # Enhanced CSV row with structured data
+            csv_row = {
+                "Transcript ID": f"Transcript {i+1}",
+                "File Name": file_name,
+                "Quality Score": f"{quality_score:.2f}",
+                "Word Count": len(raw_text.split()),
+            }
+            # Add interviewee-specific fields
+            if interviewee_type == "HCP":
+                csv_row.update({
+                    "Diagnoses": "; ".join(structured_data.get("diagnoses", [])),
+                    "Prescriptions": "; ".join(structured_data.get("prescriptions", [])),
+                    "Treatment Strategies": "; ".join(structured_data.get("treatment_rationale", [])),
+                    "Guidelines Mentioned": "; ".join(structured_data.get("guidelines_mentioned", []))
+                })
+            elif interviewee_type == "Patient":
+                csv_row.update({
+                    "Primary Symptoms": "; ".join(structured_data.get("symptoms", [])),
+                    "Main Concerns": "; ".join(structured_data.get("concerns", [])),
+                    "Treatment Response": "; ".join(structured_data.get("treatment_response", [])),
+                    "Side Effects": "; ".join(structured_data.get("side_effects", []))
+                })
+            else:
+                csv_row.update({
+                    "Key Insights": "; ".join(structured_data.get("key_insights", [])),
+                    "Recommendations": "; ".join(structured_data.get("recommendations", []))
+                })
+            csv_rows.append(csv_row)
+            print(f"[File {i+1}] ✓ Processing complete")
+        except Exception as e:
+            error_msg = f"[Error] {file_name} failed: {str(e)}"
+            print(error_msg)
+            processing_errors.append(error_msg)
+            all_results.append({
+                "transcript_id": f"Transcript {i+1}",
+                "file_name": file_name,
+                "full_text": error_msg,
+                "structured_data": {},
+                "quality_score": 0.0,
+                "word_count": 0
+            })
+    # Generate cross-transcript summary
+    try:
+        progress(0.9, desc="Generating summary and reports...")
+        print("[Summary] Analyzing trends across transcripts")
+        # Combine successful results
+        valid_results = [r for r in all_results if r["quality_score"] > 0]
+        if not valid_results:
+            return "Error: No transcripts were successfully processed", None, None, None
+        # Build comprehensive summary prompt
+        summary_prompt = f"""
+    CROSS-INTERVIEW SYNTHESIS TASK
+    SAMPLE: {len(valid_results)} {interviewee_type} transcripts
+    FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')}
+    COMPLETE TRANSCRIPT DATA:
+    """
+        for idx, result in enumerate(valid_results, 1):
+            summary_prompt += f"\n{'='*60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='*60}\n"
+            summary_prompt += f"{result['full_text'][:2000]}\n"
+        summary_prompt += f"""
+    ANALYSIS REQUIREMENTS:
+    1. QUANTIFY EVERYTHING:
+       - Count participants: "X out of {len(valid_results)} participants mentioned..."
+       - Never use vague terms (many/most/some)
+       - Calculate percentages where relevant
+    2. IDENTIFY PATTERNS BY CONSENSUS LEVEL:
+       - STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree)
+       - MAJORITY VIEW (60-79% = {int(len(valid_results)*0.6)}-{int(len(valid_results)*0.79)} transcripts)
+       - SPLIT PERSPECTIVES (40-59% = mixed views)
+       - MINORITY/OUTLIER (<40% but notable)
+    3. CROSS-VALIDATE:
+       - Check for contradictions between transcripts
+       - Note where perspectives diverge and why
+       - Flag any quality issues in individual transcripts
+    4. CITE EVIDENCE:
+       - Reference specific transcript numbers
+       - Brief supporting details
+       - Distinguish verified facts from interpretation
+    OUTPUT FORMAT:
+    Write 2-3 sentence executive overview, then structure as:
+    **STRONG CONSENSUS FINDINGS:**
+    - [Finding with count and evidence]
+    **MAJORITY FINDINGS:**
+    - [Finding with count]
+    **DIVERGENT PERSPECTIVES:**
+    - [Where views split and context]
+    **NOTABLE OUTLIERS:**
+    - [Unique but important points]
+    **DATA QUALITY NOTES:**
+    - [Any gaps or transcript issues]
+    Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
+    """
+        summary, summary_data = query_llm(
+            summary_prompt,
+            user_context,
+            interviewee_type,
+            extract_structured=False,
+            is_summary=True
+        )
+        print("[Summary] ✓ Generated")
+        # Generate enhanced reports
+        csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
+        print(f"[CSV] ✓ Saved to {csv_path}")
+        pdf_path = generate_enhanced_pdf(
+            summary,
+            all_results,
+            interviewee_type,
+            processing_errors
+        )
+        print(f"[PDF] ✓ Saved to {pdf_path}")
+        dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type)
+        print("[Dashboard] ✓ Generated")
+        # Compile final output
+        output_text = f"""# Analysis Complete
+## Summary of Findings
+{summary}
+## Processing Statistics
+- Total Files: {len(files)}
+- Successfully Processed: {len(valid_results)}
+- Failed: {len(processing_errors)}
+- Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f}
+"""
+        if processing_errors:
+            output_text += f"\n## Processing Errors\n" + "\n".join(f"- {err}" for err in processing_errors)
+        output_text += "\n\n---\n\n## Individual Transcript Results\n\n"
+        for result in all_results:
+            output_text += f"### {result['transcript_id']} - {result['file_name']}\n"
+            output_text += f"Quality Score: {result['quality_score']:.2f} | Words: {result['word_count']}\n\n"
+            output_text += result['full_text'] + "\n\n---\n\n"
+        progress(1.0, desc="Complete!")
+        return output_text, csv_path, pdf_path, dashboard
+    except Exception as e:
+        error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}"
+        print(error_msg)
+        import traceback
+        traceback.print_exc()
+        return error_msg, None, None, None
+def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style):
+    """
+    Wrapper function for Gradio UI to generate narrative reports
+    """
+    try:
+        from narrative_report_generator import generate_narrative_report
+        import tempfile
+        import os
+        # Check if CSV file exists
+        if csv_file is None:
+            return "Error: No CSV file provided. Please run analysis first.", None, None, None
+        # Save summary text to temp file if provided
+        summary_path = None
+        if summary_text and summary_text.strip():
+            with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+                f.write(summary_text)
+                summary_path = f.name
+        # Determine LLM backend
+        llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api"
+        # Generate narrative report
+        pdf_path, word_path, html_path = generate_narrative_report(
+            csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file,
+            summary_path=summary_path,
+            interviewee_type=interviewee_type,
+            report_style=report_style,
+            llm_backend=llm_backend
+        )
+        # Clean up temp file
+        if summary_path and os.path.exists(summary_path):
+            os.remove(summary_path)
+        return (
+            f"✓ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}",
+            pdf_path,
+            word_path,
+            html_path
+        )
+    except Exception as e:
+        import traceback
+        error_detail = traceback.format_exc()
+        return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎯 TranscriptorAI - Enterprise Transcript Analyzer
+    Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis.
+    """)
+    with gr.Tabs():
+        with gr.TabItem("🎤 Audio Preprocessing"):
+            gr.Markdown("""
+            Upload audio interviews to auto-transcribe with speaker identification.
+            Outputs DOCX files ready for analysis.
+            """)
+            with gr.Row():
+                audio_input = gr.File(
+                    label="Upload Audio Files",
+                    file_types=[".mp3", ".wav", ".m4a", ".flac"],
+                    file_count="multiple"
+                )
+                num_speakers_input = gr.Slider(
+                    minimum=1,
+                    maximum=5,
+                    value=2,
+                    step=1,
+                    label="Number of Speakers"
+                )
+            transcribe_btn = gr.Button("🎙️ Transcribe Audio", variant="primary")
+            transcribe_status = gr.Textbox(label="Status", lines=10)
+            transcript_files = gr.File(label="Download Transcripts", file_count="multiple")
+            transcribe_btn.click(
+                fn=preprocess_audio,
+                inputs=[audio_input, num_speakers_input],
+                outputs=[transcript_files, transcribe_status]
+            )
+            gr.Markdown("""
+            **Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them.
+            """)
+        with gr.TabItem("📊 Transcript Analysis"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    files = gr.File(
+                        label="📁 Upload Transcripts",
+                        file_types=[".docx", ".pdf"],
+                        file_count="multiple"
+                    )
+                    file_type = gr.Radio(
+                        ["DOCX", "PDF"],
+                        label="File Type",
+                        value="DOCX"
+                    )
+                    interviewee_type = gr.Radio(
+                        ["HCP", "Patient", "Other"],
+                        label="Interviewee Type",
+                        value="Patient",
+                        info="Select the type of person being interviewed"
+                    )
+                with gr.Column(scale=1):
+                    user_comments = gr.Textbox(
+                        label="Analysis Instructions",
+                        lines=6,
+                        placeholder="Enter specific analysis goals, questions to answer, or context...",
+                        info="Provide guidance for the AI analyzer"
+                    )
+                    role_hint = gr.Textbox(
+                        label="Speaker Role Mapping (Optional)",
+                        placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor",
+                        info="Help identify speakers if needed"
+                    )
+            with gr.Row():
+                debug_mode = gr.Checkbox(label="🔍 Enable Debug Mode", value=False)
+                analyze_btn = gr.Button("🚀 Analyze Transcripts", variant="primary", scale=2)
+            with gr.Row():
+                output_text = gr.Textbox(label="📊 Analysis Report", lines=40)
+            with gr.Row():
+                csv_output = gr.File(label="📥 Download CSV")
+                pdf_output = gr.File(label="📥 Download PDF")
+            with gr.Row():
+                dashboard_output = gr.Plot(label="📈 Dashboard Visualization")
+            analyze_btn.click(
+                fn=analyze,
+                inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type],
+                outputs=[output_text, csv_output, pdf_output, dashboard_output]
+            )
+        with gr.TabItem("📝 Narrative Report"):
+            gr.Markdown("""
+            ## Generate Storytelling Report
+            Transform your analysis into a narrative report with:
+            - Executive summary with key insights
+            - Data-driven storytelling
+            - Professional formatting (PDF, Word, HTML)
+            - Actionable recommendations
+            **Instructions:** First run the analysis in the previous tab, then use the outputs here to generate a narrative report.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    narrative_csv = gr.File(
+                        label="CSV Output from Analysis",
+                        file_types=[".csv"]
+                    )
+                    narrative_summary = gr.Textbox(
+                        label="Copy/Paste Summary Text from Analysis (Optional)",
+                        lines=10,
+                        placeholder="Paste the executive summary text here..."
+                    )
+                with gr.Column():
+                    narrative_interviewee_type = gr.Radio(
+                        ["HCP", "Patient", "Other"],
+                        label="Interviewee Type",
+                        value="Patient"
+                    )
+                    narrative_report_style = gr.Radio(
+                        ["executive", "detailed", "presentation"],
+                        label="Report Style",
+                        value="executive",
+                        info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready"
+                    )
+                    generate_narrative_btn = gr.Button("📖 Generate Narrative Report", variant="primary")
+            narrative_status = gr.Textbox(label="Status", lines=5)
+            with gr.Row():
+                narrative_pdf_output = gr.File(label="📥 Download PDF Report")
+                narrative_word_output = gr.File(label="📥 Download Word Report")
+                narrative_html_output = gr.File(label="📥 Download HTML Report")
+            generate_narrative_btn.click(
+                fn=generate_narrative_report_ui,
+                inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style],
+                outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output]
+            )
+        with gr.TabItem("❓ Help"):
+            gr.Markdown("""
+            ### Quick Start Guide
+            **Step 1: Analyze Transcripts**
+            1. Upload your DOCX or PDF files
+            2. Select interviewee type (HCP, Patient, or Other)
+            3. Add analysis instructions
+            4. Click "Analyze Transcripts"
+            5. Download CSV, PDF, and view dashboard
+            **Step 2: Generate Narrative Report (Optional)**
+            1. Go to "Narrative Report" tab
+            2. Upload the CSV from Step 1
+            3. Optionally paste the summary text
+            4. Select report style
+            5. Click "Generate Narrative Report"
+            6. Download PDF, Word, or HTML versions
+            ### Tips
+            - **CSV Upload**: Download the CSV from analysis, then upload it to narrative report generator
+            - **Summary Text**: Copy from the "Analysis Report" textbox and paste into narrative generator
+            - **Report Styles**:
+              - **Executive**: Best for C-level, investors, decision-makers
+              - **Detailed**: Best for researchers, comprehensive analysis
+              - **Presentation**: Best for slides, briefings, quick overviews
+            ### LLM Configuration
+            - Set `USE_LMSTUDIO=True` to use your local LM Studio
+            - Set `HUGGINGFACE_TOKEN` to use HF API for faster processing
+            - Default: Uses local model (slower but free)
+            ### Support
+            For issues, check the console output or enable debug mode.
+            """)
+    gr.Markdown("""
+    ---
+    **TranscriptorAI** | Enterprise-grade transcript analysis with narrative reporting
+    """)
+if __name__ == "__main__":
+    demo.launch()

audio_transcriber.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from faster_whisper import WhisperModel
+from speechbrain.inference import EncoderClassifier
+from sklearn.cluster import AgglomerativeClustering
+from docx import Document
+import torch, torchaudio, numpy as np
+def transcribe_with_diarization_streaming(audio_path: str, num_speakers: int = 1):
+    """
+    Streaming transcription with diarization support.
+    - Processes audio in chunks (default 30s).
+    - Streams partial transcripts as they’re ready.
+    - Handles single-speaker fallback.
+    """
+    # Device fallback
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    compute_type = "float16" if device == "cuda" else "int8"
+    print(f"[1/3] Loading Whisper model on {device}...")
+    try:
+        if torch.cuda.is_available():
+            device = "cuda"
+            compute_type = "float16"
+            _ = torch.zeros(1).to(device)  # sanity check
+        else:
+            raise RuntimeError("No CUDA")
+    except Exception:
+        print("⚠️ CUDA not usable, falling back to CPU")
+        device = "cpu"
+        compute_type = "int8"
+    whisper_model = WhisperModel("large-v3", device=device, compute_type=compute_type)
+    return whisper_model
+    print(f"[2/3] Transcribing...")
+    # Streaming generator
+    segments, info = whisper_model.transcribe(
+        audio_path,
+        language="en",
+        beam_size=5,
+        word_timestamps=True,
+        vad_filter=True,
+    )
+    segments_list = []
+    for seg in segments:
+        print(f"[stream] {seg.start:.2f}-{seg.end:.2f}: {seg.text}")
+        segments_list.append(seg)
+    # Speaker embeddings
+    print(f"[3/3] Extracting speaker embeddings...")
+    speaker_model = EncoderClassifier.from_hparams(
+        source="speechbrain/spkrec-ecapa-voxceleb",
+        savedir="models/speaker_embeddings",
+        run_opts={"device": device}
+    )
+    waveform, sample_rate = torchaudio.load(audio_path)
+    embeddings, valid_segments = [], []
+    for seg in segments_list:
+        start_sample = int(seg.start * sample_rate)
+        end_sample = int(seg.end * sample_rate)
+        if end_sample > start_sample:
+            seg_audio = waveform[:, start_sample:end_sample]
+            if sample_rate != 16000:
+                seg_audio = torchaudio.transforms.Resample(sample_rate, 16000)(seg_audio)
+            with torch.no_grad():
+                emb = speaker_model.encode_batch(seg_audio)
+                embeddings.append(emb.squeeze().cpu().numpy())
+                valid_segments.append(seg)
+    # Handle empty or single-speaker case
+    if len(embeddings) == 0 or num_speakers <= 1:
+        print("Single speaker detected or no embeddings. Skipping clustering.")
+        speaker_labels = [0] * len(valid_segments)
+        num_speakers = 1
+    else:
+        if num_speakers > len(embeddings):
+            num_speakers = len(embeddings)
+        clustering = AgglomerativeClustering(n_clusters=num_speakers)
+        speaker_labels = clustering.fit_predict(np.array(embeddings))
+    # Build transcript
+    doc = Document()
+    doc.add_heading('Interview Transcript', 0)
+    doc.add_paragraph(f"Detected {num_speakers} speaker(s)")
+    doc.add_paragraph("")
+    for seg, spk in zip(valid_segments, speaker_labels):
+        doc.add_paragraph(f"Speaker {spk+1}: {seg.text.strip()}")
+    output_path = audio_path.rsplit('.', 1)[0] + '_transcript.docx'
+    doc.save(output_path)
+    print(f"✓ Saved transcript: {output_path}")
+    return output_path

audio_transcriber_hf.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Audio transcription with speaker diarization
+"""
+from faster_whisper import WhisperModel
+from pyannote.audio import Pipeline
+import torch
+from docx import Document
+import os
+def transcribe_with_diarization(audio_path: str, num_speakers: int = 2) -> str:
+    """
+    Transcribe audio with speaker labels
+    Args:
+        audio_path: Path to audio file (mp3, wav, m4a)
+        num_speakers: Expected number of speakers (default 2 for interviews)
+    Returns:
+        Path to generated DOCX transcript
+    """
+    print(f"[1/3] Transcribing audio...")
+    # Load Whisper model
+    model = WhisperModel("large-v3", device="cuda", compute_type="float16")
+    # Transcribe with timestamps
+    segments, info = model.transcribe(
+        audio_path,
+        language="en",
+        beam_size=5,
+        word_timestamps=True
+    )
+    segments_list = list(segments)
+    print(f"[2/3] Identifying speakers...")
+    # Load diarization pipeline
+    # Note: Requires HuggingFace token for pyannote models
+    hf_token = os.getenv("HUGGINGFACE_TOKEN", "")
+    if not hf_token:
+        print("[Warning] No HF token - using simple alternating speakers")
+        return transcribe_simple(segments_list, audio_path)
+    diarization = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization-3.1",
+        use_auth_token=hf_token
+    )
+    if torch.cuda.is_available():
+        diarization.to(torch.device("cuda"))
+    # Run diarization
+    diarization_result = diarization(audio_path, num_speakers=num_speakers)
+    print(f"[3/3] Combining transcription + speakers...")
+    # Match segments to speakers
+    transcript_lines = []
+    for segment in segments_list:
+        start = segment.start
+        end = segment.end
+        text = segment.text
+        # Find speaker at this timestamp
+        speaker = get_speaker_at_time(diarization_result, start)
+        transcript_lines.append(f"{speaker}: {text}")
+    # Save to DOCX
+    doc = Document()
+    doc.add_heading('Interview Transcript', 0)
+    for line in transcript_lines:
+        doc.add_paragraph(line)
+    output_path = audio_path.replace('.mp3', '_transcript.docx').replace('.wav', '_transcript.docx').replace('.m4a', '_transcript.docx')
+    doc.save(output_path)
+    print(f"✓ Transcript saved: {output_path}")
+    return output_path
+def get_speaker_at_time(diarization_result, timestamp):
+    """Find which speaker is talking at given timestamp"""
+    for turn, _, speaker in diarization_result.itertracks(yield_label=True):
+        if turn.start <= timestamp <= turn.end:
+            return f"Speaker {speaker}"
+    return "Speaker Unknown"
+def transcribe_simple(segments_list, audio_path):
+    """Fallback: alternating speakers without diarization"""
+    doc = Document()
+    doc.add_heading('Interview Transcript', 0)
+    current_speaker = 1
+    for segment in segments_list:
+        doc.add_paragraph(f"Speaker {current_speaker}: {segment.text}")
+        # Simple heuristic: alternate on pauses > 2 seconds
+        if hasattr(segment, 'no_speech_prob') and segment.no_speech_prob > 0.5:
+            current_speaker = 3 - current_speaker  # Toggle between 1 and 2
+    output_path = audio_path.replace('.mp3', '_transcript.docx')
+    doc.save(output_path)
+    return output_path

chunking.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import tiktoken
+import re
+from typing import List
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+def chunk_text(text, max_tokens=3000):
+    """Legacy function - kept for backwards compatibility"""
+    return chunk_text_semantic(text, "Other", max_tokens)
+def count_tokens(text: str) -> int:
+    """Count tokens using tiktoken"""
+    try:
+        enc = tiktoken.get_encoding("cl100k_base")
+        return len(enc.encode(text))
+    except Exception:
+        # Fallback to word-based estimate
+        return int(len(text.split()) * 1.3)
+def split_into_sentences(text: str) -> List[str]:
+    """Split text into sentences with improved handling"""
+    try:
+        tokenizer = PunktSentenceTokenizer()
+        sentences = tokenizer.tokenize(text)
+        return sentences
+    except Exception:
+        # Fallback to simple split
+        return [s.strip() + '.' for s in text.split('.') if s.strip()]
+def find_topic_boundaries(text: str, interviewee_type: str) -> List[int]:
+    """
+    Identify topic boundaries in the text for smarter chunking
+    Returns list of character positions where topics likely change
+    """
+    boundaries = [0]  # Start position
+    # Topic change indicators
+    topic_patterns = [
+        r'\n\n+',  # Paragraph breaks
+        r'\[Interviewer\].*?(next|another|different|moving on|let\'s talk about)',
+        r'\[Interviewer\].*?\?.*?\n.*?\[(?:Doctor|Patient|Respondent)\]',  # Q&A pairs
+    ]
+    # Find all topic boundaries
+    for pattern in topic_patterns:
+        for match in re.finditer(pattern, text, re.IGNORECASE):
+            pos = match.start()
+            # Only add if not too close to existing boundary
+            if all(abs(pos - b) > 100 for b in boundaries):
+                boundaries.append(pos)
+    boundaries.append(len(text))  # End position
+    boundaries.sort()
+    return boundaries
+def extract_speaker_segments(text: str) -> List[dict]:
+    """
+    Extract segments with speaker labels and content
+    """
+    pattern = r'\[([^\]]+)\]\s*([^\[]*)'
+    segments = []
+    for match in re.finditer(pattern, text, re.DOTALL):
+        speaker = match.group(1).strip()
+        content = match.group(2).strip()
+        if content:
+            segments.append({
+                "speaker": speaker,
+                "content": content,
+                "start_pos": match.start(),
+                "tokens": count_tokens(content)
+            })
+    return segments
+def chunk_text_semantic(
+    text: str,
+    interviewee_type: str = "Other",
+    max_tokens: int = 3000,
+    overlap_tokens: int = 150
+) -> List[str]:
+    """
+    Advanced chunking that respects:
+    1. Speaker boundaries (don't split mid-sentence)
+    2. Topic boundaries (keep related Q&A together)
+    3. Token limits for LLM context
+    4. Overlap for context continuity
+    """
+    # Check if text has speaker tags
+    has_tags = bool(re.search(r'\[[^\]]+\]', text))
+    if not has_tags:
+        # Fallback to sentence-based chunking
+        return chunk_by_sentences(text, max_tokens, overlap_tokens)
+    # Extract speaker segments
+    segments = extract_speaker_segments(text)
+    if not segments:
+        return chunk_by_sentences(text, max_tokens, overlap_tokens)
+    # Group segments into chunks
+    chunks = []
+    current_chunk_segments = []
+    current_tokens = 0
+    i = 0
+    while i < len(segments):
+        segment = segments[i]
+        segment_tokens = segment["tokens"]
+        # If single segment exceeds max_tokens, split it
+        if segment_tokens > max_tokens:
+            # Split long segment by sentences
+            sub_chunks = chunk_by_sentences(
+                f"[{segment['speaker']}] {segment['content']}",
+                max_tokens,
+                overlap_tokens
+            )
+            chunks.extend(sub_chunks)
+            i += 1
+            continue
+        # Check if adding this segment would exceed limit
+        if current_tokens + segment_tokens > max_tokens and current_chunk_segments:
+            # Finalize current chunk
+            chunk_text = "\n\n".join([
+                f"[{s['speaker']}] {s['content']}"
+                for s in current_chunk_segments
+            ])
+            chunks.append(chunk_text)
+            # Start new chunk with overlap
+            # Keep last few segments for context
+            overlap_segments = []
+            overlap_token_count = 0
+            for seg in reversed(current_chunk_segments):
+                if overlap_token_count + seg["tokens"] < overlap_tokens:
+                    overlap_segments.insert(0, seg)
+                    overlap_token_count += seg["tokens"]
+                else:
+                    break
+            current_chunk_segments = overlap_segments
+            current_tokens = overlap_token_count
+        # Add segment to current chunk
+        current_chunk_segments.append(segment)
+        current_tokens += segment_tokens
+        i += 1
+    # Add final chunk
+    if current_chunk_segments:
+        chunk_text = "\n\n".join([
+            f"[{s['speaker']}] {s['content']}"
+            for s in current_chunk_segments
+        ])
+        chunks.append(chunk_text)
+    return chunks if chunks else [text]
+def chunk_by_sentences(
+    text: str,
+    max_tokens: int = 3000,
+    overlap_tokens: int = 150
+) -> List[str]:
+    """
+    Fallback chunking method based on sentences
+    """
+    sentences = split_into_sentences(text)
+    chunks = []
+    current_chunk = []
+    current_tokens = 0
+    for sentence in sentences:
+        sentence_tokens = count_tokens(sentence)
+        if current_tokens + sentence_tokens > max_tokens and current_chunk:
+            # Finalize current chunk
+            chunks.append(" ".join(current_chunk))
+            # Create overlap
+            overlap_sents = []
+            overlap_token_count = 0
+            for sent in reversed(current_chunk):
+                sent_tokens = count_tokens(sent)
+                if overlap_token_count + sent_tokens < overlap_tokens:
+                    overlap_sents.insert(0, sent)
+                    overlap_token_count += sent_tokens
+                else:
+                    break
+            current_chunk = overlap_sents
+            current_tokens = overlap_token_count
+        current_chunk.append(sentence)
+        current_tokens += sentence_tokens
+    # Add final chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks if chunks else [text]
+def analyze_chunk_quality(chunks: List[str]) -> dict:
+    """
+    Analyze chunking quality for debugging
+    """
+    if not chunks:
+        return {"error": "No chunks"}
+    token_counts = [count_tokens(chunk) for chunk in chunks]
+    return {
+        "num_chunks": len(chunks),
+        "avg_tokens": sum(token_counts) / len(token_counts),
+        "min_tokens": min(token_counts),
+        "max_tokens": max(token_counts),
+        "total_tokens": sum(token_counts),
+        "chunks_over_limit": sum(1 for t in token_counts if t > 3000)
+    }

config.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import os
+from typing import Dict, Any
+# ============================================================================
+# LLM CONFIGURATION
+# ============================================================================
+# Choose LLM backend: "hf_api" (recommended), "local", or "openai"
+LLM_BACKEND = os.getenv("LLM_BACKEND", "hf_api")
+# Hugging Face Configuration
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
+HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
+# Local Model Configuration
+LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
+DEVICE = os.getenv("DEVICE", "auto")  # "auto", "cpu", "cuda", "mps"
+# OpenAI Configuration (if using OpenAI)
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
+# LLM Parameters
+MAX_TOKENS_PER_REQUEST = int(os.getenv("MAX_TOKENS_PER_REQUEST", "300"))
+LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
+LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))
+# ============================================================================
+# CHUNKING CONFIGURATION
+# ============================================================================
+MAX_CHUNK_TOKENS = int(os.getenv("MAX_CHUNK_TOKENS", "6000"))
+OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
+TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
+# ============================================================================
+# QUALITY THRESHOLDS
+# ============================================================================
+MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
+MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
+MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
+# Quality grade thresholds
+QUALITY_EXCELLENT = 0.8
+QUALITY_GOOD = 0.6
+QUALITY_FAIR = 0.4
+# ============================================================================
+# FILE PROCESSING CONFIGURATION
+# ============================================================================
+MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
+SUPPORTED_FORMATS = [".docx", ".pdf"]
+MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
+# ============================================================================
+# OUTPUT CONFIGURATION
+# ============================================================================
+OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
+CSV_FILENAME = "transcript_analysis.csv"
+PDF_FILENAME = "transcript_report.pdf"
+# Ensure output directory exists
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# ============================================================================
+# DEBUG AND LOGGING
+# ============================================================================
+DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
+VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
+LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
+# ============================================================================
+# ADVANCED SETTINGS
+# ============================================================================
+# Cache extracted text to avoid re-processing
+ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
+CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
+# Parallel processing
+ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
+MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
+# ============================================================================
+# SYSTEM PROMPTS
+# ============================================================================
+BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
+Your task is to extract structured, actionable insights from interview transcripts.
+Core Principles:
+- Focus on factual, verifiable medical information
+- Distinguish between speaker roles accurately
+- Filter out pleasantries, disclaimers, and off-topic content
+- Extract specific medical terms, dosages, and treatment details
+- Identify patterns and clinical reasoning
+- Maintain objectivity and clinical accuracy
+"""
+HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
+Healthcare Professional Analysis Focus:
+- Prescribing patterns and medication choices
+- Diagnostic reasoning and clinical decision-making
+- Treatment protocols and guidelines referenced
+- Peer perspectives on efficacy and safety
+- Barriers to treatment or adoption
+- Off-label uses or emerging practices
+Extract and structure:
+1. Diagnoses mentioned with context
+2. Prescriptions with dosage, frequency, and rationale
+3. Treatment strategies and their justifications
+4. Clinical guidelines or studies referenced
+5. Challenges or barriers discussed
+6. Key clinical insights or pearls
+"""
+PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
+Patient Interview Analysis Focus:
+- Symptom descriptions and severity
+- Treatment experiences and outcomes
+- Side effects and tolerability
+- Quality of life impacts
+- Adherence challenges and enablers
+- Emotional and psychological factors
+- Healthcare system interactions
+Extract and structure:
+1. Primary symptoms with duration and severity
+2. Current and past treatments
+3. Treatment effectiveness and satisfaction
+4. Side effects experienced
+5. Concerns and unmet needs
+6. Quality of life impacts
+7. Support systems and resources
+"""
+SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
+Focus on:
+- Frequency analysis (how many interviewees mentioned X?)
+- Common patterns and themes
+- Consensus and disagreements
+- Statistical insights (percentages, distributions)
+- Actionable recommendations for stakeholders
+Provide:
+1. Quantitative summary (X% mentioned Y)
+2. Key trends and patterns
+3. Notable outliers or unique insights
+4. Actionable recommendations
+5. Data gaps or areas needing follow-up
+"""
+# ============================================================================
+# VALIDATION SETTINGS
+# ============================================================================
+VALIDATION_CONFIG = {
+    "min_word_ratio": 0.3,
+    "max_repetition_ratio": 1.5,
+    "min_sentences": 3,
+    "check_errors": True,
+    "check_gibberish": True
+}
+# ============================================================================
+# DASHBOARD SETTINGS
+# ============================================================================
+DASHBOARD_CONFIG = {
+    "figure_size": (14, 10),
+    "dpi": 100,
+    "style": "default",
+    "top_n_items": 8,
+    "color_scheme": {
+        "primary": "#3498db",
+        "secondary": "#2ecc71",
+        "accent": "#e74c3c",
+        "warning": "#f39c12"
+    }
+}
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def get_config() -> Dict[str, Any]:
+    """Return all configuration as a dictionary"""
+    return {
+        "llm": {
+            "backend": LLM_BACKEND,
+            "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
+            "max_tokens": MAX_TOKENS_PER_REQUEST,
+            "temperature": LLM_TEMPERATURE,
+            "timeout": LLM_TIMEOUT
+        },
+        "chunking": {
+            "max_tokens": MAX_CHUNK_TOKENS,
+            "overlap": OVERLAP_TOKENS
+        },
+        "quality": {
+            "min_score": MIN_QUALITY_SCORE,
+            "min_words": MIN_WORD_COUNT
+        },
+        "files": {
+            "max_size_mb": MAX_FILE_SIZE_MB,
+            "max_per_batch": MAX_FILES_PER_BATCH,
+            "supported": SUPPORTED_FORMATS
+        },
+        "output": {
+            "directory": OUTPUT_DIR,
+            "csv": CSV_FILENAME,
+            "pdf": PDF_FILENAME
+        },
+        "debug": DEBUG_MODE,
+        "caching": ENABLE_CACHING,
+        "parallel": ENABLE_PARALLEL_PROCESSING
+    }
+def print_config():
+    """Print current configuration"""
+    config = get_config()
+    print("=" * 60)
+    print("TRANSCRIPTORAI CONFIGURATION")
+    print("=" * 60)
+    for section, settings in config.items():
+        print(f"\n{section.upper()}:")
+        for key, value in settings.items():
+            print(f"  {key}: {value}")
+    print("=" * 60)
+def validate_config() -> bool:
+    """Validate configuration settings"""
+    issues = []
+    # Check LLM configuration
+    if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
+        issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
+    if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
+        issues.append("OpenAI selected but OPENAI_API_KEY not set")
+    # Check paths exist
+    if not os.path.exists(OUTPUT_DIR):
+        try:
+            os.makedirs(OUTPUT_DIR)
+        except:
+            issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
+    # Check reasonable values
+    if MAX_CHUNK_TOKENS < 500:
+        issues.append("MAX_CHUNK_TOKENS too small (< 500)")
+    if MAX_TOKENS_PER_REQUEST < 100:
+        issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
+    if issues:
+        print("Configuration Issues:")
+        for issue in issues:
+            print(f"  - {issue}")
+        return False
+    return True
+# ============================================================================
+# INITIALIZATION
+# ============================================================================
+if __name__ == "__main__":
+    print_config()
+    if validate_config():
+        print("\n✓ Configuration valid")
+    else:
+        print("\n✗ Configuration has issues")

dashboard.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import pandas as pd
+import numpy as np
+from collections import Counter
+from typing import List, Dict
+import re
+def generate_dashboard(data):
+    """Legacy function - kept for backwards compatibility"""
+    return generate_comprehensive_dashboard(data, "Other")
+def extract_items_from_field(data: List[Dict], field_name: str) -> List[str]:
+    """Extract and split items from semicolon-separated field"""
+    items = []
+    for row in data:
+        value = row.get(field_name, "")
+        if value and isinstance(value, str):
+            # Split by semicolon and clean
+            parts = [p.strip() for p in value.split(';') if p.strip()]
+            items.extend(parts)
+    return items
+def generate_comprehensive_dashboard(
+    data: List[Dict],
+    interviewee_type: str
+) -> plt.Figure:
+    """
+    Generate comprehensive dashboard with multiple visualizations
+    """
+    if not data or len(data) == 0:
+        # Return empty figure with message
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.text(0.5, 0.5, 'No data available for visualization',
+                ha='center', va='center', fontsize=14)
+        ax.axis('off')
+        return fig
+    df = pd.DataFrame(data)
+    # Determine number of subplots based on interviewee type
+    if interviewee_type == "HCP":
+        fig = create_hcp_dashboard(df)
+    elif interviewee_type == "Patient":
+        fig = create_patient_dashboard(df)
+    else:
+        fig = create_general_dashboard(df)
+    plt.tight_layout()
+    return fig
+def create_hcp_dashboard(df: pd.DataFrame) -> plt.Figure:
+    """Create dashboard for HCP interviews"""
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle('Healthcare Professional Interview Analysis', fontsize=16, fontweight='bold')
+    # 1. Quality Score Distribution
+    ax1 = axes[0, 0]
+    if 'Quality Score' in df.columns:
+        quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
+        if len(quality_scores) > 0:
+            ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
+            ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
+                       label=f'Mean: {quality_scores.mean():.2f}')
+            ax1.set_xlabel('Quality Score')
+            ax1.set_ylabel('Frequency')
+            ax1.set_title('Transcript Quality Distribution')
+            ax1.legend()
+            ax1.grid(axis='y', alpha=0.3)
+    # 2. Top Diagnoses
+    ax2 = axes[0, 1]
+    if 'Diagnoses' in df.columns:
+        diagnoses = extract_items_from_field(df.to_dict('records'), 'Diagnoses')
+        if diagnoses:
+            diagnosis_counts = Counter(diagnoses)
+            top_diagnoses = dict(diagnosis_counts.most_common(8))
+            if top_diagnoses:
+                labels = list(top_diagnoses.keys())
+                # Truncate long labels
+                labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
+                values = list(top_diagnoses.values())
+                bars = ax2.barh(labels, values, color='#2ecc71', edgecolor='black')
+                ax2.set_xlabel('Frequency')
+                ax2.set_title('Most Common Diagnoses')
+                ax2.invert_yaxis()
+                # Add value labels
+                for i, bar in enumerate(bars):
+                    width = bar.get_width()
+                    ax2.text(width, bar.get_y() + bar.get_height()/2,
+                            f' {int(width)}', ha='left', va='center', fontsize=9)
+    # 3. Prescription Analysis
+    ax3 = axes[1, 0]
+    if 'Prescriptions' in df.columns:
+        prescriptions = extract_items_from_field(df.to_dict('records'), 'Prescriptions')
+        if prescriptions:
+            rx_counts = Counter(prescriptions)
+            top_rx = dict(rx_counts.most_common(8))
+            if top_rx:
+                labels = list(top_rx.keys())
+                labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
+                values = list(top_rx.values())
+                bars = ax3.barh(labels, values, color='#e74c3c', edgecolor='black')
+                ax3.set_xlabel('Frequency')
+                ax3.set_title('Most Mentioned Prescriptions')
+                ax3.invert_yaxis()
+                for i, bar in enumerate(bars):
+                    width = bar.get_width()
+                    ax3.text(width, bar.get_y() + bar.get_height()/2,
+                            f' {int(width)}', ha='left', va='center', fontsize=9)
+    # 4. Word Count by Transcript
+    ax4 = axes[1, 1]
+    if 'Word Count' in df.columns and 'Transcript ID' in df.columns:
+        word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
+        transcript_ids = df['Transcript ID'][:len(word_counts)]
+        if len(word_counts) > 0:
+            bars = ax4.bar(range(len(word_counts)), word_counts, color='#9b59b6',
+                          edgecolor='black', alpha=0.7)
+            ax4.set_xlabel('Transcript')
+            ax4.set_ylabel('Word Count')
+            ax4.set_title('Interview Length by Transcript')
+            ax4.set_xticks(range(len(word_counts)))
+            ax4.set_xticklabels(transcript_ids, rotation=45, ha='right')
+            ax4.grid(axis='y', alpha=0.3)
+            # Add mean line
+            ax4.axhline(word_counts.mean(), color='red', linestyle='--',
+                       label=f'Average: {int(word_counts.mean())}')
+            ax4.legend()
+    return fig
+def create_patient_dashboard(df: pd.DataFrame) -> plt.Figure:
+    """Create dashboard for Patient interviews"""
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle('Patient Interview Analysis', fontsize=16, fontweight='bold')
+    # 1. Quality Score Distribution
+    ax1 = axes[0, 0]
+    if 'Quality Score' in df.columns:
+        quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
+        if len(quality_scores) > 0:
+            ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
+            ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
+                       label=f'Mean: {quality_scores.mean():.2f}')
+            ax1.set_xlabel('Quality Score')
+            ax1.set_ylabel('Frequency')
+            ax1.set_title('Transcript Quality Distribution')
+            ax1.legend()
+            ax1.grid(axis='y', alpha=0.3)
+    # 2. Top Symptoms
+    ax2 = axes[0, 1]
+    if 'Primary Symptoms' in df.columns:
+        symptoms = extract_items_from_field(df.to_dict('records'), 'Primary Symptoms')
+        if symptoms:
+            symptom_counts = Counter(symptoms)
+            top_symptoms = dict(symptom_counts.most_common(8))
+            if top_symptoms:
+                labels = list(top_symptoms.keys())
+                labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
+                values = list(top_symptoms.values())
+                bars = ax2.barh(labels, values, color='#e67e22', edgecolor='black')
+                ax2.set_xlabel('Frequency')
+                ax2.set_title('Most Common Symptoms')
+                ax2.invert_yaxis()
+                for i, bar in enumerate(bars):
+                    width = bar.get_width()
+                    ax2.text(width, bar.get_y() + bar.get_height()/2,
+                            f' {int(width)}', ha='left', va='center', fontsize=9)
+    # 3. Patient Concerns
+    ax3 = axes[1, 0]
+    if 'Main Concerns' in df.columns:
+        concerns = extract_items_from_field(df.to_dict('records'), 'Main Concerns')
+        if concerns:
+            concern_counts = Counter(concerns)
+            top_concerns = dict(concern_counts.most_common(6))
+            if top_concerns:
+                # Create word cloud style pie chart
+                labels = list(top_concerns.keys())
+                labels = [label[:25] + '...' if len(label) > 25 else label for label in labels]
+                sizes = list(top_concerns.values())
+                colors_list = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#f9ca24', '#6c5ce7', '#a29bfe']
+                ax3.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90,
+                       colors=colors_list[:len(sizes)])
+                ax3.set_title('Distribution of Patient Concerns')
+    # 4. Side Effects
+    ax4 = axes[1, 1]
+    if 'Side Effects' in df.columns:
+        side_effects = extract_items_from_field(df.to_dict('records'), 'Side Effects')
+        if side_effects:
+            se_counts = Counter(side_effects)
+            top_se = dict(se_counts.most_common(6))
+            if top_se:
+                labels = list(top_se.keys())
+                labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
+                values = list(top_se.values())
+                bars = ax4.barh(labels, values, color='#e74c3c', edgecolor='black')
+                ax4.set_xlabel('Frequency')
+                ax4.set_title('Reported Side Effects')
+                ax4.invert_yaxis()
+                for i, bar in enumerate(bars):
+                    width = bar.get_width()
+                    ax4.text(width, bar.get_y() + bar.get_height()/2,
+                            f' {int(width)}', ha='left', va='center', fontsize=9)
+        else:
+            ax4.text(0.5, 0.5, 'No side effects reported',
+                    ha='center', va='center', transform=ax4.transAxes, fontsize=12)
+            ax4.axis('off')
+    return fig
+def create_general_dashboard(df: pd.DataFrame) -> plt.Figure:
+    """Create general dashboard"""
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle('General Interview Analysis', fontsize=16, fontweight='bold')
+    # 1. Quality Score Distribution
+    ax1 = axes[0, 0]
+    if 'Quality Score' in df.columns:
+        quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
+        if len(quality_scores) > 0:
+            ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
+            ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
+                       label=f'Mean: {quality_scores.mean():.2f}')
+            ax1.set_xlabel('Quality Score')
+            ax1.set_ylabel('Frequency')
+            ax1.set_title('Transcript Quality Distribution')
+            ax1.legend()
+            ax1.grid(axis='y', alpha=0.3)
+    # 2. Word Count Distribution
+    ax2 = axes[0, 1]
+    if 'Word Count' in df.columns:
+        word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
+        if len(word_counts) > 0:
+            ax2.hist(word_counts, bins=15, color='#2ecc71', edgecolor='black', alpha=0.7)
+            ax2.set_xlabel('Word Count')
+            ax2.set_ylabel('Frequency')
+            ax2.set_title('Interview Length Distribution')
+            ax2.grid(axis='y', alpha=0.3)
+    # 3. Processing Summary
+    ax3 = axes[1, 0]
+    if 'Quality Score' in df.columns:
+        quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
+        categories = ['Excellent\n(>0.8)', 'Good\n(0.6-0.8)', 'Fair\n(0.4-0.6)', 'Poor\n(<0.4)']
+        counts = [
+            sum(quality_scores > 0.8),
+            sum((quality_scores >= 0.6) & (quality_scores <= 0.8)),
+            sum((quality_scores >= 0.4) & (quality_scores < 0.6)),
+            sum(quality_scores < 0.4)
+        ]
+        colors_list = ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c']
+        bars = ax3.bar(categories, counts, color=colors_list, edgecolor='black', alpha=0.7)
+        ax3.set_ylabel('Number of Transcripts')
+        ax3.set_title('Quality Score Categories')
+        ax3.grid(axis='y', alpha=0.3)
+        # Add value labels
+        for bar in bars:
+            height = bar.get_height()
+            if height > 0:
+                ax3.text(bar.get_x() + bar.get_width()/2., height,
+                        f'{int(height)}', ha='center', va='bottom', fontsize=10)
+    # 4. Summary Statistics Table
+    ax4 = axes[1, 1]
+    ax4.axis('off')
+    stats_data = []
+    if 'Transcript ID' in df.columns:
+        stats_data.append(['Total Transcripts', str(len(df))])
+    if 'Quality Score' in df.columns:
+        quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
+        if len(quality_scores) > 0:
+            stats_data.append(['Avg Quality Score', f"{quality_scores.mean():.2f}"])
+            stats_data.append(['Min Quality Score', f"{quality_scores.min():.2f}"])
+            stats_data.append(['Max Quality Score', f"{quality_scores.max():.2f}"])
+    if 'Word Count' in df.columns:
+        word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
+        if len(word_counts) > 0:
+            stats_data.append(['Avg Word Count', f"{int(word_counts.mean()):,}"])
+            stats_data.append(['Total Words', f"{int(word_counts.sum()):,}"])
+    if stats_data:
+        table = ax4.table(cellText=stats_data, cellLoc='left',
+                         colWidths=[0.5, 0.3], loc='center',
+                         colLabels=['Metric', 'Value'])
+        table.auto_set_font_size(False)
+        table.set_fontsize(11)
+        table.scale(1, 2)
+        # Style the table
+        for i in range(len(stats_data) + 1):
+            if i == 0:
+                table[(i, 0)].set_facecolor('#34495e')
+                table[(i, 1)].set_facecolor('#34495e')
+                table[(i, 0)].set_text_props(weight='bold', color='white')
+                table[(i, 1)].set_text_props(weight='bold', color='white')
+            else:
+                if i % 2 == 0:
+                    table[(i, 0)].set_facecolor('#ecf0f1')
+                    table[(i, 1)].set_facecolor('#ecf0f1')
+        ax4.set_title('Summary Statistics', fontsize=12, fontweight='bold', pad=20)
+    return fig

extractors.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from docx import Document
+import pdfplumber
+import re
+from typing import Tuple
+import os
+def extract_docx(file_obj) -> str:
+    """
+    Extract text from DOCX with enhanced error handling and formatting preservation
+    """
+    try:
+        doc = Document(file_obj)
+        # Extract paragraphs with better handling
+        paragraphs = []
+        for para in doc.paragraphs:
+            text = para.text.strip()
+            if text:  # Only include non-empty paragraphs
+                paragraphs.append(text)
+        # Also extract text from tables
+        for table in doc.tables:
+            for row in table.rows:
+                row_text = []
+                for cell in row.cells:
+                    cell_text = cell.text.strip()
+                    if cell_text:
+                        row_text.append(cell_text)
+                if row_text:
+                    paragraphs.append(" | ".join(row_text))
+        extracted_text = "\n\n".join(paragraphs)
+        # Clean up common issues
+        extracted_text = clean_extracted_text(extracted_text)
+        return extracted_text
+    except Exception as e:
+        error_msg = f"[DOCX Extraction Error] {str(e)}"
+        print(error_msg)
+        return f"Error extracting DOCX: {str(e)}"
+def extract_pdf(file_obj) -> str:
+    """
+    Extract text from PDF with multiple strategies and enhanced error handling
+    """
+    try:
+        extracted_pages = []
+        with pdfplumber.open(file_obj) as pdf:
+            # Track extraction success
+            successful_pages = 0
+            total_pages = len(pdf.pages)
+            for page_num, page in enumerate(pdf.pages, 1):
+                try:
+                    # Strategy 1: Standard text extraction
+                    page_text = page.extract_text()
+                    # Strategy 2: If standard fails, try with layout
+                    if not page_text or len(page_text.strip()) < 50:
+                        page_text = page.extract_text(layout=True)
+                    # Strategy 3: If still poor, try with custom settings
+                    if not page_text or len(page_text.strip()) < 50:
+                        page_text = page.extract_text(
+                            x_tolerance=2,
+                            y_tolerance=2
+                        )
+                    if page_text and page_text.strip():
+                        # Clean and add page marker
+                        clean_text = page_text.strip()
+                        extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
+                        successful_pages += 1
+                    else:
+                        print(f"[PDF Warning] Page {page_num} yielded no text")
+                except Exception as page_error:
+                    print(f"[PDF Warning] Error on page {page_num}: {page_error}")
+                    continue
+            if successful_pages == 0:
+                return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."
+            if successful_pages < total_pages * 0.5:
+                print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")
+            full_text = "\n\n".join(extracted_pages)
+            # Clean up the extracted text
+            full_text = clean_extracted_text(full_text)
+            return full_text
+    except Exception as e:
+        error_msg = f"[PDF Extraction Error] {str(e)}"
+        print(error_msg)
+        return f"Error extracting PDF: {str(e)}"
+def clean_extracted_text(text: str) -> str:
+    """
+    Clean up common issues in extracted text
+    """
+    # Remove excessive whitespace
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = re.sub(r' {2,}', ' ', text)
+    # Remove page numbers that appear alone on lines
+    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
+    # Remove common headers/footers patterns
+    text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE)
+    text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE)
+    # Fix common OCR issues (if any)
+    text = text.replace('', "'")  # Curly apostrophe
+    text = text.replace('', "'")
+    text = text.replace('"', '"')  # Curly quotes
+    text = text.replace('"', '"')
+    text = text.replace('–', '-')  # En dash
+    text = text.replace('—', '-')  # Em dash
+    # Remove zero-width characters
+    text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
+    return text.strip()
+def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
+    """
+    Validate extracted text quality
+    """
+    # Check if text is empty
+    if not text or not text.strip():
+        return False, "No text extracted"
+    # Check for minimum length
+    if len(text) < 100:
+        return False, f"Extracted text too short ({len(text)} characters)"
+    # Check for error messages
+    if text.startswith("Error") or text.startswith("["):
+        return False, "Extraction error detected"
+    # Check for gibberish (too many non-alphanumeric characters)
+    #alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
+    #ratio = alphanumeric / len(text) if text else 0
+    #if ratio < 0.2:
+     #   return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"
+    # Check word count
+    words = text.split()
+    if len(words) < 50:
+        return False, f"Too few words ({len(words)})"
+    # Check for reasonable word lengths (catch binary junk)
+    #avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
+    #if avg_word_length < 2 or avg_word_length > 20:
+    #    return False, f"Unusual average word length ({avg_word_length:.1f})"
+    # All checks passed
+    return True, f"Valid extraction: {len(words)} words, {len(text)} characters"
+def detect_file_encoding(file_path: str) -> str:
+    """
+    Detect file encoding for text files
+    """
+    try:
+        import chardet
+        with open(file_path, 'rb') as f:
+            raw_data = f.read()
+            result = chardet.detect(raw_data)
+            return result['encoding']
+    except:
+        return 'utf-8'  # Default fallback
+def extract_text_file(file_obj) -> str:
+    """
+    Extract from plain text file with encoding detection
+    """
+    try:
+        # Try UTF-8 first
+        try:
+            return file_obj.read().decode('utf-8')
+        except UnicodeDecodeError:
+            # Try other common encodings
+            file_obj.seek(0)
+            try:
+                return file_obj.read().decode('latin-1')
+            except:
+                file_obj.seek(0)
+                return file_obj.read().decode('cp1252')
+    except Exception as e:
+        return f"Error reading text file: {str(e)}"

llm.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import os
+import json
+import re
+from typing import Tuple, Dict, List
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as ThreadTimeout
+# Option 1: Use Hugging Face Inference API (recommended for better quality)
+# Option 2: Use larger local model
+# Option 3: Use OpenAI/Anthropic API if available
+DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
+USE_HF_API = os.getenv("USE_HF_API", "False").lower() == "true"  # Set default to False
+HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
+#if HF_TOKEN:
+    # huggingface_hub import login
+   # login(token=HF_TOKEN)
+def log(msg):
+    if DEBUG_MODE:
+        print(f"[LLM Debug] {msg}")
+def get_system_prompt(interviewee_type: str, is_summary: bool = False) -> str:
+    """Generate context-aware system prompts"""
+    base_prompt = """You are an expert medical transcript analyzer specializing in healthcare interviews.
+Your task is to extract structured, actionable insights from interview transcripts.
+Core Principles:
+- Focus on factual, verifiable medical information
+- Distinguish between speaker roles accurately
+- Filter out pleasantries, disclaimers, and off-topic content
+- Extract specific medical terms, dosages, and treatment details
+- Identify patterns and clinical reasoning
+"""
+    if is_summary:
+        return base_prompt + """
+    CROSS-INTERVIEW SYNTHESIS & VALIDATION TASK:
+    You are analyzing multiple transcripts. Extract verified patterns and flag inconsistencies.
+    STEP 1 - PATTERN IDENTIFICATION:
+    For each theme, count occurrences across transcripts:
+    - How many participants mentioned X? (e.g., "7 out of 10 participants")
+    - Calculate percentages when relevant
+    - What's the range of perspectives?
+    STEP 2 - CLASSIFY BY CONSENSUS LEVEL:
+    - STRONG CONSENSUS (80%+ agreement): Findings most participants agree on
+    - MAJORITY VIEW (60-79%): Significant but not universal agreement
+    - SPLIT PERSPECTIVES (40-59%): Where views diverge
+    - OUTLIERS (<40%): Unique but noteworthy perspectives
+    STEP 3 - CROSS-VALIDATE:
+    - Check for contradictions between transcripts
+    - Note where perspectives differ and why
+    - Flag quality issues (brief transcripts, vague responses)
+    STEP 4 - CITE EVIDENCE:
+    - Reference specific transcript numbers
+    - Include brief supporting quotes/details
+    - Distinguish fact from interpretation
+    OUTPUT FORMAT:
+    Start with 2-3 sentence executive overview, then:
+    **STRONG CONSENSUS FINDINGS:**
+    [List with counts and evidence]
+    **MAJORITY FINDINGS:**
+    [List with counts]
+    **DIVERGENT PERSPECTIVES:**
+    [Where participants disagreed and context]
+    **NOTABLE OUTLIERS:**
+    [Unique but important points]
+    **QUALITY NOTES:**
+    [Any gaps or transcript issues]
+    CRITICAL RULES:
+    - NEVER use vague terms like "many," "most," "some" - always use specific numbers
+    - ALWAYS cite transcript numbers for claims
+    - FLAG weak evidence explicitly
+    - Separate facts from interpretations
+    - NO JSON output - write in clear narrative prose
+    """
+    if interviewee_type == "HCP":
+        return base_prompt + """
+Healthcare Professional Analysis Focus:
+- Prescribing patterns and medication choices
+- Diagnostic reasoning and clinical decision-making
+- Treatment protocols and guidelines referenced
+- Peer perspectives on efficacy and safety
+- Barriers to treatment or adoption
+- Off-label uses or emerging practices
+Extract and structure:
+1. Diagnoses mentioned with context
+2. Prescriptions with dosage, frequency, and rationale
+3. Treatment strategies and their justifications
+4. Clinical guidelines or studies referenced
+5. Challenges or barriers discussed
+6. Key clinical insights or pearls
+"""
+    elif interviewee_type == "Patient":
+        return base_prompt + """
+Patient Interview Analysis Focus:
+- Symptom descriptions and severity
+- Treatment experiences and outcomes
+- Side effects and tolerability
+- Quality of life impacts
+- Adherence challenges and enablers
+- Emotional and psychological factors
+- Healthcare system interactions
+Extract and structure:
+1. Primary symptoms with duration and severity
+2. Current and past treatments
+3. Treatment effectiveness and satisfaction
+4. Side effects experienced
+5. Concerns and unmet needs
+6. Quality of life impacts
+7. Support systems and resources
+"""
+    else:
+        return base_prompt + """
+General Interview Analysis Focus:
+- Main themes and topics discussed
+- Key insights and observations
+- Recommendations or suggestions
+- Contextual factors
+- Areas of emphasis or concern
+Extract and structure relevant information based on interview content.
+"""
+def build_extraction_template(interviewee_type: str) -> str:
+    """Create JSON template for structured data extraction"""
+    if interviewee_type == "HCP":
+        return """{
+  "diagnoses": ["condition 1", "condition 2"],
+  "prescriptions": ["medication (dose, frequency, indication)"],
+  "treatment_rationale": ["reason for treatment choice"],
+  "guidelines_mentioned": ["guideline or study name"],
+  "clinical_decisions": ["key clinical decision with reasoning"],
+  "barriers": ["barrier to treatment"],
+  "key_insights": ["notable clinical insight"]
+}"""
+    elif interviewee_type == "Patient":
+        return """{
+  "symptoms": ["symptom (severity, duration)"],
+  "concerns": ["patient concern or question"],
+  "treatments_current": ["current treatment"],
+  "treatments_past": ["past treatment with outcome"],
+  "treatment_response": ["description of how treatment is working"],
+  "side_effects": ["side effect experienced"],
+  "quality_of_life": ["impact on daily life"],
+  "adherence_factors": ["factor affecting medication adherence"]
+}"""
+    else:
+        return """{
+  "key_insights": ["main insight or finding"],
+  "themes": ["recurring theme"],
+  "recommendations": ["recommendation or suggestion"],
+  "context": ["important contextual information"]
+}"""
+def parse_structured_response(text: str, interviewee_type: str) -> Dict:
+    """Extract structured data from LLM response"""
+    # Try to find JSON block
+    json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text, re.DOTALL)
+    if json_match:
+        try:
+            data = json.loads(json_match.group())
+            log(f"Successfully extracted JSON: {data}")
+            return data
+        except json.JSONDecodeError:
+            log("Failed to parse JSON from response")
+    # Fallback: Extract from text using patterns
+    data = {}
+    if interviewee_type == "HCP":
+        # Extract diagnoses
+        diag_pattern = r'(?:diagnos[ei]s|condition):\s*([^\n]+)'
+        data["diagnoses"] = re.findall(diag_pattern, text, re.IGNORECASE)
+        # Extract prescriptions
+        rx_pattern = r'(?:prescri[bp]\w*|medication):\s*([^\n]+)'
+        data["prescriptions"] = re.findall(rx_pattern, text, re.IGNORECASE)
+        # Extract treatment rationale
+        treat_pattern = r'(?:treatment|therapy|rationale):\s*([^\n]+)'
+        data["treatment_rationale"] = re.findall(treat_pattern, text, re.IGNORECASE)
+    elif interviewee_type == "Patient":
+        # Extract symptoms
+        symptom_pattern = r'(?:symptom|complaint|experienc\w*):\s*([^\n]+)'
+        data["symptoms"] = re.findall(symptom_pattern, text, re.IGNORECASE)
+        # Extract concerns
+        concern_pattern = r'(?:concern|worry|question|anxious):\s*([^\n]+)'
+        data["concerns"] = re.findall(concern_pattern, text, re.IGNORECASE)
+        # Extract side effects
+        se_pattern = r'(?:side effect|adverse|reaction):\s*([^\n]+)'
+        data["side_effects"] = re.findall(se_pattern, text, re.IGNORECASE)
+    # Clean and deduplicate
+    for key in data:
+        data[key] = list(set([item.strip() for item in data[key] if item.strip()]))
+    log(f"Extracted data from text: {data}")
+    return data
+def query_llm_hf_api(prompt: str, max_tokens: int = 500) -> str:
+    """Use Hugging Face Inference API for better quality"""
+    try:
+        from huggingface_hub import InferenceClient
+        client = InferenceClient(token=HF_TOKEN)
+        # Use chat completions instead
+        messages = [
+            {"role": "system", "content": "You are an expert transcript analyzer. Provide detailed, structured analysis."},
+            {"role": "user", "content": prompt}
+        ]
+        response = client.chat_completion(
+            messages=messages,
+            model="microsoft/Phi-3-mini-4k-instruct",
+            max_tokens=max_tokens,
+            temperature=0.3
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        import traceback
+        full_error = traceback.format_exc()
+        log(f"HF API error: {e}\n{full_error}")
+        print(f"[HF API Full Error]\n{full_error}")  # Print to console
+        return f"[Error] HF API failed: {e}"
+def query_llm_local(prompt: str, max_tokens: int = 500) -> str:
+    """Local model optimized for L4 GPU"""
+    try:
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+        import torch
+        if not hasattr(query_llm_local, 'model'):
+            log("Loading local model on L4...")
+            query_llm_local.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
+            query_llm_local.model = AutoModelForSeq2SeqLM.from_pretrained(
+                "google/flan-t5-xxl",
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+        # Tokenize and truncate to 512 tokens
+        inputs = query_llm_local.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        ).to("cuda")
+        outputs = query_llm_local.model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=False
+        )
+        response = query_llm_local.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response.strip()
+    except Exception as e:
+        log(f"Local model error: {e}")
+        return f"[Error] Local model failed: {e}"
+def query_llm(
+    chunk: str,
+    user_context: str,
+    interviewee_type: str,
+    extract_structured: bool = False,
+    is_summary: bool = False,
+    timeout: int = 120
+) -> Tuple[str, Dict]:
+    """
+    Main LLM query function with structured extraction
+    Returns:
+        Tuple of (response_text, structured_data_dict)
+    """
+    system_prompt = get_system_prompt(interviewee_type, is_summary)
+    extraction_template = build_extraction_template(interviewee_type) if extract_structured else ""
+    # Build comprehensive prompt
+    full_prompt = f"""{system_prompt}
+User Instructions:
+{user_context}
+Transcript Segment to Analyze:
+{chunk}
+"""
+    if extract_structured:
+        full_prompt += f"""
+IMPORTANT: Provide your analysis in two parts:
+1. A clear narrative summary (3-5 sentences)
+2. Structured data in this exact JSON format:
+{extraction_template}
+Be specific and include relevant details (dosages, durations, severity levels, etc.)
+"""
+    # Truncate if needed (but increased limit)
+    max_prompt_length = 6000  # Increased from 2000
+    if len(full_prompt) > max_prompt_length:
+        chunk_limit = max_prompt_length - len(system_prompt) - len(user_context) - len(extraction_template) - 500
+        chunk = chunk[:chunk_limit]
+        full_prompt = f"{system_prompt}\n\nUser Instructions:\n{user_context}\n\nTranscript Segment:\n{chunk}\n\n"
+        if extract_structured:
+            full_prompt += f"Provide analysis and structured JSON: {extraction_template}"
+        log(f"Prompt truncated to {len(full_prompt)} characters")
+    def generate():
+        if os.getenv("USE_LMSTUDIO", "False").lower() == "true":
+            return query_llm_lmstudio(full_prompt, max_tokens=600)
+        elif USE_HF_API and HF_TOKEN:
+            return query_llm_hf_api(full_prompt, max_tokens=600)
+        else:
+            return query_llm_local(full_prompt, max_tokens=600)
+    # Execute with timeout
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(generate)
+        try:
+            response = future.result(timeout=timeout)
+            log(f"LLM response received ({len(response)} chars)")
+            # Extract structured data if requested
+            structured_data = {}
+            if extract_structured:
+                structured_data = parse_structured_response(response, interviewee_type)
+            return response, structured_data
+        except ThreadTimeout:
+            log("LLM generation timed out")
+            return "[Error] LLM generation timed out.", {}
+        except Exception as e:
+            log(f"LLM generation failed: {e}")
+            return f"[Error] LLM generation failed: {e}", {}
+def extract_structured_data(text: str, interviewee_type: str) -> Dict:
+    """
+    Standalone function to extract structured data from existing text
+    Useful for post-processing
+    """
+    return parse_structured_response(text, interviewee_type)

narrative_report_generator.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+from datetime import datetime
+from typing import Tuple
+from docx import Document
+from docx.shared import Inches
+from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
+from reportlab.lib.styles import getSampleStyleSheet
+from report_parser import parse_transcriptor_output
+from table_builder import build_all_tables
+from story_writer import generate_narrative
+def generate_narrative_report(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient",
+                               report_style: str = "executive", llm_backend: str = "lmstudio",
+                               output_dir: str = "./outputs") -> Tuple[str, str, str]:
+    print("[1/4] Parsing...")
+    parsed_data = parse_transcriptor_output(csv_path, summary_path, interviewee_type)
+    print("[2/4] Building tables...")
+    tables = build_all_tables(parsed_data)
+    print("[3/4] Generating narrative (1-2 min)...")
+    narrative = generate_narrative(parsed_data, tables, report_style, llm_backend)
+    print("[4/4] Creating outputs...")
+    os.makedirs(output_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base = f"{output_dir}/narrative_report_{timestamp}"
+    pdf = f"{base}.pdf"
+    word = f"{base}.docx"
+    html = f"{base}.html"
+    create_pdf(narrative, tables, parsed_data, pdf)
+    create_word(narrative, tables, parsed_data, word)
+    create_html(narrative, tables, parsed_data, html)
+    print(f"Done!\nPDF: {pdf}\nWord: {word}\nHTML: {html}")
+    return pdf, word, html
+def create_pdf(narrative, tables, data, path):
+    doc = SimpleDocTemplate(path, pagesize=letter)
+    story = []
+    styles = getSampleStyleSheet()
+    story.append(Paragraph("Narrative Research Report", styles['Title']))
+    story.append(Spacer(1, 0.3*72))
+    for section in narrative.split('\n\n'):
+        if section.strip():
+            story.append(Paragraph(section.strip().replace('&','&amp;').replace('<','&lt;'), styles['BodyText']))
+            story.append(Spacer(1, 0.1*72))
+    doc.build(story)
+def create_word(narrative, tables, data, path):
+    doc = Document()
+    doc.add_heading('Narrative Research Report', 0)
+    for section in narrative.split('\n\n'):
+        if section.strip():
+            doc.add_paragraph(section.strip())
+    doc.save(path)
+def create_html(narrative, tables, data, path):
+    html = f"""<!DOCTYPE html><html><head><style>
+body{{font-family:Arial;max-width:900px;margin:40px auto;padding:20px;line-height:1.6}}
+h1{{color:#2c3e50;text-align:center}}
+</style></head><body><h1>Narrative Research Report</h1>"""
+    for section in narrative.split('\n\n'):
+        if section.strip():
+            html += f"<p>{section.strip()}</p>"
+    html += "</body></html>"
+    with open(path, 'w') as f:
+        f.write(html)

outputs/sample.txt ADDED Viewed

File without changes

report.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Transcript ID,File Name,Quality Score,Word Count,Key Insights,Recommendations
2	+ Transcript 1,570_24_July30_2pmET_TDredacted.docx,1.00,6812,"Interviewee is an independent researcher with expertise in healthcare topics but not a medical professional.; Focus on sharing medical materials and seeking opinions.; The interviewee expresses concern about potential TV ads for treatment of cramps and mentions the importance of managing cramps; Respondent diagnosed with fibe years ago, currently on whole bunch of medication including Vimpat and Gabapentin. Experienced side effects from one called Xcopr and had to switch.; The respondent emphasizes the importance of early intervention in mental health; The respondent confirms prior discussions and shows willingness to engage; The interviewee emphasizes the importance of clear communication and understanding in medical contexts; The interviewee expresses comfort with the application but lacks confidence in their ability to manage it independently; The interviewee emphasizes the importance of minimizing complications in medical treatment; The interviewee is focusing on the main idea of motivating participants to collaborate; The interviewee emphasizes the importance of full disclosure about a four-year period, highlighting potential gaps in knowledge and the need for transparency.; The interviewee expresses uncertainty about the TV ad's effectiveness and uniqueness; TV ad version compatibility and user preference for consistent format; The interviewee's ability to communicate clearly improved over time; The interviewee expresses a strong preference for personalized medical advice and emphasizes the importance of understanding individual differences in treatment responses.; The respondent values comparison with past experiences but emphasizes the importance of maintaining a strong foundation; The interviewee values personalized care and emphasizes the importance of having compassionate caregivers.; The interviewee expresses uncertainty about their own reactions compared to others, noting increased emotion and concern; Lack of alignment with current context; preference for happiness and movement forward; The respondent's condition involves deep-seated issues requiring careful consideration; The respondent emphasizes the importance of providing reliable pharmaceutical products with clear instructions; The interviewee expressed difficulty in transitioning from a previous role and uncertainty about their current responsibilities; The respondent found the ad motivating and engaging; The respondent finds the ad relevant but lacks specific details about its content; The respondent appreciates the detailed analysis but expresses concern about the specific recommendations; The respondent's behavior during the crisis was notably calm and composed, contrasting with the expected heightened emotional response.; Positive rapport between interviewer and respondent",Engage in discussions for mutual benefit; Clarify roles and expectations; Further investigation into specific side effects or problems with particular medical treatments; Consider alternative treatments for managing side effects; Consider implementing potential TV ads for mental health awareness; Encourage further detailed exploration of ideas; Ensure all parties involved have a clear understanding of the medication and its administration; Encourage gradual independence; Provide additional support; Further testing to identify specific causes; Encourage open communication and shared goals; Ensure thorough disclosure of all relevant medical history to avoid misunderstandings.; Conduct further research on similar ads to gauge effectiveness; Maintain consistent TV ad formats across platforms; Encourage continued practice to enhance communication; Encourage healthcare providers to tailor treatments based on patient specifics; Focus on building a solid foundation; Focus on hiring caregivers with strong interpersonal skills and a history of providing personalized care.; Further exploration of emotional triggers; Assessment of comparative emotional responses; Encourage alignment with current realities while maintaining focus on happiness; Consider a comprehensive treatment plan addressing underlying issues; Ensure all pharmaceutical products come with detailed usage guidelines; Clarify job expectations and provide support during the transition; Consider enhancing ad content to maintain motivation; Provide more detailed information about the ad; Clarify and validate specific recommendations; Encourage further training in crisis management techniques; Maintain positive communication

report.pdf ADDED Viewed

	@@ -0,0 +1,112 @@

+%PDF-1.4
+%���� ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R /F2 3 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
+>>
+endobj
+4 0 obj
+<<
+/Contents 10 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+>>
+  /Type /Page
+>>
+endobj
+5 0 obj
+<<
+/Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+>>
+  /Type /Page
+>>
+endobj
+6 0 obj
+<<
+/Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+>>
+  /Type /Page
+>>
+endobj
+7 0 obj
+<<
+/PageMode /UseNone /Pages 9 0 R /Type /Catalog
+>>
+endobj
+8 0 obj
+<<
+/Author (\(anonymous\)) /CreationDate (D:20251005104519-04'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251005104519-04'00') /Producer (ReportLab PDF Library - www.reportlab.com)
+  /Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
+>>
+endobj
+9 0 obj
+<<
+/Count 3 /Kids [ 4 0 R 5 0 R 6 0 R ] /Type /Pages
+>>
+endobj
+10 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1291
+>>
+stream
+Gat=*;0/3r&:W67f[3@JN(a6QP5OGhDOO7f1bip>9RT)?koAXFh-jIRoY5VE5ZkH.a6n0tn?VrY\)6TpMBJ_0aFDAOi9))/qB^_AL:i0f#nk>:_-&ttCD%)J=U9qAS]MX8@C40$\:6C%#m!I>p+aD6"<9e!0Q8!flm`LNGY*tf!fh:G)Cpe^*Lo$^g0&"G'=QW)+!JCNqun&^q@f]doQQ'kQDHme`")18&CP[WgPPDL:6QCqX!EQ<).1j3[:Hu\HW/TpP/&g0(khG`<Oh6;iaa+T(D-([S.69S7AL)['b<VOe=FE*iOY*H3oDkRBS=:_A7&7'mfi_<8sG'WS.F'49/I3W1PeD-+WLZW,Uo8_Fs$9p@Thgoh_Yhf[DFL>jLZ2"9l.W@i&58?a'cXROh#Y0K%oGBB2]XtU?AB9B,#LF8Vu#a>ug:MGGFu+s3[G>m>qEGi*6%_9nL><Cl9iIKR@'[o2Q3k?;H.e[c?H!ON_3a=2DGVL4_(j@gNZnqPOiKQcWX2S%UDu"rlCbD7!'ONGa%]cT`ZFFcZ=fiNB!kF)TOKRa@];#K(%eF*fU"k<bDYI@%&(dFeuOcDbT4o<,6MfcB;QJ4[%;K&p(DbSO]+Ila(\U8L@QNV`7mds3STXmH.['0)7H>73:]+S`>?)[gVo6TbH:kD/D_;9bQCC'<)!X4rp:_*Y,4B0eKR!fo5f.^tOl:*5*(Kom=s:k7%q,q_(HcqON9`mDlnc@ISSUbg@)j-3NFBUf7=BoS`qMJFK+`W?<fj&(+JMJFJebOeDDTZ0;i>"!""\/H;=Th"H!@;%S.Y(\=W>oHSL2LtP/\X1ia7dTY!4)d0A`*tj=_][0tPS\I8WM*-th.T"1RCp+!fMS>1ENNlX;<pqF+31oq1ffgS>S%g-</W%+TdJ1=m*Nqp<>LIU>*]@%mSem)-"R_UA$N:Fm_0>lT?;a>R>46/_MCAhakEbLnTs^s"'+daZ-cQIThq%YB?-O;['d=PKOd!:%l`9a3PPM3YZZlAh?2tGd(,/d]qgu_W>o_5ap2:KF_WM@*4A;88gV<Ri\hsG,t_o5@C/L&b1a^5OX^m2)EbH!o5+EqouZn])IK+4>U4V?I;P+lS`us9%*dlaiPZ%->[1N2KAdo>I_jj4K2d#?!e3mNhO.XDrkqQEUEn;ih]B&ek8?tl9(mOZIEKQ(.H]`9=33hYhWpU9'gAkJEV@>RF.,$tbAh@nR#Dsp6c4=1RVIU:c=\$Pk120g8PMO8\o0Z6i-EuTAse%%Ip)Nd-6Z^c%mN8b0_l5SQHu6~>endstream
+endobj
+11 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 1910
+>>
+stream
+GauHK?#SIU'Re<2\F<ETUoG\iVJf\&%NAHA?+k-k:o#p:>APT_,q5bKYlDF);O6Fs8Y)t#!Z6qkh/i%qkC,+Y(k7DBj<`4JF*d[,Td6K1P*/[!UA<,W4[8gcQF&"e0hmiD%c^]V9[iOqP+m%T9SQK'gh/Yp]fLe>UpCMG-1%Ln9YB2LZ[.'H/<be/BWbKL\<aP!mj_@SN&9G=qub3YFjR>^[g,-c9r!Pdi9p2.&t["G1`fu#.3+gOdchU/:#)';a-+`Q@0kLIf(18_-T%F72sKPgVl&)TnZ=FIk;Vu_/Mb>JaJ)AuchG#?bhuPXh"hc_a2&/(o;+rO>EX?99aKUm\+A2\/#;"J)m6Pmf#-_m:>g>C4Gi(mWV\Z(E^Ja;Db=[7>W%dp^goDc\X4[Wn'EF(iEba/8MiJkP4:5d*iYFpA]T`"@@*0*E>5>@Cksq(deWe7mlqXhd9ea*-oO#i13/Xa1O)B=%">:Zctb(JKck)c<@"V=e,g_W,qY'`Rcd#dXgC*FcaD,#)P!oX(Tai$B3LBs[H*kc@un37/60h*UJbWcZ[uqoQGOK<T\='u6%]u22hp7YadJIFW]O?=GU6MO*6Q>.n^7:BO:!Y1B^kG!Mfh4f\L9lPk5$.n[UD/>2m"g;MLmFEAel^V9P]+'\/&>7-.q_->*Nd@<ZWHK"p)5a$n[aAZE_au/OUnNdQ4bu"ThnG8?5(1?lXmU[c[E_>TjiSm=&*oAsG6VE/7XhT>`OuH3mk;dThLR0YX2::</!2oJ[r@%s<JD3fMK3Z5\e#iV@tgh1J=hkDlm>g->f3@R(<1j?b't$O+f)b!LDF50_stb6#"N$YE$Z8[[n5j-l6U@+G5:dpO1g=Y">,N8c\$<3__7\3CJKWNV`Q!Sk'G/6dH-iItQT]F!!&p[&raa,<Dk="pp`$Y/9J#RlhG1.])`8kqif+CMa5&tja1!MMgqKd]Zo[[P#GhEXD5@<=U7r1f\eQ+YWo!l)ol#NqJ,Y>RI)d%6f.%ZjF_en>a>dFE4,<LVTnWqo/O>(pRR8[bX`6)f,&Q?"-^K=XQf=6bB^UG\(*Amjm^7mYRtWHd9$<!^34(b_?%dJ)4SA%g*8?jg!s/useUZ]P=SanG>:e>`Y++OK?7p$Z/9g:c@Zj:&[g%e,\Fd9T1IKfG13>T0[8`]'(=4nUqe@O7-ZhpcuI?gM,r]^lpl'i7SX?e1r!XO,(t7:Z$CpE\@A*"EQ/f?02<:4toG>BOd-4$d)gnKaPD;-/'($I;N"f4b)c\sD2e9U'\FN;QTolSQs,e.m:7kXTM]3)WnabInujGB<c3+)5F(RAP.N?"U8NCu3ok[X%85i*G<Rf!-P5d_#h.DD<[mo]1Y]Eja)#>=M@A<Nh[0JSpH1Wdgi=m&4VZmWR+[/]S-E.5dQ5XX,sl%sbpLjQb;Rkb8%tN\-K97t^<C_3#Vq/;CHa:b_[q;Dk=igF(ajmH9G_0")-P-=tRW.OQbp>W(7&-d-ZC*Ac`GJ5k0fVFZK=d.Me0g+I,7@3sDlC]#:4HT"YjGVV@l!@OZ7lYO@t>=BU5C0mR9aYDUNgXb+tjpb:b%?C<o+a4U,V=eff3&Ki^s6H8Iejt9Hi-nnA\,<E[dpUeHbR'10K#%egMD\;(1UMDXL9g8%,(smT-D2$a`&qF1M8e-":4`;"EFeKZN'd*&GYR58jS+Qo(A\+LpC5^!CIS]rpFCum_HTL8;j<LTa%)KMl0tr>mIAK\Y#<C'AMX5D<U-7][m[^5+\kg(>o-_6h*%`Fr1aRX-Z^6Gr_3`g$Ib1jUqb`[&Aq+tr:KGn`TFDroT9='.oiO5h^EpXAj7#"'[%kko^IcW*`1Ou^T998ghg\Zc\8_&8uo@()H`:slgK-]Lj#ql^:H@Ri1'6u8qsJTN"\T6Bnm"cr<&F-Ab5~>endstream
+endobj
+12 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 876
+>>
+stream
+Gau0A9lJKG&A@7.N-m*'R+V3a*'p\lU'Dt$&>I'gCiAXq27r1-'_1'BlSumY9,.c,83'@UhqNGFm3I6#J!05(."HLZ]Y7&^8KfT,=i:]h5/BZGq(YJ;cIE>(lP2@"/Pam]Eq'@nF>6SRCP]lP\cc54(ap9+(JDj!*]VgT#M?BKlg#L&2#MshQ?M/]G-\3^Mr)=;.'!WLp1-+F[R<B\WR/aKEo=i2W\N)"('LK7-+-_C:?iIF;,&VGH>0`rKGTa+5kYi9']iT&_-9708GhW:'I(OpZtI&J$V83Q=3$)IMLpMJ)q*de6Oc/?,sKU;H%.qa?ac8g/9l2!&9n:4B,Fe<^bmBJ+U&-f*P(TA_XQbRI<YDF8;1;0d5p\6kSR?(^o"@@72:=`K*nsgW,nPbA[8BAIQ+Ba`q[?n*KKU8&&P^C)q^5S0RNee57`0LS]Bp0q;Zc^gXf4s`'!kY6q@Q:@5WY.HmGLSU!uA1lp>.5*phs*d!@-OKA(BobTNK2Fj6X_o^jO@rFr$Zs6k`_:^F1!\U:*)Be>X@abKGD7B'>^EU"N`'ml-3Bs<'QpBSRh=.O]U\8Z+@`MI)dYF]]IRJ-=&?!>SSn%11*]'_=Z)qThOFofQJ\Urpj[3N'*OHh/b6H)W@@-_VLP$DH0*]r.lZ`PZ0pV@AnRTSdjnC+_B*=%/&fYnBV*pECmUS`QcGY=<DhUpZVkl0D0+<MqB$M@(R7N<5US(1T[R^T<;o.qYt,*lutE)r%a_s>=%OZ>'WfBSm-j/_5lKo7%mBKT0MXGmcU9,gfJfF;A7ed15:AB7soeD4/g(CG'jO!>rqe,3@P>I$(^C#_;Rob;/X]O,a+GSHd5k%1jX5G9IQ4A5EKR+qd3C*R]~>endstream
+endobj
+xref
+0 13
+0000000000 65535 f
+0000000073 00000 n
+0000000114 00000 n
+0000000221 00000 n
+0000000333 00000 n
+0000000527 00000 n
+0000000721 00000 n
+0000000915 00000 n
+0000000983 00000 n
+0000001266 00000 n
+0000001337 00000 n
+0000002720 00000 n
+0000004722 00000 n
+trailer
+<<
+/ID
+[<6812a7b40f4b5abfbec04669e48f4c7d><6812a7b40f4b5abfbec04669e48f4c7d>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+/Info 8 0 R
+/Root 7 0 R
+/Size 13
+>>
+startxref
+5689
+%%EOF

report_parser.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import pandas as pd
+import re
+from typing import Dict, List, Tuple
+from collections import Counter
+def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]:
+    df = pd.read_csv(csv_path)
+    metadata = {
+        "total_transcripts": len(df),
+        "avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None,
+        "avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None,
+        "transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else []
+    }
+    return df, metadata
+def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]:
+    themes = {}
+    if interviewee_type == "HCP":
+        theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
+    elif interviewee_type == "Patient":
+        theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"]
+    else:
+        theme_columns = ["Key Insights"]
+    for col in theme_columns:
+        if col in df.columns:
+            all_items = []
+            for val in df[col].dropna():
+                if isinstance(val, str):
+                    all_items.extend([i.strip() for i in val.split(';') if i.strip()])
+            theme_counts = Counter(all_items)
+            themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)]
+    return themes
+def calculate_statistics(df: pd.DataFrame) -> Dict:
+    stats = {}
+    if "Quality Score" in df.columns:
+        scores = df["Quality Score"].astype(float)
+        stats["quality"] = {
+            "mean": scores.mean(),
+            "excellent_count": sum(scores > 0.8),
+            "good_count": sum((scores >= 0.6) & (scores <= 0.8)),
+            "fair_count": sum((scores >= 0.4) & (scores < 0.6)),
+            "poor_count": sum(scores < 0.4)
+        }
+    if "Word Count" in df.columns:
+        words = df["Word Count"].astype(int)
+        stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())}
+    return stats
+def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict:
+    df, metadata = parse_csv_output(csv_path)
+    themes = extract_key_themes(df, interviewee_type)
+    stats = calculate_statistics(df)
+    return {
+        "dataframe": df,
+        "metadata": metadata,
+        "themes": themes,
+        "statistics": stats,
+        "interviewee_type": interviewee_type
+    }

reporting.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import pandas as pd
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.units import inch
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+from reportlab.lib import colors
+from reportlab.lib.enums import TA_CENTER, TA_LEFT
+from datetime import datetime
+from typing import List, Dict
+import os
+def generate_csv(data, path="report.csv"):
+    """Legacy function - kept for backwards compatibility"""
+    return generate_enhanced_csv(data, "Other", path)
+def generate_enhanced_csv(
+    data: List[Dict],
+    interviewee_type: str,
+    path: str = "report.csv"
+) -> str:
+    """
+    Generate enhanced CSV with proper formatting and data validation
+    """
+    if not data:
+        # Create empty CSV with headers
+        df = pd.DataFrame(columns=["Transcript ID", "Status"])
+        df.to_csv(path, index=False)
+        return path
+    # Create DataFrame
+    df = pd.DataFrame(data)
+    # Reorder columns for better readability
+    priority_cols = ["Transcript ID", "File Name", "Quality Score", "Word Count"]
+    other_cols = [col for col in df.columns if col not in priority_cols]
+    ordered_cols = [col for col in priority_cols if col in df.columns] + other_cols
+    df = df[ordered_cols]
+    # Save with proper encoding
+    df.to_csv(path, index=False, encoding='utf-8-sig')
+    return path
+def generate_pdf(summary, details, path="report.pdf"):
+    """Legacy function - kept for backwards compatibility"""
+    # Create minimal results structure
+    results = [{
+        "transcript_id": "Transcript 1",
+        "file_name": "analysis.txt",
+        "full_text": details,
+        "quality_score": 0.8,
+        "word_count": len(details.split())
+    }]
+    return generate_enhanced_pdf(summary, results, "Other", [], path)
+def generate_enhanced_pdf(
+    summary: str,
+    results: List[Dict],
+    interviewee_type: str,
+    processing_errors: List[str],
+    path: str = "report.pdf"
+) -> str:
+    """
+    Generate professional PDF report with proper formatting
+    """
+    # Create document
+    doc = SimpleDocTemplate(
+        path,
+        pagesize=letter,
+        rightMargin=0.75*inch,
+        leftMargin=0.75*inch,
+        topMargin=0.75*inch,
+        bottomMargin=0.75*inch
+    )
+    # Container for the 'Flowable' objects
+    story = []
+    # Define styles
+    styles = getSampleStyleSheet()
+    # Custom styles
+    title_style = ParagraphStyle(
+        'CustomTitle',
+        parent=styles['Heading1'],
+        fontSize=24,
+        textColor=colors.HexColor('#1a1a1a'),
+        spaceAfter=30,
+        alignment=TA_CENTER,
+        fontName='Helvetica-Bold'
+    )
+    heading_style = ParagraphStyle(
+        'CustomHeading',
+        parent=styles['Heading2'],
+        fontSize=16,
+        textColor=colors.HexColor('#2c3e50'),
+        spaceAfter=12,
+        spaceBefore=20,
+        fontName='Helvetica-Bold'
+    )
+    subheading_style = ParagraphStyle(
+        'CustomSubheading',
+        parent=styles['Heading3'],
+        fontSize=13,
+        textColor=colors.HexColor('#34495e'),
+        spaceAfter=8,
+        spaceBefore=12,
+        fontName='Helvetica-Bold'
+    )
+    body_style = ParagraphStyle(
+        'CustomBody',
+        parent=styles['BodyText'],
+        fontSize=11,
+        leading=14,
+        textColor=colors.HexColor('#2c3e50'),
+        alignment=TA_LEFT
+    )
+    # Title page
+    story.append(Paragraph("Transcript Analysis Report", title_style))
+    story.append(Spacer(1, 0.2*inch))
+    # Metadata table
+    metadata = [
+        ["Report Generated:", datetime.now().strftime("%B %d, %Y at %I:%M %p")],
+        ["Interviewee Type:", interviewee_type],
+        ["Total Transcripts:", str(len(results))],
+        ["Successfully Processed:", str(sum(1 for r in results if r.get("quality_score", 0) > 0))]
+    ]
+    metadata_table = Table(metadata, colWidths=[2*inch, 4*inch])
+    metadata_table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#ecf0f1')),
+        ('TEXTCOLOR', (0, 0), (-1, -1), colors.HexColor('#2c3e50')),
+        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+        ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
+        ('FONTSIZE', (0, 0), (-1, -1), 10),
+        ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
+        ('TOPPADDING', (0, 0), (-1, -1), 8),
+        ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#bdc3c7'))
+    ]))
+    story.append(metadata_table)
+    story.append(Spacer(1, 0.3*inch))
+    # Executive Summary
+    story.append(Paragraph("Executive Summary", heading_style))
+    story.append(Spacer(1, 0.1*inch))
+    # Split summary into paragraphs
+    summary_paragraphs = summary.split('\n\n')
+    for para in summary_paragraphs:
+        if para.strip():
+            # Clean up text for PDF
+            clean_para = para.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+            story.append(Paragraph(clean_para, body_style))
+            story.append(Spacer(1, 0.1*inch))
+    # Processing errors section (if any)
+    if processing_errors:
+        story.append(PageBreak())
+        story.append(Paragraph("Processing Issues", heading_style))
+        story.append(Spacer(1, 0.1*inch))
+        for error in processing_errors:
+            clean_error = error.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+            story.append(Paragraph(f"• {clean_error}", body_style))
+            story.append(Spacer(1, 0.05*inch))
+    # Individual transcript details
+    story.append(PageBreak())
+    story.append(Paragraph("Detailed Transcript Analysis", heading_style))
+    story.append(Spacer(1, 0.2*inch))
+    for result in results:
+        # Transcript header
+        transcript_title = f"{result['transcript_id']} - {result['file_name']}"
+        story.append(Paragraph(transcript_title, subheading_style))
+        # Stats
+        stats_data = [
+            ["Quality Score:", f"{result['quality_score']:.2f}/1.00"],
+            ["Word Count:", f"{result['word_count']:,}"]
+        ]
+        stats_table = Table(stats_data, colWidths=[1.5*inch, 2*inch])
+        stats_table.setStyle(TableStyle([
+            ('FONTSIZE', (0, 0), (-1, -1), 9),
+            ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
+            ('TOPPADDING', (0, 0), (-1, -1), 4),
+        ]))
+        story.append(stats_table)
+        story.append(Spacer(1, 0.1*inch))
+        # Analysis text
+        text = result['full_text']
+        # Split into manageable chunks and clean
+        chunks = text.split('\n\n')
+        for chunk in chunks[:10]:  # Limit to prevent overly long PDFs
+            if chunk.strip():
+                clean_chunk = chunk.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+                # Limit paragraph length
+                if len(clean_chunk) > 1000:
+                    clean_chunk = clean_chunk[:1000] + "..."
+                story.append(Paragraph(clean_chunk, body_style))
+                story.append(Spacer(1, 0.1*inch))
+        story.append(Spacer(1, 0.2*inch))
+        # Page break between transcripts (except last)
+        if result != results[-1]:
+            story.append(PageBreak())
+    # Build PDF
+    try:
+        doc.build(story)
+        return path
+    except Exception as e:
+        print(f"[PDF Error] Failed to generate PDF: {e}")
+        # Create a minimal fallback PDF
+        simple_doc = SimpleDocTemplate(path, pagesize=letter)
+        simple_story = [
+            Paragraph("Transcript Analysis Report", title_style),
+            Paragraph(f"Error generating full report: {str(e)}", body_style),
+            Paragraph(summary, body_style)
+        ]
+        simple_doc.build(simple_story)
+        return path

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+# Core frameworks
+gradio>=4.0.0
+transformers>=4.35.0
+torch>=2.0.0
+# NLP and text processing
+nltk>=3.8.0
+tiktoken>=0.5.0
+# Document processing
+python-docx>=1.1.0
+pdfplumber>=0.10.0
+# Data processing and analysis
+pandas>=2.0.0
+numpy>=1.24.0
+# Visualization
+matplotlib>=3.7.0
+seaborn>=0.12.0
+# PDF generation
+reportlab>=4.0.0
+# API integrations
+huggingface_hub>=0.19.0
+# Utilities
+chardet>=5.0.0
+python-dateutil>=2.8.0
+# Optional but recommended
+accelerate>=0.24.0
+sentencepiece>=0.1.99
+protobuf>=4.24.0
+# Audio transcription
+faster-whisper>=0.10.0
+torchaudio>=2.0.0
+speechbrain>=0.5.16
+scikit-learn>=1.3.0  # For clustering speaker embeddings

story_writer.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import pandas as pd
+from typing import Dict
+def format_table_for_llm(df: pd.DataFrame, name: str) -> str:
+    return f"\n{name}:\n{df.to_string()}\n" if not df.empty else f"[{name}: No data]\n"
+def build_narrative_prompt(parsed_data: Dict, tables: Dict, style: str) -> str:
+    metadata = parsed_data["metadata"]
+    stats = parsed_data["statistics"]
+    interviewee_type = parsed_data["interviewee_type"]
+    tables_text = "\n".join([format_table_for_llm(df, name) for name, df in tables.items()])
+    return f"""Write an executive research report for {metadata['total_transcripts']} {interviewee_type.lower()} interviews.
+DATA TABLES:
+{tables_text}
+STRUCTURE:
+1. EXECUTIVE OVERVIEW (2-3 paragraphs): Context, sample, high-level findings
+2. KEY FINDINGS (3-5 sections): Each with narrative + data + significance
+3. PATTERNS & THEMES (2 paragraphs): Cross-cutting insights
+4. RECOMMENDATIONS (3-5 bullets): Actionable next steps
+Write professionally. Quantify everything. Be specific. Lead with insights."""
+def call_lmstudio(prompt: str) -> str:
+    import requests
+    url = os.getenv("LM_STUDIO_URL", "http://192.168.1.245:1234")
+    try:
+        r = requests.post(f"{url}/v1/chat/completions", json={
+            "messages": [{"role": "system", "content": "You are an expert research report writer."},
+                        {"role": "user", "content": prompt}],
+            "max_tokens": 2000, "temperature": 0.7
+        }, timeout=180)
+        return r.json()["choices"][0]["message"]["content"]
+    except Exception as e:
+        return f"[Error: {e}]"
+def call_hf_api(prompt: str) -> str:
+    from huggingface_hub import InferenceClient
+    try:
+        client = InferenceClient(token=os.getenv("HUGGINGFACE_TOKEN", ""))
+        return client.text_generation(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1",
+                                      max_new_tokens=2000, temperature=0.7)
+    except Exception as e:
+        return f"[Error: {e}]"
+def generate_narrative(parsed_data: Dict, tables: Dict, style: str, llm_backend: str) -> str:
+    prompt = build_narrative_prompt(parsed_data, tables, style)
+    if llm_backend == "lmstudio":
+        return call_lmstudio(prompt)
+    else:
+        return call_hf_api(prompt)

table_builder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import pandas as pd
+from typing import Dict
+from collections import Counter
+def build_participant_profile_table(metadata: Dict) -> pd.DataFrame:
+    return pd.DataFrame({
+        "Metric": ["Total Participants", "Avg Quality Score", "Avg Words"],
+        "Value": [
+            metadata.get("total_transcripts", 0),
+            f"{metadata.get('avg_quality_score', 0):.2f}",
+            f"{metadata.get('avg_word_count', 0):,.0f}"
+        ]
+    })
+def build_quality_distribution_table(stats: Dict) -> pd.DataFrame:
+    if "quality" not in stats:
+        return pd.DataFrame()
+    q = stats["quality"]
+    df = pd.DataFrame({
+        "Quality Tier": ["Excellent (>0.8)", "Good (0.6-0.8)", "Fair (0.4-0.6)", "Poor (<0.4)"],
+        "Count": [q.get("excellent_count", 0), q.get("good_count", 0),
+                  q.get("fair_count", 0), q.get("poor_count", 0)]
+    })
+    df["Percentage"] = (df["Count"] / df["Count"].sum() * 100).round(1)
+    return df
+def build_frequency_table(themes: Dict) -> pd.DataFrame:
+    rows = []
+    for theme_name, items in themes.items():
+        for item in items[:10]:
+            rows.append({"Category": theme_name, "Item": item["item"], "Frequency": item["count"]})
+    return pd.DataFrame(rows) if rows else pd.DataFrame()
+def build_all_tables(parsed_data: Dict) -> Dict[str, pd.DataFrame]:
+    tables = {}
+    df = parsed_data["dataframe"]
+    metadata = parsed_data["metadata"]
+    themes = parsed_data["themes"]
+    stats = parsed_data["statistics"]
+    tables["participant_profile"] = build_participant_profile_table(metadata)
+    quality_table = build_quality_distribution_table(stats)
+    if not quality_table.empty:
+        tables["quality_distribution"] = quality_table
+    freq_table = build_frequency_table(themes)
+    if not freq_table.empty:
+        tables["theme_frequency"] = freq_table
+    return tables

tagging.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import re
+from typing import List, Tuple
+from collections import Counter
+def detect_speaker_patterns(text: str) -> dict:
+    """Analyze text to detect speaker patterns and labeling conventions"""
+    patterns = {
+        "colon_based": re.findall(r'^([A-Z][a-z\s]+\d*):\s', text, re.MULTILINE),  # "Speaker 1: text"
+        "bracket_based": re.findall(r'^\[([^\]]+)\]\s', text, re.MULTILINE),  # "[Interviewer] text"
+        "dash_based": re.findall(r'^-\s*([A-Z][a-z\s]+):\s', text, re.MULTILINE),  # "- Doctor: text"
+        "q_a_based": bool(re.search(r'^(Q|A):\s', text, re.MULTILINE)),  # "Q: / A:"
+    }
+    # Determine most likely pattern
+    pattern_counts = {k: len(v) for k, v in patterns.items() if k != "q_a_based"}
+    pattern_counts["q_a_based"] = 1 if patterns["q_a_based"] else 0
+    most_common = max(pattern_counts, key=pattern_counts.get) if any(pattern_counts.values()) else None
+    # Extract unique speakers
+    if most_common == "colon_based":
+        speakers = list(set(patterns["colon_based"]))
+    elif most_common == "bracket_based":
+        speakers = list(set(patterns["bracket_based"]))
+    elif most_common == "dash_based":
+        speakers = list(set(patterns["dash_based"]))
+    elif most_common == "q_a_based":
+        speakers = ["Q", "A"]
+    else:
+        speakers = []
+    return {
+        "pattern_type": most_common,
+        "speakers_found": speakers,
+        "speaker_count": len(speakers),
+        "has_structure": most_common is not None
+    }
+def classify_speaker_role(text: str, speaker_label: str, interviewee_type: str) -> str:
+    """
+    Use advanced heuristics to classify speaker role
+    """
+    text_lower = text.lower()
+    # Question patterns (likely interviewer)
+    question_patterns = [
+        r'\?$',
+        r'^(what|how|why|when|where|who|can you|could you|would you|do you|have you)',
+        r'(tell me|explain|describe|walk me through)',
+        r'(your thoughts|your experience|your perspective)'
+    ]
+    question_score = sum(1 for p in question_patterns if re.search(p, text_lower))
+    # Medical/clinical patterns
+    clinical_patterns = [
+        r'\b(prescribe|prescription|rx|medication|drug|dose|dosage|mg|ml)\b',
+        r'\b(diagnos[ei]s|diagnosed|condition|disease|disorder)\b',
+        r'\b(treatment|therapy|intervention|protocol)\b',
+        r'\b(patient|case|clinical|medical|symptom)\b',
+        r'\b(efficacy|effectiveness|outcome|response|adverse)\b',
+        r'\b(guideline|recommendation|standard of care|first-line)\b'
+    ]
+    clinical_score = sum(1 for p in clinical_patterns if re.search(p, text_lower))
+    # Patient experience patterns
+    patient_patterns = [
+        r'\b(I feel|I felt|I\'m experiencing|I have)\b',
+        r'\b(my symptoms|my condition|my pain|my treatment)\b',
+        r'\b(it hurts|it bothers|it helps|it doesn\'t work)\b',
+        r'\b(I tried|I take|I stopped|I started)\b',
+        r'\b(doctor told me|doctor said|doctor prescribed)\b'
+    ]
+    patient_score = sum(1 for p in patient_patterns if re.search(p, text_lower))
+    # Neutral/closing patterns
+    neutral_patterns = [
+        r'\b(thank you|thanks|appreciate|goodbye|bye|closing)\b',
+        r'\b(that concludes|that\'s all|we\'re done)\b'
+    ]
+    neutral_score = sum(1 for p in neutral_patterns if re.search(p, text_lower))
+    # Decision logic based on interviewee type
+    if neutral_score > 0 and len(text.split()) < 15:
+        return "Neutral"
+    if interviewee_type == "HCP":
+        # In HCP interviews, high clinical language = interviewee (doctor)
+        if clinical_score >= 3:
+            return "Doctor"
+        elif question_score >= 2:
+            return "Interviewer"
+        elif clinical_score >= 1:
+            return "Doctor"
+        else:
+            return "Unknown"
+    elif interviewee_type == "Patient":
+        # In patient interviews, patient experience language = interviewee
+        if patient_score >= 2:
+            return "Patient"
+        elif question_score >= 2:
+            return "Interviewer"
+        elif clinical_score >= 2:
+            return "Interviewer"  # Likely interviewer explaining medical info
+        elif patient_score >= 1:
+            return "Patient"
+        else:
+            return "Unknown"
+    else:
+        # General classification
+        if question_score >= 2:
+            return "Interviewer"
+        elif clinical_score >= 2:
+            return "Respondent"
+        else:
+            return "Unknown"
+def parse_existing_tags(text: str, pattern_info: dict) -> List[Tuple[str, str]]:
+    """Parse text with existing speaker tags"""
+    pattern_type = pattern_info["pattern_type"]
+    segments = []
+    if pattern_type == "colon_based":
+        # "Speaker 1: text"
+        parts = re.split(r'^([A-Z][a-z\s]+\d*):\s', text, flags=re.MULTILINE)
+        for i in range(1, len(parts), 2):
+            if i + 1 < len(parts):
+                speaker = parts[i].strip()
+                content = parts[i + 1].strip()
+                if content:
+                    segments.append((speaker, content))
+    elif pattern_type == "bracket_based":
+        # "[Speaker] text"
+        parts = re.split(r'^\[([^\]]+)\]\s', text, flags=re.MULTILINE)
+        for i in range(1, len(parts), 2):
+            if i + 1 < len(parts):
+                speaker = parts[i].strip()
+                content = parts[i + 1].strip()
+                if content:
+                    segments.append((speaker, content))
+    elif pattern_type == "q_a_based":
+        # "Q: / A:"
+        parts = re.split(r'^([QA]):\s', text, flags=re.MULTILINE)
+        for i in range(1, len(parts), 2):
+            if i + 1 < len(parts):
+                speaker = "Interviewer" if parts[i] == "Q" else "Respondent"
+                content = parts[i + 1].strip()
+                if content:
+                    segments.append((speaker, content))
+    else:
+        # No clear pattern - treat as single block
+        segments.append(("Unknown", text))
+    return segments
+def tag_speakers_advanced(text: str, role_hint: str = "", interviewee_type: str = "Other") -> str:
+    """
+    Advanced speaker tagging with pattern detection and role classification
+    """
+    # Step 1: Detect existing structure
+    pattern_info = detect_speaker_patterns(text)
+    # Step 2: Parse role hints if provided
+    role_mapping = {}
+    if role_hint:
+        # Parse hints like "Speaker 1 = Interviewer, Speaker 2 = Doctor"
+        hint_parts = re.findall(r'([^,=]+)\s*=\s*([^,=]+)', role_hint)
+        for original, mapped in hint_parts:
+            role_mapping[original.strip().lower()] = mapped.strip()
+    # Step 3: Parse segments
+    if pattern_info["has_structure"]:
+        segments = parse_existing_tags(text, pattern_info)
+    else:
+        # No clear structure - split by paragraphs/lines
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        segments = [("Unknown", line) for line in lines]
+    # Step 4: Classify and tag each segment
+    tagged_segments = []
+    for speaker_label, content in segments:
+        # Apply role mapping if available
+        speaker_key = speaker_label.lower()
+        if speaker_key in role_mapping:
+            final_role = role_mapping[speaker_key]
+        else:
+            # Auto-classify based on content
+            final_role = classify_speaker_role(content, speaker_label, interviewee_type)
+        # Format the tagged line
+        tagged_segments.append(f"[{final_role}] {content}")
+    return "\n\n".join(tagged_segments)
+def analyze_speaker_distribution(tagged_text: str) -> dict:
+    """
+    Analyze the distribution of speakers in tagged text
+    Useful for quality control
+    """
+    speakers = re.findall(r'^\[([^\]]+)\]', tagged_text, re.MULTILINE)
+    distribution = Counter(speakers)
+    total = len(speakers)
+    return {
+        "total_segments": total,
+        "unique_speakers": len(distribution),
+        "distribution": dict(distribution),
+        "percentages": {k: (v / total * 100) for k, v in distribution.items()} if total > 0 else {}
+    }

utils.py ADDED Viewed

	@@ -0,0 +1,404 @@

+"""
+Utility functions for TranscriptorAI
+"""
+import os
+import json
+import hashlib
+import pickle
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+from pathlib import Path
+import logging
+# ============================================================================
+# LOGGING SETUP
+# ============================================================================
+def setup_logging(log_file: str = "transcript_analysis.log", level: str = "INFO"):
+    """Setup logging configuration"""
+    logging.basicConfig(
+        level=getattr(logging, level.upper()),
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_file),
+            logging.StreamHandler()
+        ]
+    )
+    return logging.getLogger(__name__)
+logger = setup_logging()
+# ============================================================================
+# CACHING UTILITIES
+# ============================================================================
+def get_file_hash(file_path: str) -> str:
+    """Generate hash for a file for caching purposes"""
+    hasher = hashlib.md5()
+    with open(file_path, 'rb') as f:
+        buf = f.read(65536)  # Read in 64kb chunks
+        while len(buf) > 0:
+            hasher.update(buf)
+            buf = f.read(65536)
+    return hasher.hexdigest()
+def cache_result(key: str, data: Any, cache_dir: str = "./.cache") -> bool:
+    """Cache a result to disk"""
+    try:
+        os.makedirs(cache_dir, exist_ok=True)
+        cache_file = os.path.join(cache_dir, f"{key}.pkl")
+        with open(cache_file, 'wb') as f:
+            pickle.dump(data, f)
+        logger.debug(f"Cached result for key: {key}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to cache result: {e}")
+        return False
+def load_cached_result(key: str, cache_dir: str = "./.cache") -> Optional[Any]:
+    """Load a cached result from disk"""
+    try:
+        cache_file = os.path.join(cache_dir, f"{key}.pkl")
+        if not os.path.exists(cache_file):
+            return None
+        # Check if cache is less than 7 days old
+        file_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
+        if file_age > 7 * 24 * 3600:  # 7 days
+            logger.debug(f"Cache expired for key: {key}")
+            return None
+        with open(cache_file, 'rb') as f:
+            data = pickle.load(f)
+        logger.debug(f"Loaded cached result for key: {key}")
+        return data
+    except Exception as e:
+        logger.error(f"Failed to load cached result: {e}")
+        return None
+def clear_cache(cache_dir: str = "./.cache"):
+    """Clear all cached files"""
+    try:
+        if os.path.exists(cache_dir):
+            for file in os.listdir(cache_dir):
+                file_path = os.path.join(cache_dir, file)
+                os.remove(file_path)
+            logger.info(f"Cleared cache directory: {cache_dir}")
+    except Exception as e:
+        logger.error(f"Failed to clear cache: {e}")
+# ============================================================================
+# FILE UTILITIES
+# ============================================================================
+def ensure_directory(path: str) -> str:
+    """Ensure directory exists, create if not"""
+    os.makedirs(path, exist_ok=True)
+    return path
+def get_unique_filename(base_path: str, extension: str = "") -> str:
+    """Generate unique filename by adding timestamp"""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base = os.path.splitext(base_path)[0]
+    ext = extension or os.path.splitext(base_path)[1]
+    return f"{base}_{timestamp}{ext}"
+def get_file_size_mb(file_path: str) -> float:
+    """Get file size in MB"""
+    return os.path.getsize(file_path) / (1024 * 1024)
+def validate_file(file_path: str, max_size_mb: int = 50, allowed_extensions: List[str] = None) -> tuple:
+    """Validate file exists, size, and extension"""
+    if allowed_extensions is None:
+        allowed_extensions = ['.docx', '.pdf']
+    if not os.path.exists(file_path):
+        return False, "File does not exist"
+    if get_file_size_mb(file_path) > max_size_mb:
+        return False, f"File exceeds {max_size_mb}MB limit"
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext not in allowed_extensions:
+        return False, f"File type {ext} not supported"
+    return True, "Valid"
+# ============================================================================
+# DATA PROCESSING UTILITIES
+# ============================================================================
+def sanitize_text(text: str) -> str:
+    """Sanitize text for safe processing"""
+    # Remove null bytes
+    text = text.replace('\x00', '')
+    # Normalize whitespace
+    text = ' '.join(text.split())
+    return text.strip()
+def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
+    """Truncate text to max length with suffix"""
+    if len(text) <= max_length:
+        return text
+    return text[:max_length - len(suffix)] + suffix
+def extract_keywords(text: str, top_n: int = 10) -> List[str]:
+    """Extract top N keywords from text (simple frequency-based)"""
+    from collections import Counter
+    import re
+    # Simple tokenization
+    words = re.findall(r'\b[a-z]{3,}\b', text.lower())
+    # Remove common stop words
+    stop_words = {
+        'the', 'and', 'for', 'are', 'but', 'not', 'you', 'with',
+        'this', 'that', 'from', 'they', 'have', 'has', 'was', 'were'
+    }
+    words = [w for w in words if w not in stop_words]
+    # Count and return top N
+    counter = Counter(words)
+    return [word for word, count in counter.most_common(top_n)]
+# ============================================================================
+# STATISTICS UTILITIES
+# ============================================================================
+def calculate_statistics(values: List[float]) -> Dict[str, float]:
+    """Calculate basic statistics for a list of values"""
+    if not values:
+        return {}
+    import numpy as np
+    return {
+        "mean": np.mean(values),
+        "median": np.median(values),
+        "std": np.std(values),
+        "min": np.min(values),
+        "max": np.max(values),
+        "count": len(values)
+    }
+def calculate_percentile(values: List[float], percentile: int) -> float:
+    """Calculate percentile of values"""
+    import numpy as np
+    return np.percentile(values, percentile)
+# ============================================================================
+# JSON UTILITIES
+# ============================================================================
+def save_json(data: Dict, filepath: str, pretty: bool = True) -> bool:
+    """Save data as JSON file"""
+    try:
+        with open(filepath, 'w', encoding='utf-8') as f:
+            if pretty:
+                json.dump(data, f, indent=2, ensure_ascii=False)
+            else:
+                json.dump(data, f, ensure_ascii=False)
+        logger.debug(f"Saved JSON to: {filepath}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to save JSON: {e}")
+        return False
+def load_json(filepath: str) -> Optional[Dict]:
+    """Load JSON file"""
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        logger.debug(f"Loaded JSON from: {filepath}")
+        return data
+    except Exception as e:
+        logger.error(f"Failed to load JSON: {e}")
+        return None
+# ============================================================================
+# PROGRESS TRACKING
+# ============================================================================
+class ProgressTracker:
+    """Simple progress tracker for long operations"""
+    def __init__(self, total: int, description: str = "Processing"):
+        self.total = total
+        self.current = 0
+        self.description = description
+        self.start_time = datetime.now()
+    def update(self, n: int = 1):
+        """Update progress"""
+        self.current = min(self.current + n, self.total)
+        self._print_progress()
+    def _print_progress(self):
+        """Print progress bar"""
+        percentage = (self.current / self.total) * 100 if self.total > 0 else 0
+        bar_length = 40
+        filled = int(bar_length * self.current / self.total) if self.total > 0 else 0
+        bar = '█' * filled + '-' * (bar_length - filled)
+        elapsed = (datetime.now() - self.start_time).total_seconds()
+        eta = (elapsed / self.current * (self.total - self.current)) if self.current > 0 else 0
+        print(f'\r{self.description}: |{bar}| {percentage:.1f}% ({self.current}/{self.total}) ETA: {eta:.0f}s', end='')
+        if self.current >= self.total:
+            print()  # New line when complete
+# ============================================================================
+# ERROR HANDLING UTILITIES
+# ============================================================================
+def safe_execute(func, *args, default=None, error_msg="Operation failed", **kwargs):
+    """Safely execute a function with error handling"""
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        logger.error(f"{error_msg}: {e}")
+        return default
+# ============================================================================
+# TEXT COMPARISON UTILITIES
+# ============================================================================
+def calculate_similarity(text1: str, text2: str) -> float:
+    """Calculate simple similarity score between two texts"""
+    words1 = set(text1.lower().split())
+    words2 = set(text2.lower().split())
+    if not words1 or not words2:
+        return 0.0
+    intersection = words1.intersection(words2)
+    union = words1.union(words2)
+    return len(intersection) / len(union) if union else 0.0
+# ============================================================================
+# BATCH PROCESSING UTILITIES
+# ============================================================================
+def batch_items(items: List, batch_size: int) -> List[List]:
+    """Split list into batches"""
+    return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
+def parallel_process(func, items: List, max_workers: int = 4):
+    """Process items in parallel"""
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    results = []
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(func, item) for item in items]
+        for future in as_completed(futures):
+            try:
+                result = future.result()
+                results.append(result)
+            except Exception as e:
+                logger.error(f"Parallel processing error: {e}")
+                results.append(None)
+    return results
+# ============================================================================
+# EXPORT UTILITIES
+# ============================================================================
+def export_to_excel(data: Dict[str, List[Dict]], filepath: str) -> bool:
+    """Export multiple dataframes to Excel with sheets"""
+    try:
+        import pandas as pd
+        with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
+            for sheet_name, rows in data.items():
+                df = pd.DataFrame(rows)
+                df.to_excel(writer, sheet_name=sheet_name, index=False)
+        logger.info(f"Exported to Excel: {filepath}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to export to Excel: {e}")
+        return False
+# ============================================================================
+# VALIDATION UTILITIES
+# ============================================================================
+def is_valid_email(email: str) -> bool:
+    """Basic email validation"""
+    import re
+    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
+    return bool(re.match(pattern, email))
+def is_valid_url(url: str) -> bool:
+    """Basic URL validation"""
+    import re
+    pattern = r'^https?://[^\s<>"]+$'
+    return bool(re.match(pattern, url))
+# ============================================================================
+# MAIN (FOR TESTING)
+# ============================================================================
+if __name__ == "__main__":
+    # Test utilities
+    print("Testing utilities...")
+    # Test file operations
+    test_dir = ensure_directory("./test_output")
+    print(f"Created test directory: {test_dir}")
+    # Test JSON operations
+    test_data = {"key": "value", "number": 42}
+    save_json(test_data, "./test_output/test.json")
+    loaded = load_json("./test_output/test.json")
+    assert loaded == test_data, "JSON save/load failed"
+    print("✓ JSON operations work")
+    # Test statistics
+    test_values = [1, 2, 3, 4, 5]
+    stats = calculate_statistics(test_values)
+    print(f"✓ Statistics: {stats}")
+    # Test progress tracker
+    tracker = ProgressTracker(10, "Test")
+    for i in range(10):
+        import time
+        time.sleep(0.1)
+        tracker.update()
+    print("✓ Progress tracker works")
+    print("\n✓ All utility tests passed!")

validation.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import re
+from typing import Tuple, Dict, List
+def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
+    """
+    Validate that text extraction was successful
+    """
+    if not text or not text.strip():
+        return False, "No text extracted"
+    # Check for minimum content
+    if len(text) < 50:
+        return False, f"Extracted text too short ({len(text)} chars)"
+    # Check for garbled text indicators
+    garbled_patterns = [
+        (r'[^\x00-\x7F]{50,}', "Contains large blocks of non-ASCII characters"),
+        (r'(.)\1{20,}', "Contains suspicious character repetition"),
+        (r'^[\W\d\s]+$', "Contains only symbols/numbers/whitespace")
+    ]
+    for pattern, msg in garbled_patterns:
+        if re.search(pattern, text):
+            return False, msg
+    # Check word count
+    words = text.split()
+    if len(words) < 20:
+        return False, f"Too few words ({len(words)})"
+    # Calculate ratio of real words (heuristic)
+    potential_words = [w for w in words if re.match(r'^[a-zA-Z]{2,}$', w)]
+    word_ratio = len(potential_words) / len(words) if words else 0
+    if word_ratio < 0.3:
+        return False, f"Low word ratio ({word_ratio:.2f}) - possible extraction issue"
+    return True, f"Valid ({len(words)} words, {len(text)} chars)"
+def validate_transcript_quality(
+    analyzed_text: str,
+    structured_data: Dict,
+    interviewee_type: str
+) -> Tuple[float, str]:
+    """
+    Assess quality of analyzed transcript
+    Returns:
+        Tuple of (quality_score [0-1], issues_description)
+    """
+    score = 1.0
+    issues = []
+    # Check 1: Length of analysis
+    if len(analyzed_text) < 100:
+        score -= 0.3
+        issues.append("Analysis too brief")
+    elif len(analyzed_text) < 300:
+        score -= 0.1
+        issues.append("Analysis somewhat brief")
+    # Check 2: Presence of structured data
+    if not structured_data:
+        score -= 0.2
+        issues.append("No structured data extracted")
+    else:
+        # Check if structured data has content
+        empty_fields = sum(1 for v in structured_data.values() if not v)
+        total_fields = len(structured_data)
+        if empty_fields == total_fields:
+            score -= 0.3
+            issues.append("All structured fields empty")
+        elif empty_fields > total_fields * 0.7:
+            score -= 0.2
+            issues.append("Most structured fields empty")
+    # Check 3: Type-specific validation
+    if interviewee_type == "HCP":
+        # Expect medical terminology
+        medical_terms = re.findall(
+            r'\b(diagnos\w+|prescri\w+|treatment|medication|patient|clinical|therapy)\b',
+            analyzed_text,
+            re.IGNORECASE
+        )
+        if len(medical_terms) < 3:
+            score -= 0.2
+            issues.append("Limited medical terminology for HCP interview")
+        # Check for key structured fields
+        key_fields = ["diagnoses", "prescriptions", "treatment_rationale"]
+        missing_fields = [f for f in key_fields if not structured_data.get(f)]
+        if len(missing_fields) == len(key_fields):
+            score -= 0.2
+            issues.append("No key HCP data extracted")
+    elif interviewee_type == "Patient":
+        # Expect patient-centric language
+        patient_terms = re.findall(
+            r'\b(symptom|feel|concern|experience|treatment|side effect|quality of life)\b',
+            analyzed_text,
+            re.IGNORECASE
+        )
+        if len(patient_terms) < 3:
+            score -= 0.2
+            issues.append("Limited patient-centric content")
+        # Check for key structured fields
+        key_fields = ["symptoms", "concerns", "treatment_response"]
+        missing_fields = [f for f in key_fields if not structured_data.get(f)]
+        if len(missing_fields) == len(key_fields):
+            score -= 0.2
+            issues.append("No key patient data extracted")
+    # Check 4: Error indicators
+    error_patterns = [
+        r'\[Error\]',
+        r'failed to',
+        r'could not',
+        r'unable to',
+        r'timeout'
+    ]
+    for pattern in error_patterns:
+        if re.search(pattern, analyzed_text, re.IGNORECASE):
+            score -= 0.3
+            issues.append("Contains error messages")
+            break
+    # Check 5: Repetitive content (potential LLM failure)
+    sentences = analyzed_text.split('.')
+    if len(sentences) > 3:
+        unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
+        repetition_ratio = len(sentences) / len(unique_sentences) if unique_sentences else 1
+        if repetition_ratio > 1.5:
+            score -= 0.2
+            issues.append("High content repetition")
+    # Ensure score is in valid range
+    score = max(0.0, min(1.0, score))
+    issues_text = "; ".join(issues) if issues else "No issues detected"
+    return score, issues_text
+def check_data_completeness(csv_rows: List[Dict], interviewee_type: str) -> Dict:
+    """
+    Analyze completeness of extracted data across all transcripts
+    """
+    if not csv_rows:
+        return {"error": "No data to check"}
+    # Determine key fields based on type
+    if interviewee_type == "HCP":
+        key_fields = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
+    elif interviewee_type == "Patient":
+        key_fields = ["Primary Symptoms", "Main Concerns", "Treatment Response"]
+    else:
+        key_fields = ["Key Insights"]
+    completeness = {}
+    for field in key_fields:
+        if field in csv_rows[0]:  # Check if field exists
+            filled_count = sum(1 for row in csv_rows if row.get(field) and row[field].strip())
+            completeness[field] = {
+                "filled": filled_count,
+                "total": len(csv_rows),
+                "percentage": (filled_count / len(csv_rows) * 100) if csv_rows else 0
+            }
+    # Overall completeness
+    total_fields = sum(c["total"] for c in completeness.values())
+    filled_fields = sum(c["filled"] for c in completeness.values())
+    overall_percentage = (filled_fields / total_fields * 100) if total_fields > 0 else 0
+    return {
+        "by_field": completeness,
+        "overall": {
+            "filled": filled_fields,
+            "total": total_fields,
+            "percentage": overall_percentage
+        },
+        "quality_grade": (
+            "Excellent" if overall_percentage >= 80 else
+            "Good" if overall_percentage >= 60 else
+            "Fair" if overall_percentage >= 40 else
+            "Poor"
+        )
+    }
+def validate_structured_data_format(data: Dict, interviewee_type: str) -> Tuple[bool, List[str]]:
+    """
+    Validate that structured data has expected format
+    """
+    issues = []
+    if not isinstance(data, dict):
+        return False, ["Data is not a dictionary"]
+    # Define expected fields by type
+    expected_fields = {
+        "HCP": ["diagnoses", "prescriptions", "treatment_rationale"],
+        "Patient": ["symptoms", "concerns", "treatment_response"],
+        "Other": ["key_insights"]
+    }
+    required = expected_fields.get(interviewee_type, [])
+    # Check for expected fields
+    missing = [f for f in required if f not in data]
+    if missing:
+        issues.append(f"Missing expected fields: {', '.join(missing)}")
+    # Check field types (should be lists)
+    for key, value in data.items():
+        if not isinstance(value, list):
+            issues.append(f"Field '{key}' should be a list, got {type(value)}")
+    # Check for empty lists
+    empty_fields = [k for k, v in data.items() if isinstance(v, list) and not v]
+    if len(empty_fields) == len(data):
+        issues.append("All fields are empty lists")
+    is_valid = len(issues) == 0
+    return is_valid, issues
+def validate_summary_quality(summary: str, num_transcripts: int) -> Tuple[float, List[str]]:
+    """Check summary for rigor and accuracy"""
+    issues = []
+    score = 1.0
+    # Check for quantification
+    if not re.search(r'\d+\s*(?:out of|of|participants|%)', summary):
+        issues.append("No quantified findings (must include counts/percentages)")
+        score -= 0.3
+    # Check for vague claims
+    vague_terms = ['many', 'most', 'some', 'several', 'often', 'frequently']
+    if any(term in summary.lower() for term in vague_terms):
+        issues.append("Contains vague terms - should use specific numbers")
+        score -= 0.2
+    # Check for absolute claims
+    absolute_terms = ['all', 'everyone', 'nobody', 'never', 'always']
+    for term in absolute_terms:
+        if re.search(rf'\b{term}\b', summary.lower()):
+            issues.append(f"Absolute claim '{term}' found - likely overgeneralization")
+            score -= 0.2
+    # Check for evidence markers
+    if 'consensus' not in summary.lower() and 'majority' not in summary.lower():
+        issues.append("Missing consensus indicators")
+        score -= 0.1
+    # Check length is substantial
+    if len(summary) < 500:
+        issues.append("Summary too brief for thorough analysis")
+        score -= 0.2
+    return max(0.0, score), issues