jmisak's picture
Update app.py
0964dee verified
raw
history blame
28.7 kB
import gradio as gr
import os
from datetime import datetime
from typing import List, Dict, Tuple
from extractors import extract_docx, extract_pdf, validate_extraction
from tagging import tag_speakers_advanced
from chunking import chunk_text_semantic
from llm import query_llm, extract_structured_data
from reporting import generate_enhanced_csv, generate_enhanced_pdf
from dashboard import generate_comprehensive_dashboard
from validation import validate_transcript_quality, check_data_completeness
from audio_transcriber import transcribe_with_diarization_streaming
# HuggingFace Spaces Configuration
import os
os.environ["LLM_BACKEND"] = "hf_api"
os.environ["LLM_TIMEOUT"] = "25"
os.environ["MAX_TOKENS_PER_REQUEST"] = "100"
print("πŸš€ Running on HuggingFace Spaces - Optimized Configuration Loaded")
def preprocess_audio(audio_files, num_speakers):
"""Convert audio to transcripts"""
if not audio_files:
return None, "No audio files provided"
transcript_paths = []
status = ""
for audio in audio_files:
try:
# Get the actual file path
audio_path = audio.name if hasattr(audio, 'name') else str(audio)
transcript_path = transcribe_with_diarization(audio_path, num_speakers)
transcript_paths.append(transcript_path)
status += f"βœ“ {os.path.basename(audio_path)} β†’ {transcript_path}\n"
except Exception as e:
status += f"βœ— {os.path.basename(audio_path)}: {str(e)}\n"
# Return list of paths for file component
return transcript_paths if transcript_paths else None, status
def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
"""
Enhanced analysis pipeline with robust error handling and validation
"""
os.environ["DEBUG_MODE"] = str(debug_mode)
if not files:
return "Error: No files uploaded", None, None, None
all_results = []
csv_rows = []
processing_errors = []
progress(0, desc="Initializing...")
print(f"[Start] Processing {len(files)} file(s) as {file_type}")
# Enhanced interviewee context
interviewee_context = {
"HCP": {
"focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns",
"extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"]
},
"Patient": {
"focus": "symptoms, concerns, emotional state, treatment understanding, adherence",
"extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"]
},
"Other": {
"focus": "context-dependent insights, relevant observations",
"extract": ["key_insights", "context", "recommendations"]
}
}.get(interviewee_type, {})
# Build enhanced user context
user_context = f"""
Interviewee Type: {interviewee_type}
Analysis Focus: {interviewee_context.get('focus', 'general insights')}
Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))}
Additional Instructions:
{user_comments}
""".strip()
total_steps = len(files) * 4 + 2 # extraction, validation, tagging, chunking per file + summary + report
current_step = 0
for i, file in enumerate(files):
file_name = os.path.basename(file.name)
try:
# Step 1: Extract text
progress((current_step / total_steps), desc=f"Extracting {file_name}...")
print(f"[File {i+1}/{len(files)}] Extracting: {file_name}")
raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file)
current_step += 1
# Step 2: Validate extraction
progress((current_step / total_steps), desc=f"Validating {file_name}...")
is_valid, validation_msg = validate_extraction(raw_text, file_name)
if not is_valid:
raise ValueError(f"Extraction validation failed: {validation_msg}")
print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}")
current_step += 1
# Step 3: Tag speakers with advanced logic
progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...")
tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type)
print(f"[File {i+1}] Tagged {len(tagged_text)} characters")
current_step += 1
# Step 4: Semantic chunking
progress((current_step / total_steps), desc=f"Processing {file_name}...")
chunks = chunk_text_semantic(tagged_text, interviewee_type)
print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)")
current_step += 1
# Step 5: LLM Analysis with structured extraction
transcript_result = []
structured_data = {}
for j, chunk in enumerate(chunks):
chunk_progress = (current_step + (j / len(chunks))) / total_steps
progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...")
result, chunk_data = query_llm(
chunk,
user_context,
interviewee_type,
extract_structured=True
)
transcript_result.append(result)
# Merge structured data
for key, value in chunk_data.items():
if key not in structured_data:
structured_data[key] = []
if isinstance(value, list):
structured_data[key].extend(value)
else:
structured_data[key].append(value)
current_step += 1
# Combine and validate results
full_text = "\n\n".join(transcript_result)
# Quality check
quality_score, quality_issues = validate_transcript_quality(
full_text,
structured_data,
interviewee_type
)
if quality_score < 0.3:
print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}")
processing_errors.append(f"{file_name}: Low quality - {quality_issues}")
all_results.append({
"transcript_id": f"Transcript {i+1}",
"file_name": file_name,
"full_text": full_text,
"structured_data": structured_data,
"quality_score": quality_score,
"word_count": len(raw_text.split())
})
# Enhanced CSV row with structured data
csv_row = {
"Transcript ID": f"Transcript {i+1}",
"File Name": file_name,
"Quality Score": f"{quality_score:.2f}",
"Word Count": len(raw_text.split()),
}
# Add interviewee-specific fields
if interviewee_type == "HCP":
csv_row.update({
"Diagnoses": "; ".join(structured_data.get("diagnoses", [])),
"Prescriptions": "; ".join(structured_data.get("prescriptions", [])),
"Treatment Strategies": "; ".join(structured_data.get("treatment_rationale", [])),
"Guidelines Mentioned": "; ".join(structured_data.get("guidelines_mentioned", []))
})
elif interviewee_type == "Patient":
csv_row.update({
"Primary Symptoms": "; ".join(structured_data.get("symptoms", [])),
"Main Concerns": "; ".join(structured_data.get("concerns", [])),
"Treatment Response": "; ".join(structured_data.get("treatment_response", [])),
"Side Effects": "; ".join(structured_data.get("side_effects", []))
})
else:
csv_row.update({
"Key Insights": "; ".join(structured_data.get("key_insights", [])),
"Recommendations": "; ".join(structured_data.get("recommendations", []))
})
csv_rows.append(csv_row)
print(f"[File {i+1}] βœ“ Processing complete")
except Exception as e:
# Enhanced error tracking with type and traceback
import traceback
error_type = type(e).__name__
error_details = str(e)
error_traceback = traceback.format_exc()
error_msg = f"[{error_type}] {file_name}: {error_details}"
print(error_msg)
# Store comprehensive error information
processing_errors.append({
"transcript_id": f"Transcript {i+1}",
"file_name": file_name,
"error_type": error_type,
"error_message": error_details[:200], # Truncate long messages
"timestamp": datetime.now().isoformat()
})
all_results.append({
"transcript_id": f"Transcript {i+1}",
"file_name": file_name,
"full_text": error_msg,
"structured_data": {},
"quality_score": 0.0,
"word_count": 0,
"processing_status": "FAILED",
"error_type": error_type
})
# Add to CSV with error metadata
csv_rows.append({
"Transcript ID": f"Transcript {i+1}",
"File Name": file_name,
"Quality Score": 0.0,
"Word Count": 0,
"Processing Status": "FAILED",
"Error Type": error_type,
"Error Message": error_details[:100]
})
# Generate cross-transcript summary
try:
progress(0.9, desc="Generating summary and reports...")
print("[Summary] Analyzing trends across transcripts")
# Combine successful results
valid_results = [r for r in all_results if r["quality_score"] > 0]
if not valid_results:
return "Error: No transcripts were successfully processed", None, None, None
# Build comprehensive summary prompt
summary_prompt = f"""
CROSS-INTERVIEW SYNTHESIS TASK
SAMPLE: {len(valid_results)} {interviewee_type} transcripts
FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')}
COMPLETE TRANSCRIPT DATA:
"""
for idx, result in enumerate(valid_results, 1):
summary_prompt += f"\n{'='*60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='*60}\n"
summary_prompt += f"{result['full_text'][:2000]}\n"
summary_prompt += f"""
ANALYSIS REQUIREMENTS:
1. QUANTIFY EVERYTHING:
- Count participants: "X out of {len(valid_results)} participants mentioned..."
- Never use vague terms (many/most/some)
- Calculate percentages where relevant
2. IDENTIFY PATTERNS BY CONSENSUS LEVEL:
- STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree)
- MAJORITY VIEW (60-79% = {int(len(valid_results)*0.6)}-{int(len(valid_results)*0.79)} transcripts)
- SPLIT PERSPECTIVES (40-59% = mixed views)
- MINORITY/OUTLIER (<40% but notable)
3. CROSS-VALIDATE:
- Check for contradictions between transcripts
- Note where perspectives diverge and why
- Flag any quality issues in individual transcripts
4. CITE EVIDENCE:
- Reference specific transcript numbers
- Brief supporting details
- Distinguish verified facts from interpretation
OUTPUT FORMAT:
Write 2-3 sentence executive overview, then structure as:
**STRONG CONSENSUS FINDINGS:**
- [Finding with count and evidence]
**MAJORITY FINDINGS:**
- [Finding with count]
**DIVERGENT PERSPECTIVES:**
- [Where views split and context]
**NOTABLE OUTLIERS:**
- [Unique but important points]
**DATA QUALITY NOTES:**
- [Any gaps or transcript issues]
Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
"""
# Use robust LLM with aggressive timeout protection
print("[Summary] Generating cross-transcript summary...")
print("[Summary] Note: This may take 30-60 seconds for large datasets")
try:
from llm_robust import query_llm_with_timeout
summary, summary_data = query_llm_with_timeout(
summary_prompt,
user_context,
interviewee_type,
extract_structured=False,
is_summary=True,
max_timeout=60 # 60 second hard timeout
)
except Exception as e:
# Ultimate fallback
print(f"[Summary] Critical error: {e}")
print("[Summary] Using emergency fallback...")
from llm_robust import generate_emergency_summary
summary, summary_data = generate_emergency_summary(interviewee_type)
# Validate summary quality and retry if needed
from validation import validate_summary_quality
summary_score, summary_issues = validate_summary_quality(
summary,
len(valid_results)
)
if summary_score < 0.7: # Quality threshold
print(f"[Warning] Summary quality issues (score: {summary_score:.2f}): {summary_issues}")
print("[Summary] Retrying with stricter validation...")
# Retry with enhanced prompt emphasizing validation failures
retry_prompt = summary_prompt + f"""
CRITICAL: Previous attempt failed validation with these issues:
{chr(10).join('- ' + issue for issue in summary_issues)}
MANDATORY CORRECTIONS:
- Use ONLY specific numbers (e.g., "8 out of {len(valid_results)}" not "most")
- Include percentages in parentheses
- Cite transcript numbers for every claim
- Minimum length: 500 words
- No absolute terms (all/everyone/never/always) without 100% evidence
"""
try:
summary, summary_data = query_llm_with_timeout(
retry_prompt,
user_context,
interviewee_type,
extract_structured=False,
is_summary=True,
max_timeout=60 # 60 second hard timeout for retry
)
except Exception as e:
print(f"[Summary] Retry also failed: {e}")
print("[Summary] Using emergency fallback for retry...")
summary, summary_data = generate_emergency_summary(interviewee_type)
# Re-validate
summary_score, summary_issues = validate_summary_quality(summary, len(valid_results))
if summary_score < 0.7:
# Add quality warning to summary header
warning_header = f"""[QUALITY WARNING - Score: {summary_score:.2f}]
Validation issues detected: {'; '.join(summary_issues)}
Please review findings carefully and verify against source data.
{'='*60}
"""
summary = warning_header + summary
print(f"[Warning] Summary still has issues after retry (score: {summary_score:.2f})")
else:
print(f"[Summary] βœ“ Validation passed after retry (score: {summary_score:.2f})")
else:
print(f"[Summary] βœ“ Validation passed (score: {summary_score:.2f})")
# Verify consensus claims against actual data
from validation import verify_consensus_claims
consensus_warnings = verify_consensus_claims(summary, valid_results)
if consensus_warnings:
print(f"[Warning] Consensus verification issues: {len(consensus_warnings)} found")
consensus_note = "\n\n[CONSENSUS VERIFICATION NOTES]:\n" + "\n".join(f"- {w}" for w in consensus_warnings) + "\n\n"
summary = summary + consensus_note
else:
print("[Summary] βœ“ Consensus claims verified")
# Generate enhanced reports
csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
print(f"[CSV] βœ“ Saved to {csv_path}")
pdf_path = generate_enhanced_pdf(
summary,
all_results,
interviewee_type,
processing_errors
)
print(f"[PDF] βœ“ Saved to {pdf_path}")
dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type)
print("[Dashboard] βœ“ Generated")
# Compile final output
output_text = f"""# Analysis Complete
## Summary of Findings
{summary}
## Processing Statistics
- Total Files: {len(files)}
- Successfully Processed: {len(valid_results)}
- Failed: {len(processing_errors)}
- Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f}
"""
if processing_errors:
output_text += f"\n## Processing Errors\n" + "\n".join(f"- {err}" for err in processing_errors)
output_text += "\n\n---\n\n## Individual Transcript Results\n\n"
for result in all_results:
output_text += f"### {result['transcript_id']} - {result['file_name']}\n"
output_text += f"Quality Score: {result['quality_score']:.2f} | Words: {result['word_count']}\n\n"
output_text += result['full_text'] + "\n\n---\n\n"
progress(1.0, desc="Complete!")
return output_text, csv_path, pdf_path, dashboard
except Exception as e:
error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}"
print(error_msg)
import traceback
traceback.print_exc()
return error_msg, None, None, None
def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style):
"""
Wrapper function for Gradio UI to generate narrative reports
"""
try:
from narrative_report_generator import generate_narrative_report
import tempfile
import os
# Check if CSV file exists
if csv_file is None:
return "Error: No CSV file provided. Please run analysis first.", None, None, None
# Save summary text to temp file if provided
summary_path = None
if summary_text and summary_text.strip():
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
f.write(summary_text)
summary_path = f.name
# Determine LLM backend
llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api"
# Generate narrative report
pdf_path, word_path, html_path = generate_narrative_report(
csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file,
summary_path=summary_path,
interviewee_type=interviewee_type,
report_style=report_style,
llm_backend=llm_backend
)
# Clean up temp file
if summary_path and os.path.exists(summary_path):
os.remove(summary_path)
return (
f"βœ“ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}",
pdf_path,
word_path,
html_path
)
except Exception as e:
import traceback
error_detail = traceback.format_exc()
return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎯 TranscriptorAI - Enterprise Transcript Analyzer
Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis.
""")
with gr.Tabs():
with gr.TabItem("🎀 Audio Preprocessing"):
gr.Markdown("""
Upload audio interviews to auto-transcribe with speaker identification.
Outputs DOCX files ready for analysis.
""")
with gr.Row():
audio_input = gr.File(
label="Upload Audio Files",
file_types=[".mp3", ".wav", ".m4a", ".flac"],
file_count="multiple"
)
num_speakers_input = gr.Slider(
minimum=1,
maximum=5,
value=2,
step=1,
label="Number of Speakers"
)
transcribe_btn = gr.Button("πŸŽ™οΈ Transcribe Audio", variant="primary")
transcribe_status = gr.Textbox(label="Status", lines=10)
transcript_files = gr.File(label="Download Transcripts", file_count="multiple")
transcribe_btn.click(
fn=preprocess_audio,
inputs=[audio_input, num_speakers_input],
outputs=[transcript_files, transcribe_status]
)
gr.Markdown("""
**Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them.
""")
with gr.TabItem("πŸ“Š Transcript Analysis"):
with gr.Row():
with gr.Column(scale=1):
files = gr.File(
label="πŸ“ Upload Transcripts",
file_types=[".docx", ".pdf"],
file_count="multiple"
)
file_type = gr.Radio(
["DOCX", "PDF"],
label="File Type",
value="DOCX"
)
interviewee_type = gr.Radio(
["HCP", "Patient", "Other"],
label="Interviewee Type",
value="Patient",
info="Select the type of person being interviewed"
)
with gr.Column(scale=1):
user_comments = gr.Textbox(
label="Analysis Instructions",
lines=6,
placeholder="Enter specific analysis goals, questions to answer, or context...",
info="Provide guidance for the AI analyzer"
)
role_hint = gr.Textbox(
label="Speaker Role Mapping (Optional)",
placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor",
info="Help identify speakers if needed"
)
with gr.Row():
debug_mode = gr.Checkbox(label="πŸ” Enable Debug Mode", value=False)
analyze_btn = gr.Button("πŸš€ Analyze Transcripts", variant="primary", scale=2)
with gr.Row():
output_text = gr.Textbox(label="πŸ“Š Analysis Report", lines=40)
with gr.Row():
csv_output = gr.File(label="πŸ“₯ Download CSV")
pdf_output = gr.File(label="πŸ“₯ Download PDF")
with gr.Row():
dashboard_output = gr.Plot(label="πŸ“ˆ Dashboard Visualization")
analyze_btn.click(
fn=analyze,
inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type],
outputs=[output_text, csv_output, pdf_output, dashboard_output]
)
with gr.TabItem("πŸ“ Narrative Report"):
gr.Markdown("""
## Generate Storytelling Report
Transform your analysis into a narrative report with:
- Executive summary with key insights
- Data-driven storytelling
- Professional formatting (PDF, Word, HTML)
- Actionable recommendations
**Instructions:** First run the analysis in the previous tab, then use the outputs here to generate a narrative report.
""")
with gr.Row():
with gr.Column():
narrative_csv = gr.File(
label="CSV Output from Analysis",
file_types=[".csv"]
)
narrative_summary = gr.Textbox(
label="Copy/Paste Summary Text from Analysis (Optional)",
lines=10,
placeholder="Paste the executive summary text here..."
)
with gr.Column():
narrative_interviewee_type = gr.Radio(
["HCP", "Patient", "Other"],
label="Interviewee Type",
value="Patient"
)
narrative_report_style = gr.Radio(
["executive", "detailed", "presentation"],
label="Report Style",
value="executive",
info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready"
)
generate_narrative_btn = gr.Button("πŸ“– Generate Narrative Report", variant="primary")
narrative_status = gr.Textbox(label="Status", lines=5)
with gr.Row():
narrative_pdf_output = gr.File(label="πŸ“₯ Download PDF Report")
narrative_word_output = gr.File(label="πŸ“₯ Download Word Report")
narrative_html_output = gr.File(label="πŸ“₯ Download HTML Report")
generate_narrative_btn.click(
fn=generate_narrative_report_ui,
inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style],
outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output]
)
with gr.TabItem("❓ Help"):
gr.Markdown("""
### Quick Start Guide
**Step 1: Analyze Transcripts**
1. Upload your DOCX or PDF files
2. Select interviewee type (HCP, Patient, or Other)
3. Add analysis instructions
4. Click "Analyze Transcripts"
5. Download CSV, PDF, and view dashboard
**Step 2: Generate Narrative Report (Optional)**
1. Go to "Narrative Report" tab
2. Upload the CSV from Step 1
3. Optionally paste the summary text
4. Select report style
5. Click "Generate Narrative Report"
6. Download PDF, Word, or HTML versions
### Tips
- **CSV Upload**: Download the CSV from analysis, then upload it to narrative report generator
- **Summary Text**: Copy from the "Analysis Report" textbox and paste into narrative generator
- **Report Styles**:
- **Executive**: Best for C-level, investors, decision-makers
- **Detailed**: Best for researchers, comprehensive analysis
- **Presentation**: Best for slides, briefings, quick overviews
### LLM Configuration
- Set `USE_LMSTUDIO=True` to use your local LM Studio
- Set `HUGGINGFACE_TOKEN` to use HF API for faster processing
- Default: Uses local model (slower but free)
### Support
For issues, check the console output or enable debug mode.
""")
gr.Markdown("""
---
**TranscriptorAI** | Enterprise-grade transcript analysis with narrative reporting
""")
if __name__ == "__main__":
demo.queue(
concurrency_count=1,
max_size=10,
api_open=False
).launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True
)