Spaces:
Sleeping
Sleeping
Upload 23 files
Browse files- README.md +7 -5
- Set-Service +0 -0
- app.py +582 -0
- audio_transcriber.py +100 -0
- audio_transcriber_hf.py +104 -0
- chunking.py +236 -0
- config.py +283 -0
- dashboard.py +340 -0
- extractors.py +201 -0
- llm.py +383 -0
- narrative_report_generator.py +74 -0
- outputs/sample.txt +0 -0
- report.csv +2 -0
- report.pdf +112 -0
- report_parser.py +61 -0
- reporting.py +239 -0
- requirements.txt +41 -0
- story_writer.py +55 -0
- table_builder.py +51 -0
- tagging.py +228 -0
- utils.py +404 -0
- validation.py +274 -0
README.md
CHANGED
|
@@ -1,12 +1,14 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.49.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: StoryTellerTranscript
|
| 3 |
+
emoji: 🌖
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: unknown
|
| 11 |
+
short_description: Audio interviews to final reports
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
Set-Service
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
from typing import List, Dict, Tuple
|
| 4 |
+
from extractors import extract_docx, extract_pdf, validate_extraction
|
| 5 |
+
from tagging import tag_speakers_advanced
|
| 6 |
+
from chunking import chunk_text_semantic
|
| 7 |
+
from llm import query_llm, extract_structured_data
|
| 8 |
+
from reporting import generate_enhanced_csv, generate_enhanced_pdf
|
| 9 |
+
from dashboard import generate_comprehensive_dashboard
|
| 10 |
+
from validation import validate_transcript_quality, check_data_completeness
|
| 11 |
+
from audio_transcriber import transcribe_with_diarization_streaming
|
| 12 |
+
|
| 13 |
+
def preprocess_audio(audio_files, num_speakers):
|
| 14 |
+
"""Convert audio to transcripts"""
|
| 15 |
+
if not audio_files:
|
| 16 |
+
return None, "No audio files provided"
|
| 17 |
+
|
| 18 |
+
transcript_paths = []
|
| 19 |
+
status = ""
|
| 20 |
+
|
| 21 |
+
for audio in audio_files:
|
| 22 |
+
try:
|
| 23 |
+
# Get the actual file path
|
| 24 |
+
audio_path = audio.name if hasattr(audio, 'name') else str(audio)
|
| 25 |
+
|
| 26 |
+
transcript_path = transcribe_with_diarization(audio_path, num_speakers)
|
| 27 |
+
transcript_paths.append(transcript_path)
|
| 28 |
+
status += f"✓ {os.path.basename(audio_path)} → {transcript_path}\n"
|
| 29 |
+
except Exception as e:
|
| 30 |
+
status += f"✗ {os.path.basename(audio_path)}: {str(e)}\n"
|
| 31 |
+
|
| 32 |
+
# Return list of paths for file component
|
| 33 |
+
return transcript_paths if transcript_paths else None, status
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
|
| 37 |
+
"""
|
| 38 |
+
Enhanced analysis pipeline with robust error handling and validation
|
| 39 |
+
"""
|
| 40 |
+
os.environ["DEBUG_MODE"] = str(debug_mode)
|
| 41 |
+
|
| 42 |
+
if not files:
|
| 43 |
+
return "Error: No files uploaded", None, None, None
|
| 44 |
+
|
| 45 |
+
all_results = []
|
| 46 |
+
csv_rows = []
|
| 47 |
+
processing_errors = []
|
| 48 |
+
|
| 49 |
+
progress(0, desc="Initializing...")
|
| 50 |
+
print(f"[Start] Processing {len(files)} file(s) as {file_type}")
|
| 51 |
+
|
| 52 |
+
# Enhanced interviewee context
|
| 53 |
+
interviewee_context = {
|
| 54 |
+
"HCP": {
|
| 55 |
+
"focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns",
|
| 56 |
+
"extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"]
|
| 57 |
+
},
|
| 58 |
+
"Patient": {
|
| 59 |
+
"focus": "symptoms, concerns, emotional state, treatment understanding, adherence",
|
| 60 |
+
"extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"]
|
| 61 |
+
},
|
| 62 |
+
"Other": {
|
| 63 |
+
"focus": "context-dependent insights, relevant observations",
|
| 64 |
+
"extract": ["key_insights", "context", "recommendations"]
|
| 65 |
+
}
|
| 66 |
+
}.get(interviewee_type, {})
|
| 67 |
+
|
| 68 |
+
# Build enhanced user context
|
| 69 |
+
user_context = f"""
|
| 70 |
+
Interviewee Type: {interviewee_type}
|
| 71 |
+
Analysis Focus: {interviewee_context.get('focus', 'general insights')}
|
| 72 |
+
Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))}
|
| 73 |
+
|
| 74 |
+
Additional Instructions:
|
| 75 |
+
{user_comments}
|
| 76 |
+
""".strip()
|
| 77 |
+
|
| 78 |
+
total_steps = len(files) * 4 + 2 # extraction, validation, tagging, chunking per file + summary + report
|
| 79 |
+
current_step = 0
|
| 80 |
+
|
| 81 |
+
for i, file in enumerate(files):
|
| 82 |
+
file_name = os.path.basename(file.name)
|
| 83 |
+
try:
|
| 84 |
+
# Step 1: Extract text
|
| 85 |
+
progress((current_step / total_steps), desc=f"Extracting {file_name}...")
|
| 86 |
+
print(f"[File {i+1}/{len(files)}] Extracting: {file_name}")
|
| 87 |
+
|
| 88 |
+
raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file)
|
| 89 |
+
current_step += 1
|
| 90 |
+
|
| 91 |
+
# Step 2: Validate extraction
|
| 92 |
+
progress((current_step / total_steps), desc=f"Validating {file_name}...")
|
| 93 |
+
is_valid, validation_msg = validate_extraction(raw_text, file_name)
|
| 94 |
+
if not is_valid:
|
| 95 |
+
raise ValueError(f"Extraction validation failed: {validation_msg}")
|
| 96 |
+
|
| 97 |
+
print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}")
|
| 98 |
+
current_step += 1
|
| 99 |
+
|
| 100 |
+
# Step 3: Tag speakers with advanced logic
|
| 101 |
+
progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...")
|
| 102 |
+
tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type)
|
| 103 |
+
print(f"[File {i+1}] Tagged {len(tagged_text)} characters")
|
| 104 |
+
current_step += 1
|
| 105 |
+
|
| 106 |
+
# Step 4: Semantic chunking
|
| 107 |
+
progress((current_step / total_steps), desc=f"Processing {file_name}...")
|
| 108 |
+
chunks = chunk_text_semantic(tagged_text, interviewee_type)
|
| 109 |
+
print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)")
|
| 110 |
+
current_step += 1
|
| 111 |
+
|
| 112 |
+
# Step 5: LLM Analysis with structured extraction
|
| 113 |
+
transcript_result = []
|
| 114 |
+
structured_data = {}
|
| 115 |
+
|
| 116 |
+
for j, chunk in enumerate(chunks):
|
| 117 |
+
chunk_progress = (current_step + (j / len(chunks))) / total_steps
|
| 118 |
+
progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...")
|
| 119 |
+
|
| 120 |
+
result, chunk_data = query_llm(
|
| 121 |
+
chunk,
|
| 122 |
+
user_context,
|
| 123 |
+
interviewee_type,
|
| 124 |
+
extract_structured=True
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
transcript_result.append(result)
|
| 128 |
+
|
| 129 |
+
# Merge structured data
|
| 130 |
+
for key, value in chunk_data.items():
|
| 131 |
+
if key not in structured_data:
|
| 132 |
+
structured_data[key] = []
|
| 133 |
+
if isinstance(value, list):
|
| 134 |
+
structured_data[key].extend(value)
|
| 135 |
+
else:
|
| 136 |
+
structured_data[key].append(value)
|
| 137 |
+
|
| 138 |
+
current_step += 1
|
| 139 |
+
|
| 140 |
+
# Combine and validate results
|
| 141 |
+
full_text = "\n\n".join(transcript_result)
|
| 142 |
+
|
| 143 |
+
# Quality check
|
| 144 |
+
quality_score, quality_issues = validate_transcript_quality(
|
| 145 |
+
full_text,
|
| 146 |
+
structured_data,
|
| 147 |
+
interviewee_type
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
if quality_score < 0.3:
|
| 151 |
+
print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}")
|
| 152 |
+
processing_errors.append(f"{file_name}: Low quality - {quality_issues}")
|
| 153 |
+
|
| 154 |
+
all_results.append({
|
| 155 |
+
"transcript_id": f"Transcript {i+1}",
|
| 156 |
+
"file_name": file_name,
|
| 157 |
+
"full_text": full_text,
|
| 158 |
+
"structured_data": structured_data,
|
| 159 |
+
"quality_score": quality_score,
|
| 160 |
+
"word_count": len(raw_text.split())
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
# Enhanced CSV row with structured data
|
| 164 |
+
csv_row = {
|
| 165 |
+
"Transcript ID": f"Transcript {i+1}",
|
| 166 |
+
"File Name": file_name,
|
| 167 |
+
"Quality Score": f"{quality_score:.2f}",
|
| 168 |
+
"Word Count": len(raw_text.split()),
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Add interviewee-specific fields
|
| 172 |
+
if interviewee_type == "HCP":
|
| 173 |
+
csv_row.update({
|
| 174 |
+
"Diagnoses": "; ".join(structured_data.get("diagnoses", [])),
|
| 175 |
+
"Prescriptions": "; ".join(structured_data.get("prescriptions", [])),
|
| 176 |
+
"Treatment Strategies": "; ".join(structured_data.get("treatment_rationale", [])),
|
| 177 |
+
"Guidelines Mentioned": "; ".join(structured_data.get("guidelines_mentioned", []))
|
| 178 |
+
})
|
| 179 |
+
elif interviewee_type == "Patient":
|
| 180 |
+
csv_row.update({
|
| 181 |
+
"Primary Symptoms": "; ".join(structured_data.get("symptoms", [])),
|
| 182 |
+
"Main Concerns": "; ".join(structured_data.get("concerns", [])),
|
| 183 |
+
"Treatment Response": "; ".join(structured_data.get("treatment_response", [])),
|
| 184 |
+
"Side Effects": "; ".join(structured_data.get("side_effects", []))
|
| 185 |
+
})
|
| 186 |
+
else:
|
| 187 |
+
csv_row.update({
|
| 188 |
+
"Key Insights": "; ".join(structured_data.get("key_insights", [])),
|
| 189 |
+
"Recommendations": "; ".join(structured_data.get("recommendations", []))
|
| 190 |
+
})
|
| 191 |
+
|
| 192 |
+
csv_rows.append(csv_row)
|
| 193 |
+
|
| 194 |
+
print(f"[File {i+1}] ✓ Processing complete")
|
| 195 |
+
|
| 196 |
+
except Exception as e:
|
| 197 |
+
error_msg = f"[Error] {file_name} failed: {str(e)}"
|
| 198 |
+
print(error_msg)
|
| 199 |
+
processing_errors.append(error_msg)
|
| 200 |
+
all_results.append({
|
| 201 |
+
"transcript_id": f"Transcript {i+1}",
|
| 202 |
+
"file_name": file_name,
|
| 203 |
+
"full_text": error_msg,
|
| 204 |
+
"structured_data": {},
|
| 205 |
+
"quality_score": 0.0,
|
| 206 |
+
"word_count": 0
|
| 207 |
+
})
|
| 208 |
+
|
| 209 |
+
# Generate cross-transcript summary
|
| 210 |
+
try:
|
| 211 |
+
progress(0.9, desc="Generating summary and reports...")
|
| 212 |
+
print("[Summary] Analyzing trends across transcripts")
|
| 213 |
+
|
| 214 |
+
# Combine successful results
|
| 215 |
+
valid_results = [r for r in all_results if r["quality_score"] > 0]
|
| 216 |
+
|
| 217 |
+
if not valid_results:
|
| 218 |
+
return "Error: No transcripts were successfully processed", None, None, None
|
| 219 |
+
|
| 220 |
+
# Build comprehensive summary prompt
|
| 221 |
+
summary_prompt = f"""
|
| 222 |
+
CROSS-INTERVIEW SYNTHESIS TASK
|
| 223 |
+
|
| 224 |
+
SAMPLE: {len(valid_results)} {interviewee_type} transcripts
|
| 225 |
+
FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')}
|
| 226 |
+
|
| 227 |
+
COMPLETE TRANSCRIPT DATA:
|
| 228 |
+
"""
|
| 229 |
+
|
| 230 |
+
for idx, result in enumerate(valid_results, 1):
|
| 231 |
+
summary_prompt += f"\n{'='*60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='*60}\n"
|
| 232 |
+
summary_prompt += f"{result['full_text'][:2000]}\n"
|
| 233 |
+
|
| 234 |
+
summary_prompt += f"""
|
| 235 |
+
|
| 236 |
+
ANALYSIS REQUIREMENTS:
|
| 237 |
+
|
| 238 |
+
1. QUANTIFY EVERYTHING:
|
| 239 |
+
- Count participants: "X out of {len(valid_results)} participants mentioned..."
|
| 240 |
+
- Never use vague terms (many/most/some)
|
| 241 |
+
- Calculate percentages where relevant
|
| 242 |
+
|
| 243 |
+
2. IDENTIFY PATTERNS BY CONSENSUS LEVEL:
|
| 244 |
+
- STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree)
|
| 245 |
+
- MAJORITY VIEW (60-79% = {int(len(valid_results)*0.6)}-{int(len(valid_results)*0.79)} transcripts)
|
| 246 |
+
- SPLIT PERSPECTIVES (40-59% = mixed views)
|
| 247 |
+
- MINORITY/OUTLIER (<40% but notable)
|
| 248 |
+
|
| 249 |
+
3. CROSS-VALIDATE:
|
| 250 |
+
- Check for contradictions between transcripts
|
| 251 |
+
- Note where perspectives diverge and why
|
| 252 |
+
- Flag any quality issues in individual transcripts
|
| 253 |
+
|
| 254 |
+
4. CITE EVIDENCE:
|
| 255 |
+
- Reference specific transcript numbers
|
| 256 |
+
- Brief supporting details
|
| 257 |
+
- Distinguish verified facts from interpretation
|
| 258 |
+
|
| 259 |
+
OUTPUT FORMAT:
|
| 260 |
+
Write 2-3 sentence executive overview, then structure as:
|
| 261 |
+
|
| 262 |
+
**STRONG CONSENSUS FINDINGS:**
|
| 263 |
+
- [Finding with count and evidence]
|
| 264 |
+
|
| 265 |
+
**MAJORITY FINDINGS:**
|
| 266 |
+
- [Finding with count]
|
| 267 |
+
|
| 268 |
+
**DIVERGENT PERSPECTIVES:**
|
| 269 |
+
- [Where views split and context]
|
| 270 |
+
|
| 271 |
+
**NOTABLE OUTLIERS:**
|
| 272 |
+
- [Unique but important points]
|
| 273 |
+
|
| 274 |
+
**DATA QUALITY NOTES:**
|
| 275 |
+
- [Any gaps or transcript issues]
|
| 276 |
+
|
| 277 |
+
Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
|
| 278 |
+
"""
|
| 279 |
+
|
| 280 |
+
summary, summary_data = query_llm(
|
| 281 |
+
summary_prompt,
|
| 282 |
+
user_context,
|
| 283 |
+
interviewee_type,
|
| 284 |
+
extract_structured=False,
|
| 285 |
+
is_summary=True
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
print("[Summary] ✓ Generated")
|
| 289 |
+
|
| 290 |
+
# Generate enhanced reports
|
| 291 |
+
csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
|
| 292 |
+
print(f"[CSV] ✓ Saved to {csv_path}")
|
| 293 |
+
|
| 294 |
+
pdf_path = generate_enhanced_pdf(
|
| 295 |
+
summary,
|
| 296 |
+
all_results,
|
| 297 |
+
interviewee_type,
|
| 298 |
+
processing_errors
|
| 299 |
+
)
|
| 300 |
+
print(f"[PDF] ✓ Saved to {pdf_path}")
|
| 301 |
+
|
| 302 |
+
dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type)
|
| 303 |
+
print("[Dashboard] ✓ Generated")
|
| 304 |
+
|
| 305 |
+
# Compile final output
|
| 306 |
+
output_text = f"""# Analysis Complete
|
| 307 |
+
|
| 308 |
+
## Summary of Findings
|
| 309 |
+
{summary}
|
| 310 |
+
|
| 311 |
+
## Processing Statistics
|
| 312 |
+
- Total Files: {len(files)}
|
| 313 |
+
- Successfully Processed: {len(valid_results)}
|
| 314 |
+
- Failed: {len(processing_errors)}
|
| 315 |
+
- Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f}
|
| 316 |
+
|
| 317 |
+
"""
|
| 318 |
+
|
| 319 |
+
if processing_errors:
|
| 320 |
+
output_text += f"\n## Processing Errors\n" + "\n".join(f"- {err}" for err in processing_errors)
|
| 321 |
+
|
| 322 |
+
output_text += "\n\n---\n\n## Individual Transcript Results\n\n"
|
| 323 |
+
|
| 324 |
+
for result in all_results:
|
| 325 |
+
output_text += f"### {result['transcript_id']} - {result['file_name']}\n"
|
| 326 |
+
output_text += f"Quality Score: {result['quality_score']:.2f} | Words: {result['word_count']}\n\n"
|
| 327 |
+
output_text += result['full_text'] + "\n\n---\n\n"
|
| 328 |
+
|
| 329 |
+
progress(1.0, desc="Complete!")
|
| 330 |
+
return output_text, csv_path, pdf_path, dashboard
|
| 331 |
+
|
| 332 |
+
except Exception as e:
|
| 333 |
+
error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}"
|
| 334 |
+
print(error_msg)
|
| 335 |
+
import traceback
|
| 336 |
+
traceback.print_exc()
|
| 337 |
+
return error_msg, None, None, None
|
| 338 |
+
|
| 339 |
+
def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style):
|
| 340 |
+
"""
|
| 341 |
+
Wrapper function for Gradio UI to generate narrative reports
|
| 342 |
+
"""
|
| 343 |
+
try:
|
| 344 |
+
from narrative_report_generator import generate_narrative_report
|
| 345 |
+
import tempfile
|
| 346 |
+
import os
|
| 347 |
+
|
| 348 |
+
# Check if CSV file exists
|
| 349 |
+
if csv_file is None:
|
| 350 |
+
return "Error: No CSV file provided. Please run analysis first.", None, None, None
|
| 351 |
+
|
| 352 |
+
# Save summary text to temp file if provided
|
| 353 |
+
summary_path = None
|
| 354 |
+
if summary_text and summary_text.strip():
|
| 355 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
| 356 |
+
f.write(summary_text)
|
| 357 |
+
summary_path = f.name
|
| 358 |
+
|
| 359 |
+
# Determine LLM backend
|
| 360 |
+
llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api"
|
| 361 |
+
|
| 362 |
+
# Generate narrative report
|
| 363 |
+
pdf_path, word_path, html_path = generate_narrative_report(
|
| 364 |
+
csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file,
|
| 365 |
+
summary_path=summary_path,
|
| 366 |
+
interviewee_type=interviewee_type,
|
| 367 |
+
report_style=report_style,
|
| 368 |
+
llm_backend=llm_backend
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
# Clean up temp file
|
| 372 |
+
if summary_path and os.path.exists(summary_path):
|
| 373 |
+
os.remove(summary_path)
|
| 374 |
+
|
| 375 |
+
return (
|
| 376 |
+
f"✓ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}",
|
| 377 |
+
pdf_path,
|
| 378 |
+
word_path,
|
| 379 |
+
html_path
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
except Exception as e:
|
| 383 |
+
import traceback
|
| 384 |
+
error_detail = traceback.format_exc()
|
| 385 |
+
return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 389 |
+
gr.Markdown("""
|
| 390 |
+
# 🎯 TranscriptorAI - Enterprise Transcript Analyzer
|
| 391 |
+
|
| 392 |
+
Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis.
|
| 393 |
+
""")
|
| 394 |
+
|
| 395 |
+
with gr.Tabs():
|
| 396 |
+
|
| 397 |
+
with gr.TabItem("🎤 Audio Preprocessing"):
|
| 398 |
+
gr.Markdown("""
|
| 399 |
+
Upload audio interviews to auto-transcribe with speaker identification.
|
| 400 |
+
Outputs DOCX files ready for analysis.
|
| 401 |
+
""")
|
| 402 |
+
|
| 403 |
+
with gr.Row():
|
| 404 |
+
audio_input = gr.File(
|
| 405 |
+
label="Upload Audio Files",
|
| 406 |
+
file_types=[".mp3", ".wav", ".m4a", ".flac"],
|
| 407 |
+
file_count="multiple"
|
| 408 |
+
)
|
| 409 |
+
num_speakers_input = gr.Slider(
|
| 410 |
+
minimum=1,
|
| 411 |
+
maximum=5,
|
| 412 |
+
value=2,
|
| 413 |
+
step=1,
|
| 414 |
+
label="Number of Speakers"
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
transcribe_btn = gr.Button("🎙️ Transcribe Audio", variant="primary")
|
| 418 |
+
transcribe_status = gr.Textbox(label="Status", lines=10)
|
| 419 |
+
transcript_files = gr.File(label="Download Transcripts", file_count="multiple")
|
| 420 |
+
|
| 421 |
+
transcribe_btn.click(
|
| 422 |
+
fn=preprocess_audio,
|
| 423 |
+
inputs=[audio_input, num_speakers_input],
|
| 424 |
+
outputs=[transcript_files, transcribe_status]
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
gr.Markdown("""
|
| 428 |
+
**Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them.
|
| 429 |
+
""")
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
with gr.TabItem("📊 Transcript Analysis"):
|
| 434 |
+
with gr.Row():
|
| 435 |
+
with gr.Column(scale=1):
|
| 436 |
+
files = gr.File(
|
| 437 |
+
label="📁 Upload Transcripts",
|
| 438 |
+
file_types=[".docx", ".pdf"],
|
| 439 |
+
file_count="multiple"
|
| 440 |
+
)
|
| 441 |
+
file_type = gr.Radio(
|
| 442 |
+
["DOCX", "PDF"],
|
| 443 |
+
label="File Type",
|
| 444 |
+
value="DOCX"
|
| 445 |
+
)
|
| 446 |
+
interviewee_type = gr.Radio(
|
| 447 |
+
["HCP", "Patient", "Other"],
|
| 448 |
+
label="Interviewee Type",
|
| 449 |
+
value="Patient",
|
| 450 |
+
info="Select the type of person being interviewed"
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
with gr.Column(scale=1):
|
| 454 |
+
user_comments = gr.Textbox(
|
| 455 |
+
label="Analysis Instructions",
|
| 456 |
+
lines=6,
|
| 457 |
+
placeholder="Enter specific analysis goals, questions to answer, or context...",
|
| 458 |
+
info="Provide guidance for the AI analyzer"
|
| 459 |
+
)
|
| 460 |
+
role_hint = gr.Textbox(
|
| 461 |
+
label="Speaker Role Mapping (Optional)",
|
| 462 |
+
placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor",
|
| 463 |
+
info="Help identify speakers if needed"
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
with gr.Row():
|
| 467 |
+
debug_mode = gr.Checkbox(label="🔍 Enable Debug Mode", value=False)
|
| 468 |
+
analyze_btn = gr.Button("🚀 Analyze Transcripts", variant="primary", scale=2)
|
| 469 |
+
|
| 470 |
+
with gr.Row():
|
| 471 |
+
output_text = gr.Textbox(label="📊 Analysis Report", lines=40)
|
| 472 |
+
|
| 473 |
+
with gr.Row():
|
| 474 |
+
csv_output = gr.File(label="📥 Download CSV")
|
| 475 |
+
pdf_output = gr.File(label="📥 Download PDF")
|
| 476 |
+
|
| 477 |
+
with gr.Row():
|
| 478 |
+
dashboard_output = gr.Plot(label="📈 Dashboard Visualization")
|
| 479 |
+
|
| 480 |
+
analyze_btn.click(
|
| 481 |
+
fn=analyze,
|
| 482 |
+
inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type],
|
| 483 |
+
outputs=[output_text, csv_output, pdf_output, dashboard_output]
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
with gr.TabItem("📝 Narrative Report"):
|
| 488 |
+
gr.Markdown("""
|
| 489 |
+
## Generate Storytelling Report
|
| 490 |
+
|
| 491 |
+
Transform your analysis into a narrative report with:
|
| 492 |
+
- Executive summary with key insights
|
| 493 |
+
- Data-driven storytelling
|
| 494 |
+
- Professional formatting (PDF, Word, HTML)
|
| 495 |
+
- Actionable recommendations
|
| 496 |
+
|
| 497 |
+
**Instructions:** First run the analysis in the previous tab, then use the outputs here to generate a narrative report.
|
| 498 |
+
""")
|
| 499 |
+
|
| 500 |
+
with gr.Row():
|
| 501 |
+
with gr.Column():
|
| 502 |
+
narrative_csv = gr.File(
|
| 503 |
+
label="CSV Output from Analysis",
|
| 504 |
+
file_types=[".csv"]
|
| 505 |
+
)
|
| 506 |
+
narrative_summary = gr.Textbox(
|
| 507 |
+
label="Copy/Paste Summary Text from Analysis (Optional)",
|
| 508 |
+
lines=10,
|
| 509 |
+
placeholder="Paste the executive summary text here..."
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
with gr.Column():
|
| 513 |
+
narrative_interviewee_type = gr.Radio(
|
| 514 |
+
["HCP", "Patient", "Other"],
|
| 515 |
+
label="Interviewee Type",
|
| 516 |
+
value="Patient"
|
| 517 |
+
)
|
| 518 |
+
narrative_report_style = gr.Radio(
|
| 519 |
+
["executive", "detailed", "presentation"],
|
| 520 |
+
label="Report Style",
|
| 521 |
+
value="executive",
|
| 522 |
+
info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready"
|
| 523 |
+
)
|
| 524 |
+
generate_narrative_btn = gr.Button("📖 Generate Narrative Report", variant="primary")
|
| 525 |
+
|
| 526 |
+
narrative_status = gr.Textbox(label="Status", lines=5)
|
| 527 |
+
|
| 528 |
+
with gr.Row():
|
| 529 |
+
narrative_pdf_output = gr.File(label="📥 Download PDF Report")
|
| 530 |
+
narrative_word_output = gr.File(label="📥 Download Word Report")
|
| 531 |
+
narrative_html_output = gr.File(label="📥 Download HTML Report")
|
| 532 |
+
|
| 533 |
+
generate_narrative_btn.click(
|
| 534 |
+
fn=generate_narrative_report_ui,
|
| 535 |
+
inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style],
|
| 536 |
+
outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output]
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
|
| 540 |
+
with gr.TabItem("❓ Help"):
|
| 541 |
+
gr.Markdown("""
|
| 542 |
+
### Quick Start Guide
|
| 543 |
+
|
| 544 |
+
**Step 1: Analyze Transcripts**
|
| 545 |
+
1. Upload your DOCX or PDF files
|
| 546 |
+
2. Select interviewee type (HCP, Patient, or Other)
|
| 547 |
+
3. Add analysis instructions
|
| 548 |
+
4. Click "Analyze Transcripts"
|
| 549 |
+
5. Download CSV, PDF, and view dashboard
|
| 550 |
+
|
| 551 |
+
**Step 2: Generate Narrative Report (Optional)**
|
| 552 |
+
1. Go to "Narrative Report" tab
|
| 553 |
+
2. Upload the CSV from Step 1
|
| 554 |
+
3. Optionally paste the summary text
|
| 555 |
+
4. Select report style
|
| 556 |
+
5. Click "Generate Narrative Report"
|
| 557 |
+
6. Download PDF, Word, or HTML versions
|
| 558 |
+
|
| 559 |
+
### Tips
|
| 560 |
+
- **CSV Upload**: Download the CSV from analysis, then upload it to narrative report generator
|
| 561 |
+
- **Summary Text**: Copy from the "Analysis Report" textbox and paste into narrative generator
|
| 562 |
+
- **Report Styles**:
|
| 563 |
+
- **Executive**: Best for C-level, investors, decision-makers
|
| 564 |
+
- **Detailed**: Best for researchers, comprehensive analysis
|
| 565 |
+
- **Presentation**: Best for slides, briefings, quick overviews
|
| 566 |
+
|
| 567 |
+
### LLM Configuration
|
| 568 |
+
- Set `USE_LMSTUDIO=True` to use your local LM Studio
|
| 569 |
+
- Set `HUGGINGFACE_TOKEN` to use HF API for faster processing
|
| 570 |
+
- Default: Uses local model (slower but free)
|
| 571 |
+
|
| 572 |
+
### Support
|
| 573 |
+
For issues, check the console output or enable debug mode.
|
| 574 |
+
""")
|
| 575 |
+
|
| 576 |
+
gr.Markdown("""
|
| 577 |
+
---
|
| 578 |
+
**TranscriptorAI** | Enterprise-grade transcript analysis with narrative reporting
|
| 579 |
+
""")
|
| 580 |
+
|
| 581 |
+
if __name__ == "__main__":
|
| 582 |
+
demo.launch()
|
audio_transcriber.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from faster_whisper import WhisperModel
|
| 2 |
+
from speechbrain.inference import EncoderClassifier
|
| 3 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 4 |
+
from docx import Document
|
| 5 |
+
import torch, torchaudio, numpy as np
|
| 6 |
+
|
| 7 |
+
def transcribe_with_diarization_streaming(audio_path: str, num_speakers: int = 1):
|
| 8 |
+
"""
|
| 9 |
+
Streaming transcription with diarization support.
|
| 10 |
+
- Processes audio in chunks (default 30s).
|
| 11 |
+
- Streams partial transcripts as they’re ready.
|
| 12 |
+
- Handles single-speaker fallback.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
# Device fallback
|
| 16 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 17 |
+
compute_type = "float16" if device == "cuda" else "int8"
|
| 18 |
+
|
| 19 |
+
print(f"[1/3] Loading Whisper model on {device}...")
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
if torch.cuda.is_available():
|
| 23 |
+
device = "cuda"
|
| 24 |
+
compute_type = "float16"
|
| 25 |
+
_ = torch.zeros(1).to(device) # sanity check
|
| 26 |
+
else:
|
| 27 |
+
raise RuntimeError("No CUDA")
|
| 28 |
+
except Exception:
|
| 29 |
+
print("⚠️ CUDA not usable, falling back to CPU")
|
| 30 |
+
device = "cpu"
|
| 31 |
+
compute_type = "int8"
|
| 32 |
+
|
| 33 |
+
whisper_model = WhisperModel("large-v3", device=device, compute_type=compute_type)
|
| 34 |
+
return whisper_model
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
print(f"[2/3] Transcribing...")
|
| 39 |
+
# Streaming generator
|
| 40 |
+
segments, info = whisper_model.transcribe(
|
| 41 |
+
audio_path,
|
| 42 |
+
language="en",
|
| 43 |
+
beam_size=5,
|
| 44 |
+
word_timestamps=True,
|
| 45 |
+
vad_filter=True,
|
| 46 |
+
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
segments_list = []
|
| 50 |
+
for seg in segments:
|
| 51 |
+
print(f"[stream] {seg.start:.2f}-{seg.end:.2f}: {seg.text}")
|
| 52 |
+
segments_list.append(seg)
|
| 53 |
+
|
| 54 |
+
# Speaker embeddings
|
| 55 |
+
print(f"[3/3] Extracting speaker embeddings...")
|
| 56 |
+
speaker_model = EncoderClassifier.from_hparams(
|
| 57 |
+
source="speechbrain/spkrec-ecapa-voxceleb",
|
| 58 |
+
savedir="models/speaker_embeddings",
|
| 59 |
+
run_opts={"device": device}
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
| 63 |
+
embeddings, valid_segments = [], []
|
| 64 |
+
|
| 65 |
+
for seg in segments_list:
|
| 66 |
+
start_sample = int(seg.start * sample_rate)
|
| 67 |
+
end_sample = int(seg.end * sample_rate)
|
| 68 |
+
if end_sample > start_sample:
|
| 69 |
+
seg_audio = waveform[:, start_sample:end_sample]
|
| 70 |
+
if sample_rate != 16000:
|
| 71 |
+
seg_audio = torchaudio.transforms.Resample(sample_rate, 16000)(seg_audio)
|
| 72 |
+
with torch.no_grad():
|
| 73 |
+
emb = speaker_model.encode_batch(seg_audio)
|
| 74 |
+
embeddings.append(emb.squeeze().cpu().numpy())
|
| 75 |
+
valid_segments.append(seg)
|
| 76 |
+
|
| 77 |
+
# Handle empty or single-speaker case
|
| 78 |
+
if len(embeddings) == 0 or num_speakers <= 1:
|
| 79 |
+
print("Single speaker detected or no embeddings. Skipping clustering.")
|
| 80 |
+
speaker_labels = [0] * len(valid_segments)
|
| 81 |
+
num_speakers = 1
|
| 82 |
+
else:
|
| 83 |
+
if num_speakers > len(embeddings):
|
| 84 |
+
num_speakers = len(embeddings)
|
| 85 |
+
clustering = AgglomerativeClustering(n_clusters=num_speakers)
|
| 86 |
+
speaker_labels = clustering.fit_predict(np.array(embeddings))
|
| 87 |
+
|
| 88 |
+
# Build transcript
|
| 89 |
+
doc = Document()
|
| 90 |
+
doc.add_heading('Interview Transcript', 0)
|
| 91 |
+
doc.add_paragraph(f"Detected {num_speakers} speaker(s)")
|
| 92 |
+
doc.add_paragraph("")
|
| 93 |
+
|
| 94 |
+
for seg, spk in zip(valid_segments, speaker_labels):
|
| 95 |
+
doc.add_paragraph(f"Speaker {spk+1}: {seg.text.strip()}")
|
| 96 |
+
|
| 97 |
+
output_path = audio_path.rsplit('.', 1)[0] + '_transcript.docx'
|
| 98 |
+
doc.save(output_path)
|
| 99 |
+
print(f"✓ Saved transcript: {output_path}")
|
| 100 |
+
return output_path
|
audio_transcriber_hf.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio transcription with speaker diarization
|
| 3 |
+
"""
|
| 4 |
+
from faster_whisper import WhisperModel
|
| 5 |
+
from pyannote.audio import Pipeline
|
| 6 |
+
import torch
|
| 7 |
+
from docx import Document
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
def transcribe_with_diarization(audio_path: str, num_speakers: int = 2) -> str:
|
| 11 |
+
"""
|
| 12 |
+
Transcribe audio with speaker labels
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
audio_path: Path to audio file (mp3, wav, m4a)
|
| 16 |
+
num_speakers: Expected number of speakers (default 2 for interviews)
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
Path to generated DOCX transcript
|
| 20 |
+
"""
|
| 21 |
+
print(f"[1/3] Transcribing audio...")
|
| 22 |
+
|
| 23 |
+
# Load Whisper model
|
| 24 |
+
model = WhisperModel("large-v3", device="cuda", compute_type="float16")
|
| 25 |
+
|
| 26 |
+
# Transcribe with timestamps
|
| 27 |
+
segments, info = model.transcribe(
|
| 28 |
+
audio_path,
|
| 29 |
+
language="en",
|
| 30 |
+
beam_size=5,
|
| 31 |
+
word_timestamps=True
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
segments_list = list(segments)
|
| 35 |
+
print(f"[2/3] Identifying speakers...")
|
| 36 |
+
|
| 37 |
+
# Load diarization pipeline
|
| 38 |
+
# Note: Requires HuggingFace token for pyannote models
|
| 39 |
+
hf_token = os.getenv("HUGGINGFACE_TOKEN", "")
|
| 40 |
+
if not hf_token:
|
| 41 |
+
print("[Warning] No HF token - using simple alternating speakers")
|
| 42 |
+
return transcribe_simple(segments_list, audio_path)
|
| 43 |
+
|
| 44 |
+
diarization = Pipeline.from_pretrained(
|
| 45 |
+
"pyannote/speaker-diarization-3.1",
|
| 46 |
+
use_auth_token=hf_token
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
if torch.cuda.is_available():
|
| 50 |
+
diarization.to(torch.device("cuda"))
|
| 51 |
+
|
| 52 |
+
# Run diarization
|
| 53 |
+
diarization_result = diarization(audio_path, num_speakers=num_speakers)
|
| 54 |
+
|
| 55 |
+
print(f"[3/3] Combining transcription + speakers...")
|
| 56 |
+
|
| 57 |
+
# Match segments to speakers
|
| 58 |
+
transcript_lines = []
|
| 59 |
+
for segment in segments_list:
|
| 60 |
+
start = segment.start
|
| 61 |
+
end = segment.end
|
| 62 |
+
text = segment.text
|
| 63 |
+
|
| 64 |
+
# Find speaker at this timestamp
|
| 65 |
+
speaker = get_speaker_at_time(diarization_result, start)
|
| 66 |
+
transcript_lines.append(f"{speaker}: {text}")
|
| 67 |
+
|
| 68 |
+
# Save to DOCX
|
| 69 |
+
doc = Document()
|
| 70 |
+
doc.add_heading('Interview Transcript', 0)
|
| 71 |
+
|
| 72 |
+
for line in transcript_lines:
|
| 73 |
+
doc.add_paragraph(line)
|
| 74 |
+
|
| 75 |
+
output_path = audio_path.replace('.mp3', '_transcript.docx').replace('.wav', '_transcript.docx').replace('.m4a', '_transcript.docx')
|
| 76 |
+
doc.save(output_path)
|
| 77 |
+
|
| 78 |
+
print(f"✓ Transcript saved: {output_path}")
|
| 79 |
+
return output_path
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_speaker_at_time(diarization_result, timestamp):
|
| 83 |
+
"""Find which speaker is talking at given timestamp"""
|
| 84 |
+
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
|
| 85 |
+
if turn.start <= timestamp <= turn.end:
|
| 86 |
+
return f"Speaker {speaker}"
|
| 87 |
+
return "Speaker Unknown"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def transcribe_simple(segments_list, audio_path):
|
| 91 |
+
"""Fallback: alternating speakers without diarization"""
|
| 92 |
+
doc = Document()
|
| 93 |
+
doc.add_heading('Interview Transcript', 0)
|
| 94 |
+
|
| 95 |
+
current_speaker = 1
|
| 96 |
+
for segment in segments_list:
|
| 97 |
+
doc.add_paragraph(f"Speaker {current_speaker}: {segment.text}")
|
| 98 |
+
# Simple heuristic: alternate on pauses > 2 seconds
|
| 99 |
+
if hasattr(segment, 'no_speech_prob') and segment.no_speech_prob > 0.5:
|
| 100 |
+
current_speaker = 3 - current_speaker # Toggle between 1 and 2
|
| 101 |
+
|
| 102 |
+
output_path = audio_path.replace('.mp3', '_transcript.docx')
|
| 103 |
+
doc.save(output_path)
|
| 104 |
+
return output_path
|
chunking.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tiktoken
|
| 2 |
+
import re
|
| 3 |
+
from typing import List
|
| 4 |
+
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
| 5 |
+
|
| 6 |
+
def chunk_text(text, max_tokens=3000):
|
| 7 |
+
"""Legacy function - kept for backwards compatibility"""
|
| 8 |
+
return chunk_text_semantic(text, "Other", max_tokens)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def count_tokens(text: str) -> int:
|
| 12 |
+
"""Count tokens using tiktoken"""
|
| 13 |
+
try:
|
| 14 |
+
enc = tiktoken.get_encoding("cl100k_base")
|
| 15 |
+
return len(enc.encode(text))
|
| 16 |
+
except Exception:
|
| 17 |
+
# Fallback to word-based estimate
|
| 18 |
+
return int(len(text.split()) * 1.3)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def split_into_sentences(text: str) -> List[str]:
|
| 22 |
+
"""Split text into sentences with improved handling"""
|
| 23 |
+
try:
|
| 24 |
+
tokenizer = PunktSentenceTokenizer()
|
| 25 |
+
sentences = tokenizer.tokenize(text)
|
| 26 |
+
return sentences
|
| 27 |
+
except Exception:
|
| 28 |
+
# Fallback to simple split
|
| 29 |
+
return [s.strip() + '.' for s in text.split('.') if s.strip()]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def find_topic_boundaries(text: str, interviewee_type: str) -> List[int]:
|
| 33 |
+
"""
|
| 34 |
+
Identify topic boundaries in the text for smarter chunking
|
| 35 |
+
Returns list of character positions where topics likely change
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
boundaries = [0] # Start position
|
| 39 |
+
|
| 40 |
+
# Topic change indicators
|
| 41 |
+
topic_patterns = [
|
| 42 |
+
r'\n\n+', # Paragraph breaks
|
| 43 |
+
r'\[Interviewer\].*?(next|another|different|moving on|let\'s talk about)',
|
| 44 |
+
r'\[Interviewer\].*?\?.*?\n.*?\[(?:Doctor|Patient|Respondent)\]', # Q&A pairs
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
# Find all topic boundaries
|
| 48 |
+
for pattern in topic_patterns:
|
| 49 |
+
for match in re.finditer(pattern, text, re.IGNORECASE):
|
| 50 |
+
pos = match.start()
|
| 51 |
+
# Only add if not too close to existing boundary
|
| 52 |
+
if all(abs(pos - b) > 100 for b in boundaries):
|
| 53 |
+
boundaries.append(pos)
|
| 54 |
+
|
| 55 |
+
boundaries.append(len(text)) # End position
|
| 56 |
+
boundaries.sort()
|
| 57 |
+
|
| 58 |
+
return boundaries
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def extract_speaker_segments(text: str) -> List[dict]:
|
| 62 |
+
"""
|
| 63 |
+
Extract segments with speaker labels and content
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
pattern = r'\[([^\]]+)\]\s*([^\[]*)'
|
| 67 |
+
segments = []
|
| 68 |
+
|
| 69 |
+
for match in re.finditer(pattern, text, re.DOTALL):
|
| 70 |
+
speaker = match.group(1).strip()
|
| 71 |
+
content = match.group(2).strip()
|
| 72 |
+
if content:
|
| 73 |
+
segments.append({
|
| 74 |
+
"speaker": speaker,
|
| 75 |
+
"content": content,
|
| 76 |
+
"start_pos": match.start(),
|
| 77 |
+
"tokens": count_tokens(content)
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
return segments
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def chunk_text_semantic(
|
| 84 |
+
text: str,
|
| 85 |
+
interviewee_type: str = "Other",
|
| 86 |
+
max_tokens: int = 3000,
|
| 87 |
+
overlap_tokens: int = 150
|
| 88 |
+
) -> List[str]:
|
| 89 |
+
"""
|
| 90 |
+
Advanced chunking that respects:
|
| 91 |
+
1. Speaker boundaries (don't split mid-sentence)
|
| 92 |
+
2. Topic boundaries (keep related Q&A together)
|
| 93 |
+
3. Token limits for LLM context
|
| 94 |
+
4. Overlap for context continuity
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
# Check if text has speaker tags
|
| 98 |
+
has_tags = bool(re.search(r'\[[^\]]+\]', text))
|
| 99 |
+
|
| 100 |
+
if not has_tags:
|
| 101 |
+
# Fallback to sentence-based chunking
|
| 102 |
+
return chunk_by_sentences(text, max_tokens, overlap_tokens)
|
| 103 |
+
|
| 104 |
+
# Extract speaker segments
|
| 105 |
+
segments = extract_speaker_segments(text)
|
| 106 |
+
|
| 107 |
+
if not segments:
|
| 108 |
+
return chunk_by_sentences(text, max_tokens, overlap_tokens)
|
| 109 |
+
|
| 110 |
+
# Group segments into chunks
|
| 111 |
+
chunks = []
|
| 112 |
+
current_chunk_segments = []
|
| 113 |
+
current_tokens = 0
|
| 114 |
+
|
| 115 |
+
i = 0
|
| 116 |
+
while i < len(segments):
|
| 117 |
+
segment = segments[i]
|
| 118 |
+
segment_tokens = segment["tokens"]
|
| 119 |
+
|
| 120 |
+
# If single segment exceeds max_tokens, split it
|
| 121 |
+
if segment_tokens > max_tokens:
|
| 122 |
+
# Split long segment by sentences
|
| 123 |
+
sub_chunks = chunk_by_sentences(
|
| 124 |
+
f"[{segment['speaker']}] {segment['content']}",
|
| 125 |
+
max_tokens,
|
| 126 |
+
overlap_tokens
|
| 127 |
+
)
|
| 128 |
+
chunks.extend(sub_chunks)
|
| 129 |
+
i += 1
|
| 130 |
+
continue
|
| 131 |
+
|
| 132 |
+
# Check if adding this segment would exceed limit
|
| 133 |
+
if current_tokens + segment_tokens > max_tokens and current_chunk_segments:
|
| 134 |
+
# Finalize current chunk
|
| 135 |
+
chunk_text = "\n\n".join([
|
| 136 |
+
f"[{s['speaker']}] {s['content']}"
|
| 137 |
+
for s in current_chunk_segments
|
| 138 |
+
])
|
| 139 |
+
chunks.append(chunk_text)
|
| 140 |
+
|
| 141 |
+
# Start new chunk with overlap
|
| 142 |
+
# Keep last few segments for context
|
| 143 |
+
overlap_segments = []
|
| 144 |
+
overlap_token_count = 0
|
| 145 |
+
|
| 146 |
+
for seg in reversed(current_chunk_segments):
|
| 147 |
+
if overlap_token_count + seg["tokens"] < overlap_tokens:
|
| 148 |
+
overlap_segments.insert(0, seg)
|
| 149 |
+
overlap_token_count += seg["tokens"]
|
| 150 |
+
else:
|
| 151 |
+
break
|
| 152 |
+
|
| 153 |
+
current_chunk_segments = overlap_segments
|
| 154 |
+
current_tokens = overlap_token_count
|
| 155 |
+
|
| 156 |
+
# Add segment to current chunk
|
| 157 |
+
current_chunk_segments.append(segment)
|
| 158 |
+
current_tokens += segment_tokens
|
| 159 |
+
i += 1
|
| 160 |
+
|
| 161 |
+
# Add final chunk
|
| 162 |
+
if current_chunk_segments:
|
| 163 |
+
chunk_text = "\n\n".join([
|
| 164 |
+
f"[{s['speaker']}] {s['content']}"
|
| 165 |
+
for s in current_chunk_segments
|
| 166 |
+
])
|
| 167 |
+
chunks.append(chunk_text)
|
| 168 |
+
|
| 169 |
+
return chunks if chunks else [text]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def chunk_by_sentences(
|
| 173 |
+
text: str,
|
| 174 |
+
max_tokens: int = 3000,
|
| 175 |
+
overlap_tokens: int = 150
|
| 176 |
+
) -> List[str]:
|
| 177 |
+
"""
|
| 178 |
+
Fallback chunking method based on sentences
|
| 179 |
+
"""
|
| 180 |
+
|
| 181 |
+
sentences = split_into_sentences(text)
|
| 182 |
+
|
| 183 |
+
chunks = []
|
| 184 |
+
current_chunk = []
|
| 185 |
+
current_tokens = 0
|
| 186 |
+
|
| 187 |
+
for sentence in sentences:
|
| 188 |
+
sentence_tokens = count_tokens(sentence)
|
| 189 |
+
|
| 190 |
+
if current_tokens + sentence_tokens > max_tokens and current_chunk:
|
| 191 |
+
# Finalize current chunk
|
| 192 |
+
chunks.append(" ".join(current_chunk))
|
| 193 |
+
|
| 194 |
+
# Create overlap
|
| 195 |
+
overlap_sents = []
|
| 196 |
+
overlap_token_count = 0
|
| 197 |
+
|
| 198 |
+
for sent in reversed(current_chunk):
|
| 199 |
+
sent_tokens = count_tokens(sent)
|
| 200 |
+
if overlap_token_count + sent_tokens < overlap_tokens:
|
| 201 |
+
overlap_sents.insert(0, sent)
|
| 202 |
+
overlap_token_count += sent_tokens
|
| 203 |
+
else:
|
| 204 |
+
break
|
| 205 |
+
|
| 206 |
+
current_chunk = overlap_sents
|
| 207 |
+
current_tokens = overlap_token_count
|
| 208 |
+
|
| 209 |
+
current_chunk.append(sentence)
|
| 210 |
+
current_tokens += sentence_tokens
|
| 211 |
+
|
| 212 |
+
# Add final chunk
|
| 213 |
+
if current_chunk:
|
| 214 |
+
chunks.append(" ".join(current_chunk))
|
| 215 |
+
|
| 216 |
+
return chunks if chunks else [text]
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def analyze_chunk_quality(chunks: List[str]) -> dict:
|
| 220 |
+
"""
|
| 221 |
+
Analyze chunking quality for debugging
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
if not chunks:
|
| 225 |
+
return {"error": "No chunks"}
|
| 226 |
+
|
| 227 |
+
token_counts = [count_tokens(chunk) for chunk in chunks]
|
| 228 |
+
|
| 229 |
+
return {
|
| 230 |
+
"num_chunks": len(chunks),
|
| 231 |
+
"avg_tokens": sum(token_counts) / len(token_counts),
|
| 232 |
+
"min_tokens": min(token_counts),
|
| 233 |
+
"max_tokens": max(token_counts),
|
| 234 |
+
"total_tokens": sum(token_counts),
|
| 235 |
+
"chunks_over_limit": sum(1 for t in token_counts if t > 3000)
|
| 236 |
+
}
|
config.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Dict, Any
|
| 3 |
+
|
| 4 |
+
# ============================================================================
|
| 5 |
+
# LLM CONFIGURATION
|
| 6 |
+
# ============================================================================
|
| 7 |
+
|
| 8 |
+
# Choose LLM backend: "hf_api" (recommended), "local", or "openai"
|
| 9 |
+
LLM_BACKEND = os.getenv("LLM_BACKEND", "hf_api")
|
| 10 |
+
|
| 11 |
+
# Hugging Face Configuration
|
| 12 |
+
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
|
| 13 |
+
HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
|
| 14 |
+
|
| 15 |
+
# Local Model Configuration
|
| 16 |
+
LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
|
| 17 |
+
DEVICE = os.getenv("DEVICE", "auto") # "auto", "cpu", "cuda", "mps"
|
| 18 |
+
|
| 19 |
+
# OpenAI Configuration (if using OpenAI)
|
| 20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 21 |
+
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
|
| 22 |
+
|
| 23 |
+
# LLM Parameters
|
| 24 |
+
MAX_TOKENS_PER_REQUEST = int(os.getenv("MAX_TOKENS_PER_REQUEST", "300"))
|
| 25 |
+
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
|
| 26 |
+
LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))
|
| 27 |
+
|
| 28 |
+
# ============================================================================
|
| 29 |
+
# CHUNKING CONFIGURATION
|
| 30 |
+
# ============================================================================
|
| 31 |
+
|
| 32 |
+
MAX_CHUNK_TOKENS = int(os.getenv("MAX_CHUNK_TOKENS", "6000"))
|
| 33 |
+
OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
|
| 34 |
+
TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
|
| 35 |
+
|
| 36 |
+
# ============================================================================
|
| 37 |
+
# QUALITY THRESHOLDS
|
| 38 |
+
# ============================================================================
|
| 39 |
+
|
| 40 |
+
MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
|
| 41 |
+
MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
|
| 42 |
+
MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
|
| 43 |
+
|
| 44 |
+
# Quality grade thresholds
|
| 45 |
+
QUALITY_EXCELLENT = 0.8
|
| 46 |
+
QUALITY_GOOD = 0.6
|
| 47 |
+
QUALITY_FAIR = 0.4
|
| 48 |
+
|
| 49 |
+
# ============================================================================
|
| 50 |
+
# FILE PROCESSING CONFIGURATION
|
| 51 |
+
# ============================================================================
|
| 52 |
+
|
| 53 |
+
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
|
| 54 |
+
SUPPORTED_FORMATS = [".docx", ".pdf"]
|
| 55 |
+
MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
|
| 56 |
+
|
| 57 |
+
# ============================================================================
|
| 58 |
+
# OUTPUT CONFIGURATION
|
| 59 |
+
# ============================================================================
|
| 60 |
+
|
| 61 |
+
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
|
| 62 |
+
CSV_FILENAME = "transcript_analysis.csv"
|
| 63 |
+
PDF_FILENAME = "transcript_report.pdf"
|
| 64 |
+
|
| 65 |
+
# Ensure output directory exists
|
| 66 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 67 |
+
|
| 68 |
+
# ============================================================================
|
| 69 |
+
# DEBUG AND LOGGING
|
| 70 |
+
# ============================================================================
|
| 71 |
+
|
| 72 |
+
DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
|
| 73 |
+
VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
|
| 74 |
+
LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
|
| 75 |
+
|
| 76 |
+
# ============================================================================
|
| 77 |
+
# ADVANCED SETTINGS
|
| 78 |
+
# ============================================================================
|
| 79 |
+
|
| 80 |
+
# Cache extracted text to avoid re-processing
|
| 81 |
+
ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
|
| 82 |
+
CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
|
| 83 |
+
|
| 84 |
+
# Parallel processing
|
| 85 |
+
ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
|
| 86 |
+
MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
|
| 87 |
+
|
| 88 |
+
# ============================================================================
|
| 89 |
+
# SYSTEM PROMPTS
|
| 90 |
+
# ============================================================================
|
| 91 |
+
|
| 92 |
+
BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
|
| 93 |
+
|
| 94 |
+
Your task is to extract structured, actionable insights from interview transcripts.
|
| 95 |
+
|
| 96 |
+
Core Principles:
|
| 97 |
+
- Focus on factual, verifiable medical information
|
| 98 |
+
- Distinguish between speaker roles accurately
|
| 99 |
+
- Filter out pleasantries, disclaimers, and off-topic content
|
| 100 |
+
- Extract specific medical terms, dosages, and treatment details
|
| 101 |
+
- Identify patterns and clinical reasoning
|
| 102 |
+
- Maintain objectivity and clinical accuracy
|
| 103 |
+
"""
|
| 104 |
+
|
| 105 |
+
HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
|
| 106 |
+
Healthcare Professional Analysis Focus:
|
| 107 |
+
- Prescribing patterns and medication choices
|
| 108 |
+
- Diagnostic reasoning and clinical decision-making
|
| 109 |
+
- Treatment protocols and guidelines referenced
|
| 110 |
+
- Peer perspectives on efficacy and safety
|
| 111 |
+
- Barriers to treatment or adoption
|
| 112 |
+
- Off-label uses or emerging practices
|
| 113 |
+
|
| 114 |
+
Extract and structure:
|
| 115 |
+
1. Diagnoses mentioned with context
|
| 116 |
+
2. Prescriptions with dosage, frequency, and rationale
|
| 117 |
+
3. Treatment strategies and their justifications
|
| 118 |
+
4. Clinical guidelines or studies referenced
|
| 119 |
+
5. Challenges or barriers discussed
|
| 120 |
+
6. Key clinical insights or pearls
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
|
| 124 |
+
Patient Interview Analysis Focus:
|
| 125 |
+
- Symptom descriptions and severity
|
| 126 |
+
- Treatment experiences and outcomes
|
| 127 |
+
- Side effects and tolerability
|
| 128 |
+
- Quality of life impacts
|
| 129 |
+
- Adherence challenges and enablers
|
| 130 |
+
- Emotional and psychological factors
|
| 131 |
+
- Healthcare system interactions
|
| 132 |
+
|
| 133 |
+
Extract and structure:
|
| 134 |
+
1. Primary symptoms with duration and severity
|
| 135 |
+
2. Current and past treatments
|
| 136 |
+
3. Treatment effectiveness and satisfaction
|
| 137 |
+
4. Side effects experienced
|
| 138 |
+
5. Concerns and unmet needs
|
| 139 |
+
6. Quality of life impacts
|
| 140 |
+
7. Support systems and resources
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
|
| 144 |
+
|
| 145 |
+
Focus on:
|
| 146 |
+
- Frequency analysis (how many interviewees mentioned X?)
|
| 147 |
+
- Common patterns and themes
|
| 148 |
+
- Consensus and disagreements
|
| 149 |
+
- Statistical insights (percentages, distributions)
|
| 150 |
+
- Actionable recommendations for stakeholders
|
| 151 |
+
|
| 152 |
+
Provide:
|
| 153 |
+
1. Quantitative summary (X% mentioned Y)
|
| 154 |
+
2. Key trends and patterns
|
| 155 |
+
3. Notable outliers or unique insights
|
| 156 |
+
4. Actionable recommendations
|
| 157 |
+
5. Data gaps or areas needing follow-up
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
# ============================================================================
|
| 161 |
+
# VALIDATION SETTINGS
|
| 162 |
+
# ============================================================================
|
| 163 |
+
|
| 164 |
+
VALIDATION_CONFIG = {
|
| 165 |
+
"min_word_ratio": 0.3,
|
| 166 |
+
"max_repetition_ratio": 1.5,
|
| 167 |
+
"min_sentences": 3,
|
| 168 |
+
"check_errors": True,
|
| 169 |
+
"check_gibberish": True
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# ============================================================================
|
| 173 |
+
# DASHBOARD SETTINGS
|
| 174 |
+
# ============================================================================
|
| 175 |
+
|
| 176 |
+
DASHBOARD_CONFIG = {
|
| 177 |
+
"figure_size": (14, 10),
|
| 178 |
+
"dpi": 100,
|
| 179 |
+
"style": "default",
|
| 180 |
+
"top_n_items": 8,
|
| 181 |
+
"color_scheme": {
|
| 182 |
+
"primary": "#3498db",
|
| 183 |
+
"secondary": "#2ecc71",
|
| 184 |
+
"accent": "#e74c3c",
|
| 185 |
+
"warning": "#f39c12"
|
| 186 |
+
}
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# ============================================================================
|
| 190 |
+
# HELPER FUNCTIONS
|
| 191 |
+
# ============================================================================
|
| 192 |
+
|
| 193 |
+
def get_config() -> Dict[str, Any]:
|
| 194 |
+
"""Return all configuration as a dictionary"""
|
| 195 |
+
return {
|
| 196 |
+
"llm": {
|
| 197 |
+
"backend": LLM_BACKEND,
|
| 198 |
+
"model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
|
| 199 |
+
"max_tokens": MAX_TOKENS_PER_REQUEST,
|
| 200 |
+
"temperature": LLM_TEMPERATURE,
|
| 201 |
+
"timeout": LLM_TIMEOUT
|
| 202 |
+
},
|
| 203 |
+
"chunking": {
|
| 204 |
+
"max_tokens": MAX_CHUNK_TOKENS,
|
| 205 |
+
"overlap": OVERLAP_TOKENS
|
| 206 |
+
},
|
| 207 |
+
"quality": {
|
| 208 |
+
"min_score": MIN_QUALITY_SCORE,
|
| 209 |
+
"min_words": MIN_WORD_COUNT
|
| 210 |
+
},
|
| 211 |
+
"files": {
|
| 212 |
+
"max_size_mb": MAX_FILE_SIZE_MB,
|
| 213 |
+
"max_per_batch": MAX_FILES_PER_BATCH,
|
| 214 |
+
"supported": SUPPORTED_FORMATS
|
| 215 |
+
},
|
| 216 |
+
"output": {
|
| 217 |
+
"directory": OUTPUT_DIR,
|
| 218 |
+
"csv": CSV_FILENAME,
|
| 219 |
+
"pdf": PDF_FILENAME
|
| 220 |
+
},
|
| 221 |
+
"debug": DEBUG_MODE,
|
| 222 |
+
"caching": ENABLE_CACHING,
|
| 223 |
+
"parallel": ENABLE_PARALLEL_PROCESSING
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def print_config():
|
| 228 |
+
"""Print current configuration"""
|
| 229 |
+
config = get_config()
|
| 230 |
+
print("=" * 60)
|
| 231 |
+
print("TRANSCRIPTORAI CONFIGURATION")
|
| 232 |
+
print("=" * 60)
|
| 233 |
+
for section, settings in config.items():
|
| 234 |
+
print(f"\n{section.upper()}:")
|
| 235 |
+
for key, value in settings.items():
|
| 236 |
+
print(f" {key}: {value}")
|
| 237 |
+
print("=" * 60)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def validate_config() -> bool:
|
| 241 |
+
"""Validate configuration settings"""
|
| 242 |
+
issues = []
|
| 243 |
+
|
| 244 |
+
# Check LLM configuration
|
| 245 |
+
if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
|
| 246 |
+
issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
|
| 247 |
+
|
| 248 |
+
if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
|
| 249 |
+
issues.append("OpenAI selected but OPENAI_API_KEY not set")
|
| 250 |
+
|
| 251 |
+
# Check paths exist
|
| 252 |
+
if not os.path.exists(OUTPUT_DIR):
|
| 253 |
+
try:
|
| 254 |
+
os.makedirs(OUTPUT_DIR)
|
| 255 |
+
except:
|
| 256 |
+
issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
|
| 257 |
+
|
| 258 |
+
# Check reasonable values
|
| 259 |
+
if MAX_CHUNK_TOKENS < 500:
|
| 260 |
+
issues.append("MAX_CHUNK_TOKENS too small (< 500)")
|
| 261 |
+
|
| 262 |
+
if MAX_TOKENS_PER_REQUEST < 100:
|
| 263 |
+
issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
|
| 264 |
+
|
| 265 |
+
if issues:
|
| 266 |
+
print("Configuration Issues:")
|
| 267 |
+
for issue in issues:
|
| 268 |
+
print(f" - {issue}")
|
| 269 |
+
return False
|
| 270 |
+
|
| 271 |
+
return True
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# ============================================================================
|
| 275 |
+
# INITIALIZATION
|
| 276 |
+
# ============================================================================
|
| 277 |
+
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
print_config()
|
| 280 |
+
if validate_config():
|
| 281 |
+
print("\n✓ Configuration valid")
|
| 282 |
+
else:
|
| 283 |
+
print("\n✗ Configuration has issues")
|
dashboard.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import matplotlib.pyplot as plt
|
| 2 |
+
import matplotlib.patches as mpatches
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
def generate_dashboard(data):
|
| 10 |
+
"""Legacy function - kept for backwards compatibility"""
|
| 11 |
+
return generate_comprehensive_dashboard(data, "Other")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def extract_items_from_field(data: List[Dict], field_name: str) -> List[str]:
|
| 15 |
+
"""Extract and split items from semicolon-separated field"""
|
| 16 |
+
items = []
|
| 17 |
+
for row in data:
|
| 18 |
+
value = row.get(field_name, "")
|
| 19 |
+
if value and isinstance(value, str):
|
| 20 |
+
# Split by semicolon and clean
|
| 21 |
+
parts = [p.strip() for p in value.split(';') if p.strip()]
|
| 22 |
+
items.extend(parts)
|
| 23 |
+
return items
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def generate_comprehensive_dashboard(
|
| 27 |
+
data: List[Dict],
|
| 28 |
+
interviewee_type: str
|
| 29 |
+
) -> plt.Figure:
|
| 30 |
+
"""
|
| 31 |
+
Generate comprehensive dashboard with multiple visualizations
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
if not data or len(data) == 0:
|
| 35 |
+
# Return empty figure with message
|
| 36 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 37 |
+
ax.text(0.5, 0.5, 'No data available for visualization',
|
| 38 |
+
ha='center', va='center', fontsize=14)
|
| 39 |
+
ax.axis('off')
|
| 40 |
+
return fig
|
| 41 |
+
|
| 42 |
+
df = pd.DataFrame(data)
|
| 43 |
+
|
| 44 |
+
# Determine number of subplots based on interviewee type
|
| 45 |
+
if interviewee_type == "HCP":
|
| 46 |
+
fig = create_hcp_dashboard(df)
|
| 47 |
+
elif interviewee_type == "Patient":
|
| 48 |
+
fig = create_patient_dashboard(df)
|
| 49 |
+
else:
|
| 50 |
+
fig = create_general_dashboard(df)
|
| 51 |
+
|
| 52 |
+
plt.tight_layout()
|
| 53 |
+
return fig
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def create_hcp_dashboard(df: pd.DataFrame) -> plt.Figure:
|
| 57 |
+
"""Create dashboard for HCP interviews"""
|
| 58 |
+
|
| 59 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 60 |
+
fig.suptitle('Healthcare Professional Interview Analysis', fontsize=16, fontweight='bold')
|
| 61 |
+
|
| 62 |
+
# 1. Quality Score Distribution
|
| 63 |
+
ax1 = axes[0, 0]
|
| 64 |
+
if 'Quality Score' in df.columns:
|
| 65 |
+
quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
|
| 66 |
+
if len(quality_scores) > 0:
|
| 67 |
+
ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
|
| 68 |
+
ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
|
| 69 |
+
label=f'Mean: {quality_scores.mean():.2f}')
|
| 70 |
+
ax1.set_xlabel('Quality Score')
|
| 71 |
+
ax1.set_ylabel('Frequency')
|
| 72 |
+
ax1.set_title('Transcript Quality Distribution')
|
| 73 |
+
ax1.legend()
|
| 74 |
+
ax1.grid(axis='y', alpha=0.3)
|
| 75 |
+
|
| 76 |
+
# 2. Top Diagnoses
|
| 77 |
+
ax2 = axes[0, 1]
|
| 78 |
+
if 'Diagnoses' in df.columns:
|
| 79 |
+
diagnoses = extract_items_from_field(df.to_dict('records'), 'Diagnoses')
|
| 80 |
+
if diagnoses:
|
| 81 |
+
diagnosis_counts = Counter(diagnoses)
|
| 82 |
+
top_diagnoses = dict(diagnosis_counts.most_common(8))
|
| 83 |
+
|
| 84 |
+
if top_diagnoses:
|
| 85 |
+
labels = list(top_diagnoses.keys())
|
| 86 |
+
# Truncate long labels
|
| 87 |
+
labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
|
| 88 |
+
values = list(top_diagnoses.values())
|
| 89 |
+
|
| 90 |
+
bars = ax2.barh(labels, values, color='#2ecc71', edgecolor='black')
|
| 91 |
+
ax2.set_xlabel('Frequency')
|
| 92 |
+
ax2.set_title('Most Common Diagnoses')
|
| 93 |
+
ax2.invert_yaxis()
|
| 94 |
+
|
| 95 |
+
# Add value labels
|
| 96 |
+
for i, bar in enumerate(bars):
|
| 97 |
+
width = bar.get_width()
|
| 98 |
+
ax2.text(width, bar.get_y() + bar.get_height()/2,
|
| 99 |
+
f' {int(width)}', ha='left', va='center', fontsize=9)
|
| 100 |
+
|
| 101 |
+
# 3. Prescription Analysis
|
| 102 |
+
ax3 = axes[1, 0]
|
| 103 |
+
if 'Prescriptions' in df.columns:
|
| 104 |
+
prescriptions = extract_items_from_field(df.to_dict('records'), 'Prescriptions')
|
| 105 |
+
if prescriptions:
|
| 106 |
+
rx_counts = Counter(prescriptions)
|
| 107 |
+
top_rx = dict(rx_counts.most_common(8))
|
| 108 |
+
|
| 109 |
+
if top_rx:
|
| 110 |
+
labels = list(top_rx.keys())
|
| 111 |
+
labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
|
| 112 |
+
values = list(top_rx.values())
|
| 113 |
+
|
| 114 |
+
bars = ax3.barh(labels, values, color='#e74c3c', edgecolor='black')
|
| 115 |
+
ax3.set_xlabel('Frequency')
|
| 116 |
+
ax3.set_title('Most Mentioned Prescriptions')
|
| 117 |
+
ax3.invert_yaxis()
|
| 118 |
+
|
| 119 |
+
for i, bar in enumerate(bars):
|
| 120 |
+
width = bar.get_width()
|
| 121 |
+
ax3.text(width, bar.get_y() + bar.get_height()/2,
|
| 122 |
+
f' {int(width)}', ha='left', va='center', fontsize=9)
|
| 123 |
+
|
| 124 |
+
# 4. Word Count by Transcript
|
| 125 |
+
ax4 = axes[1, 1]
|
| 126 |
+
if 'Word Count' in df.columns and 'Transcript ID' in df.columns:
|
| 127 |
+
word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
|
| 128 |
+
transcript_ids = df['Transcript ID'][:len(word_counts)]
|
| 129 |
+
|
| 130 |
+
if len(word_counts) > 0:
|
| 131 |
+
bars = ax4.bar(range(len(word_counts)), word_counts, color='#9b59b6',
|
| 132 |
+
edgecolor='black', alpha=0.7)
|
| 133 |
+
ax4.set_xlabel('Transcript')
|
| 134 |
+
ax4.set_ylabel('Word Count')
|
| 135 |
+
ax4.set_title('Interview Length by Transcript')
|
| 136 |
+
ax4.set_xticks(range(len(word_counts)))
|
| 137 |
+
ax4.set_xticklabels(transcript_ids, rotation=45, ha='right')
|
| 138 |
+
ax4.grid(axis='y', alpha=0.3)
|
| 139 |
+
|
| 140 |
+
# Add mean line
|
| 141 |
+
ax4.axhline(word_counts.mean(), color='red', linestyle='--',
|
| 142 |
+
label=f'Average: {int(word_counts.mean())}')
|
| 143 |
+
ax4.legend()
|
| 144 |
+
|
| 145 |
+
return fig
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def create_patient_dashboard(df: pd.DataFrame) -> plt.Figure:
|
| 149 |
+
"""Create dashboard for Patient interviews"""
|
| 150 |
+
|
| 151 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 152 |
+
fig.suptitle('Patient Interview Analysis', fontsize=16, fontweight='bold')
|
| 153 |
+
|
| 154 |
+
# 1. Quality Score Distribution
|
| 155 |
+
ax1 = axes[0, 0]
|
| 156 |
+
if 'Quality Score' in df.columns:
|
| 157 |
+
quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
|
| 158 |
+
if len(quality_scores) > 0:
|
| 159 |
+
ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
|
| 160 |
+
ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
|
| 161 |
+
label=f'Mean: {quality_scores.mean():.2f}')
|
| 162 |
+
ax1.set_xlabel('Quality Score')
|
| 163 |
+
ax1.set_ylabel('Frequency')
|
| 164 |
+
ax1.set_title('Transcript Quality Distribution')
|
| 165 |
+
ax1.legend()
|
| 166 |
+
ax1.grid(axis='y', alpha=0.3)
|
| 167 |
+
|
| 168 |
+
# 2. Top Symptoms
|
| 169 |
+
ax2 = axes[0, 1]
|
| 170 |
+
if 'Primary Symptoms' in df.columns:
|
| 171 |
+
symptoms = extract_items_from_field(df.to_dict('records'), 'Primary Symptoms')
|
| 172 |
+
if symptoms:
|
| 173 |
+
symptom_counts = Counter(symptoms)
|
| 174 |
+
top_symptoms = dict(symptom_counts.most_common(8))
|
| 175 |
+
|
| 176 |
+
if top_symptoms:
|
| 177 |
+
labels = list(top_symptoms.keys())
|
| 178 |
+
labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
|
| 179 |
+
values = list(top_symptoms.values())
|
| 180 |
+
|
| 181 |
+
bars = ax2.barh(labels, values, color='#e67e22', edgecolor='black')
|
| 182 |
+
ax2.set_xlabel('Frequency')
|
| 183 |
+
ax2.set_title('Most Common Symptoms')
|
| 184 |
+
ax2.invert_yaxis()
|
| 185 |
+
|
| 186 |
+
for i, bar in enumerate(bars):
|
| 187 |
+
width = bar.get_width()
|
| 188 |
+
ax2.text(width, bar.get_y() + bar.get_height()/2,
|
| 189 |
+
f' {int(width)}', ha='left', va='center', fontsize=9)
|
| 190 |
+
|
| 191 |
+
# 3. Patient Concerns
|
| 192 |
+
ax3 = axes[1, 0]
|
| 193 |
+
if 'Main Concerns' in df.columns:
|
| 194 |
+
concerns = extract_items_from_field(df.to_dict('records'), 'Main Concerns')
|
| 195 |
+
if concerns:
|
| 196 |
+
concern_counts = Counter(concerns)
|
| 197 |
+
top_concerns = dict(concern_counts.most_common(6))
|
| 198 |
+
|
| 199 |
+
if top_concerns:
|
| 200 |
+
# Create word cloud style pie chart
|
| 201 |
+
labels = list(top_concerns.keys())
|
| 202 |
+
labels = [label[:25] + '...' if len(label) > 25 else label for label in labels]
|
| 203 |
+
sizes = list(top_concerns.values())
|
| 204 |
+
colors_list = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#f9ca24', '#6c5ce7', '#a29bfe']
|
| 205 |
+
|
| 206 |
+
ax3.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90,
|
| 207 |
+
colors=colors_list[:len(sizes)])
|
| 208 |
+
ax3.set_title('Distribution of Patient Concerns')
|
| 209 |
+
|
| 210 |
+
# 4. Side Effects
|
| 211 |
+
ax4 = axes[1, 1]
|
| 212 |
+
if 'Side Effects' in df.columns:
|
| 213 |
+
side_effects = extract_items_from_field(df.to_dict('records'), 'Side Effects')
|
| 214 |
+
if side_effects:
|
| 215 |
+
se_counts = Counter(side_effects)
|
| 216 |
+
top_se = dict(se_counts.most_common(6))
|
| 217 |
+
|
| 218 |
+
if top_se:
|
| 219 |
+
labels = list(top_se.keys())
|
| 220 |
+
labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
|
| 221 |
+
values = list(top_se.values())
|
| 222 |
+
|
| 223 |
+
bars = ax4.barh(labels, values, color='#e74c3c', edgecolor='black')
|
| 224 |
+
ax4.set_xlabel('Frequency')
|
| 225 |
+
ax4.set_title('Reported Side Effects')
|
| 226 |
+
ax4.invert_yaxis()
|
| 227 |
+
|
| 228 |
+
for i, bar in enumerate(bars):
|
| 229 |
+
width = bar.get_width()
|
| 230 |
+
ax4.text(width, bar.get_y() + bar.get_height()/2,
|
| 231 |
+
f' {int(width)}', ha='left', va='center', fontsize=9)
|
| 232 |
+
else:
|
| 233 |
+
ax4.text(0.5, 0.5, 'No side effects reported',
|
| 234 |
+
ha='center', va='center', transform=ax4.transAxes, fontsize=12)
|
| 235 |
+
ax4.axis('off')
|
| 236 |
+
|
| 237 |
+
return fig
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def create_general_dashboard(df: pd.DataFrame) -> plt.Figure:
|
| 241 |
+
"""Create general dashboard"""
|
| 242 |
+
|
| 243 |
+
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
| 244 |
+
fig.suptitle('General Interview Analysis', fontsize=16, fontweight='bold')
|
| 245 |
+
|
| 246 |
+
# 1. Quality Score Distribution
|
| 247 |
+
ax1 = axes[0, 0]
|
| 248 |
+
if 'Quality Score' in df.columns:
|
| 249 |
+
quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
|
| 250 |
+
if len(quality_scores) > 0:
|
| 251 |
+
ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
|
| 252 |
+
ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
|
| 253 |
+
label=f'Mean: {quality_scores.mean():.2f}')
|
| 254 |
+
ax1.set_xlabel('Quality Score')
|
| 255 |
+
ax1.set_ylabel('Frequency')
|
| 256 |
+
ax1.set_title('Transcript Quality Distribution')
|
| 257 |
+
ax1.legend()
|
| 258 |
+
ax1.grid(axis='y', alpha=0.3)
|
| 259 |
+
|
| 260 |
+
# 2. Word Count Distribution
|
| 261 |
+
ax2 = axes[0, 1]
|
| 262 |
+
if 'Word Count' in df.columns:
|
| 263 |
+
word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
|
| 264 |
+
if len(word_counts) > 0:
|
| 265 |
+
ax2.hist(word_counts, bins=15, color='#2ecc71', edgecolor='black', alpha=0.7)
|
| 266 |
+
ax2.set_xlabel('Word Count')
|
| 267 |
+
ax2.set_ylabel('Frequency')
|
| 268 |
+
ax2.set_title('Interview Length Distribution')
|
| 269 |
+
ax2.grid(axis='y', alpha=0.3)
|
| 270 |
+
|
| 271 |
+
# 3. Processing Summary
|
| 272 |
+
ax3 = axes[1, 0]
|
| 273 |
+
if 'Quality Score' in df.columns:
|
| 274 |
+
quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
|
| 275 |
+
|
| 276 |
+
categories = ['Excellent\n(>0.8)', 'Good\n(0.6-0.8)', 'Fair\n(0.4-0.6)', 'Poor\n(<0.4)']
|
| 277 |
+
counts = [
|
| 278 |
+
sum(quality_scores > 0.8),
|
| 279 |
+
sum((quality_scores >= 0.6) & (quality_scores <= 0.8)),
|
| 280 |
+
sum((quality_scores >= 0.4) & (quality_scores < 0.6)),
|
| 281 |
+
sum(quality_scores < 0.4)
|
| 282 |
+
]
|
| 283 |
+
|
| 284 |
+
colors_list = ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c']
|
| 285 |
+
bars = ax3.bar(categories, counts, color=colors_list, edgecolor='black', alpha=0.7)
|
| 286 |
+
ax3.set_ylabel('Number of Transcripts')
|
| 287 |
+
ax3.set_title('Quality Score Categories')
|
| 288 |
+
ax3.grid(axis='y', alpha=0.3)
|
| 289 |
+
|
| 290 |
+
# Add value labels
|
| 291 |
+
for bar in bars:
|
| 292 |
+
height = bar.get_height()
|
| 293 |
+
if height > 0:
|
| 294 |
+
ax3.text(bar.get_x() + bar.get_width()/2., height,
|
| 295 |
+
f'{int(height)}', ha='center', va='bottom', fontsize=10)
|
| 296 |
+
|
| 297 |
+
# 4. Summary Statistics Table
|
| 298 |
+
ax4 = axes[1, 1]
|
| 299 |
+
ax4.axis('off')
|
| 300 |
+
|
| 301 |
+
stats_data = []
|
| 302 |
+
if 'Transcript ID' in df.columns:
|
| 303 |
+
stats_data.append(['Total Transcripts', str(len(df))])
|
| 304 |
+
|
| 305 |
+
if 'Quality Score' in df.columns:
|
| 306 |
+
quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
|
| 307 |
+
if len(quality_scores) > 0:
|
| 308 |
+
stats_data.append(['Avg Quality Score', f"{quality_scores.mean():.2f}"])
|
| 309 |
+
stats_data.append(['Min Quality Score', f"{quality_scores.min():.2f}"])
|
| 310 |
+
stats_data.append(['Max Quality Score', f"{quality_scores.max():.2f}"])
|
| 311 |
+
|
| 312 |
+
if 'Word Count' in df.columns:
|
| 313 |
+
word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
|
| 314 |
+
if len(word_counts) > 0:
|
| 315 |
+
stats_data.append(['Avg Word Count', f"{int(word_counts.mean()):,}"])
|
| 316 |
+
stats_data.append(['Total Words', f"{int(word_counts.sum()):,}"])
|
| 317 |
+
|
| 318 |
+
if stats_data:
|
| 319 |
+
table = ax4.table(cellText=stats_data, cellLoc='left',
|
| 320 |
+
colWidths=[0.5, 0.3], loc='center',
|
| 321 |
+
colLabels=['Metric', 'Value'])
|
| 322 |
+
table.auto_set_font_size(False)
|
| 323 |
+
table.set_fontsize(11)
|
| 324 |
+
table.scale(1, 2)
|
| 325 |
+
|
| 326 |
+
# Style the table
|
| 327 |
+
for i in range(len(stats_data) + 1):
|
| 328 |
+
if i == 0:
|
| 329 |
+
table[(i, 0)].set_facecolor('#34495e')
|
| 330 |
+
table[(i, 1)].set_facecolor('#34495e')
|
| 331 |
+
table[(i, 0)].set_text_props(weight='bold', color='white')
|
| 332 |
+
table[(i, 1)].set_text_props(weight='bold', color='white')
|
| 333 |
+
else:
|
| 334 |
+
if i % 2 == 0:
|
| 335 |
+
table[(i, 0)].set_facecolor('#ecf0f1')
|
| 336 |
+
table[(i, 1)].set_facecolor('#ecf0f1')
|
| 337 |
+
|
| 338 |
+
ax4.set_title('Summary Statistics', fontsize=12, fontweight='bold', pad=20)
|
| 339 |
+
|
| 340 |
+
return fig
|
extractors.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docx import Document
|
| 2 |
+
import pdfplumber
|
| 3 |
+
import re
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
def extract_docx(file_obj) -> str:
|
| 8 |
+
"""
|
| 9 |
+
Extract text from DOCX with enhanced error handling and formatting preservation
|
| 10 |
+
"""
|
| 11 |
+
try:
|
| 12 |
+
doc = Document(file_obj)
|
| 13 |
+
|
| 14 |
+
# Extract paragraphs with better handling
|
| 15 |
+
paragraphs = []
|
| 16 |
+
for para in doc.paragraphs:
|
| 17 |
+
text = para.text.strip()
|
| 18 |
+
if text: # Only include non-empty paragraphs
|
| 19 |
+
paragraphs.append(text)
|
| 20 |
+
|
| 21 |
+
# Also extract text from tables
|
| 22 |
+
for table in doc.tables:
|
| 23 |
+
for row in table.rows:
|
| 24 |
+
row_text = []
|
| 25 |
+
for cell in row.cells:
|
| 26 |
+
cell_text = cell.text.strip()
|
| 27 |
+
if cell_text:
|
| 28 |
+
row_text.append(cell_text)
|
| 29 |
+
if row_text:
|
| 30 |
+
paragraphs.append(" | ".join(row_text))
|
| 31 |
+
|
| 32 |
+
extracted_text = "\n\n".join(paragraphs)
|
| 33 |
+
|
| 34 |
+
# Clean up common issues
|
| 35 |
+
extracted_text = clean_extracted_text(extracted_text)
|
| 36 |
+
|
| 37 |
+
return extracted_text
|
| 38 |
+
|
| 39 |
+
except Exception as e:
|
| 40 |
+
error_msg = f"[DOCX Extraction Error] {str(e)}"
|
| 41 |
+
print(error_msg)
|
| 42 |
+
return f"Error extracting DOCX: {str(e)}"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def extract_pdf(file_obj) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Extract text from PDF with multiple strategies and enhanced error handling
|
| 48 |
+
"""
|
| 49 |
+
try:
|
| 50 |
+
extracted_pages = []
|
| 51 |
+
|
| 52 |
+
with pdfplumber.open(file_obj) as pdf:
|
| 53 |
+
# Track extraction success
|
| 54 |
+
successful_pages = 0
|
| 55 |
+
total_pages = len(pdf.pages)
|
| 56 |
+
|
| 57 |
+
for page_num, page in enumerate(pdf.pages, 1):
|
| 58 |
+
try:
|
| 59 |
+
# Strategy 1: Standard text extraction
|
| 60 |
+
page_text = page.extract_text()
|
| 61 |
+
|
| 62 |
+
# Strategy 2: If standard fails, try with layout
|
| 63 |
+
if not page_text or len(page_text.strip()) < 50:
|
| 64 |
+
page_text = page.extract_text(layout=True)
|
| 65 |
+
|
| 66 |
+
# Strategy 3: If still poor, try with custom settings
|
| 67 |
+
if not page_text or len(page_text.strip()) < 50:
|
| 68 |
+
page_text = page.extract_text(
|
| 69 |
+
x_tolerance=2,
|
| 70 |
+
y_tolerance=2
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
if page_text and page_text.strip():
|
| 74 |
+
# Clean and add page marker
|
| 75 |
+
clean_text = page_text.strip()
|
| 76 |
+
extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
|
| 77 |
+
successful_pages += 1
|
| 78 |
+
else:
|
| 79 |
+
print(f"[PDF Warning] Page {page_num} yielded no text")
|
| 80 |
+
|
| 81 |
+
except Exception as page_error:
|
| 82 |
+
print(f"[PDF Warning] Error on page {page_num}: {page_error}")
|
| 83 |
+
continue
|
| 84 |
+
|
| 85 |
+
if successful_pages == 0:
|
| 86 |
+
return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."
|
| 87 |
+
|
| 88 |
+
if successful_pages < total_pages * 0.5:
|
| 89 |
+
print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")
|
| 90 |
+
|
| 91 |
+
full_text = "\n\n".join(extracted_pages)
|
| 92 |
+
|
| 93 |
+
# Clean up the extracted text
|
| 94 |
+
full_text = clean_extracted_text(full_text)
|
| 95 |
+
|
| 96 |
+
return full_text
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
error_msg = f"[PDF Extraction Error] {str(e)}"
|
| 100 |
+
print(error_msg)
|
| 101 |
+
return f"Error extracting PDF: {str(e)}"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def clean_extracted_text(text: str) -> str:
|
| 105 |
+
"""
|
| 106 |
+
Clean up common issues in extracted text
|
| 107 |
+
"""
|
| 108 |
+
# Remove excessive whitespace
|
| 109 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 110 |
+
text = re.sub(r' {2,}', ' ', text)
|
| 111 |
+
|
| 112 |
+
# Remove page numbers that appear alone on lines
|
| 113 |
+
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
|
| 114 |
+
|
| 115 |
+
# Remove common headers/footers patterns
|
| 116 |
+
text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE)
|
| 117 |
+
text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE)
|
| 118 |
+
|
| 119 |
+
# Fix common OCR issues (if any)
|
| 120 |
+
text = text.replace('', "'") # Curly apostrophe
|
| 121 |
+
text = text.replace('', "'")
|
| 122 |
+
text = text.replace('"', '"') # Curly quotes
|
| 123 |
+
text = text.replace('"', '"')
|
| 124 |
+
text = text.replace('–', '-') # En dash
|
| 125 |
+
text = text.replace('—', '-') # Em dash
|
| 126 |
+
|
| 127 |
+
# Remove zero-width characters
|
| 128 |
+
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
|
| 129 |
+
|
| 130 |
+
return text.strip()
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
|
| 134 |
+
"""
|
| 135 |
+
Validate extracted text quality
|
| 136 |
+
"""
|
| 137 |
+
# Check if text is empty
|
| 138 |
+
if not text or not text.strip():
|
| 139 |
+
return False, "No text extracted"
|
| 140 |
+
|
| 141 |
+
# Check for minimum length
|
| 142 |
+
if len(text) < 100:
|
| 143 |
+
return False, f"Extracted text too short ({len(text)} characters)"
|
| 144 |
+
|
| 145 |
+
# Check for error messages
|
| 146 |
+
if text.startswith("Error") or text.startswith("["):
|
| 147 |
+
return False, "Extraction error detected"
|
| 148 |
+
|
| 149 |
+
# Check for gibberish (too many non-alphanumeric characters)
|
| 150 |
+
#alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
|
| 151 |
+
#ratio = alphanumeric / len(text) if text else 0
|
| 152 |
+
|
| 153 |
+
#if ratio < 0.2:
|
| 154 |
+
# return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"
|
| 155 |
+
|
| 156 |
+
# Check word count
|
| 157 |
+
words = text.split()
|
| 158 |
+
if len(words) < 50:
|
| 159 |
+
return False, f"Too few words ({len(words)})"
|
| 160 |
+
|
| 161 |
+
# Check for reasonable word lengths (catch binary junk)
|
| 162 |
+
#avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
|
| 163 |
+
#if avg_word_length < 2 or avg_word_length > 20:
|
| 164 |
+
# return False, f"Unusual average word length ({avg_word_length:.1f})"
|
| 165 |
+
|
| 166 |
+
# All checks passed
|
| 167 |
+
return True, f"Valid extraction: {len(words)} words, {len(text)} characters"
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def detect_file_encoding(file_path: str) -> str:
|
| 171 |
+
"""
|
| 172 |
+
Detect file encoding for text files
|
| 173 |
+
"""
|
| 174 |
+
try:
|
| 175 |
+
import chardet
|
| 176 |
+
with open(file_path, 'rb') as f:
|
| 177 |
+
raw_data = f.read()
|
| 178 |
+
result = chardet.detect(raw_data)
|
| 179 |
+
return result['encoding']
|
| 180 |
+
except:
|
| 181 |
+
return 'utf-8' # Default fallback
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def extract_text_file(file_obj) -> str:
|
| 185 |
+
"""
|
| 186 |
+
Extract from plain text file with encoding detection
|
| 187 |
+
"""
|
| 188 |
+
try:
|
| 189 |
+
# Try UTF-8 first
|
| 190 |
+
try:
|
| 191 |
+
return file_obj.read().decode('utf-8')
|
| 192 |
+
except UnicodeDecodeError:
|
| 193 |
+
# Try other common encodings
|
| 194 |
+
file_obj.seek(0)
|
| 195 |
+
try:
|
| 196 |
+
return file_obj.read().decode('latin-1')
|
| 197 |
+
except:
|
| 198 |
+
file_obj.seek(0)
|
| 199 |
+
return file_obj.read().decode('cp1252')
|
| 200 |
+
except Exception as e:
|
| 201 |
+
return f"Error reading text file: {str(e)}"
|
llm.py
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from typing import Tuple, Dict, List
|
| 5 |
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as ThreadTimeout
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# Option 1: Use Hugging Face Inference API (recommended for better quality)
|
| 9 |
+
# Option 2: Use larger local model
|
| 10 |
+
# Option 3: Use OpenAI/Anthropic API if available
|
| 11 |
+
|
| 12 |
+
DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
|
| 13 |
+
USE_HF_API = os.getenv("USE_HF_API", "False").lower() == "true" # Set default to False
|
| 14 |
+
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
|
| 15 |
+
|
| 16 |
+
#if HF_TOKEN:
|
| 17 |
+
# huggingface_hub import login
|
| 18 |
+
# login(token=HF_TOKEN)
|
| 19 |
+
def log(msg):
|
| 20 |
+
if DEBUG_MODE:
|
| 21 |
+
print(f"[LLM Debug] {msg}")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_system_prompt(interviewee_type: str, is_summary: bool = False) -> str:
|
| 25 |
+
"""Generate context-aware system prompts"""
|
| 26 |
+
|
| 27 |
+
base_prompt = """You are an expert medical transcript analyzer specializing in healthcare interviews.
|
| 28 |
+
|
| 29 |
+
Your task is to extract structured, actionable insights from interview transcripts.
|
| 30 |
+
|
| 31 |
+
Core Principles:
|
| 32 |
+
- Focus on factual, verifiable medical information
|
| 33 |
+
- Distinguish between speaker roles accurately
|
| 34 |
+
- Filter out pleasantries, disclaimers, and off-topic content
|
| 35 |
+
- Extract specific medical terms, dosages, and treatment details
|
| 36 |
+
- Identify patterns and clinical reasoning
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
if is_summary:
|
| 40 |
+
return base_prompt + """
|
| 41 |
+
CROSS-INTERVIEW SYNTHESIS & VALIDATION TASK:
|
| 42 |
+
|
| 43 |
+
You are analyzing multiple transcripts. Extract verified patterns and flag inconsistencies.
|
| 44 |
+
|
| 45 |
+
STEP 1 - PATTERN IDENTIFICATION:
|
| 46 |
+
For each theme, count occurrences across transcripts:
|
| 47 |
+
- How many participants mentioned X? (e.g., "7 out of 10 participants")
|
| 48 |
+
- Calculate percentages when relevant
|
| 49 |
+
- What's the range of perspectives?
|
| 50 |
+
|
| 51 |
+
STEP 2 - CLASSIFY BY CONSENSUS LEVEL:
|
| 52 |
+
- STRONG CONSENSUS (80%+ agreement): Findings most participants agree on
|
| 53 |
+
- MAJORITY VIEW (60-79%): Significant but not universal agreement
|
| 54 |
+
- SPLIT PERSPECTIVES (40-59%): Where views diverge
|
| 55 |
+
- OUTLIERS (<40%): Unique but noteworthy perspectives
|
| 56 |
+
|
| 57 |
+
STEP 3 - CROSS-VALIDATE:
|
| 58 |
+
- Check for contradictions between transcripts
|
| 59 |
+
- Note where perspectives differ and why
|
| 60 |
+
- Flag quality issues (brief transcripts, vague responses)
|
| 61 |
+
|
| 62 |
+
STEP 4 - CITE EVIDENCE:
|
| 63 |
+
- Reference specific transcript numbers
|
| 64 |
+
- Include brief supporting quotes/details
|
| 65 |
+
- Distinguish fact from interpretation
|
| 66 |
+
|
| 67 |
+
OUTPUT FORMAT:
|
| 68 |
+
Start with 2-3 sentence executive overview, then:
|
| 69 |
+
|
| 70 |
+
**STRONG CONSENSUS FINDINGS:**
|
| 71 |
+
[List with counts and evidence]
|
| 72 |
+
|
| 73 |
+
**MAJORITY FINDINGS:**
|
| 74 |
+
[List with counts]
|
| 75 |
+
|
| 76 |
+
**DIVERGENT PERSPECTIVES:**
|
| 77 |
+
[Where participants disagreed and context]
|
| 78 |
+
|
| 79 |
+
**NOTABLE OUTLIERS:**
|
| 80 |
+
[Unique but important points]
|
| 81 |
+
|
| 82 |
+
**QUALITY NOTES:**
|
| 83 |
+
[Any gaps or transcript issues]
|
| 84 |
+
|
| 85 |
+
CRITICAL RULES:
|
| 86 |
+
- NEVER use vague terms like "many," "most," "some" - always use specific numbers
|
| 87 |
+
- ALWAYS cite transcript numbers for claims
|
| 88 |
+
- FLAG weak evidence explicitly
|
| 89 |
+
- Separate facts from interpretations
|
| 90 |
+
- NO JSON output - write in clear narrative prose
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
if interviewee_type == "HCP":
|
| 94 |
+
return base_prompt + """
|
| 95 |
+
Healthcare Professional Analysis Focus:
|
| 96 |
+
- Prescribing patterns and medication choices
|
| 97 |
+
- Diagnostic reasoning and clinical decision-making
|
| 98 |
+
- Treatment protocols and guidelines referenced
|
| 99 |
+
- Peer perspectives on efficacy and safety
|
| 100 |
+
- Barriers to treatment or adoption
|
| 101 |
+
- Off-label uses or emerging practices
|
| 102 |
+
|
| 103 |
+
Extract and structure:
|
| 104 |
+
1. Diagnoses mentioned with context
|
| 105 |
+
2. Prescriptions with dosage, frequency, and rationale
|
| 106 |
+
3. Treatment strategies and their justifications
|
| 107 |
+
4. Clinical guidelines or studies referenced
|
| 108 |
+
5. Challenges or barriers discussed
|
| 109 |
+
6. Key clinical insights or pearls
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
+
elif interviewee_type == "Patient":
|
| 113 |
+
return base_prompt + """
|
| 114 |
+
Patient Interview Analysis Focus:
|
| 115 |
+
- Symptom descriptions and severity
|
| 116 |
+
- Treatment experiences and outcomes
|
| 117 |
+
- Side effects and tolerability
|
| 118 |
+
- Quality of life impacts
|
| 119 |
+
- Adherence challenges and enablers
|
| 120 |
+
- Emotional and psychological factors
|
| 121 |
+
- Healthcare system interactions
|
| 122 |
+
|
| 123 |
+
Extract and structure:
|
| 124 |
+
1. Primary symptoms with duration and severity
|
| 125 |
+
2. Current and past treatments
|
| 126 |
+
3. Treatment effectiveness and satisfaction
|
| 127 |
+
4. Side effects experienced
|
| 128 |
+
5. Concerns and unmet needs
|
| 129 |
+
6. Quality of life impacts
|
| 130 |
+
7. Support systems and resources
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
else:
|
| 134 |
+
return base_prompt + """
|
| 135 |
+
General Interview Analysis Focus:
|
| 136 |
+
- Main themes and topics discussed
|
| 137 |
+
- Key insights and observations
|
| 138 |
+
- Recommendations or suggestions
|
| 139 |
+
- Contextual factors
|
| 140 |
+
- Areas of emphasis or concern
|
| 141 |
+
|
| 142 |
+
Extract and structure relevant information based on interview content.
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def build_extraction_template(interviewee_type: str) -> str:
|
| 147 |
+
"""Create JSON template for structured data extraction"""
|
| 148 |
+
|
| 149 |
+
if interviewee_type == "HCP":
|
| 150 |
+
return """{
|
| 151 |
+
"diagnoses": ["condition 1", "condition 2"],
|
| 152 |
+
"prescriptions": ["medication (dose, frequency, indication)"],
|
| 153 |
+
"treatment_rationale": ["reason for treatment choice"],
|
| 154 |
+
"guidelines_mentioned": ["guideline or study name"],
|
| 155 |
+
"clinical_decisions": ["key clinical decision with reasoning"],
|
| 156 |
+
"barriers": ["barrier to treatment"],
|
| 157 |
+
"key_insights": ["notable clinical insight"]
|
| 158 |
+
}"""
|
| 159 |
+
|
| 160 |
+
elif interviewee_type == "Patient":
|
| 161 |
+
return """{
|
| 162 |
+
"symptoms": ["symptom (severity, duration)"],
|
| 163 |
+
"concerns": ["patient concern or question"],
|
| 164 |
+
"treatments_current": ["current treatment"],
|
| 165 |
+
"treatments_past": ["past treatment with outcome"],
|
| 166 |
+
"treatment_response": ["description of how treatment is working"],
|
| 167 |
+
"side_effects": ["side effect experienced"],
|
| 168 |
+
"quality_of_life": ["impact on daily life"],
|
| 169 |
+
"adherence_factors": ["factor affecting medication adherence"]
|
| 170 |
+
}"""
|
| 171 |
+
|
| 172 |
+
else:
|
| 173 |
+
return """{
|
| 174 |
+
"key_insights": ["main insight or finding"],
|
| 175 |
+
"themes": ["recurring theme"],
|
| 176 |
+
"recommendations": ["recommendation or suggestion"],
|
| 177 |
+
"context": ["important contextual information"]
|
| 178 |
+
}"""
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def parse_structured_response(text: str, interviewee_type: str) -> Dict:
|
| 182 |
+
"""Extract structured data from LLM response"""
|
| 183 |
+
|
| 184 |
+
# Try to find JSON block
|
| 185 |
+
json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text, re.DOTALL)
|
| 186 |
+
|
| 187 |
+
if json_match:
|
| 188 |
+
try:
|
| 189 |
+
data = json.loads(json_match.group())
|
| 190 |
+
log(f"Successfully extracted JSON: {data}")
|
| 191 |
+
return data
|
| 192 |
+
except json.JSONDecodeError:
|
| 193 |
+
log("Failed to parse JSON from response")
|
| 194 |
+
|
| 195 |
+
# Fallback: Extract from text using patterns
|
| 196 |
+
data = {}
|
| 197 |
+
|
| 198 |
+
if interviewee_type == "HCP":
|
| 199 |
+
# Extract diagnoses
|
| 200 |
+
diag_pattern = r'(?:diagnos[ei]s|condition):\s*([^\n]+)'
|
| 201 |
+
data["diagnoses"] = re.findall(diag_pattern, text, re.IGNORECASE)
|
| 202 |
+
|
| 203 |
+
# Extract prescriptions
|
| 204 |
+
rx_pattern = r'(?:prescri[bp]\w*|medication):\s*([^\n]+)'
|
| 205 |
+
data["prescriptions"] = re.findall(rx_pattern, text, re.IGNORECASE)
|
| 206 |
+
|
| 207 |
+
# Extract treatment rationale
|
| 208 |
+
treat_pattern = r'(?:treatment|therapy|rationale):\s*([^\n]+)'
|
| 209 |
+
data["treatment_rationale"] = re.findall(treat_pattern, text, re.IGNORECASE)
|
| 210 |
+
|
| 211 |
+
elif interviewee_type == "Patient":
|
| 212 |
+
# Extract symptoms
|
| 213 |
+
symptom_pattern = r'(?:symptom|complaint|experienc\w*):\s*([^\n]+)'
|
| 214 |
+
data["symptoms"] = re.findall(symptom_pattern, text, re.IGNORECASE)
|
| 215 |
+
|
| 216 |
+
# Extract concerns
|
| 217 |
+
concern_pattern = r'(?:concern|worry|question|anxious):\s*([^\n]+)'
|
| 218 |
+
data["concerns"] = re.findall(concern_pattern, text, re.IGNORECASE)
|
| 219 |
+
|
| 220 |
+
# Extract side effects
|
| 221 |
+
se_pattern = r'(?:side effect|adverse|reaction):\s*([^\n]+)'
|
| 222 |
+
data["side_effects"] = re.findall(se_pattern, text, re.IGNORECASE)
|
| 223 |
+
|
| 224 |
+
# Clean and deduplicate
|
| 225 |
+
for key in data:
|
| 226 |
+
data[key] = list(set([item.strip() for item in data[key] if item.strip()]))
|
| 227 |
+
|
| 228 |
+
log(f"Extracted data from text: {data}")
|
| 229 |
+
return data
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def query_llm_hf_api(prompt: str, max_tokens: int = 500) -> str:
|
| 233 |
+
"""Use Hugging Face Inference API for better quality"""
|
| 234 |
+
try:
|
| 235 |
+
from huggingface_hub import InferenceClient
|
| 236 |
+
|
| 237 |
+
client = InferenceClient(token=HF_TOKEN)
|
| 238 |
+
|
| 239 |
+
# Use chat completions instead
|
| 240 |
+
messages = [
|
| 241 |
+
{"role": "system", "content": "You are an expert transcript analyzer. Provide detailed, structured analysis."},
|
| 242 |
+
{"role": "user", "content": prompt}
|
| 243 |
+
]
|
| 244 |
+
|
| 245 |
+
response = client.chat_completion(
|
| 246 |
+
messages=messages,
|
| 247 |
+
model="microsoft/Phi-3-mini-4k-instruct",
|
| 248 |
+
max_tokens=max_tokens,
|
| 249 |
+
temperature=0.3
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
return response.choices[0].message.content.strip()
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
import traceback
|
| 256 |
+
full_error = traceback.format_exc()
|
| 257 |
+
log(f"HF API error: {e}\n{full_error}")
|
| 258 |
+
print(f"[HF API Full Error]\n{full_error}") # Print to console
|
| 259 |
+
return f"[Error] HF API failed: {e}"
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def query_llm_local(prompt: str, max_tokens: int = 500) -> str:
|
| 263 |
+
"""Local model optimized for L4 GPU"""
|
| 264 |
+
try:
|
| 265 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 266 |
+
import torch
|
| 267 |
+
|
| 268 |
+
if not hasattr(query_llm_local, 'model'):
|
| 269 |
+
log("Loading local model on L4...")
|
| 270 |
+
query_llm_local.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
|
| 271 |
+
query_llm_local.model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 272 |
+
"google/flan-t5-xxl",
|
| 273 |
+
torch_dtype=torch.float16,
|
| 274 |
+
device_map="auto"
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# Tokenize and truncate to 512 tokens
|
| 278 |
+
inputs = query_llm_local.tokenizer(
|
| 279 |
+
prompt,
|
| 280 |
+
return_tensors="pt",
|
| 281 |
+
truncation=True,
|
| 282 |
+
max_length=512
|
| 283 |
+
).to("cuda")
|
| 284 |
+
|
| 285 |
+
outputs = query_llm_local.model.generate(
|
| 286 |
+
**inputs,
|
| 287 |
+
max_new_tokens=max_tokens,
|
| 288 |
+
do_sample=False
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
response = query_llm_local.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 292 |
+
return response.strip()
|
| 293 |
+
|
| 294 |
+
except Exception as e:
|
| 295 |
+
log(f"Local model error: {e}")
|
| 296 |
+
return f"[Error] Local model failed: {e}"
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def query_llm(
|
| 300 |
+
chunk: str,
|
| 301 |
+
user_context: str,
|
| 302 |
+
interviewee_type: str,
|
| 303 |
+
extract_structured: bool = False,
|
| 304 |
+
is_summary: bool = False,
|
| 305 |
+
timeout: int = 120
|
| 306 |
+
) -> Tuple[str, Dict]:
|
| 307 |
+
"""
|
| 308 |
+
Main LLM query function with structured extraction
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
Tuple of (response_text, structured_data_dict)
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
system_prompt = get_system_prompt(interviewee_type, is_summary)
|
| 315 |
+
extraction_template = build_extraction_template(interviewee_type) if extract_structured else ""
|
| 316 |
+
|
| 317 |
+
# Build comprehensive prompt
|
| 318 |
+
full_prompt = f"""{system_prompt}
|
| 319 |
+
|
| 320 |
+
User Instructions:
|
| 321 |
+
{user_context}
|
| 322 |
+
|
| 323 |
+
Transcript Segment to Analyze:
|
| 324 |
+
{chunk}
|
| 325 |
+
|
| 326 |
+
"""
|
| 327 |
+
|
| 328 |
+
if extract_structured:
|
| 329 |
+
full_prompt += f"""
|
| 330 |
+
IMPORTANT: Provide your analysis in two parts:
|
| 331 |
+
1. A clear narrative summary (3-5 sentences)
|
| 332 |
+
2. Structured data in this exact JSON format:
|
| 333 |
+
{extraction_template}
|
| 334 |
+
|
| 335 |
+
Be specific and include relevant details (dosages, durations, severity levels, etc.)
|
| 336 |
+
"""
|
| 337 |
+
|
| 338 |
+
# Truncate if needed (but increased limit)
|
| 339 |
+
max_prompt_length = 6000 # Increased from 2000
|
| 340 |
+
if len(full_prompt) > max_prompt_length:
|
| 341 |
+
chunk_limit = max_prompt_length - len(system_prompt) - len(user_context) - len(extraction_template) - 500
|
| 342 |
+
chunk = chunk[:chunk_limit]
|
| 343 |
+
full_prompt = f"{system_prompt}\n\nUser Instructions:\n{user_context}\n\nTranscript Segment:\n{chunk}\n\n"
|
| 344 |
+
if extract_structured:
|
| 345 |
+
full_prompt += f"Provide analysis and structured JSON: {extraction_template}"
|
| 346 |
+
log(f"Prompt truncated to {len(full_prompt)} characters")
|
| 347 |
+
|
| 348 |
+
def generate():
|
| 349 |
+
if os.getenv("USE_LMSTUDIO", "False").lower() == "true":
|
| 350 |
+
return query_llm_lmstudio(full_prompt, max_tokens=600)
|
| 351 |
+
elif USE_HF_API and HF_TOKEN:
|
| 352 |
+
return query_llm_hf_api(full_prompt, max_tokens=600)
|
| 353 |
+
else:
|
| 354 |
+
return query_llm_local(full_prompt, max_tokens=600)
|
| 355 |
+
|
| 356 |
+
# Execute with timeout
|
| 357 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
| 358 |
+
future = executor.submit(generate)
|
| 359 |
+
try:
|
| 360 |
+
response = future.result(timeout=timeout)
|
| 361 |
+
log(f"LLM response received ({len(response)} chars)")
|
| 362 |
+
|
| 363 |
+
# Extract structured data if requested
|
| 364 |
+
structured_data = {}
|
| 365 |
+
if extract_structured:
|
| 366 |
+
structured_data = parse_structured_response(response, interviewee_type)
|
| 367 |
+
|
| 368 |
+
return response, structured_data
|
| 369 |
+
|
| 370 |
+
except ThreadTimeout:
|
| 371 |
+
log("LLM generation timed out")
|
| 372 |
+
return "[Error] LLM generation timed out.", {}
|
| 373 |
+
except Exception as e:
|
| 374 |
+
log(f"LLM generation failed: {e}")
|
| 375 |
+
return f"[Error] LLM generation failed: {e}", {}
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def extract_structured_data(text: str, interviewee_type: str) -> Dict:
|
| 379 |
+
"""
|
| 380 |
+
Standalone function to extract structured data from existing text
|
| 381 |
+
Useful for post-processing
|
| 382 |
+
"""
|
| 383 |
+
return parse_structured_response(text, interviewee_type)
|
narrative_report_generator.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from typing import Tuple
|
| 4 |
+
from docx import Document
|
| 5 |
+
from docx.shared import Inches
|
| 6 |
+
from reportlab.lib.pagesizes import letter
|
| 7 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
| 8 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
| 9 |
+
from report_parser import parse_transcriptor_output
|
| 10 |
+
from table_builder import build_all_tables
|
| 11 |
+
from story_writer import generate_narrative
|
| 12 |
+
|
| 13 |
+
def generate_narrative_report(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient",
|
| 14 |
+
report_style: str = "executive", llm_backend: str = "lmstudio",
|
| 15 |
+
output_dir: str = "./outputs") -> Tuple[str, str, str]:
|
| 16 |
+
print("[1/4] Parsing...")
|
| 17 |
+
parsed_data = parse_transcriptor_output(csv_path, summary_path, interviewee_type)
|
| 18 |
+
|
| 19 |
+
print("[2/4] Building tables...")
|
| 20 |
+
tables = build_all_tables(parsed_data)
|
| 21 |
+
|
| 22 |
+
print("[3/4] Generating narrative (1-2 min)...")
|
| 23 |
+
narrative = generate_narrative(parsed_data, tables, report_style, llm_backend)
|
| 24 |
+
|
| 25 |
+
print("[4/4] Creating outputs...")
|
| 26 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 27 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 28 |
+
base = f"{output_dir}/narrative_report_{timestamp}"
|
| 29 |
+
|
| 30 |
+
pdf = f"{base}.pdf"
|
| 31 |
+
word = f"{base}.docx"
|
| 32 |
+
html = f"{base}.html"
|
| 33 |
+
|
| 34 |
+
create_pdf(narrative, tables, parsed_data, pdf)
|
| 35 |
+
create_word(narrative, tables, parsed_data, word)
|
| 36 |
+
create_html(narrative, tables, parsed_data, html)
|
| 37 |
+
|
| 38 |
+
print(f"Done!\nPDF: {pdf}\nWord: {word}\nHTML: {html}")
|
| 39 |
+
return pdf, word, html
|
| 40 |
+
|
| 41 |
+
def create_pdf(narrative, tables, data, path):
|
| 42 |
+
doc = SimpleDocTemplate(path, pagesize=letter)
|
| 43 |
+
story = []
|
| 44 |
+
styles = getSampleStyleSheet()
|
| 45 |
+
|
| 46 |
+
story.append(Paragraph("Narrative Research Report", styles['Title']))
|
| 47 |
+
story.append(Spacer(1, 0.3*72))
|
| 48 |
+
|
| 49 |
+
for section in narrative.split('\n\n'):
|
| 50 |
+
if section.strip():
|
| 51 |
+
story.append(Paragraph(section.strip().replace('&','&').replace('<','<'), styles['BodyText']))
|
| 52 |
+
story.append(Spacer(1, 0.1*72))
|
| 53 |
+
|
| 54 |
+
doc.build(story)
|
| 55 |
+
|
| 56 |
+
def create_word(narrative, tables, data, path):
|
| 57 |
+
doc = Document()
|
| 58 |
+
doc.add_heading('Narrative Research Report', 0)
|
| 59 |
+
for section in narrative.split('\n\n'):
|
| 60 |
+
if section.strip():
|
| 61 |
+
doc.add_paragraph(section.strip())
|
| 62 |
+
doc.save(path)
|
| 63 |
+
|
| 64 |
+
def create_html(narrative, tables, data, path):
|
| 65 |
+
html = f"""<!DOCTYPE html><html><head><style>
|
| 66 |
+
body{{font-family:Arial;max-width:900px;margin:40px auto;padding:20px;line-height:1.6}}
|
| 67 |
+
h1{{color:#2c3e50;text-align:center}}
|
| 68 |
+
</style></head><body><h1>Narrative Research Report</h1>"""
|
| 69 |
+
for section in narrative.split('\n\n'):
|
| 70 |
+
if section.strip():
|
| 71 |
+
html += f"<p>{section.strip()}</p>"
|
| 72 |
+
html += "</body></html>"
|
| 73 |
+
with open(path, 'w') as f:
|
| 74 |
+
f.write(html)
|
outputs/sample.txt
ADDED
|
File without changes
|
report.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Transcript ID,File Name,Quality Score,Word Count,Key Insights,Recommendations
|
| 2 |
+
Transcript 1,570_24_July30_2pmET_TDredacted.docx,1.00,6812,"Interviewee is an independent researcher with expertise in healthcare topics but not a medical professional.; Focus on sharing medical materials and seeking opinions.; The interviewee expresses concern about potential TV ads for treatment of cramps and mentions the importance of managing cramps; Respondent diagnosed with fibe years ago, currently on whole bunch of medication including Vimpat and Gabapentin. Experienced side effects from one called Xcopr and had to switch.; The respondent emphasizes the importance of early intervention in mental health; The respondent confirms prior discussions and shows willingness to engage; The interviewee emphasizes the importance of clear communication and understanding in medical contexts; The interviewee expresses comfort with the application but lacks confidence in their ability to manage it independently; The interviewee emphasizes the importance of minimizing complications in medical treatment; The interviewee is focusing on the main idea of motivating participants to collaborate; The interviewee emphasizes the importance of full disclosure about a four-year period, highlighting potential gaps in knowledge and the need for transparency.; The interviewee expresses uncertainty about the TV ad's effectiveness and uniqueness; TV ad version compatibility and user preference for consistent format; The interviewee's ability to communicate clearly improved over time; The interviewee expresses a strong preference for personalized medical advice and emphasizes the importance of understanding individual differences in treatment responses.; The respondent values comparison with past experiences but emphasizes the importance of maintaining a strong foundation; The interviewee values personalized care and emphasizes the importance of having compassionate caregivers.; The interviewee expresses uncertainty about their own reactions compared to others, noting increased emotion and concern; Lack of alignment with current context; preference for happiness and movement forward; The respondent's condition involves deep-seated issues requiring careful consideration; The respondent emphasizes the importance of providing reliable pharmaceutical products with clear instructions; The interviewee expressed difficulty in transitioning from a previous role and uncertainty about their current responsibilities; The respondent found the ad motivating and engaging; The respondent finds the ad relevant but lacks specific details about its content; The respondent appreciates the detailed analysis but expresses concern about the specific recommendations; The respondent's behavior during the crisis was notably calm and composed, contrasting with the expected heightened emotional response.; Positive rapport between interviewer and respondent",Engage in discussions for mutual benefit; Clarify roles and expectations; Further investigation into specific side effects or problems with particular medical treatments; Consider alternative treatments for managing side effects; Consider implementing potential TV ads for mental health awareness; Encourage further detailed exploration of ideas; Ensure all parties involved have a clear understanding of the medication and its administration; Encourage gradual independence; Provide additional support; Further testing to identify specific causes; Encourage open communication and shared goals; Ensure thorough disclosure of all relevant medical history to avoid misunderstandings.; Conduct further research on similar ads to gauge effectiveness; Maintain consistent TV ad formats across platforms; Encourage continued practice to enhance communication; Encourage healthcare providers to tailor treatments based on patient specifics; Focus on building a solid foundation; Focus on hiring caregivers with strong interpersonal skills and a history of providing personalized care.; Further exploration of emotional triggers; Assessment of comparative emotional responses; Encourage alignment with current realities while maintaining focus on happiness; Consider a comprehensive treatment plan addressing underlying issues; Ensure all pharmaceutical products come with detailed usage guidelines; Clarify job expectations and provide support during the transition; Consider enhancing ad content to maintain motivation; Provide more detailed information about the ad; Clarify and validate specific recommendations; Encourage further training in crisis management techniques; Maintain positive communication
|
report.pdf
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
%PDF-1.4
|
| 2 |
+
%���� ReportLab Generated PDF document http://www.reportlab.com
|
| 3 |
+
1 0 obj
|
| 4 |
+
<<
|
| 5 |
+
/F1 2 0 R /F2 3 0 R
|
| 6 |
+
>>
|
| 7 |
+
endobj
|
| 8 |
+
2 0 obj
|
| 9 |
+
<<
|
| 10 |
+
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
| 11 |
+
>>
|
| 12 |
+
endobj
|
| 13 |
+
3 0 obj
|
| 14 |
+
<<
|
| 15 |
+
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
| 16 |
+
>>
|
| 17 |
+
endobj
|
| 18 |
+
4 0 obj
|
| 19 |
+
<<
|
| 20 |
+
/Contents 10 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
| 21 |
+
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 22 |
+
>> /Rotate 0 /Trans <<
|
| 23 |
+
|
| 24 |
+
>>
|
| 25 |
+
/Type /Page
|
| 26 |
+
>>
|
| 27 |
+
endobj
|
| 28 |
+
5 0 obj
|
| 29 |
+
<<
|
| 30 |
+
/Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
| 31 |
+
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 32 |
+
>> /Rotate 0 /Trans <<
|
| 33 |
+
|
| 34 |
+
>>
|
| 35 |
+
/Type /Page
|
| 36 |
+
>>
|
| 37 |
+
endobj
|
| 38 |
+
6 0 obj
|
| 39 |
+
<<
|
| 40 |
+
/Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
|
| 41 |
+
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
| 42 |
+
>> /Rotate 0 /Trans <<
|
| 43 |
+
|
| 44 |
+
>>
|
| 45 |
+
/Type /Page
|
| 46 |
+
>>
|
| 47 |
+
endobj
|
| 48 |
+
7 0 obj
|
| 49 |
+
<<
|
| 50 |
+
/PageMode /UseNone /Pages 9 0 R /Type /Catalog
|
| 51 |
+
>>
|
| 52 |
+
endobj
|
| 53 |
+
8 0 obj
|
| 54 |
+
<<
|
| 55 |
+
/Author (\(anonymous\)) /CreationDate (D:20251005104519-04'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251005104519-04'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
| 56 |
+
/Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
|
| 57 |
+
>>
|
| 58 |
+
endobj
|
| 59 |
+
9 0 obj
|
| 60 |
+
<<
|
| 61 |
+
/Count 3 /Kids [ 4 0 R 5 0 R 6 0 R ] /Type /Pages
|
| 62 |
+
>>
|
| 63 |
+
endobj
|
| 64 |
+
10 0 obj
|
| 65 |
+
<<
|
| 66 |
+
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1291
|
| 67 |
+
>>
|
| 68 |
+
stream
|
| 69 |
+
Gat=*;0/3r&:W67f[3@JN(a6QP5OGhDOO7f1bip>9RT)?koAXFh-jIRoY5VE5ZkH.a6n0tn?VrY\)6TpMBJ_0aFDAOi9))/qB^_AL:i0f#nk>:_-&ttCD%)J=U9qAS]MX8@C40$\:6C%#m!I>p+aD6"<9e!0Q8!flm`LNGY*tf!fh:G)Cpe^*Lo$^g0&"G'=QW)+!JCNqun&^q@f]doQQ'kQDHme`")18&CP[WgPPDL:6QCqX!EQ<).1j3[:Hu\HW/TpP/&g0(khG`<Oh6;iaa+T(D-([S.69S7AL)['b<VOe=FE*iOY*H3oDkRBS=:_A7&7'mfi_<8sG'WS.F'49/I3W1PeD-+WLZW,Uo8_Fs$9p@Thgoh_Yhf[DFL>jLZ2"9l.W@i&58?a'cXROh#Y0K%oGBB2]XtU?AB9B,#LF8Vu#a>ug:MGGFu+s3[G>m>qEGi*6%_9nL><Cl9iIKR@'[o2Q3k?;H.e[c?H!ON_3a=2DGVL4_(j@gNZnqPOiKQcWX2S%UDu"rlCbD7!'ONGa%]cT`ZFFcZ=fiNB!kF)TOKRa@];#K(%eF*fU"k<bDYI@%&(dFeuOcDbT4o<,6MfcB;QJ4[%;K&p(DbSO]+Ila(\U8L@QNV`7mds3STXmH.['0)7H>73:]+S`>?)[gVo6TbH:kD/D_;9bQCC'<)!X4rp:_*Y,4B0eKR!fo5f.^tOl:*5*(Kom=s:k7%q,q_(HcqON9`mDlnc@ISSUbg@)j-3NFBUf7=BoS`qMJFK+`W?<fj&(+JMJFJebOeDDTZ0;i>"!""\/H;=Th"H!@;%S.Y(\=W>oHSL2LtP/\X1ia7dTY!4)d0A`*tj=_][0tPS\I8WM*-th.T"1RCp+!fMS>1ENNlX;<pqF+31oq1ffgS>S%g-</W%+TdJ1=m*Nqp<>LIU>*]@%mSem)-"R_UA$N:Fm_0>lT?;a>R>46/_MCAhakEbLnTs^s"'+daZ-cQIThq%YB?-O;['d=PKOd!:%l`9a3PPM3YZZlAh?2tGd(,/d]qgu_W>o_5ap2:KF_WM@*4A;88gV<Ri\hsG,t_o5@C/L&b1a^5OX^m2)EbH!o5+EqouZn])IK+4>U4V?I;P+lS`us9%*dlaiPZ%->[1N2KAdo>I_jj4K2d#?!e3mNhO.XDrkqQEUEn;ih]B&ek8?tl9(mOZIEKQ(.H]`9=33hYhWpU9'gAkJEV@>RF.,$tbAh@nR#Dsp6c4=1RVIU:c=\$Pk120g8PMO8\o0Z6i-EuTAse%%Ip)Nd-6Z^c%mN8b0_l5SQHu6~>endstream
|
| 70 |
+
endobj
|
| 71 |
+
11 0 obj
|
| 72 |
+
<<
|
| 73 |
+
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1910
|
| 74 |
+
>>
|
| 75 |
+
stream
|
| 76 |
+
GauHK?#SIU'Re<2\F<ETUoG\iVJf\&%NAHA?+k-k:o#p:>APT_,q5bKYlDF);O6Fs8Y)t#!Z6qkh/i%qkC,+Y(k7DBj<`4JF*d[,Td6K1P*/[!UA<,W4[8gcQF&"e0hmiD%c^]V9[iOqP+m%T9SQK'gh/Yp]fLe>UpCMG-1%Ln9YB2LZ[.'H/<be/BWbKL\<aP!mj_@SN&9G=qub3YFjR>^[g,-c9r!Pdi9p2.&t["G1`fu#.3+gOdchU/:#)';a-+`Q@0kLIf(18_-T%F72sKPgVl&)TnZ=FIk;Vu_/Mb>JaJ)AuchG#?bhuPXh"hc_a2&/(o;+rO>EX?99aKUm\+A2\/#;"J)m6Pmf#-_m:>g>C4Gi(mWV\Z(E^Ja;Db=[7>W%dp^goDc\X4[Wn'EF(iEba/8MiJkP4:5d*iYFpA]T`"@@*0*E>5>@Cksq(deWe7mlqXhd9ea*-oO#i13/Xa1O)B=%">:Zctb(JKck)c<@"V=e,g_W,qY'`Rcd#dXgC*FcaD,#)P!oX(Tai$B3LBs[H*kc@un37/60h*UJbWcZ[uqoQGOK<T\='u6%]u22hp7YadJIFW]O?=GU6MO*6Q>.n^7:BO:!Y1B^kG!Mfh4f\L9lPk5$.n[UD/>2m"g;MLmFEAel^V9P]+'\/&>7-.q_->*Nd@<ZWHK"p)5a$n[aAZE_au/OUnNdQ4bu"ThnG8?5(1?lXmU[c[E_>TjiSm=&*oAsG6VE/7XhT>`OuH3mk;dThLR0YX2::</!2oJ[r@%s<JD3fMK3Z5\e#iV@tgh1J=hkDlm>g->f3@R(<1j?b't$O+f)b!LDF50_stb6#"N$YE$Z8[[n5j-l6U@+G5:dpO1g=Y">,N8c\$<3__7\3CJKWNV`Q!Sk'G/6dH-iItQT]F!!&p[&raa,<Dk="pp`$Y/9J#RlhG1.])`8kqif+CMa5&tja1!MMgqKd]Zo[[P#GhEXD5@<=U7r1f\eQ+YWo!l)ol#NqJ,Y>RI)d%6f.%ZjF_en>a>dFE4,<LVTnWqo/O>(pRR8[bX`6)f,&Q?"-^K=XQf=6bB^UG\(*Amjm^7mYRtWHd9$<!^34(b_?%dJ)4SA%g*8?jg!s/useUZ]P=SanG>:e>`Y++OK?7p$Z/9g:c@Zj:&[g%e,\Fd9T1IKfG13>T0[8`]'(=4nUqe@O7-ZhpcuI?gM,r]^lpl'i7SX?e1r!XO,(t7:Z$CpE\@A*"EQ/f?02<:4toG>BOd-4$d)gnKaPD;-/'($I;N"f4b)c\sD2e9U'\FN;QTolSQs,e.m:7kXTM]3)WnabInujGB<c3+)5F(RAP.N?"U8NCu3ok[X%85i*G<Rf!-P5d_#h.DD<[mo]1Y]Eja)#>=M@A<Nh[0JSpH1Wdgi=m&4VZmWR+[/]S-E.5dQ5XX,sl%sbpLjQb;Rkb8%tN\-K97t^<C_3#Vq/;CHa:b_[q;Dk=igF(ajmH9G_0")-P-=tRW.OQbp>W(7&-d-ZC*Ac`GJ5k0fVFZK=d.Me0g+I,7@3sDlC]#:4HT"YjGVV@l!@OZ7lYO@t>=BU5C0mR9aYDUNgXb+tjpb:b%?C<o+a4U,V=eff3&Ki^s6H8Iejt9Hi-nnA\,<E[dpUeHbR'10K#%egMD\;(1UMDXL9g8%,(smT-D2$a`&qF1M8e-":4`;"EFeKZN'd*&GYR58jS+Qo(A\+LpC5^!CIS]rpFCum_HTL8;j<LTa%)KMl0tr>mIAK\Y#<C'AMX5D<U-7][m[^5+\kg(>o-_6h*%`Fr1aRX-Z^6Gr_3`g$Ib1jUqb`[&Aq+tr:KGn`TFDroT9='.oiO5h^EpXAj7#"'[%kko^IcW*`1Ou^T998ghg\Zc\8_&8uo@()H`:slgK-]Lj#ql^:H@Ri1'6u8qsJTN"\T6Bnm"cr<&F-Ab5~>endstream
|
| 77 |
+
endobj
|
| 78 |
+
12 0 obj
|
| 79 |
+
<<
|
| 80 |
+
/Filter [ /ASCII85Decode /FlateDecode ] /Length 876
|
| 81 |
+
>>
|
| 82 |
+
stream
|
| 83 |
+
Gau0A9lJKG&A@7.N-m*'R+V3a*'p\lU'Dt$&>I'gCiAXq27r1-'_1'BlSumY9,.c,83'@UhqNGFm3I6#J!05(."HLZ]Y7&^8KfT,=i:]h5/BZGq(YJ;cIE>(lP2@"/Pam]Eq'@nF>6SRCP]lP\cc54(ap9+(JDj!*]VgT#M?BKlg#L&2#MshQ?M/]G-\3^Mr)=;.'!WLp1-+F[R<B\WR/aKEo=i2W\N)"('LK7-+-_C:?iIF;,&VGH>0`rKGTa+5kYi9']iT&_-9708GhW:'I(OpZtI&J$V83Q=3$)IMLpMJ)q*de6Oc/?,sKU;H%.qa?ac8g/9l2!&9n:4B,Fe<^bmBJ+U&-f*P(TA_XQbRI<YDF8;1;0d5p\6kSR?(^o"@@72:=`K*nsgW,nPbA[8BAIQ+Ba`q[?n*KKU8&&P^C)q^5S0RNee57`0LS]Bp0q;Zc^gXf4s`'!kY6q@Q:@5WY.HmGLSU!uA1lp>.5*phs*d!@-OKA(BobTNK2Fj6X_o^jO@rFr$Zs6k`_:^F1!\U:*)Be>X@abKGD7B'>^EU"N`'ml-3Bs<'QpBSRh=.O]U\8Z+@`MI)dYF]]IRJ-=&?!>SSn%11*]'_=Z)qThOFofQJ\Urpj[3N'*OHh/b6H)W@@-_VLP$DH0*]r.lZ`PZ0pV@AnRTSdjnC+_B*=%/&fYnBV*pECmUS`QcGY=<DhUpZVkl0D0+<MqB$M@(R7N<5US(1T[R^T<;o.qYt,*lutE)r%a_s>=%OZ>'WfBSm-j/_5lKo7%mBKT0MXGmcU9,gfJfF;A7ed15:AB7soeD4/g(CG'jO!>rqe,3@P>I$(^C#_;Rob;/X]O,a+GSHd5k%1jX5G9IQ4A5EKR+qd3C*R]~>endstream
|
| 84 |
+
endobj
|
| 85 |
+
xref
|
| 86 |
+
0 13
|
| 87 |
+
0000000000 65535 f
|
| 88 |
+
0000000073 00000 n
|
| 89 |
+
0000000114 00000 n
|
| 90 |
+
0000000221 00000 n
|
| 91 |
+
0000000333 00000 n
|
| 92 |
+
0000000527 00000 n
|
| 93 |
+
0000000721 00000 n
|
| 94 |
+
0000000915 00000 n
|
| 95 |
+
0000000983 00000 n
|
| 96 |
+
0000001266 00000 n
|
| 97 |
+
0000001337 00000 n
|
| 98 |
+
0000002720 00000 n
|
| 99 |
+
0000004722 00000 n
|
| 100 |
+
trailer
|
| 101 |
+
<<
|
| 102 |
+
/ID
|
| 103 |
+
[<6812a7b40f4b5abfbec04669e48f4c7d><6812a7b40f4b5abfbec04669e48f4c7d>]
|
| 104 |
+
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
| 105 |
+
|
| 106 |
+
/Info 8 0 R
|
| 107 |
+
/Root 7 0 R
|
| 108 |
+
/Size 13
|
| 109 |
+
>>
|
| 110 |
+
startxref
|
| 111 |
+
5689
|
| 112 |
+
%%EOF
|
report_parser.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
from typing import Dict, List, Tuple
|
| 4 |
+
from collections import Counter
|
| 5 |
+
|
| 6 |
+
def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]:
|
| 7 |
+
df = pd.read_csv(csv_path)
|
| 8 |
+
metadata = {
|
| 9 |
+
"total_transcripts": len(df),
|
| 10 |
+
"avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None,
|
| 11 |
+
"avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None,
|
| 12 |
+
"transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else []
|
| 13 |
+
}
|
| 14 |
+
return df, metadata
|
| 15 |
+
|
| 16 |
+
def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]:
|
| 17 |
+
themes = {}
|
| 18 |
+
if interviewee_type == "HCP":
|
| 19 |
+
theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
|
| 20 |
+
elif interviewee_type == "Patient":
|
| 21 |
+
theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"]
|
| 22 |
+
else:
|
| 23 |
+
theme_columns = ["Key Insights"]
|
| 24 |
+
|
| 25 |
+
for col in theme_columns:
|
| 26 |
+
if col in df.columns:
|
| 27 |
+
all_items = []
|
| 28 |
+
for val in df[col].dropna():
|
| 29 |
+
if isinstance(val, str):
|
| 30 |
+
all_items.extend([i.strip() for i in val.split(';') if i.strip()])
|
| 31 |
+
theme_counts = Counter(all_items)
|
| 32 |
+
themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)]
|
| 33 |
+
return themes
|
| 34 |
+
|
| 35 |
+
def calculate_statistics(df: pd.DataFrame) -> Dict:
|
| 36 |
+
stats = {}
|
| 37 |
+
if "Quality Score" in df.columns:
|
| 38 |
+
scores = df["Quality Score"].astype(float)
|
| 39 |
+
stats["quality"] = {
|
| 40 |
+
"mean": scores.mean(),
|
| 41 |
+
"excellent_count": sum(scores > 0.8),
|
| 42 |
+
"good_count": sum((scores >= 0.6) & (scores <= 0.8)),
|
| 43 |
+
"fair_count": sum((scores >= 0.4) & (scores < 0.6)),
|
| 44 |
+
"poor_count": sum(scores < 0.4)
|
| 45 |
+
}
|
| 46 |
+
if "Word Count" in df.columns:
|
| 47 |
+
words = df["Word Count"].astype(int)
|
| 48 |
+
stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())}
|
| 49 |
+
return stats
|
| 50 |
+
|
| 51 |
+
def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict:
|
| 52 |
+
df, metadata = parse_csv_output(csv_path)
|
| 53 |
+
themes = extract_key_themes(df, interviewee_type)
|
| 54 |
+
stats = calculate_statistics(df)
|
| 55 |
+
return {
|
| 56 |
+
"dataframe": df,
|
| 57 |
+
"metadata": metadata,
|
| 58 |
+
"themes": themes,
|
| 59 |
+
"statistics": stats,
|
| 60 |
+
"interviewee_type": interviewee_type
|
| 61 |
+
}
|
reporting.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from reportlab.lib.pagesizes import letter, A4
|
| 3 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 4 |
+
from reportlab.lib.units import inch
|
| 5 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
|
| 6 |
+
from reportlab.lib import colors
|
| 7 |
+
from reportlab.lib.enums import TA_CENTER, TA_LEFT
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
def generate_csv(data, path="report.csv"):
|
| 13 |
+
"""Legacy function - kept for backwards compatibility"""
|
| 14 |
+
return generate_enhanced_csv(data, "Other", path)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def generate_enhanced_csv(
|
| 18 |
+
data: List[Dict],
|
| 19 |
+
interviewee_type: str,
|
| 20 |
+
path: str = "report.csv"
|
| 21 |
+
) -> str:
|
| 22 |
+
"""
|
| 23 |
+
Generate enhanced CSV with proper formatting and data validation
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
if not data:
|
| 27 |
+
# Create empty CSV with headers
|
| 28 |
+
df = pd.DataFrame(columns=["Transcript ID", "Status"])
|
| 29 |
+
df.to_csv(path, index=False)
|
| 30 |
+
return path
|
| 31 |
+
|
| 32 |
+
# Create DataFrame
|
| 33 |
+
df = pd.DataFrame(data)
|
| 34 |
+
|
| 35 |
+
# Reorder columns for better readability
|
| 36 |
+
priority_cols = ["Transcript ID", "File Name", "Quality Score", "Word Count"]
|
| 37 |
+
other_cols = [col for col in df.columns if col not in priority_cols]
|
| 38 |
+
ordered_cols = [col for col in priority_cols if col in df.columns] + other_cols
|
| 39 |
+
|
| 40 |
+
df = df[ordered_cols]
|
| 41 |
+
|
| 42 |
+
# Save with proper encoding
|
| 43 |
+
df.to_csv(path, index=False, encoding='utf-8-sig')
|
| 44 |
+
|
| 45 |
+
return path
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def generate_pdf(summary, details, path="report.pdf"):
|
| 49 |
+
"""Legacy function - kept for backwards compatibility"""
|
| 50 |
+
# Create minimal results structure
|
| 51 |
+
results = [{
|
| 52 |
+
"transcript_id": "Transcript 1",
|
| 53 |
+
"file_name": "analysis.txt",
|
| 54 |
+
"full_text": details,
|
| 55 |
+
"quality_score": 0.8,
|
| 56 |
+
"word_count": len(details.split())
|
| 57 |
+
}]
|
| 58 |
+
return generate_enhanced_pdf(summary, results, "Other", [], path)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def generate_enhanced_pdf(
|
| 62 |
+
summary: str,
|
| 63 |
+
results: List[Dict],
|
| 64 |
+
interviewee_type: str,
|
| 65 |
+
processing_errors: List[str],
|
| 66 |
+
path: str = "report.pdf"
|
| 67 |
+
) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Generate professional PDF report with proper formatting
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
# Create document
|
| 73 |
+
doc = SimpleDocTemplate(
|
| 74 |
+
path,
|
| 75 |
+
pagesize=letter,
|
| 76 |
+
rightMargin=0.75*inch,
|
| 77 |
+
leftMargin=0.75*inch,
|
| 78 |
+
topMargin=0.75*inch,
|
| 79 |
+
bottomMargin=0.75*inch
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Container for the 'Flowable' objects
|
| 83 |
+
story = []
|
| 84 |
+
|
| 85 |
+
# Define styles
|
| 86 |
+
styles = getSampleStyleSheet()
|
| 87 |
+
|
| 88 |
+
# Custom styles
|
| 89 |
+
title_style = ParagraphStyle(
|
| 90 |
+
'CustomTitle',
|
| 91 |
+
parent=styles['Heading1'],
|
| 92 |
+
fontSize=24,
|
| 93 |
+
textColor=colors.HexColor('#1a1a1a'),
|
| 94 |
+
spaceAfter=30,
|
| 95 |
+
alignment=TA_CENTER,
|
| 96 |
+
fontName='Helvetica-Bold'
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
heading_style = ParagraphStyle(
|
| 100 |
+
'CustomHeading',
|
| 101 |
+
parent=styles['Heading2'],
|
| 102 |
+
fontSize=16,
|
| 103 |
+
textColor=colors.HexColor('#2c3e50'),
|
| 104 |
+
spaceAfter=12,
|
| 105 |
+
spaceBefore=20,
|
| 106 |
+
fontName='Helvetica-Bold'
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
subheading_style = ParagraphStyle(
|
| 110 |
+
'CustomSubheading',
|
| 111 |
+
parent=styles['Heading3'],
|
| 112 |
+
fontSize=13,
|
| 113 |
+
textColor=colors.HexColor('#34495e'),
|
| 114 |
+
spaceAfter=8,
|
| 115 |
+
spaceBefore=12,
|
| 116 |
+
fontName='Helvetica-Bold'
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
body_style = ParagraphStyle(
|
| 120 |
+
'CustomBody',
|
| 121 |
+
parent=styles['BodyText'],
|
| 122 |
+
fontSize=11,
|
| 123 |
+
leading=14,
|
| 124 |
+
textColor=colors.HexColor('#2c3e50'),
|
| 125 |
+
alignment=TA_LEFT
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Title page
|
| 129 |
+
story.append(Paragraph("Transcript Analysis Report", title_style))
|
| 130 |
+
story.append(Spacer(1, 0.2*inch))
|
| 131 |
+
|
| 132 |
+
# Metadata table
|
| 133 |
+
metadata = [
|
| 134 |
+
["Report Generated:", datetime.now().strftime("%B %d, %Y at %I:%M %p")],
|
| 135 |
+
["Interviewee Type:", interviewee_type],
|
| 136 |
+
["Total Transcripts:", str(len(results))],
|
| 137 |
+
["Successfully Processed:", str(sum(1 for r in results if r.get("quality_score", 0) > 0))]
|
| 138 |
+
]
|
| 139 |
+
|
| 140 |
+
metadata_table = Table(metadata, colWidths=[2*inch, 4*inch])
|
| 141 |
+
metadata_table.setStyle(TableStyle([
|
| 142 |
+
('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#ecf0f1')),
|
| 143 |
+
('TEXTCOLOR', (0, 0), (-1, -1), colors.HexColor('#2c3e50')),
|
| 144 |
+
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
| 145 |
+
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
|
| 146 |
+
('FONTSIZE', (0, 0), (-1, -1), 10),
|
| 147 |
+
('BOTTOMPADDING', (0, 0), (-1, -1), 8),
|
| 148 |
+
('TOPPADDING', (0, 0), (-1, -1), 8),
|
| 149 |
+
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#bdc3c7'))
|
| 150 |
+
]))
|
| 151 |
+
|
| 152 |
+
story.append(metadata_table)
|
| 153 |
+
story.append(Spacer(1, 0.3*inch))
|
| 154 |
+
|
| 155 |
+
# Executive Summary
|
| 156 |
+
story.append(Paragraph("Executive Summary", heading_style))
|
| 157 |
+
story.append(Spacer(1, 0.1*inch))
|
| 158 |
+
|
| 159 |
+
# Split summary into paragraphs
|
| 160 |
+
summary_paragraphs = summary.split('\n\n')
|
| 161 |
+
for para in summary_paragraphs:
|
| 162 |
+
if para.strip():
|
| 163 |
+
# Clean up text for PDF
|
| 164 |
+
clean_para = para.strip().replace('&', '&').replace('<', '<').replace('>', '>')
|
| 165 |
+
story.append(Paragraph(clean_para, body_style))
|
| 166 |
+
story.append(Spacer(1, 0.1*inch))
|
| 167 |
+
|
| 168 |
+
# Processing errors section (if any)
|
| 169 |
+
if processing_errors:
|
| 170 |
+
story.append(PageBreak())
|
| 171 |
+
story.append(Paragraph("Processing Issues", heading_style))
|
| 172 |
+
story.append(Spacer(1, 0.1*inch))
|
| 173 |
+
|
| 174 |
+
for error in processing_errors:
|
| 175 |
+
clean_error = error.replace('&', '&').replace('<', '<').replace('>', '>')
|
| 176 |
+
story.append(Paragraph(f"• {clean_error}", body_style))
|
| 177 |
+
story.append(Spacer(1, 0.05*inch))
|
| 178 |
+
|
| 179 |
+
# Individual transcript details
|
| 180 |
+
story.append(PageBreak())
|
| 181 |
+
story.append(Paragraph("Detailed Transcript Analysis", heading_style))
|
| 182 |
+
story.append(Spacer(1, 0.2*inch))
|
| 183 |
+
|
| 184 |
+
for result in results:
|
| 185 |
+
# Transcript header
|
| 186 |
+
transcript_title = f"{result['transcript_id']} - {result['file_name']}"
|
| 187 |
+
story.append(Paragraph(transcript_title, subheading_style))
|
| 188 |
+
|
| 189 |
+
# Stats
|
| 190 |
+
stats_data = [
|
| 191 |
+
["Quality Score:", f"{result['quality_score']:.2f}/1.00"],
|
| 192 |
+
["Word Count:", f"{result['word_count']:,}"]
|
| 193 |
+
]
|
| 194 |
+
|
| 195 |
+
stats_table = Table(stats_data, colWidths=[1.5*inch, 2*inch])
|
| 196 |
+
stats_table.setStyle(TableStyle([
|
| 197 |
+
('FONTSIZE', (0, 0), (-1, -1), 9),
|
| 198 |
+
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
|
| 199 |
+
('TOPPADDING', (0, 0), (-1, -1), 4),
|
| 200 |
+
]))
|
| 201 |
+
|
| 202 |
+
story.append(stats_table)
|
| 203 |
+
story.append(Spacer(1, 0.1*inch))
|
| 204 |
+
|
| 205 |
+
# Analysis text
|
| 206 |
+
text = result['full_text']
|
| 207 |
+
|
| 208 |
+
# Split into manageable chunks and clean
|
| 209 |
+
chunks = text.split('\n\n')
|
| 210 |
+
for chunk in chunks[:10]: # Limit to prevent overly long PDFs
|
| 211 |
+
if chunk.strip():
|
| 212 |
+
clean_chunk = chunk.strip().replace('&', '&').replace('<', '<').replace('>', '>')
|
| 213 |
+
# Limit paragraph length
|
| 214 |
+
if len(clean_chunk) > 1000:
|
| 215 |
+
clean_chunk = clean_chunk[:1000] + "..."
|
| 216 |
+
story.append(Paragraph(clean_chunk, body_style))
|
| 217 |
+
story.append(Spacer(1, 0.1*inch))
|
| 218 |
+
|
| 219 |
+
story.append(Spacer(1, 0.2*inch))
|
| 220 |
+
|
| 221 |
+
# Page break between transcripts (except last)
|
| 222 |
+
if result != results[-1]:
|
| 223 |
+
story.append(PageBreak())
|
| 224 |
+
|
| 225 |
+
# Build PDF
|
| 226 |
+
try:
|
| 227 |
+
doc.build(story)
|
| 228 |
+
return path
|
| 229 |
+
except Exception as e:
|
| 230 |
+
print(f"[PDF Error] Failed to generate PDF: {e}")
|
| 231 |
+
# Create a minimal fallback PDF
|
| 232 |
+
simple_doc = SimpleDocTemplate(path, pagesize=letter)
|
| 233 |
+
simple_story = [
|
| 234 |
+
Paragraph("Transcript Analysis Report", title_style),
|
| 235 |
+
Paragraph(f"Error generating full report: {str(e)}", body_style),
|
| 236 |
+
Paragraph(summary, body_style)
|
| 237 |
+
]
|
| 238 |
+
simple_doc.build(simple_story)
|
| 239 |
+
return path
|
requirements.txt
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core frameworks
|
| 2 |
+
gradio>=4.0.0
|
| 3 |
+
transformers>=4.35.0
|
| 4 |
+
torch>=2.0.0
|
| 5 |
+
|
| 6 |
+
# NLP and text processing
|
| 7 |
+
nltk>=3.8.0
|
| 8 |
+
tiktoken>=0.5.0
|
| 9 |
+
|
| 10 |
+
# Document processing
|
| 11 |
+
python-docx>=1.1.0
|
| 12 |
+
pdfplumber>=0.10.0
|
| 13 |
+
|
| 14 |
+
# Data processing and analysis
|
| 15 |
+
pandas>=2.0.0
|
| 16 |
+
numpy>=1.24.0
|
| 17 |
+
|
| 18 |
+
# Visualization
|
| 19 |
+
matplotlib>=3.7.0
|
| 20 |
+
seaborn>=0.12.0
|
| 21 |
+
|
| 22 |
+
# PDF generation
|
| 23 |
+
reportlab>=4.0.0
|
| 24 |
+
|
| 25 |
+
# API integrations
|
| 26 |
+
huggingface_hub>=0.19.0
|
| 27 |
+
|
| 28 |
+
# Utilities
|
| 29 |
+
chardet>=5.0.0
|
| 30 |
+
python-dateutil>=2.8.0
|
| 31 |
+
|
| 32 |
+
# Optional but recommended
|
| 33 |
+
accelerate>=0.24.0
|
| 34 |
+
sentencepiece>=0.1.99
|
| 35 |
+
protobuf>=4.24.0
|
| 36 |
+
|
| 37 |
+
# Audio transcription
|
| 38 |
+
faster-whisper>=0.10.0
|
| 39 |
+
torchaudio>=2.0.0
|
| 40 |
+
speechbrain>=0.5.16
|
| 41 |
+
scikit-learn>=1.3.0 # For clustering speaker embeddings
|
story_writer.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
def format_table_for_llm(df: pd.DataFrame, name: str) -> str:
|
| 6 |
+
return f"\n{name}:\n{df.to_string()}\n" if not df.empty else f"[{name}: No data]\n"
|
| 7 |
+
|
| 8 |
+
def build_narrative_prompt(parsed_data: Dict, tables: Dict, style: str) -> str:
|
| 9 |
+
metadata = parsed_data["metadata"]
|
| 10 |
+
stats = parsed_data["statistics"]
|
| 11 |
+
interviewee_type = parsed_data["interviewee_type"]
|
| 12 |
+
|
| 13 |
+
tables_text = "\n".join([format_table_for_llm(df, name) for name, df in tables.items()])
|
| 14 |
+
|
| 15 |
+
return f"""Write an executive research report for {metadata['total_transcripts']} {interviewee_type.lower()} interviews.
|
| 16 |
+
|
| 17 |
+
DATA TABLES:
|
| 18 |
+
{tables_text}
|
| 19 |
+
|
| 20 |
+
STRUCTURE:
|
| 21 |
+
1. EXECUTIVE OVERVIEW (2-3 paragraphs): Context, sample, high-level findings
|
| 22 |
+
2. KEY FINDINGS (3-5 sections): Each with narrative + data + significance
|
| 23 |
+
3. PATTERNS & THEMES (2 paragraphs): Cross-cutting insights
|
| 24 |
+
4. RECOMMENDATIONS (3-5 bullets): Actionable next steps
|
| 25 |
+
|
| 26 |
+
Write professionally. Quantify everything. Be specific. Lead with insights."""
|
| 27 |
+
|
| 28 |
+
def call_lmstudio(prompt: str) -> str:
|
| 29 |
+
import requests
|
| 30 |
+
url = os.getenv("LM_STUDIO_URL", "http://192.168.1.245:1234")
|
| 31 |
+
try:
|
| 32 |
+
r = requests.post(f"{url}/v1/chat/completions", json={
|
| 33 |
+
"messages": [{"role": "system", "content": "You are an expert research report writer."},
|
| 34 |
+
{"role": "user", "content": prompt}],
|
| 35 |
+
"max_tokens": 2000, "temperature": 0.7
|
| 36 |
+
}, timeout=180)
|
| 37 |
+
return r.json()["choices"][0]["message"]["content"]
|
| 38 |
+
except Exception as e:
|
| 39 |
+
return f"[Error: {e}]"
|
| 40 |
+
|
| 41 |
+
def call_hf_api(prompt: str) -> str:
|
| 42 |
+
from huggingface_hub import InferenceClient
|
| 43 |
+
try:
|
| 44 |
+
client = InferenceClient(token=os.getenv("HUGGINGFACE_TOKEN", ""))
|
| 45 |
+
return client.text_generation(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 46 |
+
max_new_tokens=2000, temperature=0.7)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return f"[Error: {e}]"
|
| 49 |
+
|
| 50 |
+
def generate_narrative(parsed_data: Dict, tables: Dict, style: str, llm_backend: str) -> str:
|
| 51 |
+
prompt = build_narrative_prompt(parsed_data, tables, style)
|
| 52 |
+
if llm_backend == "lmstudio":
|
| 53 |
+
return call_lmstudio(prompt)
|
| 54 |
+
else:
|
| 55 |
+
return call_hf_api(prompt)
|
table_builder.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from typing import Dict
|
| 3 |
+
from collections import Counter
|
| 4 |
+
|
| 5 |
+
def build_participant_profile_table(metadata: Dict) -> pd.DataFrame:
|
| 6 |
+
return pd.DataFrame({
|
| 7 |
+
"Metric": ["Total Participants", "Avg Quality Score", "Avg Words"],
|
| 8 |
+
"Value": [
|
| 9 |
+
metadata.get("total_transcripts", 0),
|
| 10 |
+
f"{metadata.get('avg_quality_score', 0):.2f}",
|
| 11 |
+
f"{metadata.get('avg_word_count', 0):,.0f}"
|
| 12 |
+
]
|
| 13 |
+
})
|
| 14 |
+
|
| 15 |
+
def build_quality_distribution_table(stats: Dict) -> pd.DataFrame:
|
| 16 |
+
if "quality" not in stats:
|
| 17 |
+
return pd.DataFrame()
|
| 18 |
+
q = stats["quality"]
|
| 19 |
+
df = pd.DataFrame({
|
| 20 |
+
"Quality Tier": ["Excellent (>0.8)", "Good (0.6-0.8)", "Fair (0.4-0.6)", "Poor (<0.4)"],
|
| 21 |
+
"Count": [q.get("excellent_count", 0), q.get("good_count", 0),
|
| 22 |
+
q.get("fair_count", 0), q.get("poor_count", 0)]
|
| 23 |
+
})
|
| 24 |
+
df["Percentage"] = (df["Count"] / df["Count"].sum() * 100).round(1)
|
| 25 |
+
return df
|
| 26 |
+
|
| 27 |
+
def build_frequency_table(themes: Dict) -> pd.DataFrame:
|
| 28 |
+
rows = []
|
| 29 |
+
for theme_name, items in themes.items():
|
| 30 |
+
for item in items[:10]:
|
| 31 |
+
rows.append({"Category": theme_name, "Item": item["item"], "Frequency": item["count"]})
|
| 32 |
+
return pd.DataFrame(rows) if rows else pd.DataFrame()
|
| 33 |
+
|
| 34 |
+
def build_all_tables(parsed_data: Dict) -> Dict[str, pd.DataFrame]:
|
| 35 |
+
tables = {}
|
| 36 |
+
df = parsed_data["dataframe"]
|
| 37 |
+
metadata = parsed_data["metadata"]
|
| 38 |
+
themes = parsed_data["themes"]
|
| 39 |
+
stats = parsed_data["statistics"]
|
| 40 |
+
|
| 41 |
+
tables["participant_profile"] = build_participant_profile_table(metadata)
|
| 42 |
+
|
| 43 |
+
quality_table = build_quality_distribution_table(stats)
|
| 44 |
+
if not quality_table.empty:
|
| 45 |
+
tables["quality_distribution"] = quality_table
|
| 46 |
+
|
| 47 |
+
freq_table = build_frequency_table(themes)
|
| 48 |
+
if not freq_table.empty:
|
| 49 |
+
tables["theme_frequency"] = freq_table
|
| 50 |
+
|
| 51 |
+
return tables
|
tagging.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Tuple
|
| 3 |
+
from collections import Counter
|
| 4 |
+
|
| 5 |
+
def detect_speaker_patterns(text: str) -> dict:
|
| 6 |
+
"""Analyze text to detect speaker patterns and labeling conventions"""
|
| 7 |
+
|
| 8 |
+
patterns = {
|
| 9 |
+
"colon_based": re.findall(r'^([A-Z][a-z\s]+\d*):\s', text, re.MULTILINE), # "Speaker 1: text"
|
| 10 |
+
"bracket_based": re.findall(r'^\[([^\]]+)\]\s', text, re.MULTILINE), # "[Interviewer] text"
|
| 11 |
+
"dash_based": re.findall(r'^-\s*([A-Z][a-z\s]+):\s', text, re.MULTILINE), # "- Doctor: text"
|
| 12 |
+
"q_a_based": bool(re.search(r'^(Q|A):\s', text, re.MULTILINE)), # "Q: / A:"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
# Determine most likely pattern
|
| 16 |
+
pattern_counts = {k: len(v) for k, v in patterns.items() if k != "q_a_based"}
|
| 17 |
+
pattern_counts["q_a_based"] = 1 if patterns["q_a_based"] else 0
|
| 18 |
+
|
| 19 |
+
most_common = max(pattern_counts, key=pattern_counts.get) if any(pattern_counts.values()) else None
|
| 20 |
+
|
| 21 |
+
# Extract unique speakers
|
| 22 |
+
if most_common == "colon_based":
|
| 23 |
+
speakers = list(set(patterns["colon_based"]))
|
| 24 |
+
elif most_common == "bracket_based":
|
| 25 |
+
speakers = list(set(patterns["bracket_based"]))
|
| 26 |
+
elif most_common == "dash_based":
|
| 27 |
+
speakers = list(set(patterns["dash_based"]))
|
| 28 |
+
elif most_common == "q_a_based":
|
| 29 |
+
speakers = ["Q", "A"]
|
| 30 |
+
else:
|
| 31 |
+
speakers = []
|
| 32 |
+
|
| 33 |
+
return {
|
| 34 |
+
"pattern_type": most_common,
|
| 35 |
+
"speakers_found": speakers,
|
| 36 |
+
"speaker_count": len(speakers),
|
| 37 |
+
"has_structure": most_common is not None
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def classify_speaker_role(text: str, speaker_label: str, interviewee_type: str) -> str:
|
| 42 |
+
"""
|
| 43 |
+
Use advanced heuristics to classify speaker role
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
text_lower = text.lower()
|
| 47 |
+
|
| 48 |
+
# Question patterns (likely interviewer)
|
| 49 |
+
question_patterns = [
|
| 50 |
+
r'\?$',
|
| 51 |
+
r'^(what|how|why|when|where|who|can you|could you|would you|do you|have you)',
|
| 52 |
+
r'(tell me|explain|describe|walk me through)',
|
| 53 |
+
r'(your thoughts|your experience|your perspective)'
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
question_score = sum(1 for p in question_patterns if re.search(p, text_lower))
|
| 57 |
+
|
| 58 |
+
# Medical/clinical patterns
|
| 59 |
+
clinical_patterns = [
|
| 60 |
+
r'\b(prescribe|prescription|rx|medication|drug|dose|dosage|mg|ml)\b',
|
| 61 |
+
r'\b(diagnos[ei]s|diagnosed|condition|disease|disorder)\b',
|
| 62 |
+
r'\b(treatment|therapy|intervention|protocol)\b',
|
| 63 |
+
r'\b(patient|case|clinical|medical|symptom)\b',
|
| 64 |
+
r'\b(efficacy|effectiveness|outcome|response|adverse)\b',
|
| 65 |
+
r'\b(guideline|recommendation|standard of care|first-line)\b'
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
clinical_score = sum(1 for p in clinical_patterns if re.search(p, text_lower))
|
| 69 |
+
|
| 70 |
+
# Patient experience patterns
|
| 71 |
+
patient_patterns = [
|
| 72 |
+
r'\b(I feel|I felt|I\'m experiencing|I have)\b',
|
| 73 |
+
r'\b(my symptoms|my condition|my pain|my treatment)\b',
|
| 74 |
+
r'\b(it hurts|it bothers|it helps|it doesn\'t work)\b',
|
| 75 |
+
r'\b(I tried|I take|I stopped|I started)\b',
|
| 76 |
+
r'\b(doctor told me|doctor said|doctor prescribed)\b'
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
patient_score = sum(1 for p in patient_patterns if re.search(p, text_lower))
|
| 80 |
+
|
| 81 |
+
# Neutral/closing patterns
|
| 82 |
+
neutral_patterns = [
|
| 83 |
+
r'\b(thank you|thanks|appreciate|goodbye|bye|closing)\b',
|
| 84 |
+
r'\b(that concludes|that\'s all|we\'re done)\b'
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
neutral_score = sum(1 for p in neutral_patterns if re.search(p, text_lower))
|
| 88 |
+
|
| 89 |
+
# Decision logic based on interviewee type
|
| 90 |
+
if neutral_score > 0 and len(text.split()) < 15:
|
| 91 |
+
return "Neutral"
|
| 92 |
+
|
| 93 |
+
if interviewee_type == "HCP":
|
| 94 |
+
# In HCP interviews, high clinical language = interviewee (doctor)
|
| 95 |
+
if clinical_score >= 3:
|
| 96 |
+
return "Doctor"
|
| 97 |
+
elif question_score >= 2:
|
| 98 |
+
return "Interviewer"
|
| 99 |
+
elif clinical_score >= 1:
|
| 100 |
+
return "Doctor"
|
| 101 |
+
else:
|
| 102 |
+
return "Unknown"
|
| 103 |
+
|
| 104 |
+
elif interviewee_type == "Patient":
|
| 105 |
+
# In patient interviews, patient experience language = interviewee
|
| 106 |
+
if patient_score >= 2:
|
| 107 |
+
return "Patient"
|
| 108 |
+
elif question_score >= 2:
|
| 109 |
+
return "Interviewer"
|
| 110 |
+
elif clinical_score >= 2:
|
| 111 |
+
return "Interviewer" # Likely interviewer explaining medical info
|
| 112 |
+
elif patient_score >= 1:
|
| 113 |
+
return "Patient"
|
| 114 |
+
else:
|
| 115 |
+
return "Unknown"
|
| 116 |
+
|
| 117 |
+
else:
|
| 118 |
+
# General classification
|
| 119 |
+
if question_score >= 2:
|
| 120 |
+
return "Interviewer"
|
| 121 |
+
elif clinical_score >= 2:
|
| 122 |
+
return "Respondent"
|
| 123 |
+
else:
|
| 124 |
+
return "Unknown"
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def parse_existing_tags(text: str, pattern_info: dict) -> List[Tuple[str, str]]:
|
| 128 |
+
"""Parse text with existing speaker tags"""
|
| 129 |
+
|
| 130 |
+
pattern_type = pattern_info["pattern_type"]
|
| 131 |
+
segments = []
|
| 132 |
+
|
| 133 |
+
if pattern_type == "colon_based":
|
| 134 |
+
# "Speaker 1: text"
|
| 135 |
+
parts = re.split(r'^([A-Z][a-z\s]+\d*):\s', text, flags=re.MULTILINE)
|
| 136 |
+
for i in range(1, len(parts), 2):
|
| 137 |
+
if i + 1 < len(parts):
|
| 138 |
+
speaker = parts[i].strip()
|
| 139 |
+
content = parts[i + 1].strip()
|
| 140 |
+
if content:
|
| 141 |
+
segments.append((speaker, content))
|
| 142 |
+
|
| 143 |
+
elif pattern_type == "bracket_based":
|
| 144 |
+
# "[Speaker] text"
|
| 145 |
+
parts = re.split(r'^\[([^\]]+)\]\s', text, flags=re.MULTILINE)
|
| 146 |
+
for i in range(1, len(parts), 2):
|
| 147 |
+
if i + 1 < len(parts):
|
| 148 |
+
speaker = parts[i].strip()
|
| 149 |
+
content = parts[i + 1].strip()
|
| 150 |
+
if content:
|
| 151 |
+
segments.append((speaker, content))
|
| 152 |
+
|
| 153 |
+
elif pattern_type == "q_a_based":
|
| 154 |
+
# "Q: / A:"
|
| 155 |
+
parts = re.split(r'^([QA]):\s', text, flags=re.MULTILINE)
|
| 156 |
+
for i in range(1, len(parts), 2):
|
| 157 |
+
if i + 1 < len(parts):
|
| 158 |
+
speaker = "Interviewer" if parts[i] == "Q" else "Respondent"
|
| 159 |
+
content = parts[i + 1].strip()
|
| 160 |
+
if content:
|
| 161 |
+
segments.append((speaker, content))
|
| 162 |
+
|
| 163 |
+
else:
|
| 164 |
+
# No clear pattern - treat as single block
|
| 165 |
+
segments.append(("Unknown", text))
|
| 166 |
+
|
| 167 |
+
return segments
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def tag_speakers_advanced(text: str, role_hint: str = "", interviewee_type: str = "Other") -> str:
|
| 171 |
+
"""
|
| 172 |
+
Advanced speaker tagging with pattern detection and role classification
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
# Step 1: Detect existing structure
|
| 176 |
+
pattern_info = detect_speaker_patterns(text)
|
| 177 |
+
|
| 178 |
+
# Step 2: Parse role hints if provided
|
| 179 |
+
role_mapping = {}
|
| 180 |
+
if role_hint:
|
| 181 |
+
# Parse hints like "Speaker 1 = Interviewer, Speaker 2 = Doctor"
|
| 182 |
+
hint_parts = re.findall(r'([^,=]+)\s*=\s*([^,=]+)', role_hint)
|
| 183 |
+
for original, mapped in hint_parts:
|
| 184 |
+
role_mapping[original.strip().lower()] = mapped.strip()
|
| 185 |
+
|
| 186 |
+
# Step 3: Parse segments
|
| 187 |
+
if pattern_info["has_structure"]:
|
| 188 |
+
segments = parse_existing_tags(text, pattern_info)
|
| 189 |
+
else:
|
| 190 |
+
# No clear structure - split by paragraphs/lines
|
| 191 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 192 |
+
segments = [("Unknown", line) for line in lines]
|
| 193 |
+
|
| 194 |
+
# Step 4: Classify and tag each segment
|
| 195 |
+
tagged_segments = []
|
| 196 |
+
|
| 197 |
+
for speaker_label, content in segments:
|
| 198 |
+
# Apply role mapping if available
|
| 199 |
+
speaker_key = speaker_label.lower()
|
| 200 |
+
if speaker_key in role_mapping:
|
| 201 |
+
final_role = role_mapping[speaker_key]
|
| 202 |
+
else:
|
| 203 |
+
# Auto-classify based on content
|
| 204 |
+
final_role = classify_speaker_role(content, speaker_label, interviewee_type)
|
| 205 |
+
|
| 206 |
+
# Format the tagged line
|
| 207 |
+
tagged_segments.append(f"[{final_role}] {content}")
|
| 208 |
+
|
| 209 |
+
return "\n\n".join(tagged_segments)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def analyze_speaker_distribution(tagged_text: str) -> dict:
|
| 213 |
+
"""
|
| 214 |
+
Analyze the distribution of speakers in tagged text
|
| 215 |
+
Useful for quality control
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
speakers = re.findall(r'^\[([^\]]+)\]', tagged_text, re.MULTILINE)
|
| 219 |
+
distribution = Counter(speakers)
|
| 220 |
+
|
| 221 |
+
total = len(speakers)
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
"total_segments": total,
|
| 225 |
+
"unique_speakers": len(distribution),
|
| 226 |
+
"distribution": dict(distribution),
|
| 227 |
+
"percentages": {k: (v / total * 100) for k, v in distribution.items()} if total > 0 else {}
|
| 228 |
+
}
|
utils.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for TranscriptorAI
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import hashlib
|
| 8 |
+
import pickle
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import Any, Dict, List, Optional
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
# ============================================================================
|
| 15 |
+
# LOGGING SETUP
|
| 16 |
+
# ============================================================================
|
| 17 |
+
|
| 18 |
+
def setup_logging(log_file: str = "transcript_analysis.log", level: str = "INFO"):
|
| 19 |
+
"""Setup logging configuration"""
|
| 20 |
+
logging.basicConfig(
|
| 21 |
+
level=getattr(logging, level.upper()),
|
| 22 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 23 |
+
handlers=[
|
| 24 |
+
logging.FileHandler(log_file),
|
| 25 |
+
logging.StreamHandler()
|
| 26 |
+
]
|
| 27 |
+
)
|
| 28 |
+
return logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
logger = setup_logging()
|
| 31 |
+
|
| 32 |
+
# ============================================================================
|
| 33 |
+
# CACHING UTILITIES
|
| 34 |
+
# ============================================================================
|
| 35 |
+
|
| 36 |
+
def get_file_hash(file_path: str) -> str:
|
| 37 |
+
"""Generate hash for a file for caching purposes"""
|
| 38 |
+
hasher = hashlib.md5()
|
| 39 |
+
with open(file_path, 'rb') as f:
|
| 40 |
+
buf = f.read(65536) # Read in 64kb chunks
|
| 41 |
+
while len(buf) > 0:
|
| 42 |
+
hasher.update(buf)
|
| 43 |
+
buf = f.read(65536)
|
| 44 |
+
return hasher.hexdigest()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def cache_result(key: str, data: Any, cache_dir: str = "./.cache") -> bool:
|
| 48 |
+
"""Cache a result to disk"""
|
| 49 |
+
try:
|
| 50 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 51 |
+
cache_file = os.path.join(cache_dir, f"{key}.pkl")
|
| 52 |
+
|
| 53 |
+
with open(cache_file, 'wb') as f:
|
| 54 |
+
pickle.dump(data, f)
|
| 55 |
+
|
| 56 |
+
logger.debug(f"Cached result for key: {key}")
|
| 57 |
+
return True
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.error(f"Failed to cache result: {e}")
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def load_cached_result(key: str, cache_dir: str = "./.cache") -> Optional[Any]:
|
| 64 |
+
"""Load a cached result from disk"""
|
| 65 |
+
try:
|
| 66 |
+
cache_file = os.path.join(cache_dir, f"{key}.pkl")
|
| 67 |
+
|
| 68 |
+
if not os.path.exists(cache_file):
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
# Check if cache is less than 7 days old
|
| 72 |
+
file_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
|
| 73 |
+
if file_age > 7 * 24 * 3600: # 7 days
|
| 74 |
+
logger.debug(f"Cache expired for key: {key}")
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
with open(cache_file, 'rb') as f:
|
| 78 |
+
data = pickle.load(f)
|
| 79 |
+
|
| 80 |
+
logger.debug(f"Loaded cached result for key: {key}")
|
| 81 |
+
return data
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error(f"Failed to load cached result: {e}")
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def clear_cache(cache_dir: str = "./.cache"):
|
| 88 |
+
"""Clear all cached files"""
|
| 89 |
+
try:
|
| 90 |
+
if os.path.exists(cache_dir):
|
| 91 |
+
for file in os.listdir(cache_dir):
|
| 92 |
+
file_path = os.path.join(cache_dir, file)
|
| 93 |
+
os.remove(file_path)
|
| 94 |
+
logger.info(f"Cleared cache directory: {cache_dir}")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.error(f"Failed to clear cache: {e}")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ============================================================================
|
| 100 |
+
# FILE UTILITIES
|
| 101 |
+
# ============================================================================
|
| 102 |
+
|
| 103 |
+
def ensure_directory(path: str) -> str:
|
| 104 |
+
"""Ensure directory exists, create if not"""
|
| 105 |
+
os.makedirs(path, exist_ok=True)
|
| 106 |
+
return path
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def get_unique_filename(base_path: str, extension: str = "") -> str:
|
| 110 |
+
"""Generate unique filename by adding timestamp"""
|
| 111 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 112 |
+
base = os.path.splitext(base_path)[0]
|
| 113 |
+
ext = extension or os.path.splitext(base_path)[1]
|
| 114 |
+
return f"{base}_{timestamp}{ext}"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def get_file_size_mb(file_path: str) -> float:
|
| 118 |
+
"""Get file size in MB"""
|
| 119 |
+
return os.path.getsize(file_path) / (1024 * 1024)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def validate_file(file_path: str, max_size_mb: int = 50, allowed_extensions: List[str] = None) -> tuple:
|
| 123 |
+
"""Validate file exists, size, and extension"""
|
| 124 |
+
if allowed_extensions is None:
|
| 125 |
+
allowed_extensions = ['.docx', '.pdf']
|
| 126 |
+
|
| 127 |
+
if not os.path.exists(file_path):
|
| 128 |
+
return False, "File does not exist"
|
| 129 |
+
|
| 130 |
+
if get_file_size_mb(file_path) > max_size_mb:
|
| 131 |
+
return False, f"File exceeds {max_size_mb}MB limit"
|
| 132 |
+
|
| 133 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 134 |
+
if ext not in allowed_extensions:
|
| 135 |
+
return False, f"File type {ext} not supported"
|
| 136 |
+
|
| 137 |
+
return True, "Valid"
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# ============================================================================
|
| 141 |
+
# DATA PROCESSING UTILITIES
|
| 142 |
+
# ============================================================================
|
| 143 |
+
|
| 144 |
+
def sanitize_text(text: str) -> str:
|
| 145 |
+
"""Sanitize text for safe processing"""
|
| 146 |
+
# Remove null bytes
|
| 147 |
+
text = text.replace('\x00', '')
|
| 148 |
+
|
| 149 |
+
# Normalize whitespace
|
| 150 |
+
text = ' '.join(text.split())
|
| 151 |
+
|
| 152 |
+
return text.strip()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
|
| 156 |
+
"""Truncate text to max length with suffix"""
|
| 157 |
+
if len(text) <= max_length:
|
| 158 |
+
return text
|
| 159 |
+
return text[:max_length - len(suffix)] + suffix
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def extract_keywords(text: str, top_n: int = 10) -> List[str]:
|
| 163 |
+
"""Extract top N keywords from text (simple frequency-based)"""
|
| 164 |
+
from collections import Counter
|
| 165 |
+
import re
|
| 166 |
+
|
| 167 |
+
# Simple tokenization
|
| 168 |
+
words = re.findall(r'\b[a-z]{3,}\b', text.lower())
|
| 169 |
+
|
| 170 |
+
# Remove common stop words
|
| 171 |
+
stop_words = {
|
| 172 |
+
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'with',
|
| 173 |
+
'this', 'that', 'from', 'they', 'have', 'has', 'was', 'were'
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
words = [w for w in words if w not in stop_words]
|
| 177 |
+
|
| 178 |
+
# Count and return top N
|
| 179 |
+
counter = Counter(words)
|
| 180 |
+
return [word for word, count in counter.most_common(top_n)]
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ============================================================================
|
| 184 |
+
# STATISTICS UTILITIES
|
| 185 |
+
# ============================================================================
|
| 186 |
+
|
| 187 |
+
def calculate_statistics(values: List[float]) -> Dict[str, float]:
|
| 188 |
+
"""Calculate basic statistics for a list of values"""
|
| 189 |
+
if not values:
|
| 190 |
+
return {}
|
| 191 |
+
|
| 192 |
+
import numpy as np
|
| 193 |
+
|
| 194 |
+
return {
|
| 195 |
+
"mean": np.mean(values),
|
| 196 |
+
"median": np.median(values),
|
| 197 |
+
"std": np.std(values),
|
| 198 |
+
"min": np.min(values),
|
| 199 |
+
"max": np.max(values),
|
| 200 |
+
"count": len(values)
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def calculate_percentile(values: List[float], percentile: int) -> float:
|
| 205 |
+
"""Calculate percentile of values"""
|
| 206 |
+
import numpy as np
|
| 207 |
+
return np.percentile(values, percentile)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
# ============================================================================
|
| 211 |
+
# JSON UTILITIES
|
| 212 |
+
# ============================================================================
|
| 213 |
+
|
| 214 |
+
def save_json(data: Dict, filepath: str, pretty: bool = True) -> bool:
|
| 215 |
+
"""Save data as JSON file"""
|
| 216 |
+
try:
|
| 217 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 218 |
+
if pretty:
|
| 219 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 220 |
+
else:
|
| 221 |
+
json.dump(data, f, ensure_ascii=False)
|
| 222 |
+
logger.debug(f"Saved JSON to: {filepath}")
|
| 223 |
+
return True
|
| 224 |
+
except Exception as e:
|
| 225 |
+
logger.error(f"Failed to save JSON: {e}")
|
| 226 |
+
return False
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def load_json(filepath: str) -> Optional[Dict]:
|
| 230 |
+
"""Load JSON file"""
|
| 231 |
+
try:
|
| 232 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 233 |
+
data = json.load(f)
|
| 234 |
+
logger.debug(f"Loaded JSON from: {filepath}")
|
| 235 |
+
return data
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logger.error(f"Failed to load JSON: {e}")
|
| 238 |
+
return None
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ============================================================================
|
| 242 |
+
# PROGRESS TRACKING
|
| 243 |
+
# ============================================================================
|
| 244 |
+
|
| 245 |
+
class ProgressTracker:
|
| 246 |
+
"""Simple progress tracker for long operations"""
|
| 247 |
+
|
| 248 |
+
def __init__(self, total: int, description: str = "Processing"):
|
| 249 |
+
self.total = total
|
| 250 |
+
self.current = 0
|
| 251 |
+
self.description = description
|
| 252 |
+
self.start_time = datetime.now()
|
| 253 |
+
|
| 254 |
+
def update(self, n: int = 1):
|
| 255 |
+
"""Update progress"""
|
| 256 |
+
self.current = min(self.current + n, self.total)
|
| 257 |
+
self._print_progress()
|
| 258 |
+
|
| 259 |
+
def _print_progress(self):
|
| 260 |
+
"""Print progress bar"""
|
| 261 |
+
percentage = (self.current / self.total) * 100 if self.total > 0 else 0
|
| 262 |
+
bar_length = 40
|
| 263 |
+
filled = int(bar_length * self.current / self.total) if self.total > 0 else 0
|
| 264 |
+
bar = '█' * filled + '-' * (bar_length - filled)
|
| 265 |
+
|
| 266 |
+
elapsed = (datetime.now() - self.start_time).total_seconds()
|
| 267 |
+
eta = (elapsed / self.current * (self.total - self.current)) if self.current > 0 else 0
|
| 268 |
+
|
| 269 |
+
print(f'\r{self.description}: |{bar}| {percentage:.1f}% ({self.current}/{self.total}) ETA: {eta:.0f}s', end='')
|
| 270 |
+
|
| 271 |
+
if self.current >= self.total:
|
| 272 |
+
print() # New line when complete
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# ============================================================================
|
| 276 |
+
# ERROR HANDLING UTILITIES
|
| 277 |
+
# ============================================================================
|
| 278 |
+
|
| 279 |
+
def safe_execute(func, *args, default=None, error_msg="Operation failed", **kwargs):
|
| 280 |
+
"""Safely execute a function with error handling"""
|
| 281 |
+
try:
|
| 282 |
+
return func(*args, **kwargs)
|
| 283 |
+
except Exception as e:
|
| 284 |
+
logger.error(f"{error_msg}: {e}")
|
| 285 |
+
return default
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
# ============================================================================
|
| 289 |
+
# TEXT COMPARISON UTILITIES
|
| 290 |
+
# ============================================================================
|
| 291 |
+
|
| 292 |
+
def calculate_similarity(text1: str, text2: str) -> float:
|
| 293 |
+
"""Calculate simple similarity score between two texts"""
|
| 294 |
+
words1 = set(text1.lower().split())
|
| 295 |
+
words2 = set(text2.lower().split())
|
| 296 |
+
|
| 297 |
+
if not words1 or not words2:
|
| 298 |
+
return 0.0
|
| 299 |
+
|
| 300 |
+
intersection = words1.intersection(words2)
|
| 301 |
+
union = words1.union(words2)
|
| 302 |
+
|
| 303 |
+
return len(intersection) / len(union) if union else 0.0
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# ============================================================================
|
| 307 |
+
# BATCH PROCESSING UTILITIES
|
| 308 |
+
# ============================================================================
|
| 309 |
+
|
| 310 |
+
def batch_items(items: List, batch_size: int) -> List[List]:
|
| 311 |
+
"""Split list into batches"""
|
| 312 |
+
return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def parallel_process(func, items: List, max_workers: int = 4):
|
| 316 |
+
"""Process items in parallel"""
|
| 317 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 318 |
+
|
| 319 |
+
results = []
|
| 320 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 321 |
+
futures = [executor.submit(func, item) for item in items]
|
| 322 |
+
for future in as_completed(futures):
|
| 323 |
+
try:
|
| 324 |
+
result = future.result()
|
| 325 |
+
results.append(result)
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.error(f"Parallel processing error: {e}")
|
| 328 |
+
results.append(None)
|
| 329 |
+
|
| 330 |
+
return results
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# ============================================================================
|
| 334 |
+
# EXPORT UTILITIES
|
| 335 |
+
# ============================================================================
|
| 336 |
+
|
| 337 |
+
def export_to_excel(data: Dict[str, List[Dict]], filepath: str) -> bool:
|
| 338 |
+
"""Export multiple dataframes to Excel with sheets"""
|
| 339 |
+
try:
|
| 340 |
+
import pandas as pd
|
| 341 |
+
|
| 342 |
+
with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
|
| 343 |
+
for sheet_name, rows in data.items():
|
| 344 |
+
df = pd.DataFrame(rows)
|
| 345 |
+
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
| 346 |
+
|
| 347 |
+
logger.info(f"Exported to Excel: {filepath}")
|
| 348 |
+
return True
|
| 349 |
+
except Exception as e:
|
| 350 |
+
logger.error(f"Failed to export to Excel: {e}")
|
| 351 |
+
return False
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
# ============================================================================
|
| 355 |
+
# VALIDATION UTILITIES
|
| 356 |
+
# ============================================================================
|
| 357 |
+
|
| 358 |
+
def is_valid_email(email: str) -> bool:
|
| 359 |
+
"""Basic email validation"""
|
| 360 |
+
import re
|
| 361 |
+
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
| 362 |
+
return bool(re.match(pattern, email))
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def is_valid_url(url: str) -> bool:
|
| 366 |
+
"""Basic URL validation"""
|
| 367 |
+
import re
|
| 368 |
+
pattern = r'^https?://[^\s<>"]+$'
|
| 369 |
+
return bool(re.match(pattern, url))
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ============================================================================
|
| 373 |
+
# MAIN (FOR TESTING)
|
| 374 |
+
# ============================================================================
|
| 375 |
+
|
| 376 |
+
if __name__ == "__main__":
|
| 377 |
+
# Test utilities
|
| 378 |
+
print("Testing utilities...")
|
| 379 |
+
|
| 380 |
+
# Test file operations
|
| 381 |
+
test_dir = ensure_directory("./test_output")
|
| 382 |
+
print(f"Created test directory: {test_dir}")
|
| 383 |
+
|
| 384 |
+
# Test JSON operations
|
| 385 |
+
test_data = {"key": "value", "number": 42}
|
| 386 |
+
save_json(test_data, "./test_output/test.json")
|
| 387 |
+
loaded = load_json("./test_output/test.json")
|
| 388 |
+
assert loaded == test_data, "JSON save/load failed"
|
| 389 |
+
print("✓ JSON operations work")
|
| 390 |
+
|
| 391 |
+
# Test statistics
|
| 392 |
+
test_values = [1, 2, 3, 4, 5]
|
| 393 |
+
stats = calculate_statistics(test_values)
|
| 394 |
+
print(f"✓ Statistics: {stats}")
|
| 395 |
+
|
| 396 |
+
# Test progress tracker
|
| 397 |
+
tracker = ProgressTracker(10, "Test")
|
| 398 |
+
for i in range(10):
|
| 399 |
+
import time
|
| 400 |
+
time.sleep(0.1)
|
| 401 |
+
tracker.update()
|
| 402 |
+
print("✓ Progress tracker works")
|
| 403 |
+
|
| 404 |
+
print("\n✓ All utility tests passed!")
|
validation.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Tuple, Dict, List
|
| 3 |
+
|
| 4 |
+
def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
|
| 5 |
+
"""
|
| 6 |
+
Validate that text extraction was successful
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
if not text or not text.strip():
|
| 10 |
+
return False, "No text extracted"
|
| 11 |
+
|
| 12 |
+
# Check for minimum content
|
| 13 |
+
if len(text) < 50:
|
| 14 |
+
return False, f"Extracted text too short ({len(text)} chars)"
|
| 15 |
+
|
| 16 |
+
# Check for garbled text indicators
|
| 17 |
+
garbled_patterns = [
|
| 18 |
+
(r'[^\x00-\x7F]{50,}', "Contains large blocks of non-ASCII characters"),
|
| 19 |
+
(r'(.)\1{20,}', "Contains suspicious character repetition"),
|
| 20 |
+
(r'^[\W\d\s]+$', "Contains only symbols/numbers/whitespace")
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
for pattern, msg in garbled_patterns:
|
| 24 |
+
if re.search(pattern, text):
|
| 25 |
+
return False, msg
|
| 26 |
+
|
| 27 |
+
# Check word count
|
| 28 |
+
words = text.split()
|
| 29 |
+
if len(words) < 20:
|
| 30 |
+
return False, f"Too few words ({len(words)})"
|
| 31 |
+
|
| 32 |
+
# Calculate ratio of real words (heuristic)
|
| 33 |
+
potential_words = [w for w in words if re.match(r'^[a-zA-Z]{2,}$', w)]
|
| 34 |
+
word_ratio = len(potential_words) / len(words) if words else 0
|
| 35 |
+
|
| 36 |
+
if word_ratio < 0.3:
|
| 37 |
+
return False, f"Low word ratio ({word_ratio:.2f}) - possible extraction issue"
|
| 38 |
+
|
| 39 |
+
return True, f"Valid ({len(words)} words, {len(text)} chars)"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def validate_transcript_quality(
|
| 43 |
+
analyzed_text: str,
|
| 44 |
+
structured_data: Dict,
|
| 45 |
+
interviewee_type: str
|
| 46 |
+
) -> Tuple[float, str]:
|
| 47 |
+
"""
|
| 48 |
+
Assess quality of analyzed transcript
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Tuple of (quality_score [0-1], issues_description)
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
score = 1.0
|
| 55 |
+
issues = []
|
| 56 |
+
|
| 57 |
+
# Check 1: Length of analysis
|
| 58 |
+
if len(analyzed_text) < 100:
|
| 59 |
+
score -= 0.3
|
| 60 |
+
issues.append("Analysis too brief")
|
| 61 |
+
elif len(analyzed_text) < 300:
|
| 62 |
+
score -= 0.1
|
| 63 |
+
issues.append("Analysis somewhat brief")
|
| 64 |
+
|
| 65 |
+
# Check 2: Presence of structured data
|
| 66 |
+
if not structured_data:
|
| 67 |
+
score -= 0.2
|
| 68 |
+
issues.append("No structured data extracted")
|
| 69 |
+
else:
|
| 70 |
+
# Check if structured data has content
|
| 71 |
+
empty_fields = sum(1 for v in structured_data.values() if not v)
|
| 72 |
+
total_fields = len(structured_data)
|
| 73 |
+
|
| 74 |
+
if empty_fields == total_fields:
|
| 75 |
+
score -= 0.3
|
| 76 |
+
issues.append("All structured fields empty")
|
| 77 |
+
elif empty_fields > total_fields * 0.7:
|
| 78 |
+
score -= 0.2
|
| 79 |
+
issues.append("Most structured fields empty")
|
| 80 |
+
|
| 81 |
+
# Check 3: Type-specific validation
|
| 82 |
+
if interviewee_type == "HCP":
|
| 83 |
+
# Expect medical terminology
|
| 84 |
+
medical_terms = re.findall(
|
| 85 |
+
r'\b(diagnos\w+|prescri\w+|treatment|medication|patient|clinical|therapy)\b',
|
| 86 |
+
analyzed_text,
|
| 87 |
+
re.IGNORECASE
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
if len(medical_terms) < 3:
|
| 91 |
+
score -= 0.2
|
| 92 |
+
issues.append("Limited medical terminology for HCP interview")
|
| 93 |
+
|
| 94 |
+
# Check for key structured fields
|
| 95 |
+
key_fields = ["diagnoses", "prescriptions", "treatment_rationale"]
|
| 96 |
+
missing_fields = [f for f in key_fields if not structured_data.get(f)]
|
| 97 |
+
|
| 98 |
+
if len(missing_fields) == len(key_fields):
|
| 99 |
+
score -= 0.2
|
| 100 |
+
issues.append("No key HCP data extracted")
|
| 101 |
+
|
| 102 |
+
elif interviewee_type == "Patient":
|
| 103 |
+
# Expect patient-centric language
|
| 104 |
+
patient_terms = re.findall(
|
| 105 |
+
r'\b(symptom|feel|concern|experience|treatment|side effect|quality of life)\b',
|
| 106 |
+
analyzed_text,
|
| 107 |
+
re.IGNORECASE
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
if len(patient_terms) < 3:
|
| 111 |
+
score -= 0.2
|
| 112 |
+
issues.append("Limited patient-centric content")
|
| 113 |
+
|
| 114 |
+
# Check for key structured fields
|
| 115 |
+
key_fields = ["symptoms", "concerns", "treatment_response"]
|
| 116 |
+
missing_fields = [f for f in key_fields if not structured_data.get(f)]
|
| 117 |
+
|
| 118 |
+
if len(missing_fields) == len(key_fields):
|
| 119 |
+
score -= 0.2
|
| 120 |
+
issues.append("No key patient data extracted")
|
| 121 |
+
|
| 122 |
+
# Check 4: Error indicators
|
| 123 |
+
error_patterns = [
|
| 124 |
+
r'\[Error\]',
|
| 125 |
+
r'failed to',
|
| 126 |
+
r'could not',
|
| 127 |
+
r'unable to',
|
| 128 |
+
r'timeout'
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
for pattern in error_patterns:
|
| 132 |
+
if re.search(pattern, analyzed_text, re.IGNORECASE):
|
| 133 |
+
score -= 0.3
|
| 134 |
+
issues.append("Contains error messages")
|
| 135 |
+
break
|
| 136 |
+
|
| 137 |
+
# Check 5: Repetitive content (potential LLM failure)
|
| 138 |
+
sentences = analyzed_text.split('.')
|
| 139 |
+
if len(sentences) > 3:
|
| 140 |
+
unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
|
| 141 |
+
repetition_ratio = len(sentences) / len(unique_sentences) if unique_sentences else 1
|
| 142 |
+
|
| 143 |
+
if repetition_ratio > 1.5:
|
| 144 |
+
score -= 0.2
|
| 145 |
+
issues.append("High content repetition")
|
| 146 |
+
|
| 147 |
+
# Ensure score is in valid range
|
| 148 |
+
score = max(0.0, min(1.0, score))
|
| 149 |
+
|
| 150 |
+
issues_text = "; ".join(issues) if issues else "No issues detected"
|
| 151 |
+
|
| 152 |
+
return score, issues_text
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def check_data_completeness(csv_rows: List[Dict], interviewee_type: str) -> Dict:
|
| 156 |
+
"""
|
| 157 |
+
Analyze completeness of extracted data across all transcripts
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
if not csv_rows:
|
| 161 |
+
return {"error": "No data to check"}
|
| 162 |
+
|
| 163 |
+
# Determine key fields based on type
|
| 164 |
+
if interviewee_type == "HCP":
|
| 165 |
+
key_fields = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
|
| 166 |
+
elif interviewee_type == "Patient":
|
| 167 |
+
key_fields = ["Primary Symptoms", "Main Concerns", "Treatment Response"]
|
| 168 |
+
else:
|
| 169 |
+
key_fields = ["Key Insights"]
|
| 170 |
+
|
| 171 |
+
completeness = {}
|
| 172 |
+
|
| 173 |
+
for field in key_fields:
|
| 174 |
+
if field in csv_rows[0]: # Check if field exists
|
| 175 |
+
filled_count = sum(1 for row in csv_rows if row.get(field) and row[field].strip())
|
| 176 |
+
completeness[field] = {
|
| 177 |
+
"filled": filled_count,
|
| 178 |
+
"total": len(csv_rows),
|
| 179 |
+
"percentage": (filled_count / len(csv_rows) * 100) if csv_rows else 0
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
# Overall completeness
|
| 183 |
+
total_fields = sum(c["total"] for c in completeness.values())
|
| 184 |
+
filled_fields = sum(c["filled"] for c in completeness.values())
|
| 185 |
+
overall_percentage = (filled_fields / total_fields * 100) if total_fields > 0 else 0
|
| 186 |
+
|
| 187 |
+
return {
|
| 188 |
+
"by_field": completeness,
|
| 189 |
+
"overall": {
|
| 190 |
+
"filled": filled_fields,
|
| 191 |
+
"total": total_fields,
|
| 192 |
+
"percentage": overall_percentage
|
| 193 |
+
},
|
| 194 |
+
"quality_grade": (
|
| 195 |
+
"Excellent" if overall_percentage >= 80 else
|
| 196 |
+
"Good" if overall_percentage >= 60 else
|
| 197 |
+
"Fair" if overall_percentage >= 40 else
|
| 198 |
+
"Poor"
|
| 199 |
+
)
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def validate_structured_data_format(data: Dict, interviewee_type: str) -> Tuple[bool, List[str]]:
|
| 204 |
+
"""
|
| 205 |
+
Validate that structured data has expected format
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
issues = []
|
| 209 |
+
|
| 210 |
+
if not isinstance(data, dict):
|
| 211 |
+
return False, ["Data is not a dictionary"]
|
| 212 |
+
|
| 213 |
+
# Define expected fields by type
|
| 214 |
+
expected_fields = {
|
| 215 |
+
"HCP": ["diagnoses", "prescriptions", "treatment_rationale"],
|
| 216 |
+
"Patient": ["symptoms", "concerns", "treatment_response"],
|
| 217 |
+
"Other": ["key_insights"]
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
required = expected_fields.get(interviewee_type, [])
|
| 221 |
+
|
| 222 |
+
# Check for expected fields
|
| 223 |
+
missing = [f for f in required if f not in data]
|
| 224 |
+
if missing:
|
| 225 |
+
issues.append(f"Missing expected fields: {', '.join(missing)}")
|
| 226 |
+
|
| 227 |
+
# Check field types (should be lists)
|
| 228 |
+
for key, value in data.items():
|
| 229 |
+
if not isinstance(value, list):
|
| 230 |
+
issues.append(f"Field '{key}' should be a list, got {type(value)}")
|
| 231 |
+
|
| 232 |
+
# Check for empty lists
|
| 233 |
+
empty_fields = [k for k, v in data.items() if isinstance(v, list) and not v]
|
| 234 |
+
if len(empty_fields) == len(data):
|
| 235 |
+
issues.append("All fields are empty lists")
|
| 236 |
+
|
| 237 |
+
is_valid = len(issues) == 0
|
| 238 |
+
|
| 239 |
+
return is_valid, issues
|
| 240 |
+
|
| 241 |
+
def validate_summary_quality(summary: str, num_transcripts: int) -> Tuple[float, List[str]]:
|
| 242 |
+
"""Check summary for rigor and accuracy"""
|
| 243 |
+
issues = []
|
| 244 |
+
score = 1.0
|
| 245 |
+
|
| 246 |
+
# Check for quantification
|
| 247 |
+
if not re.search(r'\d+\s*(?:out of|of|participants|%)', summary):
|
| 248 |
+
issues.append("No quantified findings (must include counts/percentages)")
|
| 249 |
+
score -= 0.3
|
| 250 |
+
|
| 251 |
+
# Check for vague claims
|
| 252 |
+
vague_terms = ['many', 'most', 'some', 'several', 'often', 'frequently']
|
| 253 |
+
if any(term in summary.lower() for term in vague_terms):
|
| 254 |
+
issues.append("Contains vague terms - should use specific numbers")
|
| 255 |
+
score -= 0.2
|
| 256 |
+
|
| 257 |
+
# Check for absolute claims
|
| 258 |
+
absolute_terms = ['all', 'everyone', 'nobody', 'never', 'always']
|
| 259 |
+
for term in absolute_terms:
|
| 260 |
+
if re.search(rf'\b{term}\b', summary.lower()):
|
| 261 |
+
issues.append(f"Absolute claim '{term}' found - likely overgeneralization")
|
| 262 |
+
score -= 0.2
|
| 263 |
+
|
| 264 |
+
# Check for evidence markers
|
| 265 |
+
if 'consensus' not in summary.lower() and 'majority' not in summary.lower():
|
| 266 |
+
issues.append("Missing consensus indicators")
|
| 267 |
+
score -= 0.1
|
| 268 |
+
|
| 269 |
+
# Check length is substantial
|
| 270 |
+
if len(summary) < 500:
|
| 271 |
+
issues.append("Summary too brief for thorough analysis")
|
| 272 |
+
score -= 0.2
|
| 273 |
+
|
| 274 |
+
return max(0.0, score), issues
|