jmisak commited on
Commit
54c99ad
·
verified ·
1 Parent(s): 769f718

Upload 23 files

Browse files
Files changed (22) hide show
  1. README.md +7 -5
  2. Set-Service +0 -0
  3. app.py +582 -0
  4. audio_transcriber.py +100 -0
  5. audio_transcriber_hf.py +104 -0
  6. chunking.py +236 -0
  7. config.py +283 -0
  8. dashboard.py +340 -0
  9. extractors.py +201 -0
  10. llm.py +383 -0
  11. narrative_report_generator.py +74 -0
  12. outputs/sample.txt +0 -0
  13. report.csv +2 -0
  14. report.pdf +112 -0
  15. report_parser.py +61 -0
  16. reporting.py +239 -0
  17. requirements.txt +41 -0
  18. story_writer.py +55 -0
  19. table_builder.py +51 -0
  20. tagging.py +228 -0
  21. utils.py +404 -0
  22. validation.py +274 -0
README.md CHANGED
@@ -1,12 +1,14 @@
1
  ---
2
- title: TranscriptWriting
3
- emoji:
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: StoryTellerTranscript
3
+ emoji: 🌖
4
+ colorFrom: green
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.49.0
8
  app_file: app.py
9
  pinned: false
10
+ license: unknown
11
+ short_description: Audio interviews to final reports
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Set-Service ADDED
File without changes
app.py ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from typing import List, Dict, Tuple
4
+ from extractors import extract_docx, extract_pdf, validate_extraction
5
+ from tagging import tag_speakers_advanced
6
+ from chunking import chunk_text_semantic
7
+ from llm import query_llm, extract_structured_data
8
+ from reporting import generate_enhanced_csv, generate_enhanced_pdf
9
+ from dashboard import generate_comprehensive_dashboard
10
+ from validation import validate_transcript_quality, check_data_completeness
11
+ from audio_transcriber import transcribe_with_diarization_streaming
12
+
13
+ def preprocess_audio(audio_files, num_speakers):
14
+ """Convert audio to transcripts"""
15
+ if not audio_files:
16
+ return None, "No audio files provided"
17
+
18
+ transcript_paths = []
19
+ status = ""
20
+
21
+ for audio in audio_files:
22
+ try:
23
+ # Get the actual file path
24
+ audio_path = audio.name if hasattr(audio, 'name') else str(audio)
25
+
26
+ transcript_path = transcribe_with_diarization(audio_path, num_speakers)
27
+ transcript_paths.append(transcript_path)
28
+ status += f"✓ {os.path.basename(audio_path)} → {transcript_path}\n"
29
+ except Exception as e:
30
+ status += f"✗ {os.path.basename(audio_path)}: {str(e)}\n"
31
+
32
+ # Return list of paths for file component
33
+ return transcript_paths if transcript_paths else None, status
34
+
35
+
36
+ def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
37
+ """
38
+ Enhanced analysis pipeline with robust error handling and validation
39
+ """
40
+ os.environ["DEBUG_MODE"] = str(debug_mode)
41
+
42
+ if not files:
43
+ return "Error: No files uploaded", None, None, None
44
+
45
+ all_results = []
46
+ csv_rows = []
47
+ processing_errors = []
48
+
49
+ progress(0, desc="Initializing...")
50
+ print(f"[Start] Processing {len(files)} file(s) as {file_type}")
51
+
52
+ # Enhanced interviewee context
53
+ interviewee_context = {
54
+ "HCP": {
55
+ "focus": "clinical reasoning, peer communication, medical expertise, prescribing patterns",
56
+ "extract": ["diagnoses", "treatment_rationale", "clinical_decisions", "prescriptions", "guidelines_mentioned"]
57
+ },
58
+ "Patient": {
59
+ "focus": "symptoms, concerns, emotional state, treatment understanding, adherence",
60
+ "extract": ["symptoms", "concerns", "treatment_response", "quality_of_life", "side_effects"]
61
+ },
62
+ "Other": {
63
+ "focus": "context-dependent insights, relevant observations",
64
+ "extract": ["key_insights", "context", "recommendations"]
65
+ }
66
+ }.get(interviewee_type, {})
67
+
68
+ # Build enhanced user context
69
+ user_context = f"""
70
+ Interviewee Type: {interviewee_type}
71
+ Analysis Focus: {interviewee_context.get('focus', 'general insights')}
72
+ Key Data Points to Extract: {', '.join(interviewee_context.get('extract', []))}
73
+
74
+ Additional Instructions:
75
+ {user_comments}
76
+ """.strip()
77
+
78
+ total_steps = len(files) * 4 + 2 # extraction, validation, tagging, chunking per file + summary + report
79
+ current_step = 0
80
+
81
+ for i, file in enumerate(files):
82
+ file_name = os.path.basename(file.name)
83
+ try:
84
+ # Step 1: Extract text
85
+ progress((current_step / total_steps), desc=f"Extracting {file_name}...")
86
+ print(f"[File {i+1}/{len(files)}] Extracting: {file_name}")
87
+
88
+ raw_text = extract_docx(file) if file_type == "DOCX" else extract_pdf(file)
89
+ current_step += 1
90
+
91
+ # Step 2: Validate extraction
92
+ progress((current_step / total_steps), desc=f"Validating {file_name}...")
93
+ is_valid, validation_msg = validate_extraction(raw_text, file_name)
94
+ if not is_valid:
95
+ raise ValueError(f"Extraction validation failed: {validation_msg}")
96
+
97
+ print(f"[File {i+1}] Extracted {len(raw_text)} characters - Valid: {validation_msg}")
98
+ current_step += 1
99
+
100
+ # Step 3: Tag speakers with advanced logic
101
+ progress((current_step / total_steps), desc=f"Analyzing speakers in {file_name}...")
102
+ tagged_text = tag_speakers_advanced(raw_text, role_hint, interviewee_type)
103
+ print(f"[File {i+1}] Tagged {len(tagged_text)} characters")
104
+ current_step += 1
105
+
106
+ # Step 4: Semantic chunking
107
+ progress((current_step / total_steps), desc=f"Processing {file_name}...")
108
+ chunks = chunk_text_semantic(tagged_text, interviewee_type)
109
+ print(f"[File {i+1}] Created {len(chunks)} semantic chunk(s)")
110
+ current_step += 1
111
+
112
+ # Step 5: LLM Analysis with structured extraction
113
+ transcript_result = []
114
+ structured_data = {}
115
+
116
+ for j, chunk in enumerate(chunks):
117
+ chunk_progress = (current_step + (j / len(chunks))) / total_steps
118
+ progress(chunk_progress, desc=f"Analyzing {file_name} ({j+1}/{len(chunks)})...")
119
+
120
+ result, chunk_data = query_llm(
121
+ chunk,
122
+ user_context,
123
+ interviewee_type,
124
+ extract_structured=True
125
+ )
126
+
127
+ transcript_result.append(result)
128
+
129
+ # Merge structured data
130
+ for key, value in chunk_data.items():
131
+ if key not in structured_data:
132
+ structured_data[key] = []
133
+ if isinstance(value, list):
134
+ structured_data[key].extend(value)
135
+ else:
136
+ structured_data[key].append(value)
137
+
138
+ current_step += 1
139
+
140
+ # Combine and validate results
141
+ full_text = "\n\n".join(transcript_result)
142
+
143
+ # Quality check
144
+ quality_score, quality_issues = validate_transcript_quality(
145
+ full_text,
146
+ structured_data,
147
+ interviewee_type
148
+ )
149
+
150
+ if quality_score < 0.3:
151
+ print(f"[Warning] Low quality score ({quality_score:.2f}) for {file_name}: {quality_issues}")
152
+ processing_errors.append(f"{file_name}: Low quality - {quality_issues}")
153
+
154
+ all_results.append({
155
+ "transcript_id": f"Transcript {i+1}",
156
+ "file_name": file_name,
157
+ "full_text": full_text,
158
+ "structured_data": structured_data,
159
+ "quality_score": quality_score,
160
+ "word_count": len(raw_text.split())
161
+ })
162
+
163
+ # Enhanced CSV row with structured data
164
+ csv_row = {
165
+ "Transcript ID": f"Transcript {i+1}",
166
+ "File Name": file_name,
167
+ "Quality Score": f"{quality_score:.2f}",
168
+ "Word Count": len(raw_text.split()),
169
+ }
170
+
171
+ # Add interviewee-specific fields
172
+ if interviewee_type == "HCP":
173
+ csv_row.update({
174
+ "Diagnoses": "; ".join(structured_data.get("diagnoses", [])),
175
+ "Prescriptions": "; ".join(structured_data.get("prescriptions", [])),
176
+ "Treatment Strategies": "; ".join(structured_data.get("treatment_rationale", [])),
177
+ "Guidelines Mentioned": "; ".join(structured_data.get("guidelines_mentioned", []))
178
+ })
179
+ elif interviewee_type == "Patient":
180
+ csv_row.update({
181
+ "Primary Symptoms": "; ".join(structured_data.get("symptoms", [])),
182
+ "Main Concerns": "; ".join(structured_data.get("concerns", [])),
183
+ "Treatment Response": "; ".join(structured_data.get("treatment_response", [])),
184
+ "Side Effects": "; ".join(structured_data.get("side_effects", []))
185
+ })
186
+ else:
187
+ csv_row.update({
188
+ "Key Insights": "; ".join(structured_data.get("key_insights", [])),
189
+ "Recommendations": "; ".join(structured_data.get("recommendations", []))
190
+ })
191
+
192
+ csv_rows.append(csv_row)
193
+
194
+ print(f"[File {i+1}] ✓ Processing complete")
195
+
196
+ except Exception as e:
197
+ error_msg = f"[Error] {file_name} failed: {str(e)}"
198
+ print(error_msg)
199
+ processing_errors.append(error_msg)
200
+ all_results.append({
201
+ "transcript_id": f"Transcript {i+1}",
202
+ "file_name": file_name,
203
+ "full_text": error_msg,
204
+ "structured_data": {},
205
+ "quality_score": 0.0,
206
+ "word_count": 0
207
+ })
208
+
209
+ # Generate cross-transcript summary
210
+ try:
211
+ progress(0.9, desc="Generating summary and reports...")
212
+ print("[Summary] Analyzing trends across transcripts")
213
+
214
+ # Combine successful results
215
+ valid_results = [r for r in all_results if r["quality_score"] > 0]
216
+
217
+ if not valid_results:
218
+ return "Error: No transcripts were successfully processed", None, None, None
219
+
220
+ # Build comprehensive summary prompt
221
+ summary_prompt = f"""
222
+ CROSS-INTERVIEW SYNTHESIS TASK
223
+
224
+ SAMPLE: {len(valid_results)} {interviewee_type} transcripts
225
+ FOCUS AREAS: {interviewee_context.get('focus', 'general patterns')}
226
+
227
+ COMPLETE TRANSCRIPT DATA:
228
+ """
229
+
230
+ for idx, result in enumerate(valid_results, 1):
231
+ summary_prompt += f"\n{'='*60}\nTRANSCRIPT {idx}/{len(valid_results)}: {result['file_name']}\n{'='*60}\n"
232
+ summary_prompt += f"{result['full_text'][:2000]}\n"
233
+
234
+ summary_prompt += f"""
235
+
236
+ ANALYSIS REQUIREMENTS:
237
+
238
+ 1. QUANTIFY EVERYTHING:
239
+ - Count participants: "X out of {len(valid_results)} participants mentioned..."
240
+ - Never use vague terms (many/most/some)
241
+ - Calculate percentages where relevant
242
+
243
+ 2. IDENTIFY PATTERNS BY CONSENSUS LEVEL:
244
+ - STRONG CONSENSUS (80%+ = {int(len(valid_results)*0.8)}+ transcripts agree)
245
+ - MAJORITY VIEW (60-79% = {int(len(valid_results)*0.6)}-{int(len(valid_results)*0.79)} transcripts)
246
+ - SPLIT PERSPECTIVES (40-59% = mixed views)
247
+ - MINORITY/OUTLIER (<40% but notable)
248
+
249
+ 3. CROSS-VALIDATE:
250
+ - Check for contradictions between transcripts
251
+ - Note where perspectives diverge and why
252
+ - Flag any quality issues in individual transcripts
253
+
254
+ 4. CITE EVIDENCE:
255
+ - Reference specific transcript numbers
256
+ - Brief supporting details
257
+ - Distinguish verified facts from interpretation
258
+
259
+ OUTPUT FORMAT:
260
+ Write 2-3 sentence executive overview, then structure as:
261
+
262
+ **STRONG CONSENSUS FINDINGS:**
263
+ - [Finding with count and evidence]
264
+
265
+ **MAJORITY FINDINGS:**
266
+ - [Finding with count]
267
+
268
+ **DIVERGENT PERSPECTIVES:**
269
+ - [Where views split and context]
270
+
271
+ **NOTABLE OUTLIERS:**
272
+ - [Unique but important points]
273
+
274
+ **DATA QUALITY NOTES:**
275
+ - [Any gaps or transcript issues]
276
+
277
+ Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
278
+ """
279
+
280
+ summary, summary_data = query_llm(
281
+ summary_prompt,
282
+ user_context,
283
+ interviewee_type,
284
+ extract_structured=False,
285
+ is_summary=True
286
+ )
287
+
288
+ print("[Summary] ✓ Generated")
289
+
290
+ # Generate enhanced reports
291
+ csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
292
+ print(f"[CSV] ✓ Saved to {csv_path}")
293
+
294
+ pdf_path = generate_enhanced_pdf(
295
+ summary,
296
+ all_results,
297
+ interviewee_type,
298
+ processing_errors
299
+ )
300
+ print(f"[PDF] ✓ Saved to {pdf_path}")
301
+
302
+ dashboard = generate_comprehensive_dashboard(csv_rows, interviewee_type)
303
+ print("[Dashboard] ✓ Generated")
304
+
305
+ # Compile final output
306
+ output_text = f"""# Analysis Complete
307
+
308
+ ## Summary of Findings
309
+ {summary}
310
+
311
+ ## Processing Statistics
312
+ - Total Files: {len(files)}
313
+ - Successfully Processed: {len(valid_results)}
314
+ - Failed: {len(processing_errors)}
315
+ - Average Quality Score: {sum(r['quality_score'] for r in valid_results) / len(valid_results):.2f}
316
+
317
+ """
318
+
319
+ if processing_errors:
320
+ output_text += f"\n## Processing Errors\n" + "\n".join(f"- {err}" for err in processing_errors)
321
+
322
+ output_text += "\n\n---\n\n## Individual Transcript Results\n\n"
323
+
324
+ for result in all_results:
325
+ output_text += f"### {result['transcript_id']} - {result['file_name']}\n"
326
+ output_text += f"Quality Score: {result['quality_score']:.2f} | Words: {result['word_count']}\n\n"
327
+ output_text += result['full_text'] + "\n\n---\n\n"
328
+
329
+ progress(1.0, desc="Complete!")
330
+ return output_text, csv_path, pdf_path, dashboard
331
+
332
+ except Exception as e:
333
+ error_msg = f"[Fatal Error] Summary or report generation failed: {str(e)}"
334
+ print(error_msg)
335
+ import traceback
336
+ traceback.print_exc()
337
+ return error_msg, None, None, None
338
+
339
+ def generate_narrative_report_ui(csv_file, summary_text, interviewee_type, report_style):
340
+ """
341
+ Wrapper function for Gradio UI to generate narrative reports
342
+ """
343
+ try:
344
+ from narrative_report_generator import generate_narrative_report
345
+ import tempfile
346
+ import os
347
+
348
+ # Check if CSV file exists
349
+ if csv_file is None:
350
+ return "Error: No CSV file provided. Please run analysis first.", None, None, None
351
+
352
+ # Save summary text to temp file if provided
353
+ summary_path = None
354
+ if summary_text and summary_text.strip():
355
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
356
+ f.write(summary_text)
357
+ summary_path = f.name
358
+
359
+ # Determine LLM backend
360
+ llm_backend = "lmstudio" if os.getenv("USE_LMSTUDIO", "False").lower() == "true" else "hf_api"
361
+
362
+ # Generate narrative report
363
+ pdf_path, word_path, html_path = generate_narrative_report(
364
+ csv_path=csv_file.name if hasattr(csv_file, 'name') else csv_file,
365
+ summary_path=summary_path,
366
+ interviewee_type=interviewee_type,
367
+ report_style=report_style,
368
+ llm_backend=llm_backend
369
+ )
370
+
371
+ # Clean up temp file
372
+ if summary_path and os.path.exists(summary_path):
373
+ os.remove(summary_path)
374
+
375
+ return (
376
+ f"✓ Narrative reports generated successfully!\n\nPDF: {pdf_path}\nWord: {word_path}\nHTML: {html_path}",
377
+ pdf_path,
378
+ word_path,
379
+ html_path
380
+ )
381
+
382
+ except Exception as e:
383
+ import traceback
384
+ error_detail = traceback.format_exc()
385
+ return f"Error generating narrative report: {str(e)}\n\n{error_detail}", None, None, None
386
+
387
+
388
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
389
+ gr.Markdown("""
390
+ # 🎯 TranscriptorAI - Enterprise Transcript Analyzer
391
+
392
+ Upload multiple transcripts and generate comprehensive, structured insights with advanced AI analysis.
393
+ """)
394
+
395
+ with gr.Tabs():
396
+
397
+ with gr.TabItem("🎤 Audio Preprocessing"):
398
+ gr.Markdown("""
399
+ Upload audio interviews to auto-transcribe with speaker identification.
400
+ Outputs DOCX files ready for analysis.
401
+ """)
402
+
403
+ with gr.Row():
404
+ audio_input = gr.File(
405
+ label="Upload Audio Files",
406
+ file_types=[".mp3", ".wav", ".m4a", ".flac"],
407
+ file_count="multiple"
408
+ )
409
+ num_speakers_input = gr.Slider(
410
+ minimum=1,
411
+ maximum=5,
412
+ value=2,
413
+ step=1,
414
+ label="Number of Speakers"
415
+ )
416
+
417
+ transcribe_btn = gr.Button("🎙️ Transcribe Audio", variant="primary")
418
+ transcribe_status = gr.Textbox(label="Status", lines=10)
419
+ transcript_files = gr.File(label="Download Transcripts", file_count="multiple")
420
+
421
+ transcribe_btn.click(
422
+ fn=preprocess_audio,
423
+ inputs=[audio_input, num_speakers_input],
424
+ outputs=[transcript_files, transcribe_status]
425
+ )
426
+
427
+ gr.Markdown("""
428
+ **Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them.
429
+ """)
430
+
431
+
432
+
433
+ with gr.TabItem("📊 Transcript Analysis"):
434
+ with gr.Row():
435
+ with gr.Column(scale=1):
436
+ files = gr.File(
437
+ label="📁 Upload Transcripts",
438
+ file_types=[".docx", ".pdf"],
439
+ file_count="multiple"
440
+ )
441
+ file_type = gr.Radio(
442
+ ["DOCX", "PDF"],
443
+ label="File Type",
444
+ value="DOCX"
445
+ )
446
+ interviewee_type = gr.Radio(
447
+ ["HCP", "Patient", "Other"],
448
+ label="Interviewee Type",
449
+ value="Patient",
450
+ info="Select the type of person being interviewed"
451
+ )
452
+
453
+ with gr.Column(scale=1):
454
+ user_comments = gr.Textbox(
455
+ label="Analysis Instructions",
456
+ lines=6,
457
+ placeholder="Enter specific analysis goals, questions to answer, or context...",
458
+ info="Provide guidance for the AI analyzer"
459
+ )
460
+ role_hint = gr.Textbox(
461
+ label="Speaker Role Mapping (Optional)",
462
+ placeholder="e.g., Speaker 1 = Interviewer, Speaker 2 = Doctor",
463
+ info="Help identify speakers if needed"
464
+ )
465
+
466
+ with gr.Row():
467
+ debug_mode = gr.Checkbox(label="🔍 Enable Debug Mode", value=False)
468
+ analyze_btn = gr.Button("🚀 Analyze Transcripts", variant="primary", scale=2)
469
+
470
+ with gr.Row():
471
+ output_text = gr.Textbox(label="📊 Analysis Report", lines=40)
472
+
473
+ with gr.Row():
474
+ csv_output = gr.File(label="📥 Download CSV")
475
+ pdf_output = gr.File(label="📥 Download PDF")
476
+
477
+ with gr.Row():
478
+ dashboard_output = gr.Plot(label="📈 Dashboard Visualization")
479
+
480
+ analyze_btn.click(
481
+ fn=analyze,
482
+ inputs=[files, file_type, user_comments, role_hint, debug_mode, interviewee_type],
483
+ outputs=[output_text, csv_output, pdf_output, dashboard_output]
484
+ )
485
+
486
+
487
+ with gr.TabItem("📝 Narrative Report"):
488
+ gr.Markdown("""
489
+ ## Generate Storytelling Report
490
+
491
+ Transform your analysis into a narrative report with:
492
+ - Executive summary with key insights
493
+ - Data-driven storytelling
494
+ - Professional formatting (PDF, Word, HTML)
495
+ - Actionable recommendations
496
+
497
+ **Instructions:** First run the analysis in the previous tab, then use the outputs here to generate a narrative report.
498
+ """)
499
+
500
+ with gr.Row():
501
+ with gr.Column():
502
+ narrative_csv = gr.File(
503
+ label="CSV Output from Analysis",
504
+ file_types=[".csv"]
505
+ )
506
+ narrative_summary = gr.Textbox(
507
+ label="Copy/Paste Summary Text from Analysis (Optional)",
508
+ lines=10,
509
+ placeholder="Paste the executive summary text here..."
510
+ )
511
+
512
+ with gr.Column():
513
+ narrative_interviewee_type = gr.Radio(
514
+ ["HCP", "Patient", "Other"],
515
+ label="Interviewee Type",
516
+ value="Patient"
517
+ )
518
+ narrative_report_style = gr.Radio(
519
+ ["executive", "detailed", "presentation"],
520
+ label="Report Style",
521
+ value="executive",
522
+ info="Executive = concise C-level report, Detailed = thorough analysis, Presentation = slide-ready"
523
+ )
524
+ generate_narrative_btn = gr.Button("📖 Generate Narrative Report", variant="primary")
525
+
526
+ narrative_status = gr.Textbox(label="Status", lines=5)
527
+
528
+ with gr.Row():
529
+ narrative_pdf_output = gr.File(label="📥 Download PDF Report")
530
+ narrative_word_output = gr.File(label="📥 Download Word Report")
531
+ narrative_html_output = gr.File(label="📥 Download HTML Report")
532
+
533
+ generate_narrative_btn.click(
534
+ fn=generate_narrative_report_ui,
535
+ inputs=[narrative_csv, narrative_summary, narrative_interviewee_type, narrative_report_style],
536
+ outputs=[narrative_status, narrative_pdf_output, narrative_word_output, narrative_html_output]
537
+ )
538
+
539
+
540
+ with gr.TabItem("❓ Help"):
541
+ gr.Markdown("""
542
+ ### Quick Start Guide
543
+
544
+ **Step 1: Analyze Transcripts**
545
+ 1. Upload your DOCX or PDF files
546
+ 2. Select interviewee type (HCP, Patient, or Other)
547
+ 3. Add analysis instructions
548
+ 4. Click "Analyze Transcripts"
549
+ 5. Download CSV, PDF, and view dashboard
550
+
551
+ **Step 2: Generate Narrative Report (Optional)**
552
+ 1. Go to "Narrative Report" tab
553
+ 2. Upload the CSV from Step 1
554
+ 3. Optionally paste the summary text
555
+ 4. Select report style
556
+ 5. Click "Generate Narrative Report"
557
+ 6. Download PDF, Word, or HTML versions
558
+
559
+ ### Tips
560
+ - **CSV Upload**: Download the CSV from analysis, then upload it to narrative report generator
561
+ - **Summary Text**: Copy from the "Analysis Report" textbox and paste into narrative generator
562
+ - **Report Styles**:
563
+ - **Executive**: Best for C-level, investors, decision-makers
564
+ - **Detailed**: Best for researchers, comprehensive analysis
565
+ - **Presentation**: Best for slides, briefings, quick overviews
566
+
567
+ ### LLM Configuration
568
+ - Set `USE_LMSTUDIO=True` to use your local LM Studio
569
+ - Set `HUGGINGFACE_TOKEN` to use HF API for faster processing
570
+ - Default: Uses local model (slower but free)
571
+
572
+ ### Support
573
+ For issues, check the console output or enable debug mode.
574
+ """)
575
+
576
+ gr.Markdown("""
577
+ ---
578
+ **TranscriptorAI** | Enterprise-grade transcript analysis with narrative reporting
579
+ """)
580
+
581
+ if __name__ == "__main__":
582
+ demo.launch()
audio_transcriber.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ from speechbrain.inference import EncoderClassifier
3
+ from sklearn.cluster import AgglomerativeClustering
4
+ from docx import Document
5
+ import torch, torchaudio, numpy as np
6
+
7
+ def transcribe_with_diarization_streaming(audio_path: str, num_speakers: int = 1):
8
+ """
9
+ Streaming transcription with diarization support.
10
+ - Processes audio in chunks (default 30s).
11
+ - Streams partial transcripts as they’re ready.
12
+ - Handles single-speaker fallback.
13
+ """
14
+
15
+ # Device fallback
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ compute_type = "float16" if device == "cuda" else "int8"
18
+
19
+ print(f"[1/3] Loading Whisper model on {device}...")
20
+
21
+ try:
22
+ if torch.cuda.is_available():
23
+ device = "cuda"
24
+ compute_type = "float16"
25
+ _ = torch.zeros(1).to(device) # sanity check
26
+ else:
27
+ raise RuntimeError("No CUDA")
28
+ except Exception:
29
+ print("⚠️ CUDA not usable, falling back to CPU")
30
+ device = "cpu"
31
+ compute_type = "int8"
32
+
33
+ whisper_model = WhisperModel("large-v3", device=device, compute_type=compute_type)
34
+ return whisper_model
35
+
36
+
37
+
38
+ print(f"[2/3] Transcribing...")
39
+ # Streaming generator
40
+ segments, info = whisper_model.transcribe(
41
+ audio_path,
42
+ language="en",
43
+ beam_size=5,
44
+ word_timestamps=True,
45
+ vad_filter=True,
46
+
47
+ )
48
+
49
+ segments_list = []
50
+ for seg in segments:
51
+ print(f"[stream] {seg.start:.2f}-{seg.end:.2f}: {seg.text}")
52
+ segments_list.append(seg)
53
+
54
+ # Speaker embeddings
55
+ print(f"[3/3] Extracting speaker embeddings...")
56
+ speaker_model = EncoderClassifier.from_hparams(
57
+ source="speechbrain/spkrec-ecapa-voxceleb",
58
+ savedir="models/speaker_embeddings",
59
+ run_opts={"device": device}
60
+ )
61
+
62
+ waveform, sample_rate = torchaudio.load(audio_path)
63
+ embeddings, valid_segments = [], []
64
+
65
+ for seg in segments_list:
66
+ start_sample = int(seg.start * sample_rate)
67
+ end_sample = int(seg.end * sample_rate)
68
+ if end_sample > start_sample:
69
+ seg_audio = waveform[:, start_sample:end_sample]
70
+ if sample_rate != 16000:
71
+ seg_audio = torchaudio.transforms.Resample(sample_rate, 16000)(seg_audio)
72
+ with torch.no_grad():
73
+ emb = speaker_model.encode_batch(seg_audio)
74
+ embeddings.append(emb.squeeze().cpu().numpy())
75
+ valid_segments.append(seg)
76
+
77
+ # Handle empty or single-speaker case
78
+ if len(embeddings) == 0 or num_speakers <= 1:
79
+ print("Single speaker detected or no embeddings. Skipping clustering.")
80
+ speaker_labels = [0] * len(valid_segments)
81
+ num_speakers = 1
82
+ else:
83
+ if num_speakers > len(embeddings):
84
+ num_speakers = len(embeddings)
85
+ clustering = AgglomerativeClustering(n_clusters=num_speakers)
86
+ speaker_labels = clustering.fit_predict(np.array(embeddings))
87
+
88
+ # Build transcript
89
+ doc = Document()
90
+ doc.add_heading('Interview Transcript', 0)
91
+ doc.add_paragraph(f"Detected {num_speakers} speaker(s)")
92
+ doc.add_paragraph("")
93
+
94
+ for seg, spk in zip(valid_segments, speaker_labels):
95
+ doc.add_paragraph(f"Speaker {spk+1}: {seg.text.strip()}")
96
+
97
+ output_path = audio_path.rsplit('.', 1)[0] + '_transcript.docx'
98
+ doc.save(output_path)
99
+ print(f"✓ Saved transcript: {output_path}")
100
+ return output_path
audio_transcriber_hf.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio transcription with speaker diarization
3
+ """
4
+ from faster_whisper import WhisperModel
5
+ from pyannote.audio import Pipeline
6
+ import torch
7
+ from docx import Document
8
+ import os
9
+
10
+ def transcribe_with_diarization(audio_path: str, num_speakers: int = 2) -> str:
11
+ """
12
+ Transcribe audio with speaker labels
13
+
14
+ Args:
15
+ audio_path: Path to audio file (mp3, wav, m4a)
16
+ num_speakers: Expected number of speakers (default 2 for interviews)
17
+
18
+ Returns:
19
+ Path to generated DOCX transcript
20
+ """
21
+ print(f"[1/3] Transcribing audio...")
22
+
23
+ # Load Whisper model
24
+ model = WhisperModel("large-v3", device="cuda", compute_type="float16")
25
+
26
+ # Transcribe with timestamps
27
+ segments, info = model.transcribe(
28
+ audio_path,
29
+ language="en",
30
+ beam_size=5,
31
+ word_timestamps=True
32
+ )
33
+
34
+ segments_list = list(segments)
35
+ print(f"[2/3] Identifying speakers...")
36
+
37
+ # Load diarization pipeline
38
+ # Note: Requires HuggingFace token for pyannote models
39
+ hf_token = os.getenv("HUGGINGFACE_TOKEN", "")
40
+ if not hf_token:
41
+ print("[Warning] No HF token - using simple alternating speakers")
42
+ return transcribe_simple(segments_list, audio_path)
43
+
44
+ diarization = Pipeline.from_pretrained(
45
+ "pyannote/speaker-diarization-3.1",
46
+ use_auth_token=hf_token
47
+ )
48
+
49
+ if torch.cuda.is_available():
50
+ diarization.to(torch.device("cuda"))
51
+
52
+ # Run diarization
53
+ diarization_result = diarization(audio_path, num_speakers=num_speakers)
54
+
55
+ print(f"[3/3] Combining transcription + speakers...")
56
+
57
+ # Match segments to speakers
58
+ transcript_lines = []
59
+ for segment in segments_list:
60
+ start = segment.start
61
+ end = segment.end
62
+ text = segment.text
63
+
64
+ # Find speaker at this timestamp
65
+ speaker = get_speaker_at_time(diarization_result, start)
66
+ transcript_lines.append(f"{speaker}: {text}")
67
+
68
+ # Save to DOCX
69
+ doc = Document()
70
+ doc.add_heading('Interview Transcript', 0)
71
+
72
+ for line in transcript_lines:
73
+ doc.add_paragraph(line)
74
+
75
+ output_path = audio_path.replace('.mp3', '_transcript.docx').replace('.wav', '_transcript.docx').replace('.m4a', '_transcript.docx')
76
+ doc.save(output_path)
77
+
78
+ print(f"✓ Transcript saved: {output_path}")
79
+ return output_path
80
+
81
+
82
+ def get_speaker_at_time(diarization_result, timestamp):
83
+ """Find which speaker is talking at given timestamp"""
84
+ for turn, _, speaker in diarization_result.itertracks(yield_label=True):
85
+ if turn.start <= timestamp <= turn.end:
86
+ return f"Speaker {speaker}"
87
+ return "Speaker Unknown"
88
+
89
+
90
+ def transcribe_simple(segments_list, audio_path):
91
+ """Fallback: alternating speakers without diarization"""
92
+ doc = Document()
93
+ doc.add_heading('Interview Transcript', 0)
94
+
95
+ current_speaker = 1
96
+ for segment in segments_list:
97
+ doc.add_paragraph(f"Speaker {current_speaker}: {segment.text}")
98
+ # Simple heuristic: alternate on pauses > 2 seconds
99
+ if hasattr(segment, 'no_speech_prob') and segment.no_speech_prob > 0.5:
100
+ current_speaker = 3 - current_speaker # Toggle between 1 and 2
101
+
102
+ output_path = audio_path.replace('.mp3', '_transcript.docx')
103
+ doc.save(output_path)
104
+ return output_path
chunking.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ import re
3
+ from typing import List
4
+ from nltk.tokenize.punkt import PunktSentenceTokenizer
5
+
6
+ def chunk_text(text, max_tokens=3000):
7
+ """Legacy function - kept for backwards compatibility"""
8
+ return chunk_text_semantic(text, "Other", max_tokens)
9
+
10
+
11
+ def count_tokens(text: str) -> int:
12
+ """Count tokens using tiktoken"""
13
+ try:
14
+ enc = tiktoken.get_encoding("cl100k_base")
15
+ return len(enc.encode(text))
16
+ except Exception:
17
+ # Fallback to word-based estimate
18
+ return int(len(text.split()) * 1.3)
19
+
20
+
21
+ def split_into_sentences(text: str) -> List[str]:
22
+ """Split text into sentences with improved handling"""
23
+ try:
24
+ tokenizer = PunktSentenceTokenizer()
25
+ sentences = tokenizer.tokenize(text)
26
+ return sentences
27
+ except Exception:
28
+ # Fallback to simple split
29
+ return [s.strip() + '.' for s in text.split('.') if s.strip()]
30
+
31
+
32
+ def find_topic_boundaries(text: str, interviewee_type: str) -> List[int]:
33
+ """
34
+ Identify topic boundaries in the text for smarter chunking
35
+ Returns list of character positions where topics likely change
36
+ """
37
+
38
+ boundaries = [0] # Start position
39
+
40
+ # Topic change indicators
41
+ topic_patterns = [
42
+ r'\n\n+', # Paragraph breaks
43
+ r'\[Interviewer\].*?(next|another|different|moving on|let\'s talk about)',
44
+ r'\[Interviewer\].*?\?.*?\n.*?\[(?:Doctor|Patient|Respondent)\]', # Q&A pairs
45
+ ]
46
+
47
+ # Find all topic boundaries
48
+ for pattern in topic_patterns:
49
+ for match in re.finditer(pattern, text, re.IGNORECASE):
50
+ pos = match.start()
51
+ # Only add if not too close to existing boundary
52
+ if all(abs(pos - b) > 100 for b in boundaries):
53
+ boundaries.append(pos)
54
+
55
+ boundaries.append(len(text)) # End position
56
+ boundaries.sort()
57
+
58
+ return boundaries
59
+
60
+
61
+ def extract_speaker_segments(text: str) -> List[dict]:
62
+ """
63
+ Extract segments with speaker labels and content
64
+ """
65
+
66
+ pattern = r'\[([^\]]+)\]\s*([^\[]*)'
67
+ segments = []
68
+
69
+ for match in re.finditer(pattern, text, re.DOTALL):
70
+ speaker = match.group(1).strip()
71
+ content = match.group(2).strip()
72
+ if content:
73
+ segments.append({
74
+ "speaker": speaker,
75
+ "content": content,
76
+ "start_pos": match.start(),
77
+ "tokens": count_tokens(content)
78
+ })
79
+
80
+ return segments
81
+
82
+
83
+ def chunk_text_semantic(
84
+ text: str,
85
+ interviewee_type: str = "Other",
86
+ max_tokens: int = 3000,
87
+ overlap_tokens: int = 150
88
+ ) -> List[str]:
89
+ """
90
+ Advanced chunking that respects:
91
+ 1. Speaker boundaries (don't split mid-sentence)
92
+ 2. Topic boundaries (keep related Q&A together)
93
+ 3. Token limits for LLM context
94
+ 4. Overlap for context continuity
95
+ """
96
+
97
+ # Check if text has speaker tags
98
+ has_tags = bool(re.search(r'\[[^\]]+\]', text))
99
+
100
+ if not has_tags:
101
+ # Fallback to sentence-based chunking
102
+ return chunk_by_sentences(text, max_tokens, overlap_tokens)
103
+
104
+ # Extract speaker segments
105
+ segments = extract_speaker_segments(text)
106
+
107
+ if not segments:
108
+ return chunk_by_sentences(text, max_tokens, overlap_tokens)
109
+
110
+ # Group segments into chunks
111
+ chunks = []
112
+ current_chunk_segments = []
113
+ current_tokens = 0
114
+
115
+ i = 0
116
+ while i < len(segments):
117
+ segment = segments[i]
118
+ segment_tokens = segment["tokens"]
119
+
120
+ # If single segment exceeds max_tokens, split it
121
+ if segment_tokens > max_tokens:
122
+ # Split long segment by sentences
123
+ sub_chunks = chunk_by_sentences(
124
+ f"[{segment['speaker']}] {segment['content']}",
125
+ max_tokens,
126
+ overlap_tokens
127
+ )
128
+ chunks.extend(sub_chunks)
129
+ i += 1
130
+ continue
131
+
132
+ # Check if adding this segment would exceed limit
133
+ if current_tokens + segment_tokens > max_tokens and current_chunk_segments:
134
+ # Finalize current chunk
135
+ chunk_text = "\n\n".join([
136
+ f"[{s['speaker']}] {s['content']}"
137
+ for s in current_chunk_segments
138
+ ])
139
+ chunks.append(chunk_text)
140
+
141
+ # Start new chunk with overlap
142
+ # Keep last few segments for context
143
+ overlap_segments = []
144
+ overlap_token_count = 0
145
+
146
+ for seg in reversed(current_chunk_segments):
147
+ if overlap_token_count + seg["tokens"] < overlap_tokens:
148
+ overlap_segments.insert(0, seg)
149
+ overlap_token_count += seg["tokens"]
150
+ else:
151
+ break
152
+
153
+ current_chunk_segments = overlap_segments
154
+ current_tokens = overlap_token_count
155
+
156
+ # Add segment to current chunk
157
+ current_chunk_segments.append(segment)
158
+ current_tokens += segment_tokens
159
+ i += 1
160
+
161
+ # Add final chunk
162
+ if current_chunk_segments:
163
+ chunk_text = "\n\n".join([
164
+ f"[{s['speaker']}] {s['content']}"
165
+ for s in current_chunk_segments
166
+ ])
167
+ chunks.append(chunk_text)
168
+
169
+ return chunks if chunks else [text]
170
+
171
+
172
+ def chunk_by_sentences(
173
+ text: str,
174
+ max_tokens: int = 3000,
175
+ overlap_tokens: int = 150
176
+ ) -> List[str]:
177
+ """
178
+ Fallback chunking method based on sentences
179
+ """
180
+
181
+ sentences = split_into_sentences(text)
182
+
183
+ chunks = []
184
+ current_chunk = []
185
+ current_tokens = 0
186
+
187
+ for sentence in sentences:
188
+ sentence_tokens = count_tokens(sentence)
189
+
190
+ if current_tokens + sentence_tokens > max_tokens and current_chunk:
191
+ # Finalize current chunk
192
+ chunks.append(" ".join(current_chunk))
193
+
194
+ # Create overlap
195
+ overlap_sents = []
196
+ overlap_token_count = 0
197
+
198
+ for sent in reversed(current_chunk):
199
+ sent_tokens = count_tokens(sent)
200
+ if overlap_token_count + sent_tokens < overlap_tokens:
201
+ overlap_sents.insert(0, sent)
202
+ overlap_token_count += sent_tokens
203
+ else:
204
+ break
205
+
206
+ current_chunk = overlap_sents
207
+ current_tokens = overlap_token_count
208
+
209
+ current_chunk.append(sentence)
210
+ current_tokens += sentence_tokens
211
+
212
+ # Add final chunk
213
+ if current_chunk:
214
+ chunks.append(" ".join(current_chunk))
215
+
216
+ return chunks if chunks else [text]
217
+
218
+
219
+ def analyze_chunk_quality(chunks: List[str]) -> dict:
220
+ """
221
+ Analyze chunking quality for debugging
222
+ """
223
+
224
+ if not chunks:
225
+ return {"error": "No chunks"}
226
+
227
+ token_counts = [count_tokens(chunk) for chunk in chunks]
228
+
229
+ return {
230
+ "num_chunks": len(chunks),
231
+ "avg_tokens": sum(token_counts) / len(token_counts),
232
+ "min_tokens": min(token_counts),
233
+ "max_tokens": max(token_counts),
234
+ "total_tokens": sum(token_counts),
235
+ "chunks_over_limit": sum(1 for t in token_counts if t > 3000)
236
+ }
config.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Any
3
+
4
+ # ============================================================================
5
+ # LLM CONFIGURATION
6
+ # ============================================================================
7
+
8
+ # Choose LLM backend: "hf_api" (recommended), "local", or "openai"
9
+ LLM_BACKEND = os.getenv("LLM_BACKEND", "hf_api")
10
+
11
+ # Hugging Face Configuration
12
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
13
+ HF_MODEL = os.getenv("HF_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1")
14
+
15
+ # Local Model Configuration
16
+ LOCAL_MODEL = os.getenv("LOCAL_MODEL", "google/flan-t5-xl")
17
+ DEVICE = os.getenv("DEVICE", "auto") # "auto", "cpu", "cuda", "mps"
18
+
19
+ # OpenAI Configuration (if using OpenAI)
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
21
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4")
22
+
23
+ # LLM Parameters
24
+ MAX_TOKENS_PER_REQUEST = int(os.getenv("MAX_TOKENS_PER_REQUEST", "300"))
25
+ LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.3"))
26
+ LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))
27
+
28
+ # ============================================================================
29
+ # CHUNKING CONFIGURATION
30
+ # ============================================================================
31
+
32
+ MAX_CHUNK_TOKENS = int(os.getenv("MAX_CHUNK_TOKENS", "6000"))
33
+ OVERLAP_TOKENS = int(os.getenv("OVERLAP_TOKENS", "150"))
34
+ TOKENIZER_ENCODING = os.getenv("TOKENIZER_ENCODING", "cl100k_base")
35
+
36
+ # ============================================================================
37
+ # QUALITY THRESHOLDS
38
+ # ============================================================================
39
+
40
+ MIN_QUALITY_SCORE = float(os.getenv("MIN_QUALITY_SCORE", "0.3"))
41
+ MIN_WORD_COUNT = int(os.getenv("MIN_WORD_COUNT", "50"))
42
+ MIN_TEXT_LENGTH = int(os.getenv("MIN_TEXT_LENGTH", "100"))
43
+
44
+ # Quality grade thresholds
45
+ QUALITY_EXCELLENT = 0.8
46
+ QUALITY_GOOD = 0.6
47
+ QUALITY_FAIR = 0.4
48
+
49
+ # ============================================================================
50
+ # FILE PROCESSING CONFIGURATION
51
+ # ============================================================================
52
+
53
+ MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
54
+ SUPPORTED_FORMATS = [".docx", ".pdf"]
55
+ MAX_FILES_PER_BATCH = int(os.getenv("MAX_FILES_PER_BATCH", "10"))
56
+
57
+ # ============================================================================
58
+ # OUTPUT CONFIGURATION
59
+ # ============================================================================
60
+
61
+ OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./outputs")
62
+ CSV_FILENAME = "transcript_analysis.csv"
63
+ PDF_FILENAME = "transcript_report.pdf"
64
+
65
+ # Ensure output directory exists
66
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
67
+
68
+ # ============================================================================
69
+ # DEBUG AND LOGGING
70
+ # ============================================================================
71
+
72
+ DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
73
+ VERBOSE_LOGGING = os.getenv("VERBOSE_LOGGING", "False").lower() == "true"
74
+ LOG_FILE = os.getenv("LOG_FILE", "transcript_analysis.log")
75
+
76
+ # ============================================================================
77
+ # ADVANCED SETTINGS
78
+ # ============================================================================
79
+
80
+ # Cache extracted text to avoid re-processing
81
+ ENABLE_CACHING = os.getenv("ENABLE_CACHING", "True").lower() == "true"
82
+ CACHE_DIR = os.getenv("CACHE_DIR", "./.cache")
83
+
84
+ # Parallel processing
85
+ ENABLE_PARALLEL_PROCESSING = os.getenv("ENABLE_PARALLEL_PROCESSING", "False").lower() == "true"
86
+ MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
87
+
88
+ # ============================================================================
89
+ # SYSTEM PROMPTS
90
+ # ============================================================================
91
+
92
+ BASE_SYSTEM_PROMPT = """You are an expert medical transcript analyzer specializing in healthcare interviews.
93
+
94
+ Your task is to extract structured, actionable insights from interview transcripts.
95
+
96
+ Core Principles:
97
+ - Focus on factual, verifiable medical information
98
+ - Distinguish between speaker roles accurately
99
+ - Filter out pleasantries, disclaimers, and off-topic content
100
+ - Extract specific medical terms, dosages, and treatment details
101
+ - Identify patterns and clinical reasoning
102
+ - Maintain objectivity and clinical accuracy
103
+ """
104
+
105
+ HCP_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
106
+ Healthcare Professional Analysis Focus:
107
+ - Prescribing patterns and medication choices
108
+ - Diagnostic reasoning and clinical decision-making
109
+ - Treatment protocols and guidelines referenced
110
+ - Peer perspectives on efficacy and safety
111
+ - Barriers to treatment or adoption
112
+ - Off-label uses or emerging practices
113
+
114
+ Extract and structure:
115
+ 1. Diagnoses mentioned with context
116
+ 2. Prescriptions with dosage, frequency, and rationale
117
+ 3. Treatment strategies and their justifications
118
+ 4. Clinical guidelines or studies referenced
119
+ 5. Challenges or barriers discussed
120
+ 6. Key clinical insights or pearls
121
+ """
122
+
123
+ PATIENT_SYSTEM_PROMPT = BASE_SYSTEM_PROMPT + """
124
+ Patient Interview Analysis Focus:
125
+ - Symptom descriptions and severity
126
+ - Treatment experiences and outcomes
127
+ - Side effects and tolerability
128
+ - Quality of life impacts
129
+ - Adherence challenges and enablers
130
+ - Emotional and psychological factors
131
+ - Healthcare system interactions
132
+
133
+ Extract and structure:
134
+ 1. Primary symptoms with duration and severity
135
+ 2. Current and past treatments
136
+ 3. Treatment effectiveness and satisfaction
137
+ 4. Side effects experienced
138
+ 5. Concerns and unmet needs
139
+ 6. Quality of life impacts
140
+ 7. Support systems and resources
141
+ """
142
+
143
+ SUMMARY_SYSTEM_PROMPT = """You are analyzing multiple transcripts to identify cross-cutting trends.
144
+
145
+ Focus on:
146
+ - Frequency analysis (how many interviewees mentioned X?)
147
+ - Common patterns and themes
148
+ - Consensus and disagreements
149
+ - Statistical insights (percentages, distributions)
150
+ - Actionable recommendations for stakeholders
151
+
152
+ Provide:
153
+ 1. Quantitative summary (X% mentioned Y)
154
+ 2. Key trends and patterns
155
+ 3. Notable outliers or unique insights
156
+ 4. Actionable recommendations
157
+ 5. Data gaps or areas needing follow-up
158
+ """
159
+
160
+ # ============================================================================
161
+ # VALIDATION SETTINGS
162
+ # ============================================================================
163
+
164
+ VALIDATION_CONFIG = {
165
+ "min_word_ratio": 0.3,
166
+ "max_repetition_ratio": 1.5,
167
+ "min_sentences": 3,
168
+ "check_errors": True,
169
+ "check_gibberish": True
170
+ }
171
+
172
+ # ============================================================================
173
+ # DASHBOARD SETTINGS
174
+ # ============================================================================
175
+
176
+ DASHBOARD_CONFIG = {
177
+ "figure_size": (14, 10),
178
+ "dpi": 100,
179
+ "style": "default",
180
+ "top_n_items": 8,
181
+ "color_scheme": {
182
+ "primary": "#3498db",
183
+ "secondary": "#2ecc71",
184
+ "accent": "#e74c3c",
185
+ "warning": "#f39c12"
186
+ }
187
+ }
188
+
189
+ # ============================================================================
190
+ # HELPER FUNCTIONS
191
+ # ============================================================================
192
+
193
+ def get_config() -> Dict[str, Any]:
194
+ """Return all configuration as a dictionary"""
195
+ return {
196
+ "llm": {
197
+ "backend": LLM_BACKEND,
198
+ "model": HF_MODEL if LLM_BACKEND == "hf_api" else LOCAL_MODEL,
199
+ "max_tokens": MAX_TOKENS_PER_REQUEST,
200
+ "temperature": LLM_TEMPERATURE,
201
+ "timeout": LLM_TIMEOUT
202
+ },
203
+ "chunking": {
204
+ "max_tokens": MAX_CHUNK_TOKENS,
205
+ "overlap": OVERLAP_TOKENS
206
+ },
207
+ "quality": {
208
+ "min_score": MIN_QUALITY_SCORE,
209
+ "min_words": MIN_WORD_COUNT
210
+ },
211
+ "files": {
212
+ "max_size_mb": MAX_FILE_SIZE_MB,
213
+ "max_per_batch": MAX_FILES_PER_BATCH,
214
+ "supported": SUPPORTED_FORMATS
215
+ },
216
+ "output": {
217
+ "directory": OUTPUT_DIR,
218
+ "csv": CSV_FILENAME,
219
+ "pdf": PDF_FILENAME
220
+ },
221
+ "debug": DEBUG_MODE,
222
+ "caching": ENABLE_CACHING,
223
+ "parallel": ENABLE_PARALLEL_PROCESSING
224
+ }
225
+
226
+
227
+ def print_config():
228
+ """Print current configuration"""
229
+ config = get_config()
230
+ print("=" * 60)
231
+ print("TRANSCRIPTORAI CONFIGURATION")
232
+ print("=" * 60)
233
+ for section, settings in config.items():
234
+ print(f"\n{section.upper()}:")
235
+ for key, value in settings.items():
236
+ print(f" {key}: {value}")
237
+ print("=" * 60)
238
+
239
+
240
+ def validate_config() -> bool:
241
+ """Validate configuration settings"""
242
+ issues = []
243
+
244
+ # Check LLM configuration
245
+ if LLM_BACKEND == "hf_api" and not HUGGINGFACE_TOKEN:
246
+ issues.append("HF API selected but HUGGINGFACE_TOKEN not set")
247
+
248
+ if LLM_BACKEND == "openai" and not OPENAI_API_KEY:
249
+ issues.append("OpenAI selected but OPENAI_API_KEY not set")
250
+
251
+ # Check paths exist
252
+ if not os.path.exists(OUTPUT_DIR):
253
+ try:
254
+ os.makedirs(OUTPUT_DIR)
255
+ except:
256
+ issues.append(f"Cannot create output directory: {OUTPUT_DIR}")
257
+
258
+ # Check reasonable values
259
+ if MAX_CHUNK_TOKENS < 500:
260
+ issues.append("MAX_CHUNK_TOKENS too small (< 500)")
261
+
262
+ if MAX_TOKENS_PER_REQUEST < 100:
263
+ issues.append("MAX_TOKENS_PER_REQUEST too small (< 100)")
264
+
265
+ if issues:
266
+ print("Configuration Issues:")
267
+ for issue in issues:
268
+ print(f" - {issue}")
269
+ return False
270
+
271
+ return True
272
+
273
+
274
+ # ============================================================================
275
+ # INITIALIZATION
276
+ # ============================================================================
277
+
278
+ if __name__ == "__main__":
279
+ print_config()
280
+ if validate_config():
281
+ print("\n✓ Configuration valid")
282
+ else:
283
+ print("\n✗ Configuration has issues")
dashboard.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import matplotlib.patches as mpatches
3
+ import pandas as pd
4
+ import numpy as np
5
+ from collections import Counter
6
+ from typing import List, Dict
7
+ import re
8
+
9
+ def generate_dashboard(data):
10
+ """Legacy function - kept for backwards compatibility"""
11
+ return generate_comprehensive_dashboard(data, "Other")
12
+
13
+
14
+ def extract_items_from_field(data: List[Dict], field_name: str) -> List[str]:
15
+ """Extract and split items from semicolon-separated field"""
16
+ items = []
17
+ for row in data:
18
+ value = row.get(field_name, "")
19
+ if value and isinstance(value, str):
20
+ # Split by semicolon and clean
21
+ parts = [p.strip() for p in value.split(';') if p.strip()]
22
+ items.extend(parts)
23
+ return items
24
+
25
+
26
+ def generate_comprehensive_dashboard(
27
+ data: List[Dict],
28
+ interviewee_type: str
29
+ ) -> plt.Figure:
30
+ """
31
+ Generate comprehensive dashboard with multiple visualizations
32
+ """
33
+
34
+ if not data or len(data) == 0:
35
+ # Return empty figure with message
36
+ fig, ax = plt.subplots(figsize=(10, 6))
37
+ ax.text(0.5, 0.5, 'No data available for visualization',
38
+ ha='center', va='center', fontsize=14)
39
+ ax.axis('off')
40
+ return fig
41
+
42
+ df = pd.DataFrame(data)
43
+
44
+ # Determine number of subplots based on interviewee type
45
+ if interviewee_type == "HCP":
46
+ fig = create_hcp_dashboard(df)
47
+ elif interviewee_type == "Patient":
48
+ fig = create_patient_dashboard(df)
49
+ else:
50
+ fig = create_general_dashboard(df)
51
+
52
+ plt.tight_layout()
53
+ return fig
54
+
55
+
56
+ def create_hcp_dashboard(df: pd.DataFrame) -> plt.Figure:
57
+ """Create dashboard for HCP interviews"""
58
+
59
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
60
+ fig.suptitle('Healthcare Professional Interview Analysis', fontsize=16, fontweight='bold')
61
+
62
+ # 1. Quality Score Distribution
63
+ ax1 = axes[0, 0]
64
+ if 'Quality Score' in df.columns:
65
+ quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
66
+ if len(quality_scores) > 0:
67
+ ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
68
+ ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
69
+ label=f'Mean: {quality_scores.mean():.2f}')
70
+ ax1.set_xlabel('Quality Score')
71
+ ax1.set_ylabel('Frequency')
72
+ ax1.set_title('Transcript Quality Distribution')
73
+ ax1.legend()
74
+ ax1.grid(axis='y', alpha=0.3)
75
+
76
+ # 2. Top Diagnoses
77
+ ax2 = axes[0, 1]
78
+ if 'Diagnoses' in df.columns:
79
+ diagnoses = extract_items_from_field(df.to_dict('records'), 'Diagnoses')
80
+ if diagnoses:
81
+ diagnosis_counts = Counter(diagnoses)
82
+ top_diagnoses = dict(diagnosis_counts.most_common(8))
83
+
84
+ if top_diagnoses:
85
+ labels = list(top_diagnoses.keys())
86
+ # Truncate long labels
87
+ labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
88
+ values = list(top_diagnoses.values())
89
+
90
+ bars = ax2.barh(labels, values, color='#2ecc71', edgecolor='black')
91
+ ax2.set_xlabel('Frequency')
92
+ ax2.set_title('Most Common Diagnoses')
93
+ ax2.invert_yaxis()
94
+
95
+ # Add value labels
96
+ for i, bar in enumerate(bars):
97
+ width = bar.get_width()
98
+ ax2.text(width, bar.get_y() + bar.get_height()/2,
99
+ f' {int(width)}', ha='left', va='center', fontsize=9)
100
+
101
+ # 3. Prescription Analysis
102
+ ax3 = axes[1, 0]
103
+ if 'Prescriptions' in df.columns:
104
+ prescriptions = extract_items_from_field(df.to_dict('records'), 'Prescriptions')
105
+ if prescriptions:
106
+ rx_counts = Counter(prescriptions)
107
+ top_rx = dict(rx_counts.most_common(8))
108
+
109
+ if top_rx:
110
+ labels = list(top_rx.keys())
111
+ labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
112
+ values = list(top_rx.values())
113
+
114
+ bars = ax3.barh(labels, values, color='#e74c3c', edgecolor='black')
115
+ ax3.set_xlabel('Frequency')
116
+ ax3.set_title('Most Mentioned Prescriptions')
117
+ ax3.invert_yaxis()
118
+
119
+ for i, bar in enumerate(bars):
120
+ width = bar.get_width()
121
+ ax3.text(width, bar.get_y() + bar.get_height()/2,
122
+ f' {int(width)}', ha='left', va='center', fontsize=9)
123
+
124
+ # 4. Word Count by Transcript
125
+ ax4 = axes[1, 1]
126
+ if 'Word Count' in df.columns and 'Transcript ID' in df.columns:
127
+ word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
128
+ transcript_ids = df['Transcript ID'][:len(word_counts)]
129
+
130
+ if len(word_counts) > 0:
131
+ bars = ax4.bar(range(len(word_counts)), word_counts, color='#9b59b6',
132
+ edgecolor='black', alpha=0.7)
133
+ ax4.set_xlabel('Transcript')
134
+ ax4.set_ylabel('Word Count')
135
+ ax4.set_title('Interview Length by Transcript')
136
+ ax4.set_xticks(range(len(word_counts)))
137
+ ax4.set_xticklabels(transcript_ids, rotation=45, ha='right')
138
+ ax4.grid(axis='y', alpha=0.3)
139
+
140
+ # Add mean line
141
+ ax4.axhline(word_counts.mean(), color='red', linestyle='--',
142
+ label=f'Average: {int(word_counts.mean())}')
143
+ ax4.legend()
144
+
145
+ return fig
146
+
147
+
148
+ def create_patient_dashboard(df: pd.DataFrame) -> plt.Figure:
149
+ """Create dashboard for Patient interviews"""
150
+
151
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
152
+ fig.suptitle('Patient Interview Analysis', fontsize=16, fontweight='bold')
153
+
154
+ # 1. Quality Score Distribution
155
+ ax1 = axes[0, 0]
156
+ if 'Quality Score' in df.columns:
157
+ quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
158
+ if len(quality_scores) > 0:
159
+ ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
160
+ ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
161
+ label=f'Mean: {quality_scores.mean():.2f}')
162
+ ax1.set_xlabel('Quality Score')
163
+ ax1.set_ylabel('Frequency')
164
+ ax1.set_title('Transcript Quality Distribution')
165
+ ax1.legend()
166
+ ax1.grid(axis='y', alpha=0.3)
167
+
168
+ # 2. Top Symptoms
169
+ ax2 = axes[0, 1]
170
+ if 'Primary Symptoms' in df.columns:
171
+ symptoms = extract_items_from_field(df.to_dict('records'), 'Primary Symptoms')
172
+ if symptoms:
173
+ symptom_counts = Counter(symptoms)
174
+ top_symptoms = dict(symptom_counts.most_common(8))
175
+
176
+ if top_symptoms:
177
+ labels = list(top_symptoms.keys())
178
+ labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
179
+ values = list(top_symptoms.values())
180
+
181
+ bars = ax2.barh(labels, values, color='#e67e22', edgecolor='black')
182
+ ax2.set_xlabel('Frequency')
183
+ ax2.set_title('Most Common Symptoms')
184
+ ax2.invert_yaxis()
185
+
186
+ for i, bar in enumerate(bars):
187
+ width = bar.get_width()
188
+ ax2.text(width, bar.get_y() + bar.get_height()/2,
189
+ f' {int(width)}', ha='left', va='center', fontsize=9)
190
+
191
+ # 3. Patient Concerns
192
+ ax3 = axes[1, 0]
193
+ if 'Main Concerns' in df.columns:
194
+ concerns = extract_items_from_field(df.to_dict('records'), 'Main Concerns')
195
+ if concerns:
196
+ concern_counts = Counter(concerns)
197
+ top_concerns = dict(concern_counts.most_common(6))
198
+
199
+ if top_concerns:
200
+ # Create word cloud style pie chart
201
+ labels = list(top_concerns.keys())
202
+ labels = [label[:25] + '...' if len(label) > 25 else label for label in labels]
203
+ sizes = list(top_concerns.values())
204
+ colors_list = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#f9ca24', '#6c5ce7', '#a29bfe']
205
+
206
+ ax3.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90,
207
+ colors=colors_list[:len(sizes)])
208
+ ax3.set_title('Distribution of Patient Concerns')
209
+
210
+ # 4. Side Effects
211
+ ax4 = axes[1, 1]
212
+ if 'Side Effects' in df.columns:
213
+ side_effects = extract_items_from_field(df.to_dict('records'), 'Side Effects')
214
+ if side_effects:
215
+ se_counts = Counter(side_effects)
216
+ top_se = dict(se_counts.most_common(6))
217
+
218
+ if top_se:
219
+ labels = list(top_se.keys())
220
+ labels = [label[:30] + '...' if len(label) > 30 else label for label in labels]
221
+ values = list(top_se.values())
222
+
223
+ bars = ax4.barh(labels, values, color='#e74c3c', edgecolor='black')
224
+ ax4.set_xlabel('Frequency')
225
+ ax4.set_title('Reported Side Effects')
226
+ ax4.invert_yaxis()
227
+
228
+ for i, bar in enumerate(bars):
229
+ width = bar.get_width()
230
+ ax4.text(width, bar.get_y() + bar.get_height()/2,
231
+ f' {int(width)}', ha='left', va='center', fontsize=9)
232
+ else:
233
+ ax4.text(0.5, 0.5, 'No side effects reported',
234
+ ha='center', va='center', transform=ax4.transAxes, fontsize=12)
235
+ ax4.axis('off')
236
+
237
+ return fig
238
+
239
+
240
+ def create_general_dashboard(df: pd.DataFrame) -> plt.Figure:
241
+ """Create general dashboard"""
242
+
243
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
244
+ fig.suptitle('General Interview Analysis', fontsize=16, fontweight='bold')
245
+
246
+ # 1. Quality Score Distribution
247
+ ax1 = axes[0, 0]
248
+ if 'Quality Score' in df.columns:
249
+ quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
250
+ if len(quality_scores) > 0:
251
+ ax1.hist(quality_scores, bins=10, color='#3498db', edgecolor='black', alpha=0.7)
252
+ ax1.axvline(quality_scores.mean(), color='red', linestyle='--',
253
+ label=f'Mean: {quality_scores.mean():.2f}')
254
+ ax1.set_xlabel('Quality Score')
255
+ ax1.set_ylabel('Frequency')
256
+ ax1.set_title('Transcript Quality Distribution')
257
+ ax1.legend()
258
+ ax1.grid(axis='y', alpha=0.3)
259
+
260
+ # 2. Word Count Distribution
261
+ ax2 = axes[0, 1]
262
+ if 'Word Count' in df.columns:
263
+ word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
264
+ if len(word_counts) > 0:
265
+ ax2.hist(word_counts, bins=15, color='#2ecc71', edgecolor='black', alpha=0.7)
266
+ ax2.set_xlabel('Word Count')
267
+ ax2.set_ylabel('Frequency')
268
+ ax2.set_title('Interview Length Distribution')
269
+ ax2.grid(axis='y', alpha=0.3)
270
+
271
+ # 3. Processing Summary
272
+ ax3 = axes[1, 0]
273
+ if 'Quality Score' in df.columns:
274
+ quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
275
+
276
+ categories = ['Excellent\n(>0.8)', 'Good\n(0.6-0.8)', 'Fair\n(0.4-0.6)', 'Poor\n(<0.4)']
277
+ counts = [
278
+ sum(quality_scores > 0.8),
279
+ sum((quality_scores >= 0.6) & (quality_scores <= 0.8)),
280
+ sum((quality_scores >= 0.4) & (quality_scores < 0.6)),
281
+ sum(quality_scores < 0.4)
282
+ ]
283
+
284
+ colors_list = ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c']
285
+ bars = ax3.bar(categories, counts, color=colors_list, edgecolor='black', alpha=0.7)
286
+ ax3.set_ylabel('Number of Transcripts')
287
+ ax3.set_title('Quality Score Categories')
288
+ ax3.grid(axis='y', alpha=0.3)
289
+
290
+ # Add value labels
291
+ for bar in bars:
292
+ height = bar.get_height()
293
+ if height > 0:
294
+ ax3.text(bar.get_x() + bar.get_width()/2., height,
295
+ f'{int(height)}', ha='center', va='bottom', fontsize=10)
296
+
297
+ # 4. Summary Statistics Table
298
+ ax4 = axes[1, 1]
299
+ ax4.axis('off')
300
+
301
+ stats_data = []
302
+ if 'Transcript ID' in df.columns:
303
+ stats_data.append(['Total Transcripts', str(len(df))])
304
+
305
+ if 'Quality Score' in df.columns:
306
+ quality_scores = pd.to_numeric(df['Quality Score'], errors='coerce').dropna()
307
+ if len(quality_scores) > 0:
308
+ stats_data.append(['Avg Quality Score', f"{quality_scores.mean():.2f}"])
309
+ stats_data.append(['Min Quality Score', f"{quality_scores.min():.2f}"])
310
+ stats_data.append(['Max Quality Score', f"{quality_scores.max():.2f}"])
311
+
312
+ if 'Word Count' in df.columns:
313
+ word_counts = pd.to_numeric(df['Word Count'], errors='coerce').dropna()
314
+ if len(word_counts) > 0:
315
+ stats_data.append(['Avg Word Count', f"{int(word_counts.mean()):,}"])
316
+ stats_data.append(['Total Words', f"{int(word_counts.sum()):,}"])
317
+
318
+ if stats_data:
319
+ table = ax4.table(cellText=stats_data, cellLoc='left',
320
+ colWidths=[0.5, 0.3], loc='center',
321
+ colLabels=['Metric', 'Value'])
322
+ table.auto_set_font_size(False)
323
+ table.set_fontsize(11)
324
+ table.scale(1, 2)
325
+
326
+ # Style the table
327
+ for i in range(len(stats_data) + 1):
328
+ if i == 0:
329
+ table[(i, 0)].set_facecolor('#34495e')
330
+ table[(i, 1)].set_facecolor('#34495e')
331
+ table[(i, 0)].set_text_props(weight='bold', color='white')
332
+ table[(i, 1)].set_text_props(weight='bold', color='white')
333
+ else:
334
+ if i % 2 == 0:
335
+ table[(i, 0)].set_facecolor('#ecf0f1')
336
+ table[(i, 1)].set_facecolor('#ecf0f1')
337
+
338
+ ax4.set_title('Summary Statistics', fontsize=12, fontweight='bold', pad=20)
339
+
340
+ return fig
extractors.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docx import Document
2
+ import pdfplumber
3
+ import re
4
+ from typing import Tuple
5
+ import os
6
+
7
+ def extract_docx(file_obj) -> str:
8
+ """
9
+ Extract text from DOCX with enhanced error handling and formatting preservation
10
+ """
11
+ try:
12
+ doc = Document(file_obj)
13
+
14
+ # Extract paragraphs with better handling
15
+ paragraphs = []
16
+ for para in doc.paragraphs:
17
+ text = para.text.strip()
18
+ if text: # Only include non-empty paragraphs
19
+ paragraphs.append(text)
20
+
21
+ # Also extract text from tables
22
+ for table in doc.tables:
23
+ for row in table.rows:
24
+ row_text = []
25
+ for cell in row.cells:
26
+ cell_text = cell.text.strip()
27
+ if cell_text:
28
+ row_text.append(cell_text)
29
+ if row_text:
30
+ paragraphs.append(" | ".join(row_text))
31
+
32
+ extracted_text = "\n\n".join(paragraphs)
33
+
34
+ # Clean up common issues
35
+ extracted_text = clean_extracted_text(extracted_text)
36
+
37
+ return extracted_text
38
+
39
+ except Exception as e:
40
+ error_msg = f"[DOCX Extraction Error] {str(e)}"
41
+ print(error_msg)
42
+ return f"Error extracting DOCX: {str(e)}"
43
+
44
+
45
+ def extract_pdf(file_obj) -> str:
46
+ """
47
+ Extract text from PDF with multiple strategies and enhanced error handling
48
+ """
49
+ try:
50
+ extracted_pages = []
51
+
52
+ with pdfplumber.open(file_obj) as pdf:
53
+ # Track extraction success
54
+ successful_pages = 0
55
+ total_pages = len(pdf.pages)
56
+
57
+ for page_num, page in enumerate(pdf.pages, 1):
58
+ try:
59
+ # Strategy 1: Standard text extraction
60
+ page_text = page.extract_text()
61
+
62
+ # Strategy 2: If standard fails, try with layout
63
+ if not page_text or len(page_text.strip()) < 50:
64
+ page_text = page.extract_text(layout=True)
65
+
66
+ # Strategy 3: If still poor, try with custom settings
67
+ if not page_text or len(page_text.strip()) < 50:
68
+ page_text = page.extract_text(
69
+ x_tolerance=2,
70
+ y_tolerance=2
71
+ )
72
+
73
+ if page_text and page_text.strip():
74
+ # Clean and add page marker
75
+ clean_text = page_text.strip()
76
+ extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
77
+ successful_pages += 1
78
+ else:
79
+ print(f"[PDF Warning] Page {page_num} yielded no text")
80
+
81
+ except Exception as page_error:
82
+ print(f"[PDF Warning] Error on page {page_num}: {page_error}")
83
+ continue
84
+
85
+ if successful_pages == 0:
86
+ return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."
87
+
88
+ if successful_pages < total_pages * 0.5:
89
+ print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")
90
+
91
+ full_text = "\n\n".join(extracted_pages)
92
+
93
+ # Clean up the extracted text
94
+ full_text = clean_extracted_text(full_text)
95
+
96
+ return full_text
97
+
98
+ except Exception as e:
99
+ error_msg = f"[PDF Extraction Error] {str(e)}"
100
+ print(error_msg)
101
+ return f"Error extracting PDF: {str(e)}"
102
+
103
+
104
+ def clean_extracted_text(text: str) -> str:
105
+ """
106
+ Clean up common issues in extracted text
107
+ """
108
+ # Remove excessive whitespace
109
+ text = re.sub(r'\n{3,}', '\n\n', text)
110
+ text = re.sub(r' {2,}', ' ', text)
111
+
112
+ # Remove page numbers that appear alone on lines
113
+ text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
114
+
115
+ # Remove common headers/footers patterns
116
+ text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE)
117
+ text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE)
118
+
119
+ # Fix common OCR issues (if any)
120
+ text = text.replace('', "'") # Curly apostrophe
121
+ text = text.replace('', "'")
122
+ text = text.replace('"', '"') # Curly quotes
123
+ text = text.replace('"', '"')
124
+ text = text.replace('–', '-') # En dash
125
+ text = text.replace('—', '-') # Em dash
126
+
127
+ # Remove zero-width characters
128
+ text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
129
+
130
+ return text.strip()
131
+
132
+
133
+ def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
134
+ """
135
+ Validate extracted text quality
136
+ """
137
+ # Check if text is empty
138
+ if not text or not text.strip():
139
+ return False, "No text extracted"
140
+
141
+ # Check for minimum length
142
+ if len(text) < 100:
143
+ return False, f"Extracted text too short ({len(text)} characters)"
144
+
145
+ # Check for error messages
146
+ if text.startswith("Error") or text.startswith("["):
147
+ return False, "Extraction error detected"
148
+
149
+ # Check for gibberish (too many non-alphanumeric characters)
150
+ #alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
151
+ #ratio = alphanumeric / len(text) if text else 0
152
+
153
+ #if ratio < 0.2:
154
+ # return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"
155
+
156
+ # Check word count
157
+ words = text.split()
158
+ if len(words) < 50:
159
+ return False, f"Too few words ({len(words)})"
160
+
161
+ # Check for reasonable word lengths (catch binary junk)
162
+ #avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
163
+ #if avg_word_length < 2 or avg_word_length > 20:
164
+ # return False, f"Unusual average word length ({avg_word_length:.1f})"
165
+
166
+ # All checks passed
167
+ return True, f"Valid extraction: {len(words)} words, {len(text)} characters"
168
+
169
+
170
+ def detect_file_encoding(file_path: str) -> str:
171
+ """
172
+ Detect file encoding for text files
173
+ """
174
+ try:
175
+ import chardet
176
+ with open(file_path, 'rb') as f:
177
+ raw_data = f.read()
178
+ result = chardet.detect(raw_data)
179
+ return result['encoding']
180
+ except:
181
+ return 'utf-8' # Default fallback
182
+
183
+
184
+ def extract_text_file(file_obj) -> str:
185
+ """
186
+ Extract from plain text file with encoding detection
187
+ """
188
+ try:
189
+ # Try UTF-8 first
190
+ try:
191
+ return file_obj.read().decode('utf-8')
192
+ except UnicodeDecodeError:
193
+ # Try other common encodings
194
+ file_obj.seek(0)
195
+ try:
196
+ return file_obj.read().decode('latin-1')
197
+ except:
198
+ file_obj.seek(0)
199
+ return file_obj.read().decode('cp1252')
200
+ except Exception as e:
201
+ return f"Error reading text file: {str(e)}"
llm.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ from typing import Tuple, Dict, List
5
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as ThreadTimeout
6
+
7
+
8
+ # Option 1: Use Hugging Face Inference API (recommended for better quality)
9
+ # Option 2: Use larger local model
10
+ # Option 3: Use OpenAI/Anthropic API if available
11
+
12
+ DEBUG_MODE = os.getenv("DEBUG_MODE", "False").lower() == "true"
13
+ USE_HF_API = os.getenv("USE_HF_API", "False").lower() == "true" # Set default to False
14
+ HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
15
+
16
+ #if HF_TOKEN:
17
+ # huggingface_hub import login
18
+ # login(token=HF_TOKEN)
19
+ def log(msg):
20
+ if DEBUG_MODE:
21
+ print(f"[LLM Debug] {msg}")
22
+
23
+
24
+ def get_system_prompt(interviewee_type: str, is_summary: bool = False) -> str:
25
+ """Generate context-aware system prompts"""
26
+
27
+ base_prompt = """You are an expert medical transcript analyzer specializing in healthcare interviews.
28
+
29
+ Your task is to extract structured, actionable insights from interview transcripts.
30
+
31
+ Core Principles:
32
+ - Focus on factual, verifiable medical information
33
+ - Distinguish between speaker roles accurately
34
+ - Filter out pleasantries, disclaimers, and off-topic content
35
+ - Extract specific medical terms, dosages, and treatment details
36
+ - Identify patterns and clinical reasoning
37
+ """
38
+
39
+ if is_summary:
40
+ return base_prompt + """
41
+ CROSS-INTERVIEW SYNTHESIS & VALIDATION TASK:
42
+
43
+ You are analyzing multiple transcripts. Extract verified patterns and flag inconsistencies.
44
+
45
+ STEP 1 - PATTERN IDENTIFICATION:
46
+ For each theme, count occurrences across transcripts:
47
+ - How many participants mentioned X? (e.g., "7 out of 10 participants")
48
+ - Calculate percentages when relevant
49
+ - What's the range of perspectives?
50
+
51
+ STEP 2 - CLASSIFY BY CONSENSUS LEVEL:
52
+ - STRONG CONSENSUS (80%+ agreement): Findings most participants agree on
53
+ - MAJORITY VIEW (60-79%): Significant but not universal agreement
54
+ - SPLIT PERSPECTIVES (40-59%): Where views diverge
55
+ - OUTLIERS (<40%): Unique but noteworthy perspectives
56
+
57
+ STEP 3 - CROSS-VALIDATE:
58
+ - Check for contradictions between transcripts
59
+ - Note where perspectives differ and why
60
+ - Flag quality issues (brief transcripts, vague responses)
61
+
62
+ STEP 4 - CITE EVIDENCE:
63
+ - Reference specific transcript numbers
64
+ - Include brief supporting quotes/details
65
+ - Distinguish fact from interpretation
66
+
67
+ OUTPUT FORMAT:
68
+ Start with 2-3 sentence executive overview, then:
69
+
70
+ **STRONG CONSENSUS FINDINGS:**
71
+ [List with counts and evidence]
72
+
73
+ **MAJORITY FINDINGS:**
74
+ [List with counts]
75
+
76
+ **DIVERGENT PERSPECTIVES:**
77
+ [Where participants disagreed and context]
78
+
79
+ **NOTABLE OUTLIERS:**
80
+ [Unique but important points]
81
+
82
+ **QUALITY NOTES:**
83
+ [Any gaps or transcript issues]
84
+
85
+ CRITICAL RULES:
86
+ - NEVER use vague terms like "many," "most," "some" - always use specific numbers
87
+ - ALWAYS cite transcript numbers for claims
88
+ - FLAG weak evidence explicitly
89
+ - Separate facts from interpretations
90
+ - NO JSON output - write in clear narrative prose
91
+ """
92
+
93
+ if interviewee_type == "HCP":
94
+ return base_prompt + """
95
+ Healthcare Professional Analysis Focus:
96
+ - Prescribing patterns and medication choices
97
+ - Diagnostic reasoning and clinical decision-making
98
+ - Treatment protocols and guidelines referenced
99
+ - Peer perspectives on efficacy and safety
100
+ - Barriers to treatment or adoption
101
+ - Off-label uses or emerging practices
102
+
103
+ Extract and structure:
104
+ 1. Diagnoses mentioned with context
105
+ 2. Prescriptions with dosage, frequency, and rationale
106
+ 3. Treatment strategies and their justifications
107
+ 4. Clinical guidelines or studies referenced
108
+ 5. Challenges or barriers discussed
109
+ 6. Key clinical insights or pearls
110
+ """
111
+
112
+ elif interviewee_type == "Patient":
113
+ return base_prompt + """
114
+ Patient Interview Analysis Focus:
115
+ - Symptom descriptions and severity
116
+ - Treatment experiences and outcomes
117
+ - Side effects and tolerability
118
+ - Quality of life impacts
119
+ - Adherence challenges and enablers
120
+ - Emotional and psychological factors
121
+ - Healthcare system interactions
122
+
123
+ Extract and structure:
124
+ 1. Primary symptoms with duration and severity
125
+ 2. Current and past treatments
126
+ 3. Treatment effectiveness and satisfaction
127
+ 4. Side effects experienced
128
+ 5. Concerns and unmet needs
129
+ 6. Quality of life impacts
130
+ 7. Support systems and resources
131
+ """
132
+
133
+ else:
134
+ return base_prompt + """
135
+ General Interview Analysis Focus:
136
+ - Main themes and topics discussed
137
+ - Key insights and observations
138
+ - Recommendations or suggestions
139
+ - Contextual factors
140
+ - Areas of emphasis or concern
141
+
142
+ Extract and structure relevant information based on interview content.
143
+ """
144
+
145
+
146
+ def build_extraction_template(interviewee_type: str) -> str:
147
+ """Create JSON template for structured data extraction"""
148
+
149
+ if interviewee_type == "HCP":
150
+ return """{
151
+ "diagnoses": ["condition 1", "condition 2"],
152
+ "prescriptions": ["medication (dose, frequency, indication)"],
153
+ "treatment_rationale": ["reason for treatment choice"],
154
+ "guidelines_mentioned": ["guideline or study name"],
155
+ "clinical_decisions": ["key clinical decision with reasoning"],
156
+ "barriers": ["barrier to treatment"],
157
+ "key_insights": ["notable clinical insight"]
158
+ }"""
159
+
160
+ elif interviewee_type == "Patient":
161
+ return """{
162
+ "symptoms": ["symptom (severity, duration)"],
163
+ "concerns": ["patient concern or question"],
164
+ "treatments_current": ["current treatment"],
165
+ "treatments_past": ["past treatment with outcome"],
166
+ "treatment_response": ["description of how treatment is working"],
167
+ "side_effects": ["side effect experienced"],
168
+ "quality_of_life": ["impact on daily life"],
169
+ "adherence_factors": ["factor affecting medication adherence"]
170
+ }"""
171
+
172
+ else:
173
+ return """{
174
+ "key_insights": ["main insight or finding"],
175
+ "themes": ["recurring theme"],
176
+ "recommendations": ["recommendation or suggestion"],
177
+ "context": ["important contextual information"]
178
+ }"""
179
+
180
+
181
+ def parse_structured_response(text: str, interviewee_type: str) -> Dict:
182
+ """Extract structured data from LLM response"""
183
+
184
+ # Try to find JSON block
185
+ json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text, re.DOTALL)
186
+
187
+ if json_match:
188
+ try:
189
+ data = json.loads(json_match.group())
190
+ log(f"Successfully extracted JSON: {data}")
191
+ return data
192
+ except json.JSONDecodeError:
193
+ log("Failed to parse JSON from response")
194
+
195
+ # Fallback: Extract from text using patterns
196
+ data = {}
197
+
198
+ if interviewee_type == "HCP":
199
+ # Extract diagnoses
200
+ diag_pattern = r'(?:diagnos[ei]s|condition):\s*([^\n]+)'
201
+ data["diagnoses"] = re.findall(diag_pattern, text, re.IGNORECASE)
202
+
203
+ # Extract prescriptions
204
+ rx_pattern = r'(?:prescri[bp]\w*|medication):\s*([^\n]+)'
205
+ data["prescriptions"] = re.findall(rx_pattern, text, re.IGNORECASE)
206
+
207
+ # Extract treatment rationale
208
+ treat_pattern = r'(?:treatment|therapy|rationale):\s*([^\n]+)'
209
+ data["treatment_rationale"] = re.findall(treat_pattern, text, re.IGNORECASE)
210
+
211
+ elif interviewee_type == "Patient":
212
+ # Extract symptoms
213
+ symptom_pattern = r'(?:symptom|complaint|experienc\w*):\s*([^\n]+)'
214
+ data["symptoms"] = re.findall(symptom_pattern, text, re.IGNORECASE)
215
+
216
+ # Extract concerns
217
+ concern_pattern = r'(?:concern|worry|question|anxious):\s*([^\n]+)'
218
+ data["concerns"] = re.findall(concern_pattern, text, re.IGNORECASE)
219
+
220
+ # Extract side effects
221
+ se_pattern = r'(?:side effect|adverse|reaction):\s*([^\n]+)'
222
+ data["side_effects"] = re.findall(se_pattern, text, re.IGNORECASE)
223
+
224
+ # Clean and deduplicate
225
+ for key in data:
226
+ data[key] = list(set([item.strip() for item in data[key] if item.strip()]))
227
+
228
+ log(f"Extracted data from text: {data}")
229
+ return data
230
+
231
+
232
+ def query_llm_hf_api(prompt: str, max_tokens: int = 500) -> str:
233
+ """Use Hugging Face Inference API for better quality"""
234
+ try:
235
+ from huggingface_hub import InferenceClient
236
+
237
+ client = InferenceClient(token=HF_TOKEN)
238
+
239
+ # Use chat completions instead
240
+ messages = [
241
+ {"role": "system", "content": "You are an expert transcript analyzer. Provide detailed, structured analysis."},
242
+ {"role": "user", "content": prompt}
243
+ ]
244
+
245
+ response = client.chat_completion(
246
+ messages=messages,
247
+ model="microsoft/Phi-3-mini-4k-instruct",
248
+ max_tokens=max_tokens,
249
+ temperature=0.3
250
+ )
251
+
252
+ return response.choices[0].message.content.strip()
253
+
254
+ except Exception as e:
255
+ import traceback
256
+ full_error = traceback.format_exc()
257
+ log(f"HF API error: {e}\n{full_error}")
258
+ print(f"[HF API Full Error]\n{full_error}") # Print to console
259
+ return f"[Error] HF API failed: {e}"
260
+
261
+
262
+ def query_llm_local(prompt: str, max_tokens: int = 500) -> str:
263
+ """Local model optimized for L4 GPU"""
264
+ try:
265
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
266
+ import torch
267
+
268
+ if not hasattr(query_llm_local, 'model'):
269
+ log("Loading local model on L4...")
270
+ query_llm_local.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
271
+ query_llm_local.model = AutoModelForSeq2SeqLM.from_pretrained(
272
+ "google/flan-t5-xxl",
273
+ torch_dtype=torch.float16,
274
+ device_map="auto"
275
+ )
276
+
277
+ # Tokenize and truncate to 512 tokens
278
+ inputs = query_llm_local.tokenizer(
279
+ prompt,
280
+ return_tensors="pt",
281
+ truncation=True,
282
+ max_length=512
283
+ ).to("cuda")
284
+
285
+ outputs = query_llm_local.model.generate(
286
+ **inputs,
287
+ max_new_tokens=max_tokens,
288
+ do_sample=False
289
+ )
290
+
291
+ response = query_llm_local.tokenizer.decode(outputs[0], skip_special_tokens=True)
292
+ return response.strip()
293
+
294
+ except Exception as e:
295
+ log(f"Local model error: {e}")
296
+ return f"[Error] Local model failed: {e}"
297
+
298
+
299
+ def query_llm(
300
+ chunk: str,
301
+ user_context: str,
302
+ interviewee_type: str,
303
+ extract_structured: bool = False,
304
+ is_summary: bool = False,
305
+ timeout: int = 120
306
+ ) -> Tuple[str, Dict]:
307
+ """
308
+ Main LLM query function with structured extraction
309
+
310
+ Returns:
311
+ Tuple of (response_text, structured_data_dict)
312
+ """
313
+
314
+ system_prompt = get_system_prompt(interviewee_type, is_summary)
315
+ extraction_template = build_extraction_template(interviewee_type) if extract_structured else ""
316
+
317
+ # Build comprehensive prompt
318
+ full_prompt = f"""{system_prompt}
319
+
320
+ User Instructions:
321
+ {user_context}
322
+
323
+ Transcript Segment to Analyze:
324
+ {chunk}
325
+
326
+ """
327
+
328
+ if extract_structured:
329
+ full_prompt += f"""
330
+ IMPORTANT: Provide your analysis in two parts:
331
+ 1. A clear narrative summary (3-5 sentences)
332
+ 2. Structured data in this exact JSON format:
333
+ {extraction_template}
334
+
335
+ Be specific and include relevant details (dosages, durations, severity levels, etc.)
336
+ """
337
+
338
+ # Truncate if needed (but increased limit)
339
+ max_prompt_length = 6000 # Increased from 2000
340
+ if len(full_prompt) > max_prompt_length:
341
+ chunk_limit = max_prompt_length - len(system_prompt) - len(user_context) - len(extraction_template) - 500
342
+ chunk = chunk[:chunk_limit]
343
+ full_prompt = f"{system_prompt}\n\nUser Instructions:\n{user_context}\n\nTranscript Segment:\n{chunk}\n\n"
344
+ if extract_structured:
345
+ full_prompt += f"Provide analysis and structured JSON: {extraction_template}"
346
+ log(f"Prompt truncated to {len(full_prompt)} characters")
347
+
348
+ def generate():
349
+ if os.getenv("USE_LMSTUDIO", "False").lower() == "true":
350
+ return query_llm_lmstudio(full_prompt, max_tokens=600)
351
+ elif USE_HF_API and HF_TOKEN:
352
+ return query_llm_hf_api(full_prompt, max_tokens=600)
353
+ else:
354
+ return query_llm_local(full_prompt, max_tokens=600)
355
+
356
+ # Execute with timeout
357
+ with ThreadPoolExecutor(max_workers=1) as executor:
358
+ future = executor.submit(generate)
359
+ try:
360
+ response = future.result(timeout=timeout)
361
+ log(f"LLM response received ({len(response)} chars)")
362
+
363
+ # Extract structured data if requested
364
+ structured_data = {}
365
+ if extract_structured:
366
+ structured_data = parse_structured_response(response, interviewee_type)
367
+
368
+ return response, structured_data
369
+
370
+ except ThreadTimeout:
371
+ log("LLM generation timed out")
372
+ return "[Error] LLM generation timed out.", {}
373
+ except Exception as e:
374
+ log(f"LLM generation failed: {e}")
375
+ return f"[Error] LLM generation failed: {e}", {}
376
+
377
+
378
+ def extract_structured_data(text: str, interviewee_type: str) -> Dict:
379
+ """
380
+ Standalone function to extract structured data from existing text
381
+ Useful for post-processing
382
+ """
383
+ return parse_structured_response(text, interviewee_type)
narrative_report_generator.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ from typing import Tuple
4
+ from docx import Document
5
+ from docx.shared import Inches
6
+ from reportlab.lib.pagesizes import letter
7
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
8
+ from reportlab.lib.styles import getSampleStyleSheet
9
+ from report_parser import parse_transcriptor_output
10
+ from table_builder import build_all_tables
11
+ from story_writer import generate_narrative
12
+
13
+ def generate_narrative_report(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient",
14
+ report_style: str = "executive", llm_backend: str = "lmstudio",
15
+ output_dir: str = "./outputs") -> Tuple[str, str, str]:
16
+ print("[1/4] Parsing...")
17
+ parsed_data = parse_transcriptor_output(csv_path, summary_path, interviewee_type)
18
+
19
+ print("[2/4] Building tables...")
20
+ tables = build_all_tables(parsed_data)
21
+
22
+ print("[3/4] Generating narrative (1-2 min)...")
23
+ narrative = generate_narrative(parsed_data, tables, report_style, llm_backend)
24
+
25
+ print("[4/4] Creating outputs...")
26
+ os.makedirs(output_dir, exist_ok=True)
27
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
28
+ base = f"{output_dir}/narrative_report_{timestamp}"
29
+
30
+ pdf = f"{base}.pdf"
31
+ word = f"{base}.docx"
32
+ html = f"{base}.html"
33
+
34
+ create_pdf(narrative, tables, parsed_data, pdf)
35
+ create_word(narrative, tables, parsed_data, word)
36
+ create_html(narrative, tables, parsed_data, html)
37
+
38
+ print(f"Done!\nPDF: {pdf}\nWord: {word}\nHTML: {html}")
39
+ return pdf, word, html
40
+
41
+ def create_pdf(narrative, tables, data, path):
42
+ doc = SimpleDocTemplate(path, pagesize=letter)
43
+ story = []
44
+ styles = getSampleStyleSheet()
45
+
46
+ story.append(Paragraph("Narrative Research Report", styles['Title']))
47
+ story.append(Spacer(1, 0.3*72))
48
+
49
+ for section in narrative.split('\n\n'):
50
+ if section.strip():
51
+ story.append(Paragraph(section.strip().replace('&','&amp;').replace('<','&lt;'), styles['BodyText']))
52
+ story.append(Spacer(1, 0.1*72))
53
+
54
+ doc.build(story)
55
+
56
+ def create_word(narrative, tables, data, path):
57
+ doc = Document()
58
+ doc.add_heading('Narrative Research Report', 0)
59
+ for section in narrative.split('\n\n'):
60
+ if section.strip():
61
+ doc.add_paragraph(section.strip())
62
+ doc.save(path)
63
+
64
+ def create_html(narrative, tables, data, path):
65
+ html = f"""<!DOCTYPE html><html><head><style>
66
+ body{{font-family:Arial;max-width:900px;margin:40px auto;padding:20px;line-height:1.6}}
67
+ h1{{color:#2c3e50;text-align:center}}
68
+ </style></head><body><h1>Narrative Research Report</h1>"""
69
+ for section in narrative.split('\n\n'):
70
+ if section.strip():
71
+ html += f"<p>{section.strip()}</p>"
72
+ html += "</body></html>"
73
+ with open(path, 'w') as f:
74
+ f.write(html)
outputs/sample.txt ADDED
File without changes
report.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Transcript ID,File Name,Quality Score,Word Count,Key Insights,Recommendations
2
+ Transcript 1,570_24_July30_2pmET_TDredacted.docx,1.00,6812,"Interviewee is an independent researcher with expertise in healthcare topics but not a medical professional.; Focus on sharing medical materials and seeking opinions.; The interviewee expresses concern about potential TV ads for treatment of cramps and mentions the importance of managing cramps; Respondent diagnosed with fibe years ago, currently on whole bunch of medication including Vimpat and Gabapentin. Experienced side effects from one called Xcopr and had to switch.; The respondent emphasizes the importance of early intervention in mental health; The respondent confirms prior discussions and shows willingness to engage; The interviewee emphasizes the importance of clear communication and understanding in medical contexts; The interviewee expresses comfort with the application but lacks confidence in their ability to manage it independently; The interviewee emphasizes the importance of minimizing complications in medical treatment; The interviewee is focusing on the main idea of motivating participants to collaborate; The interviewee emphasizes the importance of full disclosure about a four-year period, highlighting potential gaps in knowledge and the need for transparency.; The interviewee expresses uncertainty about the TV ad's effectiveness and uniqueness; TV ad version compatibility and user preference for consistent format; The interviewee's ability to communicate clearly improved over time; The interviewee expresses a strong preference for personalized medical advice and emphasizes the importance of understanding individual differences in treatment responses.; The respondent values comparison with past experiences but emphasizes the importance of maintaining a strong foundation; The interviewee values personalized care and emphasizes the importance of having compassionate caregivers.; The interviewee expresses uncertainty about their own reactions compared to others, noting increased emotion and concern; Lack of alignment with current context; preference for happiness and movement forward; The respondent's condition involves deep-seated issues requiring careful consideration; The respondent emphasizes the importance of providing reliable pharmaceutical products with clear instructions; The interviewee expressed difficulty in transitioning from a previous role and uncertainty about their current responsibilities; The respondent found the ad motivating and engaging; The respondent finds the ad relevant but lacks specific details about its content; The respondent appreciates the detailed analysis but expresses concern about the specific recommendations; The respondent's behavior during the crisis was notably calm and composed, contrasting with the expected heightened emotional response.; Positive rapport between interviewer and respondent",Engage in discussions for mutual benefit; Clarify roles and expectations; Further investigation into specific side effects or problems with particular medical treatments; Consider alternative treatments for managing side effects; Consider implementing potential TV ads for mental health awareness; Encourage further detailed exploration of ideas; Ensure all parties involved have a clear understanding of the medication and its administration; Encourage gradual independence; Provide additional support; Further testing to identify specific causes; Encourage open communication and shared goals; Ensure thorough disclosure of all relevant medical history to avoid misunderstandings.; Conduct further research on similar ads to gauge effectiveness; Maintain consistent TV ad formats across platforms; Encourage continued practice to enhance communication; Encourage healthcare providers to tailor treatments based on patient specifics; Focus on building a solid foundation; Focus on hiring caregivers with strong interpersonal skills and a history of providing personalized care.; Further exploration of emotional triggers; Assessment of comparative emotional responses; Encourage alignment with current realities while maintaining focus on happiness; Consider a comprehensive treatment plan addressing underlying issues; Ensure all pharmaceutical products come with detailed usage guidelines; Clarify job expectations and provide support during the transition; Consider enhancing ad content to maintain motivation; Provide more detailed information about the ad; Clarify and validate specific recommendations; Encourage further training in crisis management techniques; Maintain positive communication
report.pdf ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %PDF-1.4
2
+ %���� ReportLab Generated PDF document http://www.reportlab.com
3
+ 1 0 obj
4
+ <<
5
+ /F1 2 0 R /F2 3 0 R
6
+ >>
7
+ endobj
8
+ 2 0 obj
9
+ <<
10
+ /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
11
+ >>
12
+ endobj
13
+ 3 0 obj
14
+ <<
15
+ /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
16
+ >>
17
+ endobj
18
+ 4 0 obj
19
+ <<
20
+ /Contents 10 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
21
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
22
+ >> /Rotate 0 /Trans <<
23
+
24
+ >>
25
+ /Type /Page
26
+ >>
27
+ endobj
28
+ 5 0 obj
29
+ <<
30
+ /Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
31
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
32
+ >> /Rotate 0 /Trans <<
33
+
34
+ >>
35
+ /Type /Page
36
+ >>
37
+ endobj
38
+ 6 0 obj
39
+ <<
40
+ /Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 9 0 R /Resources <<
41
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
42
+ >> /Rotate 0 /Trans <<
43
+
44
+ >>
45
+ /Type /Page
46
+ >>
47
+ endobj
48
+ 7 0 obj
49
+ <<
50
+ /PageMode /UseNone /Pages 9 0 R /Type /Catalog
51
+ >>
52
+ endobj
53
+ 8 0 obj
54
+ <<
55
+ /Author (\(anonymous\)) /CreationDate (D:20251005104519-04'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251005104519-04'00') /Producer (ReportLab PDF Library - www.reportlab.com)
56
+ /Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
57
+ >>
58
+ endobj
59
+ 9 0 obj
60
+ <<
61
+ /Count 3 /Kids [ 4 0 R 5 0 R 6 0 R ] /Type /Pages
62
+ >>
63
+ endobj
64
+ 10 0 obj
65
+ <<
66
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1291
67
+ >>
68
+ stream
69
+ Gat=*;0/3r&:W67f[3@JN(a6QP5OGhDOO7f1bip>9RT)?koAXFh-jIRoY5VE5ZkH.a6n0tn?VrY\)6TpMBJ_0aFDAOi9))/qB^_AL:i0f#nk>:_-&ttCD%)J=U9qAS]MX8@C40$\:6C%#m!I>p+aD6"<9e!0Q8!flm`LNGY*tf!fh:G)Cpe^*Lo$^g0&"G'=QW)+!JCNqun&^q@f]doQQ'kQDHme`")18&CP[WgPPDL:6QCqX!EQ<).1j3[:Hu\HW/TpP/&g0(khG`<Oh6;iaa+T(D-([S.69S7AL)['b<VOe=FE*iOY*H3oDkRBS=:_A7&7'mfi_<8sG'WS.F'49/I3W1PeD-+WLZW,Uo8_Fs$9p@Thgoh_Yhf[DFL>jLZ2"9l.W@i&58?a'cXROh#Y0K%oGBB2]XtU?AB9B,#LF8Vu#a>ug:MGGFu+s3[G>m>qEGi*6%_9nL><Cl9iIKR@'[o2Q3k?;H.e[c?H!ON_3a=2DGVL4_(j@gNZnqPOiKQcWX2S%UDu"rlCbD7!'ONGa%]cT`ZFFcZ=fiNB!kF)TOKRa@];#K(%eF*fU"k<bDYI@%&(dFeuOcDbT4o<,6MfcB;QJ4[%;K&p(DbSO]+Ila(\U8L@QNV`7mds3STXmH.['0)7H>73:]+S`>?)[gVo6TbH:kD/D_;9bQCC'<)!X4rp:_*Y,4B0eKR!fo5f.^tOl:*5*(Kom=s:k7%q,q_(HcqON9`mDlnc@ISSUbg@)j-3NFBUf7=BoS`qMJFK+`W?<fj&(+JMJFJebOeDDTZ0;i>"!""\/H;=Th"H!@;%S.Y(\=W>oHSL2LtP/\X1ia7dTY!4)d0A`*tj=_][0tPS\I8WM*-th.T"1RCp+!fMS>1ENNlX;<pqF+31oq1ffgS>S%g-</W%+TdJ1=m*Nqp<>LIU>*]@%mSem)-"R_UA$N:Fm_0>lT?;a>R>46/_MCAhakEbLnTs^s"'+daZ-cQIThq%YB?-O;['d=PKOd!:%l`9a3PPM3YZZlAh?2tGd(,/d]qgu_W>o_5ap2:KF_WM@*4A;88gV<Ri\hsG,t_o5@C/L&b1a^5OX^m2)EbH!o5+EqouZn])IK+4>U4V?I;P+lS`us9%*dlaiPZ%->[1N2KAdo>I_jj4K2d#?!e3mNhO.XDrkqQEUEn;ih]B&ek8?tl9(mOZIEKQ(.H]`9=33hYhWpU9'gAkJEV@>RF.,$tbAh@nR#Dsp6c4=1RVIU:c=\$Pk120g8PMO8\o0Z6i-EuTAse%%Ip)Nd-6Z^c%mN8b0_l5SQHu6~>endstream
70
+ endobj
71
+ 11 0 obj
72
+ <<
73
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1910
74
+ >>
75
+ stream
76
+ GauHK?#SIU'Re<2\F<ETUoG\iVJf\&%NAHA?+k-k:o#p:>APT_,q5bKYlDF);O6Fs8Y)t#!Z6qkh/i%qkC,+Y(k7DBj<`4JF*d[,Td6K1P*/[!UA<,W4[8gcQF&"e0hmiD%c^]V9[iOqP+m%T9SQK'gh/Yp]fLe>UpCMG-1%Ln9YB2LZ[.'H/<be/BWbKL\<aP!mj_@SN&9G=qub3YFjR>^[g,-c9r!Pdi9p2.&t["G1`fu#.3+gOdchU/:#)';a-+`Q@0kLIf(18_-T%F72sKPgVl&)TnZ=FIk;Vu_/Mb>JaJ)AuchG#?bhuPXh"hc_a2&/(o;+rO>EX?99aKUm\+A2\/#;"J)m6Pmf#-_m:>g>C4Gi(mWV\Z(E^Ja;Db=[7>W%dp^goDc\X4[Wn'EF(iEba/8MiJkP4:5d*iYFpA]T`"@@*0*E>5>@Cksq(deWe7mlqXhd9ea*-oO#i13/Xa1O)B=%">:Zctb(JKck)c<@"V=e,g_W,qY'`Rcd#dXgC*FcaD,#)P!oX(Tai$B3LBs[H*kc@un37/60h*UJbWcZ[uqoQGOK<T\='u6%]u22hp7YadJIFW]O?=GU6MO*6Q>.n^7:BO:!Y1B^kG!Mfh4f\L9lPk5$.n[UD/>2m"g;MLmFEAel^V9P]+'\/&>7-.q_->*Nd@<ZWHK"p)5a$n[aAZE_au/OUnNdQ4bu"ThnG8?5(1?lXmU[c[E_>TjiSm=&*oAsG6VE/7XhT>`OuH3mk;dThLR0YX2::</!2oJ[r@%s<JD3fMK3Z5\e#iV@tgh1J=hkDlm>g->f3@R(<1j?b't$O+f)b!LDF50_stb6#"N$YE$Z8[[n5j-l6U@+G5:dpO1g=Y">,N8c\$<3__7\3CJKWNV`Q!Sk'G/6dH-iItQT]F!!&p[&raa,<Dk="pp`$Y/9J#RlhG1.])`8kqif+CMa5&tja1!MMgqKd]Zo[[P#GhEXD5@<=U7r1f\eQ+YWo!l)ol#NqJ,Y>RI)d%6f.%ZjF_en>a>dFE4,<LVTnWqo/O>(pRR8[bX`6)f,&Q?"-^K=XQf=6bB^UG\(*Amjm^7mYRtWHd9$<!^34(b_?%dJ)4SA%g*8?jg!s/useUZ]P=SanG>:e>`Y++OK?7p$Z/9g:c@Zj:&[g%e,\Fd9T1IKfG13>T0[8`]'(=4nUqe@O7-ZhpcuI?gM,r]^lpl'i7SX?e1r!XO,(t7:Z$CpE\@A*"EQ/f?02<:4toG>BOd-4$d)gnKaPD;-/'($I;N"f4b)c\sD2e9U'\FN;QTolSQs,e.m:7kXTM]3)WnabInujGB<c3+)5F(RAP.N?"U8NCu3ok[X%85i*G<Rf!-P5d_#h.DD<[mo]1Y]Eja)#>=M@A<Nh[0JSpH1Wdgi=m&4VZmWR+[/]S-E.5dQ5XX,sl%sbpLjQb;Rkb8%tN\-K97t^<C_3#Vq/;CHa:b_[q;Dk=igF(ajmH9G_0")-P-=tRW.OQbp>W(7&-d-ZC*Ac`GJ5k0fVFZK=d.Me0g+I,7@3sDlC]#:4HT"YjGVV@l!@OZ7lYO@t>=BU5C0mR9aYDUNgXb+tjpb:b%?C<o+a4U,V=eff3&Ki^s6H8Iejt9Hi-nnA\,<E[dpUeHbR'10K#%egMD\;(1UMDXL9g8%,(smT-D2$a`&qF1M8e-":4`;"EFeKZN'd*&GYR58jS+Qo(A\+LpC5^!CIS]rpFCum_HTL8;j<LTa%)KMl0tr>mIAK\Y#<C'AMX5D<U-7][m[^5+\kg(>o-_6h*%`Fr1aRX-Z^6Gr_3`g$Ib1jUqb`[&Aq+tr:KGn`TFDroT9='.oiO5h^EpXAj7#"'[%kko^IcW*`1Ou^T998ghg\Zc\8_&8uo@()H`:slgK-]Lj#ql^:H@Ri1'6u8qsJTN"\T6Bnm"cr<&F-Ab5~>endstream
77
+ endobj
78
+ 12 0 obj
79
+ <<
80
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 876
81
+ >>
82
+ stream
83
+ Gau0A9lJKG&A@7.N-m*'R+V3a*'p\lU'Dt$&>I'gCiAXq27r1-'_1'BlSumY9,.c,83'@UhqNGFm3I6#J!05(."HLZ]Y7&^8KfT,=i:]h5/BZGq(YJ;cIE>(lP2@"/Pam]Eq'@nF>6SRCP]lP\cc54(ap9+(JDj!*]VgT#M?BKlg#L&2#MshQ?M/]G-\3^Mr)=;.'!WLp1-+F[R<B\WR/aKEo=i2W\N)"('LK7-+-_C:?iIF;,&VGH>0`rKGTa+5kYi9']iT&_-9708GhW:'I(OpZtI&J$V83Q=3$)IMLpMJ)q*de6Oc/?,sKU;H%.qa?ac8g/9l2!&9n:4B,Fe<^bmBJ+U&-f*P(TA_XQbRI<YDF8;1;0d5p\6kSR?(^o"@@72:=`K*nsgW,nPbA[8BAIQ+Ba`q[?n*KKU8&&P^C)q^5S0RNee57`0LS]Bp0q;Zc^gXf4s`'!kY6q@Q:@5WY.HmGLSU!uA1lp>.5*phs*d!@-OKA(BobTNK2Fj6X_o^jO@rFr$Zs6k`_:^F1!\U:*)Be>X@abKGD7B'>^EU"N`'ml-3Bs<'QpBSRh=.O]U\8Z+@`MI)dYF]]IRJ-=&?!>SSn%11*]'_=Z)qThOFofQJ\Urpj[3N'*OHh/b6H)W@@-_VLP$DH0*]r.lZ`PZ0pV@AnRTSdjnC+_B*=%/&fYnBV*pECmUS`QcGY=<DhUpZVkl0D0+<MqB$M@(R7N<5US(1T[R^T<;o.qYt,*lutE)r%a_s>=%OZ>'WfBSm-j/_5lKo7%mBKT0MXGmcU9,gfJfF;A7ed15:AB7soeD4/g(CG'jO!>rqe,3@P>I$(^C#_;Rob;/X]O,a+GSHd5k%1jX5G9IQ4A5EKR+qd3C*R]~>endstream
84
+ endobj
85
+ xref
86
+ 0 13
87
+ 0000000000 65535 f
88
+ 0000000073 00000 n
89
+ 0000000114 00000 n
90
+ 0000000221 00000 n
91
+ 0000000333 00000 n
92
+ 0000000527 00000 n
93
+ 0000000721 00000 n
94
+ 0000000915 00000 n
95
+ 0000000983 00000 n
96
+ 0000001266 00000 n
97
+ 0000001337 00000 n
98
+ 0000002720 00000 n
99
+ 0000004722 00000 n
100
+ trailer
101
+ <<
102
+ /ID
103
+ [<6812a7b40f4b5abfbec04669e48f4c7d><6812a7b40f4b5abfbec04669e48f4c7d>]
104
+ % ReportLab generated PDF document -- digest (http://www.reportlab.com)
105
+
106
+ /Info 8 0 R
107
+ /Root 7 0 R
108
+ /Size 13
109
+ >>
110
+ startxref
111
+ 5689
112
+ %%EOF
report_parser.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from typing import Dict, List, Tuple
4
+ from collections import Counter
5
+
6
+ def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]:
7
+ df = pd.read_csv(csv_path)
8
+ metadata = {
9
+ "total_transcripts": len(df),
10
+ "avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None,
11
+ "avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None,
12
+ "transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else []
13
+ }
14
+ return df, metadata
15
+
16
+ def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]:
17
+ themes = {}
18
+ if interviewee_type == "HCP":
19
+ theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
20
+ elif interviewee_type == "Patient":
21
+ theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"]
22
+ else:
23
+ theme_columns = ["Key Insights"]
24
+
25
+ for col in theme_columns:
26
+ if col in df.columns:
27
+ all_items = []
28
+ for val in df[col].dropna():
29
+ if isinstance(val, str):
30
+ all_items.extend([i.strip() for i in val.split(';') if i.strip()])
31
+ theme_counts = Counter(all_items)
32
+ themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)]
33
+ return themes
34
+
35
+ def calculate_statistics(df: pd.DataFrame) -> Dict:
36
+ stats = {}
37
+ if "Quality Score" in df.columns:
38
+ scores = df["Quality Score"].astype(float)
39
+ stats["quality"] = {
40
+ "mean": scores.mean(),
41
+ "excellent_count": sum(scores > 0.8),
42
+ "good_count": sum((scores >= 0.6) & (scores <= 0.8)),
43
+ "fair_count": sum((scores >= 0.4) & (scores < 0.6)),
44
+ "poor_count": sum(scores < 0.4)
45
+ }
46
+ if "Word Count" in df.columns:
47
+ words = df["Word Count"].astype(int)
48
+ stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())}
49
+ return stats
50
+
51
+ def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict:
52
+ df, metadata = parse_csv_output(csv_path)
53
+ themes = extract_key_themes(df, interviewee_type)
54
+ stats = calculate_statistics(df)
55
+ return {
56
+ "dataframe": df,
57
+ "metadata": metadata,
58
+ "themes": themes,
59
+ "statistics": stats,
60
+ "interviewee_type": interviewee_type
61
+ }
reporting.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from reportlab.lib.pagesizes import letter, A4
3
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
4
+ from reportlab.lib.units import inch
5
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
6
+ from reportlab.lib import colors
7
+ from reportlab.lib.enums import TA_CENTER, TA_LEFT
8
+ from datetime import datetime
9
+ from typing import List, Dict
10
+ import os
11
+
12
+ def generate_csv(data, path="report.csv"):
13
+ """Legacy function - kept for backwards compatibility"""
14
+ return generate_enhanced_csv(data, "Other", path)
15
+
16
+
17
+ def generate_enhanced_csv(
18
+ data: List[Dict],
19
+ interviewee_type: str,
20
+ path: str = "report.csv"
21
+ ) -> str:
22
+ """
23
+ Generate enhanced CSV with proper formatting and data validation
24
+ """
25
+
26
+ if not data:
27
+ # Create empty CSV with headers
28
+ df = pd.DataFrame(columns=["Transcript ID", "Status"])
29
+ df.to_csv(path, index=False)
30
+ return path
31
+
32
+ # Create DataFrame
33
+ df = pd.DataFrame(data)
34
+
35
+ # Reorder columns for better readability
36
+ priority_cols = ["Transcript ID", "File Name", "Quality Score", "Word Count"]
37
+ other_cols = [col for col in df.columns if col not in priority_cols]
38
+ ordered_cols = [col for col in priority_cols if col in df.columns] + other_cols
39
+
40
+ df = df[ordered_cols]
41
+
42
+ # Save with proper encoding
43
+ df.to_csv(path, index=False, encoding='utf-8-sig')
44
+
45
+ return path
46
+
47
+
48
+ def generate_pdf(summary, details, path="report.pdf"):
49
+ """Legacy function - kept for backwards compatibility"""
50
+ # Create minimal results structure
51
+ results = [{
52
+ "transcript_id": "Transcript 1",
53
+ "file_name": "analysis.txt",
54
+ "full_text": details,
55
+ "quality_score": 0.8,
56
+ "word_count": len(details.split())
57
+ }]
58
+ return generate_enhanced_pdf(summary, results, "Other", [], path)
59
+
60
+
61
+ def generate_enhanced_pdf(
62
+ summary: str,
63
+ results: List[Dict],
64
+ interviewee_type: str,
65
+ processing_errors: List[str],
66
+ path: str = "report.pdf"
67
+ ) -> str:
68
+ """
69
+ Generate professional PDF report with proper formatting
70
+ """
71
+
72
+ # Create document
73
+ doc = SimpleDocTemplate(
74
+ path,
75
+ pagesize=letter,
76
+ rightMargin=0.75*inch,
77
+ leftMargin=0.75*inch,
78
+ topMargin=0.75*inch,
79
+ bottomMargin=0.75*inch
80
+ )
81
+
82
+ # Container for the 'Flowable' objects
83
+ story = []
84
+
85
+ # Define styles
86
+ styles = getSampleStyleSheet()
87
+
88
+ # Custom styles
89
+ title_style = ParagraphStyle(
90
+ 'CustomTitle',
91
+ parent=styles['Heading1'],
92
+ fontSize=24,
93
+ textColor=colors.HexColor('#1a1a1a'),
94
+ spaceAfter=30,
95
+ alignment=TA_CENTER,
96
+ fontName='Helvetica-Bold'
97
+ )
98
+
99
+ heading_style = ParagraphStyle(
100
+ 'CustomHeading',
101
+ parent=styles['Heading2'],
102
+ fontSize=16,
103
+ textColor=colors.HexColor('#2c3e50'),
104
+ spaceAfter=12,
105
+ spaceBefore=20,
106
+ fontName='Helvetica-Bold'
107
+ )
108
+
109
+ subheading_style = ParagraphStyle(
110
+ 'CustomSubheading',
111
+ parent=styles['Heading3'],
112
+ fontSize=13,
113
+ textColor=colors.HexColor('#34495e'),
114
+ spaceAfter=8,
115
+ spaceBefore=12,
116
+ fontName='Helvetica-Bold'
117
+ )
118
+
119
+ body_style = ParagraphStyle(
120
+ 'CustomBody',
121
+ parent=styles['BodyText'],
122
+ fontSize=11,
123
+ leading=14,
124
+ textColor=colors.HexColor('#2c3e50'),
125
+ alignment=TA_LEFT
126
+ )
127
+
128
+ # Title page
129
+ story.append(Paragraph("Transcript Analysis Report", title_style))
130
+ story.append(Spacer(1, 0.2*inch))
131
+
132
+ # Metadata table
133
+ metadata = [
134
+ ["Report Generated:", datetime.now().strftime("%B %d, %Y at %I:%M %p")],
135
+ ["Interviewee Type:", interviewee_type],
136
+ ["Total Transcripts:", str(len(results))],
137
+ ["Successfully Processed:", str(sum(1 for r in results if r.get("quality_score", 0) > 0))]
138
+ ]
139
+
140
+ metadata_table = Table(metadata, colWidths=[2*inch, 4*inch])
141
+ metadata_table.setStyle(TableStyle([
142
+ ('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#ecf0f1')),
143
+ ('TEXTCOLOR', (0, 0), (-1, -1), colors.HexColor('#2c3e50')),
144
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
145
+ ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
146
+ ('FONTSIZE', (0, 0), (-1, -1), 10),
147
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
148
+ ('TOPPADDING', (0, 0), (-1, -1), 8),
149
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#bdc3c7'))
150
+ ]))
151
+
152
+ story.append(metadata_table)
153
+ story.append(Spacer(1, 0.3*inch))
154
+
155
+ # Executive Summary
156
+ story.append(Paragraph("Executive Summary", heading_style))
157
+ story.append(Spacer(1, 0.1*inch))
158
+
159
+ # Split summary into paragraphs
160
+ summary_paragraphs = summary.split('\n\n')
161
+ for para in summary_paragraphs:
162
+ if para.strip():
163
+ # Clean up text for PDF
164
+ clean_para = para.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
165
+ story.append(Paragraph(clean_para, body_style))
166
+ story.append(Spacer(1, 0.1*inch))
167
+
168
+ # Processing errors section (if any)
169
+ if processing_errors:
170
+ story.append(PageBreak())
171
+ story.append(Paragraph("Processing Issues", heading_style))
172
+ story.append(Spacer(1, 0.1*inch))
173
+
174
+ for error in processing_errors:
175
+ clean_error = error.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
176
+ story.append(Paragraph(f"• {clean_error}", body_style))
177
+ story.append(Spacer(1, 0.05*inch))
178
+
179
+ # Individual transcript details
180
+ story.append(PageBreak())
181
+ story.append(Paragraph("Detailed Transcript Analysis", heading_style))
182
+ story.append(Spacer(1, 0.2*inch))
183
+
184
+ for result in results:
185
+ # Transcript header
186
+ transcript_title = f"{result['transcript_id']} - {result['file_name']}"
187
+ story.append(Paragraph(transcript_title, subheading_style))
188
+
189
+ # Stats
190
+ stats_data = [
191
+ ["Quality Score:", f"{result['quality_score']:.2f}/1.00"],
192
+ ["Word Count:", f"{result['word_count']:,}"]
193
+ ]
194
+
195
+ stats_table = Table(stats_data, colWidths=[1.5*inch, 2*inch])
196
+ stats_table.setStyle(TableStyle([
197
+ ('FONTSIZE', (0, 0), (-1, -1), 9),
198
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
199
+ ('TOPPADDING', (0, 0), (-1, -1), 4),
200
+ ]))
201
+
202
+ story.append(stats_table)
203
+ story.append(Spacer(1, 0.1*inch))
204
+
205
+ # Analysis text
206
+ text = result['full_text']
207
+
208
+ # Split into manageable chunks and clean
209
+ chunks = text.split('\n\n')
210
+ for chunk in chunks[:10]: # Limit to prevent overly long PDFs
211
+ if chunk.strip():
212
+ clean_chunk = chunk.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
213
+ # Limit paragraph length
214
+ if len(clean_chunk) > 1000:
215
+ clean_chunk = clean_chunk[:1000] + "..."
216
+ story.append(Paragraph(clean_chunk, body_style))
217
+ story.append(Spacer(1, 0.1*inch))
218
+
219
+ story.append(Spacer(1, 0.2*inch))
220
+
221
+ # Page break between transcripts (except last)
222
+ if result != results[-1]:
223
+ story.append(PageBreak())
224
+
225
+ # Build PDF
226
+ try:
227
+ doc.build(story)
228
+ return path
229
+ except Exception as e:
230
+ print(f"[PDF Error] Failed to generate PDF: {e}")
231
+ # Create a minimal fallback PDF
232
+ simple_doc = SimpleDocTemplate(path, pagesize=letter)
233
+ simple_story = [
234
+ Paragraph("Transcript Analysis Report", title_style),
235
+ Paragraph(f"Error generating full report: {str(e)}", body_style),
236
+ Paragraph(summary, body_style)
237
+ ]
238
+ simple_doc.build(simple_story)
239
+ return path
requirements.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core frameworks
2
+ gradio>=4.0.0
3
+ transformers>=4.35.0
4
+ torch>=2.0.0
5
+
6
+ # NLP and text processing
7
+ nltk>=3.8.0
8
+ tiktoken>=0.5.0
9
+
10
+ # Document processing
11
+ python-docx>=1.1.0
12
+ pdfplumber>=0.10.0
13
+
14
+ # Data processing and analysis
15
+ pandas>=2.0.0
16
+ numpy>=1.24.0
17
+
18
+ # Visualization
19
+ matplotlib>=3.7.0
20
+ seaborn>=0.12.0
21
+
22
+ # PDF generation
23
+ reportlab>=4.0.0
24
+
25
+ # API integrations
26
+ huggingface_hub>=0.19.0
27
+
28
+ # Utilities
29
+ chardet>=5.0.0
30
+ python-dateutil>=2.8.0
31
+
32
+ # Optional but recommended
33
+ accelerate>=0.24.0
34
+ sentencepiece>=0.1.99
35
+ protobuf>=4.24.0
36
+
37
+ # Audio transcription
38
+ faster-whisper>=0.10.0
39
+ torchaudio>=2.0.0
40
+ speechbrain>=0.5.16
41
+ scikit-learn>=1.3.0 # For clustering speaker embeddings
story_writer.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from typing import Dict
4
+
5
+ def format_table_for_llm(df: pd.DataFrame, name: str) -> str:
6
+ return f"\n{name}:\n{df.to_string()}\n" if not df.empty else f"[{name}: No data]\n"
7
+
8
+ def build_narrative_prompt(parsed_data: Dict, tables: Dict, style: str) -> str:
9
+ metadata = parsed_data["metadata"]
10
+ stats = parsed_data["statistics"]
11
+ interviewee_type = parsed_data["interviewee_type"]
12
+
13
+ tables_text = "\n".join([format_table_for_llm(df, name) for name, df in tables.items()])
14
+
15
+ return f"""Write an executive research report for {metadata['total_transcripts']} {interviewee_type.lower()} interviews.
16
+
17
+ DATA TABLES:
18
+ {tables_text}
19
+
20
+ STRUCTURE:
21
+ 1. EXECUTIVE OVERVIEW (2-3 paragraphs): Context, sample, high-level findings
22
+ 2. KEY FINDINGS (3-5 sections): Each with narrative + data + significance
23
+ 3. PATTERNS & THEMES (2 paragraphs): Cross-cutting insights
24
+ 4. RECOMMENDATIONS (3-5 bullets): Actionable next steps
25
+
26
+ Write professionally. Quantify everything. Be specific. Lead with insights."""
27
+
28
+ def call_lmstudio(prompt: str) -> str:
29
+ import requests
30
+ url = os.getenv("LM_STUDIO_URL", "http://192.168.1.245:1234")
31
+ try:
32
+ r = requests.post(f"{url}/v1/chat/completions", json={
33
+ "messages": [{"role": "system", "content": "You are an expert research report writer."},
34
+ {"role": "user", "content": prompt}],
35
+ "max_tokens": 2000, "temperature": 0.7
36
+ }, timeout=180)
37
+ return r.json()["choices"][0]["message"]["content"]
38
+ except Exception as e:
39
+ return f"[Error: {e}]"
40
+
41
+ def call_hf_api(prompt: str) -> str:
42
+ from huggingface_hub import InferenceClient
43
+ try:
44
+ client = InferenceClient(token=os.getenv("HUGGINGFACE_TOKEN", ""))
45
+ return client.text_generation(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1",
46
+ max_new_tokens=2000, temperature=0.7)
47
+ except Exception as e:
48
+ return f"[Error: {e}]"
49
+
50
+ def generate_narrative(parsed_data: Dict, tables: Dict, style: str, llm_backend: str) -> str:
51
+ prompt = build_narrative_prompt(parsed_data, tables, style)
52
+ if llm_backend == "lmstudio":
53
+ return call_lmstudio(prompt)
54
+ else:
55
+ return call_hf_api(prompt)
table_builder.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Dict
3
+ from collections import Counter
4
+
5
+ def build_participant_profile_table(metadata: Dict) -> pd.DataFrame:
6
+ return pd.DataFrame({
7
+ "Metric": ["Total Participants", "Avg Quality Score", "Avg Words"],
8
+ "Value": [
9
+ metadata.get("total_transcripts", 0),
10
+ f"{metadata.get('avg_quality_score', 0):.2f}",
11
+ f"{metadata.get('avg_word_count', 0):,.0f}"
12
+ ]
13
+ })
14
+
15
+ def build_quality_distribution_table(stats: Dict) -> pd.DataFrame:
16
+ if "quality" not in stats:
17
+ return pd.DataFrame()
18
+ q = stats["quality"]
19
+ df = pd.DataFrame({
20
+ "Quality Tier": ["Excellent (>0.8)", "Good (0.6-0.8)", "Fair (0.4-0.6)", "Poor (<0.4)"],
21
+ "Count": [q.get("excellent_count", 0), q.get("good_count", 0),
22
+ q.get("fair_count", 0), q.get("poor_count", 0)]
23
+ })
24
+ df["Percentage"] = (df["Count"] / df["Count"].sum() * 100).round(1)
25
+ return df
26
+
27
+ def build_frequency_table(themes: Dict) -> pd.DataFrame:
28
+ rows = []
29
+ for theme_name, items in themes.items():
30
+ for item in items[:10]:
31
+ rows.append({"Category": theme_name, "Item": item["item"], "Frequency": item["count"]})
32
+ return pd.DataFrame(rows) if rows else pd.DataFrame()
33
+
34
+ def build_all_tables(parsed_data: Dict) -> Dict[str, pd.DataFrame]:
35
+ tables = {}
36
+ df = parsed_data["dataframe"]
37
+ metadata = parsed_data["metadata"]
38
+ themes = parsed_data["themes"]
39
+ stats = parsed_data["statistics"]
40
+
41
+ tables["participant_profile"] = build_participant_profile_table(metadata)
42
+
43
+ quality_table = build_quality_distribution_table(stats)
44
+ if not quality_table.empty:
45
+ tables["quality_distribution"] = quality_table
46
+
47
+ freq_table = build_frequency_table(themes)
48
+ if not freq_table.empty:
49
+ tables["theme_frequency"] = freq_table
50
+
51
+ return tables
tagging.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Tuple
3
+ from collections import Counter
4
+
5
+ def detect_speaker_patterns(text: str) -> dict:
6
+ """Analyze text to detect speaker patterns and labeling conventions"""
7
+
8
+ patterns = {
9
+ "colon_based": re.findall(r'^([A-Z][a-z\s]+\d*):\s', text, re.MULTILINE), # "Speaker 1: text"
10
+ "bracket_based": re.findall(r'^\[([^\]]+)\]\s', text, re.MULTILINE), # "[Interviewer] text"
11
+ "dash_based": re.findall(r'^-\s*([A-Z][a-z\s]+):\s', text, re.MULTILINE), # "- Doctor: text"
12
+ "q_a_based": bool(re.search(r'^(Q|A):\s', text, re.MULTILINE)), # "Q: / A:"
13
+ }
14
+
15
+ # Determine most likely pattern
16
+ pattern_counts = {k: len(v) for k, v in patterns.items() if k != "q_a_based"}
17
+ pattern_counts["q_a_based"] = 1 if patterns["q_a_based"] else 0
18
+
19
+ most_common = max(pattern_counts, key=pattern_counts.get) if any(pattern_counts.values()) else None
20
+
21
+ # Extract unique speakers
22
+ if most_common == "colon_based":
23
+ speakers = list(set(patterns["colon_based"]))
24
+ elif most_common == "bracket_based":
25
+ speakers = list(set(patterns["bracket_based"]))
26
+ elif most_common == "dash_based":
27
+ speakers = list(set(patterns["dash_based"]))
28
+ elif most_common == "q_a_based":
29
+ speakers = ["Q", "A"]
30
+ else:
31
+ speakers = []
32
+
33
+ return {
34
+ "pattern_type": most_common,
35
+ "speakers_found": speakers,
36
+ "speaker_count": len(speakers),
37
+ "has_structure": most_common is not None
38
+ }
39
+
40
+
41
+ def classify_speaker_role(text: str, speaker_label: str, interviewee_type: str) -> str:
42
+ """
43
+ Use advanced heuristics to classify speaker role
44
+ """
45
+
46
+ text_lower = text.lower()
47
+
48
+ # Question patterns (likely interviewer)
49
+ question_patterns = [
50
+ r'\?$',
51
+ r'^(what|how|why|when|where|who|can you|could you|would you|do you|have you)',
52
+ r'(tell me|explain|describe|walk me through)',
53
+ r'(your thoughts|your experience|your perspective)'
54
+ ]
55
+
56
+ question_score = sum(1 for p in question_patterns if re.search(p, text_lower))
57
+
58
+ # Medical/clinical patterns
59
+ clinical_patterns = [
60
+ r'\b(prescribe|prescription|rx|medication|drug|dose|dosage|mg|ml)\b',
61
+ r'\b(diagnos[ei]s|diagnosed|condition|disease|disorder)\b',
62
+ r'\b(treatment|therapy|intervention|protocol)\b',
63
+ r'\b(patient|case|clinical|medical|symptom)\b',
64
+ r'\b(efficacy|effectiveness|outcome|response|adverse)\b',
65
+ r'\b(guideline|recommendation|standard of care|first-line)\b'
66
+ ]
67
+
68
+ clinical_score = sum(1 for p in clinical_patterns if re.search(p, text_lower))
69
+
70
+ # Patient experience patterns
71
+ patient_patterns = [
72
+ r'\b(I feel|I felt|I\'m experiencing|I have)\b',
73
+ r'\b(my symptoms|my condition|my pain|my treatment)\b',
74
+ r'\b(it hurts|it bothers|it helps|it doesn\'t work)\b',
75
+ r'\b(I tried|I take|I stopped|I started)\b',
76
+ r'\b(doctor told me|doctor said|doctor prescribed)\b'
77
+ ]
78
+
79
+ patient_score = sum(1 for p in patient_patterns if re.search(p, text_lower))
80
+
81
+ # Neutral/closing patterns
82
+ neutral_patterns = [
83
+ r'\b(thank you|thanks|appreciate|goodbye|bye|closing)\b',
84
+ r'\b(that concludes|that\'s all|we\'re done)\b'
85
+ ]
86
+
87
+ neutral_score = sum(1 for p in neutral_patterns if re.search(p, text_lower))
88
+
89
+ # Decision logic based on interviewee type
90
+ if neutral_score > 0 and len(text.split()) < 15:
91
+ return "Neutral"
92
+
93
+ if interviewee_type == "HCP":
94
+ # In HCP interviews, high clinical language = interviewee (doctor)
95
+ if clinical_score >= 3:
96
+ return "Doctor"
97
+ elif question_score >= 2:
98
+ return "Interviewer"
99
+ elif clinical_score >= 1:
100
+ return "Doctor"
101
+ else:
102
+ return "Unknown"
103
+
104
+ elif interviewee_type == "Patient":
105
+ # In patient interviews, patient experience language = interviewee
106
+ if patient_score >= 2:
107
+ return "Patient"
108
+ elif question_score >= 2:
109
+ return "Interviewer"
110
+ elif clinical_score >= 2:
111
+ return "Interviewer" # Likely interviewer explaining medical info
112
+ elif patient_score >= 1:
113
+ return "Patient"
114
+ else:
115
+ return "Unknown"
116
+
117
+ else:
118
+ # General classification
119
+ if question_score >= 2:
120
+ return "Interviewer"
121
+ elif clinical_score >= 2:
122
+ return "Respondent"
123
+ else:
124
+ return "Unknown"
125
+
126
+
127
+ def parse_existing_tags(text: str, pattern_info: dict) -> List[Tuple[str, str]]:
128
+ """Parse text with existing speaker tags"""
129
+
130
+ pattern_type = pattern_info["pattern_type"]
131
+ segments = []
132
+
133
+ if pattern_type == "colon_based":
134
+ # "Speaker 1: text"
135
+ parts = re.split(r'^([A-Z][a-z\s]+\d*):\s', text, flags=re.MULTILINE)
136
+ for i in range(1, len(parts), 2):
137
+ if i + 1 < len(parts):
138
+ speaker = parts[i].strip()
139
+ content = parts[i + 1].strip()
140
+ if content:
141
+ segments.append((speaker, content))
142
+
143
+ elif pattern_type == "bracket_based":
144
+ # "[Speaker] text"
145
+ parts = re.split(r'^\[([^\]]+)\]\s', text, flags=re.MULTILINE)
146
+ for i in range(1, len(parts), 2):
147
+ if i + 1 < len(parts):
148
+ speaker = parts[i].strip()
149
+ content = parts[i + 1].strip()
150
+ if content:
151
+ segments.append((speaker, content))
152
+
153
+ elif pattern_type == "q_a_based":
154
+ # "Q: / A:"
155
+ parts = re.split(r'^([QA]):\s', text, flags=re.MULTILINE)
156
+ for i in range(1, len(parts), 2):
157
+ if i + 1 < len(parts):
158
+ speaker = "Interviewer" if parts[i] == "Q" else "Respondent"
159
+ content = parts[i + 1].strip()
160
+ if content:
161
+ segments.append((speaker, content))
162
+
163
+ else:
164
+ # No clear pattern - treat as single block
165
+ segments.append(("Unknown", text))
166
+
167
+ return segments
168
+
169
+
170
+ def tag_speakers_advanced(text: str, role_hint: str = "", interviewee_type: str = "Other") -> str:
171
+ """
172
+ Advanced speaker tagging with pattern detection and role classification
173
+ """
174
+
175
+ # Step 1: Detect existing structure
176
+ pattern_info = detect_speaker_patterns(text)
177
+
178
+ # Step 2: Parse role hints if provided
179
+ role_mapping = {}
180
+ if role_hint:
181
+ # Parse hints like "Speaker 1 = Interviewer, Speaker 2 = Doctor"
182
+ hint_parts = re.findall(r'([^,=]+)\s*=\s*([^,=]+)', role_hint)
183
+ for original, mapped in hint_parts:
184
+ role_mapping[original.strip().lower()] = mapped.strip()
185
+
186
+ # Step 3: Parse segments
187
+ if pattern_info["has_structure"]:
188
+ segments = parse_existing_tags(text, pattern_info)
189
+ else:
190
+ # No clear structure - split by paragraphs/lines
191
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
192
+ segments = [("Unknown", line) for line in lines]
193
+
194
+ # Step 4: Classify and tag each segment
195
+ tagged_segments = []
196
+
197
+ for speaker_label, content in segments:
198
+ # Apply role mapping if available
199
+ speaker_key = speaker_label.lower()
200
+ if speaker_key in role_mapping:
201
+ final_role = role_mapping[speaker_key]
202
+ else:
203
+ # Auto-classify based on content
204
+ final_role = classify_speaker_role(content, speaker_label, interviewee_type)
205
+
206
+ # Format the tagged line
207
+ tagged_segments.append(f"[{final_role}] {content}")
208
+
209
+ return "\n\n".join(tagged_segments)
210
+
211
+
212
+ def analyze_speaker_distribution(tagged_text: str) -> dict:
213
+ """
214
+ Analyze the distribution of speakers in tagged text
215
+ Useful for quality control
216
+ """
217
+
218
+ speakers = re.findall(r'^\[([^\]]+)\]', tagged_text, re.MULTILINE)
219
+ distribution = Counter(speakers)
220
+
221
+ total = len(speakers)
222
+
223
+ return {
224
+ "total_segments": total,
225
+ "unique_speakers": len(distribution),
226
+ "distribution": dict(distribution),
227
+ "percentages": {k: (v / total * 100) for k, v in distribution.items()} if total > 0 else {}
228
+ }
utils.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for TranscriptorAI
3
+ """
4
+
5
+ import os
6
+ import json
7
+ import hashlib
8
+ import pickle
9
+ from datetime import datetime
10
+ from typing import Any, Dict, List, Optional
11
+ from pathlib import Path
12
+ import logging
13
+
14
+ # ============================================================================
15
+ # LOGGING SETUP
16
+ # ============================================================================
17
+
18
+ def setup_logging(log_file: str = "transcript_analysis.log", level: str = "INFO"):
19
+ """Setup logging configuration"""
20
+ logging.basicConfig(
21
+ level=getattr(logging, level.upper()),
22
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
23
+ handlers=[
24
+ logging.FileHandler(log_file),
25
+ logging.StreamHandler()
26
+ ]
27
+ )
28
+ return logging.getLogger(__name__)
29
+
30
+ logger = setup_logging()
31
+
32
+ # ============================================================================
33
+ # CACHING UTILITIES
34
+ # ============================================================================
35
+
36
+ def get_file_hash(file_path: str) -> str:
37
+ """Generate hash for a file for caching purposes"""
38
+ hasher = hashlib.md5()
39
+ with open(file_path, 'rb') as f:
40
+ buf = f.read(65536) # Read in 64kb chunks
41
+ while len(buf) > 0:
42
+ hasher.update(buf)
43
+ buf = f.read(65536)
44
+ return hasher.hexdigest()
45
+
46
+
47
+ def cache_result(key: str, data: Any, cache_dir: str = "./.cache") -> bool:
48
+ """Cache a result to disk"""
49
+ try:
50
+ os.makedirs(cache_dir, exist_ok=True)
51
+ cache_file = os.path.join(cache_dir, f"{key}.pkl")
52
+
53
+ with open(cache_file, 'wb') as f:
54
+ pickle.dump(data, f)
55
+
56
+ logger.debug(f"Cached result for key: {key}")
57
+ return True
58
+ except Exception as e:
59
+ logger.error(f"Failed to cache result: {e}")
60
+ return False
61
+
62
+
63
+ def load_cached_result(key: str, cache_dir: str = "./.cache") -> Optional[Any]:
64
+ """Load a cached result from disk"""
65
+ try:
66
+ cache_file = os.path.join(cache_dir, f"{key}.pkl")
67
+
68
+ if not os.path.exists(cache_file):
69
+ return None
70
+
71
+ # Check if cache is less than 7 days old
72
+ file_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
73
+ if file_age > 7 * 24 * 3600: # 7 days
74
+ logger.debug(f"Cache expired for key: {key}")
75
+ return None
76
+
77
+ with open(cache_file, 'rb') as f:
78
+ data = pickle.load(f)
79
+
80
+ logger.debug(f"Loaded cached result for key: {key}")
81
+ return data
82
+ except Exception as e:
83
+ logger.error(f"Failed to load cached result: {e}")
84
+ return None
85
+
86
+
87
+ def clear_cache(cache_dir: str = "./.cache"):
88
+ """Clear all cached files"""
89
+ try:
90
+ if os.path.exists(cache_dir):
91
+ for file in os.listdir(cache_dir):
92
+ file_path = os.path.join(cache_dir, file)
93
+ os.remove(file_path)
94
+ logger.info(f"Cleared cache directory: {cache_dir}")
95
+ except Exception as e:
96
+ logger.error(f"Failed to clear cache: {e}")
97
+
98
+
99
+ # ============================================================================
100
+ # FILE UTILITIES
101
+ # ============================================================================
102
+
103
+ def ensure_directory(path: str) -> str:
104
+ """Ensure directory exists, create if not"""
105
+ os.makedirs(path, exist_ok=True)
106
+ return path
107
+
108
+
109
+ def get_unique_filename(base_path: str, extension: str = "") -> str:
110
+ """Generate unique filename by adding timestamp"""
111
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
112
+ base = os.path.splitext(base_path)[0]
113
+ ext = extension or os.path.splitext(base_path)[1]
114
+ return f"{base}_{timestamp}{ext}"
115
+
116
+
117
+ def get_file_size_mb(file_path: str) -> float:
118
+ """Get file size in MB"""
119
+ return os.path.getsize(file_path) / (1024 * 1024)
120
+
121
+
122
+ def validate_file(file_path: str, max_size_mb: int = 50, allowed_extensions: List[str] = None) -> tuple:
123
+ """Validate file exists, size, and extension"""
124
+ if allowed_extensions is None:
125
+ allowed_extensions = ['.docx', '.pdf']
126
+
127
+ if not os.path.exists(file_path):
128
+ return False, "File does not exist"
129
+
130
+ if get_file_size_mb(file_path) > max_size_mb:
131
+ return False, f"File exceeds {max_size_mb}MB limit"
132
+
133
+ ext = os.path.splitext(file_path)[1].lower()
134
+ if ext not in allowed_extensions:
135
+ return False, f"File type {ext} not supported"
136
+
137
+ return True, "Valid"
138
+
139
+
140
+ # ============================================================================
141
+ # DATA PROCESSING UTILITIES
142
+ # ============================================================================
143
+
144
+ def sanitize_text(text: str) -> str:
145
+ """Sanitize text for safe processing"""
146
+ # Remove null bytes
147
+ text = text.replace('\x00', '')
148
+
149
+ # Normalize whitespace
150
+ text = ' '.join(text.split())
151
+
152
+ return text.strip()
153
+
154
+
155
+ def truncate_text(text: str, max_length: int, suffix: str = "...") -> str:
156
+ """Truncate text to max length with suffix"""
157
+ if len(text) <= max_length:
158
+ return text
159
+ return text[:max_length - len(suffix)] + suffix
160
+
161
+
162
+ def extract_keywords(text: str, top_n: int = 10) -> List[str]:
163
+ """Extract top N keywords from text (simple frequency-based)"""
164
+ from collections import Counter
165
+ import re
166
+
167
+ # Simple tokenization
168
+ words = re.findall(r'\b[a-z]{3,}\b', text.lower())
169
+
170
+ # Remove common stop words
171
+ stop_words = {
172
+ 'the', 'and', 'for', 'are', 'but', 'not', 'you', 'with',
173
+ 'this', 'that', 'from', 'they', 'have', 'has', 'was', 'were'
174
+ }
175
+
176
+ words = [w for w in words if w not in stop_words]
177
+
178
+ # Count and return top N
179
+ counter = Counter(words)
180
+ return [word for word, count in counter.most_common(top_n)]
181
+
182
+
183
+ # ============================================================================
184
+ # STATISTICS UTILITIES
185
+ # ============================================================================
186
+
187
+ def calculate_statistics(values: List[float]) -> Dict[str, float]:
188
+ """Calculate basic statistics for a list of values"""
189
+ if not values:
190
+ return {}
191
+
192
+ import numpy as np
193
+
194
+ return {
195
+ "mean": np.mean(values),
196
+ "median": np.median(values),
197
+ "std": np.std(values),
198
+ "min": np.min(values),
199
+ "max": np.max(values),
200
+ "count": len(values)
201
+ }
202
+
203
+
204
+ def calculate_percentile(values: List[float], percentile: int) -> float:
205
+ """Calculate percentile of values"""
206
+ import numpy as np
207
+ return np.percentile(values, percentile)
208
+
209
+
210
+ # ============================================================================
211
+ # JSON UTILITIES
212
+ # ============================================================================
213
+
214
+ def save_json(data: Dict, filepath: str, pretty: bool = True) -> bool:
215
+ """Save data as JSON file"""
216
+ try:
217
+ with open(filepath, 'w', encoding='utf-8') as f:
218
+ if pretty:
219
+ json.dump(data, f, indent=2, ensure_ascii=False)
220
+ else:
221
+ json.dump(data, f, ensure_ascii=False)
222
+ logger.debug(f"Saved JSON to: {filepath}")
223
+ return True
224
+ except Exception as e:
225
+ logger.error(f"Failed to save JSON: {e}")
226
+ return False
227
+
228
+
229
+ def load_json(filepath: str) -> Optional[Dict]:
230
+ """Load JSON file"""
231
+ try:
232
+ with open(filepath, 'r', encoding='utf-8') as f:
233
+ data = json.load(f)
234
+ logger.debug(f"Loaded JSON from: {filepath}")
235
+ return data
236
+ except Exception as e:
237
+ logger.error(f"Failed to load JSON: {e}")
238
+ return None
239
+
240
+
241
+ # ============================================================================
242
+ # PROGRESS TRACKING
243
+ # ============================================================================
244
+
245
+ class ProgressTracker:
246
+ """Simple progress tracker for long operations"""
247
+
248
+ def __init__(self, total: int, description: str = "Processing"):
249
+ self.total = total
250
+ self.current = 0
251
+ self.description = description
252
+ self.start_time = datetime.now()
253
+
254
+ def update(self, n: int = 1):
255
+ """Update progress"""
256
+ self.current = min(self.current + n, self.total)
257
+ self._print_progress()
258
+
259
+ def _print_progress(self):
260
+ """Print progress bar"""
261
+ percentage = (self.current / self.total) * 100 if self.total > 0 else 0
262
+ bar_length = 40
263
+ filled = int(bar_length * self.current / self.total) if self.total > 0 else 0
264
+ bar = '█' * filled + '-' * (bar_length - filled)
265
+
266
+ elapsed = (datetime.now() - self.start_time).total_seconds()
267
+ eta = (elapsed / self.current * (self.total - self.current)) if self.current > 0 else 0
268
+
269
+ print(f'\r{self.description}: |{bar}| {percentage:.1f}% ({self.current}/{self.total}) ETA: {eta:.0f}s', end='')
270
+
271
+ if self.current >= self.total:
272
+ print() # New line when complete
273
+
274
+
275
+ # ============================================================================
276
+ # ERROR HANDLING UTILITIES
277
+ # ============================================================================
278
+
279
+ def safe_execute(func, *args, default=None, error_msg="Operation failed", **kwargs):
280
+ """Safely execute a function with error handling"""
281
+ try:
282
+ return func(*args, **kwargs)
283
+ except Exception as e:
284
+ logger.error(f"{error_msg}: {e}")
285
+ return default
286
+
287
+
288
+ # ============================================================================
289
+ # TEXT COMPARISON UTILITIES
290
+ # ============================================================================
291
+
292
+ def calculate_similarity(text1: str, text2: str) -> float:
293
+ """Calculate simple similarity score between two texts"""
294
+ words1 = set(text1.lower().split())
295
+ words2 = set(text2.lower().split())
296
+
297
+ if not words1 or not words2:
298
+ return 0.0
299
+
300
+ intersection = words1.intersection(words2)
301
+ union = words1.union(words2)
302
+
303
+ return len(intersection) / len(union) if union else 0.0
304
+
305
+
306
+ # ============================================================================
307
+ # BATCH PROCESSING UTILITIES
308
+ # ============================================================================
309
+
310
+ def batch_items(items: List, batch_size: int) -> List[List]:
311
+ """Split list into batches"""
312
+ return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
313
+
314
+
315
+ def parallel_process(func, items: List, max_workers: int = 4):
316
+ """Process items in parallel"""
317
+ from concurrent.futures import ThreadPoolExecutor, as_completed
318
+
319
+ results = []
320
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
321
+ futures = [executor.submit(func, item) for item in items]
322
+ for future in as_completed(futures):
323
+ try:
324
+ result = future.result()
325
+ results.append(result)
326
+ except Exception as e:
327
+ logger.error(f"Parallel processing error: {e}")
328
+ results.append(None)
329
+
330
+ return results
331
+
332
+
333
+ # ============================================================================
334
+ # EXPORT UTILITIES
335
+ # ============================================================================
336
+
337
+ def export_to_excel(data: Dict[str, List[Dict]], filepath: str) -> bool:
338
+ """Export multiple dataframes to Excel with sheets"""
339
+ try:
340
+ import pandas as pd
341
+
342
+ with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
343
+ for sheet_name, rows in data.items():
344
+ df = pd.DataFrame(rows)
345
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
346
+
347
+ logger.info(f"Exported to Excel: {filepath}")
348
+ return True
349
+ except Exception as e:
350
+ logger.error(f"Failed to export to Excel: {e}")
351
+ return False
352
+
353
+
354
+ # ============================================================================
355
+ # VALIDATION UTILITIES
356
+ # ============================================================================
357
+
358
+ def is_valid_email(email: str) -> bool:
359
+ """Basic email validation"""
360
+ import re
361
+ pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
362
+ return bool(re.match(pattern, email))
363
+
364
+
365
+ def is_valid_url(url: str) -> bool:
366
+ """Basic URL validation"""
367
+ import re
368
+ pattern = r'^https?://[^\s<>"]+$'
369
+ return bool(re.match(pattern, url))
370
+
371
+
372
+ # ============================================================================
373
+ # MAIN (FOR TESTING)
374
+ # ============================================================================
375
+
376
+ if __name__ == "__main__":
377
+ # Test utilities
378
+ print("Testing utilities...")
379
+
380
+ # Test file operations
381
+ test_dir = ensure_directory("./test_output")
382
+ print(f"Created test directory: {test_dir}")
383
+
384
+ # Test JSON operations
385
+ test_data = {"key": "value", "number": 42}
386
+ save_json(test_data, "./test_output/test.json")
387
+ loaded = load_json("./test_output/test.json")
388
+ assert loaded == test_data, "JSON save/load failed"
389
+ print("✓ JSON operations work")
390
+
391
+ # Test statistics
392
+ test_values = [1, 2, 3, 4, 5]
393
+ stats = calculate_statistics(test_values)
394
+ print(f"✓ Statistics: {stats}")
395
+
396
+ # Test progress tracker
397
+ tracker = ProgressTracker(10, "Test")
398
+ for i in range(10):
399
+ import time
400
+ time.sleep(0.1)
401
+ tracker.update()
402
+ print("✓ Progress tracker works")
403
+
404
+ print("\n✓ All utility tests passed!")
validation.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Tuple, Dict, List
3
+
4
+ def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
5
+ """
6
+ Validate that text extraction was successful
7
+ """
8
+
9
+ if not text or not text.strip():
10
+ return False, "No text extracted"
11
+
12
+ # Check for minimum content
13
+ if len(text) < 50:
14
+ return False, f"Extracted text too short ({len(text)} chars)"
15
+
16
+ # Check for garbled text indicators
17
+ garbled_patterns = [
18
+ (r'[^\x00-\x7F]{50,}', "Contains large blocks of non-ASCII characters"),
19
+ (r'(.)\1{20,}', "Contains suspicious character repetition"),
20
+ (r'^[\W\d\s]+$', "Contains only symbols/numbers/whitespace")
21
+ ]
22
+
23
+ for pattern, msg in garbled_patterns:
24
+ if re.search(pattern, text):
25
+ return False, msg
26
+
27
+ # Check word count
28
+ words = text.split()
29
+ if len(words) < 20:
30
+ return False, f"Too few words ({len(words)})"
31
+
32
+ # Calculate ratio of real words (heuristic)
33
+ potential_words = [w for w in words if re.match(r'^[a-zA-Z]{2,}$', w)]
34
+ word_ratio = len(potential_words) / len(words) if words else 0
35
+
36
+ if word_ratio < 0.3:
37
+ return False, f"Low word ratio ({word_ratio:.2f}) - possible extraction issue"
38
+
39
+ return True, f"Valid ({len(words)} words, {len(text)} chars)"
40
+
41
+
42
+ def validate_transcript_quality(
43
+ analyzed_text: str,
44
+ structured_data: Dict,
45
+ interviewee_type: str
46
+ ) -> Tuple[float, str]:
47
+ """
48
+ Assess quality of analyzed transcript
49
+
50
+ Returns:
51
+ Tuple of (quality_score [0-1], issues_description)
52
+ """
53
+
54
+ score = 1.0
55
+ issues = []
56
+
57
+ # Check 1: Length of analysis
58
+ if len(analyzed_text) < 100:
59
+ score -= 0.3
60
+ issues.append("Analysis too brief")
61
+ elif len(analyzed_text) < 300:
62
+ score -= 0.1
63
+ issues.append("Analysis somewhat brief")
64
+
65
+ # Check 2: Presence of structured data
66
+ if not structured_data:
67
+ score -= 0.2
68
+ issues.append("No structured data extracted")
69
+ else:
70
+ # Check if structured data has content
71
+ empty_fields = sum(1 for v in structured_data.values() if not v)
72
+ total_fields = len(structured_data)
73
+
74
+ if empty_fields == total_fields:
75
+ score -= 0.3
76
+ issues.append("All structured fields empty")
77
+ elif empty_fields > total_fields * 0.7:
78
+ score -= 0.2
79
+ issues.append("Most structured fields empty")
80
+
81
+ # Check 3: Type-specific validation
82
+ if interviewee_type == "HCP":
83
+ # Expect medical terminology
84
+ medical_terms = re.findall(
85
+ r'\b(diagnos\w+|prescri\w+|treatment|medication|patient|clinical|therapy)\b',
86
+ analyzed_text,
87
+ re.IGNORECASE
88
+ )
89
+
90
+ if len(medical_terms) < 3:
91
+ score -= 0.2
92
+ issues.append("Limited medical terminology for HCP interview")
93
+
94
+ # Check for key structured fields
95
+ key_fields = ["diagnoses", "prescriptions", "treatment_rationale"]
96
+ missing_fields = [f for f in key_fields if not structured_data.get(f)]
97
+
98
+ if len(missing_fields) == len(key_fields):
99
+ score -= 0.2
100
+ issues.append("No key HCP data extracted")
101
+
102
+ elif interviewee_type == "Patient":
103
+ # Expect patient-centric language
104
+ patient_terms = re.findall(
105
+ r'\b(symptom|feel|concern|experience|treatment|side effect|quality of life)\b',
106
+ analyzed_text,
107
+ re.IGNORECASE
108
+ )
109
+
110
+ if len(patient_terms) < 3:
111
+ score -= 0.2
112
+ issues.append("Limited patient-centric content")
113
+
114
+ # Check for key structured fields
115
+ key_fields = ["symptoms", "concerns", "treatment_response"]
116
+ missing_fields = [f for f in key_fields if not structured_data.get(f)]
117
+
118
+ if len(missing_fields) == len(key_fields):
119
+ score -= 0.2
120
+ issues.append("No key patient data extracted")
121
+
122
+ # Check 4: Error indicators
123
+ error_patterns = [
124
+ r'\[Error\]',
125
+ r'failed to',
126
+ r'could not',
127
+ r'unable to',
128
+ r'timeout'
129
+ ]
130
+
131
+ for pattern in error_patterns:
132
+ if re.search(pattern, analyzed_text, re.IGNORECASE):
133
+ score -= 0.3
134
+ issues.append("Contains error messages")
135
+ break
136
+
137
+ # Check 5: Repetitive content (potential LLM failure)
138
+ sentences = analyzed_text.split('.')
139
+ if len(sentences) > 3:
140
+ unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
141
+ repetition_ratio = len(sentences) / len(unique_sentences) if unique_sentences else 1
142
+
143
+ if repetition_ratio > 1.5:
144
+ score -= 0.2
145
+ issues.append("High content repetition")
146
+
147
+ # Ensure score is in valid range
148
+ score = max(0.0, min(1.0, score))
149
+
150
+ issues_text = "; ".join(issues) if issues else "No issues detected"
151
+
152
+ return score, issues_text
153
+
154
+
155
+ def check_data_completeness(csv_rows: List[Dict], interviewee_type: str) -> Dict:
156
+ """
157
+ Analyze completeness of extracted data across all transcripts
158
+ """
159
+
160
+ if not csv_rows:
161
+ return {"error": "No data to check"}
162
+
163
+ # Determine key fields based on type
164
+ if interviewee_type == "HCP":
165
+ key_fields = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
166
+ elif interviewee_type == "Patient":
167
+ key_fields = ["Primary Symptoms", "Main Concerns", "Treatment Response"]
168
+ else:
169
+ key_fields = ["Key Insights"]
170
+
171
+ completeness = {}
172
+
173
+ for field in key_fields:
174
+ if field in csv_rows[0]: # Check if field exists
175
+ filled_count = sum(1 for row in csv_rows if row.get(field) and row[field].strip())
176
+ completeness[field] = {
177
+ "filled": filled_count,
178
+ "total": len(csv_rows),
179
+ "percentage": (filled_count / len(csv_rows) * 100) if csv_rows else 0
180
+ }
181
+
182
+ # Overall completeness
183
+ total_fields = sum(c["total"] for c in completeness.values())
184
+ filled_fields = sum(c["filled"] for c in completeness.values())
185
+ overall_percentage = (filled_fields / total_fields * 100) if total_fields > 0 else 0
186
+
187
+ return {
188
+ "by_field": completeness,
189
+ "overall": {
190
+ "filled": filled_fields,
191
+ "total": total_fields,
192
+ "percentage": overall_percentage
193
+ },
194
+ "quality_grade": (
195
+ "Excellent" if overall_percentage >= 80 else
196
+ "Good" if overall_percentage >= 60 else
197
+ "Fair" if overall_percentage >= 40 else
198
+ "Poor"
199
+ )
200
+ }
201
+
202
+
203
+ def validate_structured_data_format(data: Dict, interviewee_type: str) -> Tuple[bool, List[str]]:
204
+ """
205
+ Validate that structured data has expected format
206
+ """
207
+
208
+ issues = []
209
+
210
+ if not isinstance(data, dict):
211
+ return False, ["Data is not a dictionary"]
212
+
213
+ # Define expected fields by type
214
+ expected_fields = {
215
+ "HCP": ["diagnoses", "prescriptions", "treatment_rationale"],
216
+ "Patient": ["symptoms", "concerns", "treatment_response"],
217
+ "Other": ["key_insights"]
218
+ }
219
+
220
+ required = expected_fields.get(interviewee_type, [])
221
+
222
+ # Check for expected fields
223
+ missing = [f for f in required if f not in data]
224
+ if missing:
225
+ issues.append(f"Missing expected fields: {', '.join(missing)}")
226
+
227
+ # Check field types (should be lists)
228
+ for key, value in data.items():
229
+ if not isinstance(value, list):
230
+ issues.append(f"Field '{key}' should be a list, got {type(value)}")
231
+
232
+ # Check for empty lists
233
+ empty_fields = [k for k, v in data.items() if isinstance(v, list) and not v]
234
+ if len(empty_fields) == len(data):
235
+ issues.append("All fields are empty lists")
236
+
237
+ is_valid = len(issues) == 0
238
+
239
+ return is_valid, issues
240
+
241
+ def validate_summary_quality(summary: str, num_transcripts: int) -> Tuple[float, List[str]]:
242
+ """Check summary for rigor and accuracy"""
243
+ issues = []
244
+ score = 1.0
245
+
246
+ # Check for quantification
247
+ if not re.search(r'\d+\s*(?:out of|of|participants|%)', summary):
248
+ issues.append("No quantified findings (must include counts/percentages)")
249
+ score -= 0.3
250
+
251
+ # Check for vague claims
252
+ vague_terms = ['many', 'most', 'some', 'several', 'often', 'frequently']
253
+ if any(term in summary.lower() for term in vague_terms):
254
+ issues.append("Contains vague terms - should use specific numbers")
255
+ score -= 0.2
256
+
257
+ # Check for absolute claims
258
+ absolute_terms = ['all', 'everyone', 'nobody', 'never', 'always']
259
+ for term in absolute_terms:
260
+ if re.search(rf'\b{term}\b', summary.lower()):
261
+ issues.append(f"Absolute claim '{term}' found - likely overgeneralization")
262
+ score -= 0.2
263
+
264
+ # Check for evidence markers
265
+ if 'consensus' not in summary.lower() and 'majority' not in summary.lower():
266
+ issues.append("Missing consensus indicators")
267
+ score -= 0.1
268
+
269
+ # Check length is substantial
270
+ if len(summary) < 500:
271
+ issues.append("Summary too brief for thorough analysis")
272
+ score -= 0.2
273
+
274
+ return max(0.0, score), issues