File size: 2,496 Bytes
54c99ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
import re
from typing import Dict, List, Tuple
from collections import Counter

def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]:
    df = pd.read_csv(csv_path)
    metadata = {
        "total_transcripts": len(df),
        "avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None,
        "avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None,
        "transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else []
    }
    return df, metadata

def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]:
    themes = {}
    if interviewee_type == "HCP":
        theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
    elif interviewee_type == "Patient":
        theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"]
    else:
        theme_columns = ["Key Insights"]
    
    for col in theme_columns:
        if col in df.columns:
            all_items = []
            for val in df[col].dropna():
                if isinstance(val, str):
                    all_items.extend([i.strip() for i in val.split(';') if i.strip()])
            theme_counts = Counter(all_items)
            themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)]
    return themes

def calculate_statistics(df: pd.DataFrame) -> Dict:
    stats = {}
    if "Quality Score" in df.columns:
        scores = df["Quality Score"].astype(float)
        stats["quality"] = {
            "mean": scores.mean(),
            "excellent_count": sum(scores > 0.8),
            "good_count": sum((scores >= 0.6) & (scores <= 0.8)),
            "fair_count": sum((scores >= 0.4) & (scores < 0.6)),
            "poor_count": sum(scores < 0.4)
        }
    if "Word Count" in df.columns:
        words = df["Word Count"].astype(int)
        stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())}
    return stats

def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict:
    df, metadata = parse_csv_output(csv_path)
    themes = extract_key_themes(df, interviewee_type)
    stats = calculate_statistics(df)
    return {
        "dataframe": df,
        "metadata": metadata,
        "themes": themes,
        "statistics": stats,
        "interviewee_type": interviewee_type
    }