import pandas as pd import re from typing import Dict, List, Tuple from collections import Counter def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]: df = pd.read_csv(csv_path) metadata = { "total_transcripts": len(df), "avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None, "avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None, "transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else [] } return df, metadata def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]: themes = {} if interviewee_type == "HCP": theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"] elif interviewee_type == "Patient": theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"] else: theme_columns = ["Key Insights"] for col in theme_columns: if col in df.columns: all_items = [] for val in df[col].dropna(): if isinstance(val, str): all_items.extend([i.strip() for i in val.split(';') if i.strip()]) theme_counts = Counter(all_items) themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)] return themes def calculate_statistics(df: pd.DataFrame) -> Dict: stats = {} if "Quality Score" in df.columns: scores = df["Quality Score"].astype(float) stats["quality"] = { "mean": scores.mean(), "excellent_count": sum(scores > 0.8), "good_count": sum((scores >= 0.6) & (scores <= 0.8)), "fair_count": sum((scores >= 0.4) & (scores < 0.6)), "poor_count": sum(scores < 0.4) } if "Word Count" in df.columns: words = df["Word Count"].astype(int) stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())} return stats def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict: df, metadata = parse_csv_output(csv_path) themes = extract_key_themes(df, interviewee_type) stats = calculate_statistics(df) return { "dataframe": df, "metadata": metadata, "themes": themes, "statistics": stats, "interviewee_type": interviewee_type }