Spaces:
Sleeping
Sleeping
File size: 2,496 Bytes
54c99ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import pandas as pd
import re
from typing import Dict, List, Tuple
from collections import Counter
def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]:
df = pd.read_csv(csv_path)
metadata = {
"total_transcripts": len(df),
"avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None,
"avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None,
"transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else []
}
return df, metadata
def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]:
themes = {}
if interviewee_type == "HCP":
theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
elif interviewee_type == "Patient":
theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"]
else:
theme_columns = ["Key Insights"]
for col in theme_columns:
if col in df.columns:
all_items = []
for val in df[col].dropna():
if isinstance(val, str):
all_items.extend([i.strip() for i in val.split(';') if i.strip()])
theme_counts = Counter(all_items)
themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)]
return themes
def calculate_statistics(df: pd.DataFrame) -> Dict:
stats = {}
if "Quality Score" in df.columns:
scores = df["Quality Score"].astype(float)
stats["quality"] = {
"mean": scores.mean(),
"excellent_count": sum(scores > 0.8),
"good_count": sum((scores >= 0.6) & (scores <= 0.8)),
"fair_count": sum((scores >= 0.4) & (scores < 0.6)),
"poor_count": sum(scores < 0.4)
}
if "Word Count" in df.columns:
words = df["Word Count"].astype(int)
stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())}
return stats
def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict:
df, metadata = parse_csv_output(csv_path)
themes = extract_key_themes(df, interviewee_type)
stats = calculate_statistics(df)
return {
"dataframe": df,
"metadata": metadata,
"themes": themes,
"statistics": stats,
"interviewee_type": interviewee_type
} |