Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| from typing import Dict, List, Tuple | |
| from collections import Counter | |
| def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]: | |
| df = pd.read_csv(csv_path) | |
| metadata = { | |
| "total_transcripts": len(df), | |
| "avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None, | |
| "avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None, | |
| "transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else [] | |
| } | |
| return df, metadata | |
| def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]: | |
| themes = {} | |
| if interviewee_type == "HCP": | |
| theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"] | |
| elif interviewee_type == "Patient": | |
| theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"] | |
| else: | |
| theme_columns = ["Key Insights"] | |
| for col in theme_columns: | |
| if col in df.columns: | |
| all_items = [] | |
| for val in df[col].dropna(): | |
| if isinstance(val, str): | |
| all_items.extend([i.strip() for i in val.split(';') if i.strip()]) | |
| theme_counts = Counter(all_items) | |
| themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)] | |
| return themes | |
| def calculate_statistics(df: pd.DataFrame) -> Dict: | |
| stats = {} | |
| if "Quality Score" in df.columns: | |
| scores = df["Quality Score"].astype(float) | |
| stats["quality"] = { | |
| "mean": scores.mean(), | |
| "excellent_count": sum(scores > 0.8), | |
| "good_count": sum((scores >= 0.6) & (scores <= 0.8)), | |
| "fair_count": sum((scores >= 0.4) & (scores < 0.6)), | |
| "poor_count": sum(scores < 0.4) | |
| } | |
| if "Word Count" in df.columns: | |
| words = df["Word Count"].astype(int) | |
| stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())} | |
| return stats | |
| def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict: | |
| df, metadata = parse_csv_output(csv_path) | |
| themes = extract_key_themes(df, interviewee_type) | |
| stats = calculate_statistics(df) | |
| return { | |
| "dataframe": df, | |
| "metadata": metadata, | |
| "themes": themes, | |
| "statistics": stats, | |
| "interviewee_type": interviewee_type | |
| } |