TranscriptWriting / report_parser.py
jmisak's picture
Upload 23 files
54c99ad verified
raw
history blame
2.5 kB
import pandas as pd
import re
from typing import Dict, List, Tuple
from collections import Counter
def parse_csv_output(csv_path: str) -> Tuple[pd.DataFrame, Dict]:
df = pd.read_csv(csv_path)
metadata = {
"total_transcripts": len(df),
"avg_quality_score": df["Quality Score"].astype(float).mean() if "Quality Score" in df else None,
"avg_word_count": df["Word Count"].astype(int).mean() if "Word Count" in df else None,
"transcript_ids": df["Transcript ID"].tolist() if "Transcript ID" in df else []
}
return df, metadata
def extract_key_themes(df: pd.DataFrame, interviewee_type: str) -> Dict[str, List]:
themes = {}
if interviewee_type == "HCP":
theme_columns = ["Diagnoses", "Prescriptions", "Treatment Strategies"]
elif interviewee_type == "Patient":
theme_columns = ["Primary Symptoms", "Main Concerns", "Side Effects"]
else:
theme_columns = ["Key Insights"]
for col in theme_columns:
if col in df.columns:
all_items = []
for val in df[col].dropna():
if isinstance(val, str):
all_items.extend([i.strip() for i in val.split(';') if i.strip()])
theme_counts = Counter(all_items)
themes[col] = [{"item": k, "count": v} for k, v in theme_counts.most_common(10)]
return themes
def calculate_statistics(df: pd.DataFrame) -> Dict:
stats = {}
if "Quality Score" in df.columns:
scores = df["Quality Score"].astype(float)
stats["quality"] = {
"mean": scores.mean(),
"excellent_count": sum(scores > 0.8),
"good_count": sum((scores >= 0.6) & (scores <= 0.8)),
"fair_count": sum((scores >= 0.4) & (scores < 0.6)),
"poor_count": sum(scores < 0.4)
}
if "Word Count" in df.columns:
words = df["Word Count"].astype(int)
stats["word_count"] = {"mean": int(words.mean()), "total": int(words.sum())}
return stats
def parse_transcriptor_output(csv_path: str, summary_path: str = None, interviewee_type: str = "Patient") -> Dict:
df, metadata = parse_csv_output(csv_path)
themes = extract_key_themes(df, interviewee_type)
stats = calculate_statistics(df)
return {
"dataframe": df,
"metadata": metadata,
"themes": themes,
"statistics": stats,
"interviewee_type": interviewee_type
}