""" Parse multilingual pain descriptor data from xlsx Generate Python dictionaries for each language """ import pandas as pd import json def categorize_pain_type(english_word): """Categorize pain descriptor into neuropathic, nociceptive, or affective""" neuropathic_keywords = [ 'sharp', 'shooting', 'burning', 'tingling', 'numb', 'electric', 'stabbing', 'pricking', 'shock', 'sting', 'piercing', 'needle' ] nociceptive_keywords = [ 'aching', 'sore', 'throbbing', 'cramping', 'pressing', 'dull', 'heavy', 'tight', 'tender', 'stiff', 'pulling', 'squeezing' ] affective_keywords = [ 'exhausting', 'tiring', 'unbearable', 'miserable', 'annoying', 'troublesome', 'depressing', 'frustrating', 'worrying', 'frightening' ] word_lower = english_word.lower() # Check each category if any(kw in word_lower for kw in neuropathic_keywords): return 'neuropathic' elif any(kw in word_lower for kw in nociceptive_keywords): return 'nociceptive' elif any(kw in word_lower for kw in affective_keywords): return 'affective' else: return 'nociceptive' # Default to nociceptive def parse_sheet(xlsx_path, sheet_name, english_col, foreign_col): """Parse a specific sheet and return structured data""" df = pd.read_excel(xlsx_path, sheet_name=sheet_name) # Special handling for Korean sheet (header is in first row) if sheet_name == 'ko-en': # First row contains data, not headers # Read without header df = pd.read_excel(xlsx_path, sheet_name=sheet_name, header=None) # Assume column 0 is Korean, column 1 is English df.columns = ['Korean', 'English'] + [f'Col{i}' for i in range(len(df.columns) - 2)] english_col = 'English' foreign_col = 'Korean' # Remove rows with NaN in critical columns if english_col not in df.columns or foreign_col not in df.columns: print(f"⚠️ Warning: Expected columns not found in {sheet_name}") print(f" Available columns: {list(df.columns)}") return {'neuropathic': {}, 'nociceptive': {}, 'affective': {}} df = df.dropna(subset=[english_col, foreign_col]) pain_dict = { 'neuropathic': {}, 'nociceptive': {}, 'affective': {} } for _, row in df.iterrows(): english = str(row[english_col]).strip() foreign = str(row[foreign_col]).strip() # Skip empty or invalid entries if not english or not foreign or english == 'nan' or foreign == 'nan': continue # Categorize category = categorize_pain_type(english) # Add to dictionary (without snomed_ct) pain_dict[category][foreign] = { 'english': english, 'mcgill_dimension': 'sensory' # Default, can be refined } return pain_dict def main(): xlsx_path = r'c:\Users\ChaCha ship\Documents\Github\PainReport\Backend\data\questionnaire_form.xlsx' # Parse each language sheet with correct column names # Format: (sheet_name, english_column, foreign_column) languages = { 'chinese': ('cn-en', 'English', 'Chinese'), 'korean': ('ko-en', 'English', 'Korean'), # Will be handled specially 'spanish': ('es-en', 'English', 'Spanish'), 'hmong': ('hmong-en', 'English pain words', 'Hmong pain words') } results = {} for lang_name, (sheet_name, english_col, foreign_col) in languages.items(): print(f"\n{'='*60}") print(f"Parsing {lang_name.upper()} ({sheet_name})") print(f"{'='*60}") pain_dict = parse_sheet(xlsx_path, sheet_name, english_col, foreign_col) results[lang_name] = pain_dict # Print statistics total = sum(len(pain_dict[cat]) for cat in pain_dict) print(f"Total terms: {total}") print(f" - Neuropathic: {len(pain_dict['neuropathic'])}") print(f" - Nociceptive: {len(pain_dict['nociceptive'])}") print(f" - Affective: {len(pain_dict['affective'])}") # Show samples if len(pain_dict['neuropathic']) > 0: print(f"\nSample neuropathic terms:") for i, (foreign, data) in enumerate(list(pain_dict['neuropathic'].items())[:3]): print(f" {foreign} -> {data['english']}") # Save to JSON for inspection import os scripts_dir = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.join(scripts_dir, 'multilingual_pain_data.json') with open(output_path, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n\n✅ Multilingual data saved to: {output_path}") return results if __name__ == '__main__': main()