File size: 4,888 Bytes
acaf471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Parse multilingual pain descriptor data from xlsx
Generate Python dictionaries for each language
"""
import pandas as pd
import json

def categorize_pain_type(english_word):
    """Categorize pain descriptor into neuropathic, nociceptive, or affective"""
    neuropathic_keywords = [
        'sharp', 'shooting', 'burning', 'tingling', 'numb', 'electric', 
        'stabbing', 'pricking', 'shock', 'sting', 'piercing', 'needle'
    ]
    
    nociceptive_keywords = [
        'aching', 'sore', 'throbbing', 'cramping', 'pressing', 'dull',
        'heavy', 'tight', 'tender', 'stiff', 'pulling', 'squeezing'
    ]
    
    affective_keywords = [
        'exhausting', 'tiring', 'unbearable', 'miserable', 'annoying',
        'troublesome', 'depressing', 'frustrating', 'worrying', 'frightening'
    ]
    
    word_lower = english_word.lower()
    
    # Check each category
    if any(kw in word_lower for kw in neuropathic_keywords):
        return 'neuropathic'
    elif any(kw in word_lower for kw in nociceptive_keywords):
        return 'nociceptive'
    elif any(kw in word_lower for kw in affective_keywords):
        return 'affective'
    else:
        return 'nociceptive'  # Default to nociceptive

def parse_sheet(xlsx_path, sheet_name, english_col, foreign_col):
    """Parse a specific sheet and return structured data"""
    df = pd.read_excel(xlsx_path, sheet_name=sheet_name)
    
    # Special handling for Korean sheet (header is in first row)
    if sheet_name == 'ko-en':
        # First row contains data, not headers
        # Read without header
        df = pd.read_excel(xlsx_path, sheet_name=sheet_name, header=None)
        # Assume column 0 is Korean, column 1 is English
        df.columns = ['Korean', 'English'] + [f'Col{i}' for i in range(len(df.columns) - 2)]
        english_col = 'English'
        foreign_col = 'Korean'
    
    # Remove rows with NaN in critical columns
    if english_col not in df.columns or foreign_col not in df.columns:
        print(f"⚠️  Warning: Expected columns not found in {sheet_name}")
        print(f"    Available columns: {list(df.columns)}")
        return {'neuropathic': {}, 'nociceptive': {}, 'affective': {}}
    
    df = df.dropna(subset=[english_col, foreign_col])
    
    pain_dict = {
        'neuropathic': {},
        'nociceptive': {},
        'affective': {}
    }
    
    for _, row in df.iterrows():
        english = str(row[english_col]).strip()
        foreign = str(row[foreign_col]).strip()
        
        # Skip empty or invalid entries
        if not english or not foreign or english == 'nan' or foreign == 'nan':
            continue
            
        # Categorize
        category = categorize_pain_type(english)
        
        # Add to dictionary (without snomed_ct)
        pain_dict[category][foreign] = {
            'english': english,
            'mcgill_dimension': 'sensory'  # Default, can be refined
        }
    
    return pain_dict

def main():
    xlsx_path = r'c:\Users\ChaCha ship\Documents\Github\PainReport\Backend\data\questionnaire_form.xlsx'
    
    # Parse each language sheet with correct column names
    # Format: (sheet_name, english_column, foreign_column)
    languages = {
        'chinese': ('cn-en', 'English', 'Chinese'),
        'korean': ('ko-en', 'English', 'Korean'),  # Will be handled specially
        'spanish': ('es-en', 'English', 'Spanish'),
        'hmong': ('hmong-en', 'English pain words', 'Hmong pain words')
    }
    
    results = {}
    
    for lang_name, (sheet_name, english_col, foreign_col) in languages.items():
        print(f"\n{'='*60}")
        print(f"Parsing {lang_name.upper()} ({sheet_name})")
        print(f"{'='*60}")
        
        pain_dict = parse_sheet(xlsx_path, sheet_name, english_col, foreign_col)
        results[lang_name] = pain_dict
        
        # Print statistics
        total = sum(len(pain_dict[cat]) for cat in pain_dict)
        print(f"Total terms: {total}")
        print(f"  - Neuropathic: {len(pain_dict['neuropathic'])}")
        print(f"  - Nociceptive: {len(pain_dict['nociceptive'])}")
        print(f"  - Affective: {len(pain_dict['affective'])}")
        
        # Show samples
        if len(pain_dict['neuropathic']) > 0:
            print(f"\nSample neuropathic terms:")
            for i, (foreign, data) in enumerate(list(pain_dict['neuropathic'].items())[:3]):
                print(f"  {foreign} -> {data['english']}")
    
    # Save to JSON for inspection
    import os
    scripts_dir = os.path.dirname(os.path.abspath(__file__))
    output_path = os.path.join(scripts_dir, 'multilingual_pain_data.json')
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print(f"\n\n✅ Multilingual data saved to: {output_path}")
    
    return results

if __name__ == '__main__':
    main()