Spaces:
Sleeping
Sleeping
File size: 4,888 Bytes
acaf471 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | """
Parse multilingual pain descriptor data from xlsx
Generate Python dictionaries for each language
"""
import pandas as pd
import json
def categorize_pain_type(english_word):
"""Categorize pain descriptor into neuropathic, nociceptive, or affective"""
neuropathic_keywords = [
'sharp', 'shooting', 'burning', 'tingling', 'numb', 'electric',
'stabbing', 'pricking', 'shock', 'sting', 'piercing', 'needle'
]
nociceptive_keywords = [
'aching', 'sore', 'throbbing', 'cramping', 'pressing', 'dull',
'heavy', 'tight', 'tender', 'stiff', 'pulling', 'squeezing'
]
affective_keywords = [
'exhausting', 'tiring', 'unbearable', 'miserable', 'annoying',
'troublesome', 'depressing', 'frustrating', 'worrying', 'frightening'
]
word_lower = english_word.lower()
# Check each category
if any(kw in word_lower for kw in neuropathic_keywords):
return 'neuropathic'
elif any(kw in word_lower for kw in nociceptive_keywords):
return 'nociceptive'
elif any(kw in word_lower for kw in affective_keywords):
return 'affective'
else:
return 'nociceptive' # Default to nociceptive
def parse_sheet(xlsx_path, sheet_name, english_col, foreign_col):
"""Parse a specific sheet and return structured data"""
df = pd.read_excel(xlsx_path, sheet_name=sheet_name)
# Special handling for Korean sheet (header is in first row)
if sheet_name == 'ko-en':
# First row contains data, not headers
# Read without header
df = pd.read_excel(xlsx_path, sheet_name=sheet_name, header=None)
# Assume column 0 is Korean, column 1 is English
df.columns = ['Korean', 'English'] + [f'Col{i}' for i in range(len(df.columns) - 2)]
english_col = 'English'
foreign_col = 'Korean'
# Remove rows with NaN in critical columns
if english_col not in df.columns or foreign_col not in df.columns:
print(f"⚠️ Warning: Expected columns not found in {sheet_name}")
print(f" Available columns: {list(df.columns)}")
return {'neuropathic': {}, 'nociceptive': {}, 'affective': {}}
df = df.dropna(subset=[english_col, foreign_col])
pain_dict = {
'neuropathic': {},
'nociceptive': {},
'affective': {}
}
for _, row in df.iterrows():
english = str(row[english_col]).strip()
foreign = str(row[foreign_col]).strip()
# Skip empty or invalid entries
if not english or not foreign or english == 'nan' or foreign == 'nan':
continue
# Categorize
category = categorize_pain_type(english)
# Add to dictionary (without snomed_ct)
pain_dict[category][foreign] = {
'english': english,
'mcgill_dimension': 'sensory' # Default, can be refined
}
return pain_dict
def main():
xlsx_path = r'c:\Users\ChaCha ship\Documents\Github\PainReport\Backend\data\questionnaire_form.xlsx'
# Parse each language sheet with correct column names
# Format: (sheet_name, english_column, foreign_column)
languages = {
'chinese': ('cn-en', 'English', 'Chinese'),
'korean': ('ko-en', 'English', 'Korean'), # Will be handled specially
'spanish': ('es-en', 'English', 'Spanish'),
'hmong': ('hmong-en', 'English pain words', 'Hmong pain words')
}
results = {}
for lang_name, (sheet_name, english_col, foreign_col) in languages.items():
print(f"\n{'='*60}")
print(f"Parsing {lang_name.upper()} ({sheet_name})")
print(f"{'='*60}")
pain_dict = parse_sheet(xlsx_path, sheet_name, english_col, foreign_col)
results[lang_name] = pain_dict
# Print statistics
total = sum(len(pain_dict[cat]) for cat in pain_dict)
print(f"Total terms: {total}")
print(f" - Neuropathic: {len(pain_dict['neuropathic'])}")
print(f" - Nociceptive: {len(pain_dict['nociceptive'])}")
print(f" - Affective: {len(pain_dict['affective'])}")
# Show samples
if len(pain_dict['neuropathic']) > 0:
print(f"\nSample neuropathic terms:")
for i, (foreign, data) in enumerate(list(pain_dict['neuropathic'].items())[:3]):
print(f" {foreign} -> {data['english']}")
# Save to JSON for inspection
import os
scripts_dir = os.path.dirname(os.path.abspath(__file__))
output_path = os.path.join(scripts_dir, 'multilingual_pain_data.json')
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n\n✅ Multilingual data saved to: {output_path}")
return results
if __name__ == '__main__':
main()
|