egumasa commited on
Commit
42f8800
·
1 Parent(s): 492bb24

more sophistication indice selection

Browse files
clear_session.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ from web_app.session_manager import SessionManager
4
+
5
+ st.title("🔄 Session State Reset")
6
+
7
+ st.write("## Current Session State")
8
+ st.write("Reference lists currently loaded:")
9
+
10
+ if hasattr(st.session_state, 'reference_lists') and st.session_state.reference_lists:
11
+ for name, data in st.session_state.reference_lists.items():
12
+ st.write(f"- **{name}**")
13
+ else:
14
+ st.write("No reference lists loaded")
15
+
16
+ st.write("---")
17
+
18
+ if st.button("🗑️ Clear All Session State", type="primary"):
19
+ # Clear all session state
20
+ for key in list(st.session_state.keys()):
21
+ del st.session_state[key]
22
+
23
+ # Reinitialize
24
+ SessionManager.initialize_session_state()
25
+
26
+ st.success("✅ Session state cleared! Please refresh the page.")
27
+ st.balloons()
28
+
29
+ st.write("### Instructions:")
30
+ st.write("1. Click 'Clear All Session State' above")
31
+ st.write("2. Refresh your browser page")
32
+ st.write("3. Go back to the Lexical Sophistication tool")
33
+ st.write("4. Re-select your reference lists")
34
+ st.write("5. You should now see smart defaults!")
config/reference_lists.yaml CHANGED
@@ -1,17 +1,12 @@
1
- # Configuration for Default Reference Lists
2
- # Add new reference lists here and they'll automatically appear in the UI
3
- # Structure: language -> type -> list_name -> configuration
4
-
5
  english:
6
  unigrams:
7
- COCA_spoken_frequency:
8
- display_name: "COCA Spoken Frequency"
9
- description: "Frequency and range data from COCA spoken subcorpus"
10
- files:
11
- token: "resources/reference_lists/en/COCA_spoken_unigram_list.csv"
12
- lemma: "resources/reference_lists/en/COCA_spoken_unigram_list.csv" # Using same file for now
13
- format: "tsv"
14
- columns:
15
  word: 0
16
  frequency: 1
17
  normalized_freq: 2
@@ -19,59 +14,269 @@ english:
19
  dispersion: 4
20
  has_header: false
21
  enabled: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- concreteness_ratings:
24
- display_name: "Concreteness Ratings"
25
- description: "Concreteness ratings for English words (1-5 scale)"
26
- files:
27
- token: "resources/reference_lists/en/concreteness_token.csv"
28
- lemma: "resources/reference_lists/en/concreteness_lemma.csv"
29
- format: "tsv"
30
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  word: 0
32
  concreteness: 1
33
  has_header: true
34
- header_prefix: "#"
35
- enabled: false # Disabled until files exist
36
-
37
- academic_words:
38
- display_name: "Academic Word List"
39
- description: "Common academic vocabulary for research writing"
40
- files:
41
- token: "resources/reference_lists/en/academic_words_token.csv"
42
- lemma: "resources/reference_lists/en/academic_words_lemma.csv"
43
- format: "csv"
44
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  word: 0
46
  frequency: 1
47
  has_header: true
48
- enabled: false # Disabled until files exist
49
-
50
- bigrams:
51
- COCA_bigram_frequency:
52
- display_name: "COCA Bigram Frequency"
53
- description: "Bigram frequencies and range data"
54
- files:
55
- token: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
56
- lemma: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
57
- format: "tsv"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  bigram: 0
60
  frequency: 1
61
  normalized_freq: 2
62
  documents: 3
63
  range: 4
64
  has_header: false
65
- enabled: true # Disabled until files exist
66
-
67
- COCA_bigram_association:
68
- display_name: "COCA Bigram Associations"
69
- description: "Bigram association measures (MI, T-score, Delta P)"
70
- files:
71
- token: "resources/reference_lists/en/COCA_bigram_association_token.csv"
72
- lemma: "resources/reference_lists/en/COCA_bigram_association_lemma.csv"
73
- format: "csv"
74
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  bigram: 0
76
  frequency: 1
77
  mi_score: 2
@@ -80,17 +285,199 @@ english:
80
  delta_p: 5
81
  ap_collex: 6
82
  has_header: true
83
- enabled: false # Disabled until files exist
84
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  trigrams:
86
- COCA_trigram_frequency:
87
- display_name: "COCA Trigram Frequency"
88
- description: "Trigram frequencies and range data"
89
- files:
90
- token: "resources/reference_lists/en/COCA_spoken_trigram_list.csv"
91
- lemma: "resources/reference_lists/en/COCA_spoken_trigram_list.csv"
92
- format: "tsv"
93
- columns:
94
  trigram: 0
95
  frequency: 1
96
  normalized_freq: 2
@@ -98,15 +485,62 @@ english:
98
  dispersion: 4
99
  has_header: false
100
  enabled: true
101
-
102
- COCA_trigram_assoc_uni_bi:
103
- display_name: "COCA Trigram→Bigram Associations"
104
- description: "Trigram to bigram association measures"
105
- files:
106
- token: "resources/reference_lists/en/COCA_trigram_assoc_uni_bi_token.csv"
107
- lemma: "resources/reference_lists/en/COCA_trigram_assoc_uni_bi_lemma.csv"
108
- format: "csv"
109
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  trigram: 0
111
  frequency: 1
112
  mi_score: 2
@@ -115,16 +549,65 @@ english:
115
  delta_p: 5
116
  ap_collex: 6
117
  has_header: true
118
- enabled: false # Disabled until files exist
119
-
120
- COCA_trigram_assoc_bi_uni:
121
- display_name: "COCA Trigram→Unigram Associations"
122
- description: "Trigram to unigram association measures"
123
- files:
124
- token: "resources/reference_lists/en/COCA_trigram_assoc_bi_uni_token.csv"
125
- lemma: "resources/reference_lists/en/COCA_trigram_assoc_bi_uni_lemma.csv"
126
- format: "csv"
127
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  trigram: 0
129
  frequency: 1
130
  mi_score: 2
@@ -133,118 +616,398 @@ english:
133
  delta_p: 5
134
  ap_collex: 6
135
  has_header: true
136
- enabled: false # Disabled until files exist
137
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  japanese:
139
  unigrams:
140
- BCCWJ_frequency:
141
- display_name: "BCCWJ Written - Frequency"
142
- description: "BCCWJ raw frequency counts for written Japanese"
143
- files:
144
- token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
145
- lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
146
- format: "tsv"
 
 
 
147
  has_header: true
148
  enabled: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  japanese_corpus: true
150
- columns:
151
- surface_form: 1 # lForm
152
- lemma: 2 # lemma
153
- pos: 3 # pos
154
- frequency: 6 # primary measure column
155
-
156
- BCCWJ_pmw:
157
- display_name: "BCCWJ Written - Per Million Words"
158
- description: "BCCWJ normalized frequency for written Japanese"
159
- files:
160
- token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
161
- lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
162
- format: "tsv"
163
  has_header: true
164
  enabled: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  japanese_corpus: true
166
- columns:
 
 
 
 
 
167
  surface_form: 1
168
  lemma: 2
169
  pos: 3
170
- frequency: 7 # pmw column
171
-
172
- BCCWJ_rank:
173
- display_name: "BCCWJ Written - Frequency Rank"
174
- description: "BCCWJ frequency ranking for written Japanese"
175
- files:
176
- token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
177
- lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
178
- format: "tsv"
179
  has_header: true
180
  enabled: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  japanese_corpus: true
182
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  surface_form: 1
184
  lemma: 2
185
  pos: 3
186
- frequency: 0 # rank column
187
-
188
- CSJ_frequency:
189
- display_name: "CSJ Spoken - Frequency"
190
- description: "CSJ raw frequency counts for spoken Japanese"
191
- files:
192
- token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
193
- lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
194
- format: "tsv"
195
  has_header: true
196
  enabled: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  japanese_corpus: true
198
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  surface_form: 1
200
  lemma: 2
201
  pos: 3
202
  frequency: 6
203
-
204
- CSJ_pmw:
205
- display_name: "CSJ Spoken - Per Million Words"
206
- description: "CSJ normalized frequency for spoken Japanese"
207
- files:
208
- token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
209
- lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
210
- format: "tsv"
211
  has_header: true
212
  enabled: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  japanese_corpus: true
214
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  surface_form: 1
216
  lemma: 2
217
  pos: 3
218
  frequency: 7
219
-
220
- CSJ_rank:
221
- display_name: "CSJ Spoken - Frequency Rank"
222
- description: "CSJ frequency ranking for spoken Japanese"
223
- files:
224
- token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
225
- lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
226
- format: "tsv"
227
  has_header: true
228
  enabled: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  japanese_corpus: true
230
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  surface_form: 1
232
  lemma: 2
233
  pos: 3
234
  frequency: 0
235
-
236
- jp_frequency:
237
- display_name: "Japanese Frequency List"
238
- description: "Frequency data for Japanese words"
239
- files:
240
- token: "resources/reference_lists/ja/jp_frequency_token.csv"
241
- lemma: "resources/reference_lists/ja/jp_frequency_lemma.csv"
242
- format: "csv"
243
- columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  word: 0
245
  frequency: 1
246
  has_header: true
247
- enabled: false # Disabled until files exist
248
-
249
- # bigrams: {}
250
- # trigrams: {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  english:
2
  unigrams:
3
+ COCA_magazine_frequency_token:
4
+ display_name: COCA Magazine Frequency (Token)
5
+ description: Frequency and range data from COCA magazine subcorpus - token-based
6
+ analysis
7
+ file: resources/reference_lists/en/COCA_magazine_unigram_list.csv
8
+ format: tsv
9
+ columns: &id001
 
10
  word: 0
11
  frequency: 1
12
  normalized_freq: 2
 
14
  dispersion: 4
15
  has_header: false
16
  enabled: true
17
+ analysis_type: token
18
+ log_transformable:
19
+ - frequency
20
+ - normalized_freq
21
+ selectable_measures:
22
+ - frequency
23
+ - normalized_freq
24
+ - range
25
+ - dispersion
26
+ default_measures:
27
+ - frequency
28
+ - normalized_freq
29
+ - range
30
+ default_log_transforms:
31
+ - frequency
32
+ - normalized_freq
33
+ measure_classifications:
34
+ frequency: frequency
35
+ normalized_freq: frequency
36
+ range: range
37
+ dispersion: range
38
 
39
+ COCA_magazine_frequency_lemma:
40
+ display_name: COCA Magazine Frequency (Lemma)
41
+ description: Frequency and range data from COCA magazine subcorpus - lemma-based
42
+ analysis
43
+ file: resources/reference_lists/en/COCA_magazine_unigram_list.csv
44
+ format: tsv
45
+ columns: *id001
46
+ has_header: false
47
+ enabled: true
48
+ analysis_type: lemma
49
+ log_transformable:
50
+ - frequency
51
+ - normalized_freq
52
+ selectable_measures:
53
+ - frequency
54
+ - normalized_freq
55
+ - range
56
+ - dispersion
57
+ default_measures:
58
+ - frequency
59
+ - normalized_freq
60
+ - range
61
+ default_log_transforms:
62
+ - frequency
63
+ - normalized_freq
64
+ measure_classifications:
65
+ frequency: frequency
66
+ normalized_freq: frequency
67
+ range: range
68
+ dispersion: range
69
+
70
+ concreteness_ratings_token:
71
+ display_name: Concreteness Ratings (Token)
72
+ description: Concreteness ratings for English words (1-5 scale) - token-based
73
+ analysis
74
+ file: resources/reference_lists/en/Concreteness_Brysbaert.txt
75
+ format: tsv
76
+ columns: &id002
77
  word: 0
78
  concreteness: 1
79
  has_header: true
80
+ enabled: true
81
+ analysis_type: token
82
+ log_transformable: []
83
+ selectable_measures:
84
+ - concreteness
85
+ default_measures:
86
+ - concreteness
87
+ default_log_transforms: []
88
+ measure_classifications:
89
+ concreteness: psycholinguistic
90
+ header_prefix: '#'
91
+
92
+ concreteness_ratings_lemma:
93
+ display_name: Concreteness Ratings (Lemma)
94
+ description: Concreteness ratings for English words (1-5 scale) - lemma-based
95
+ analysis
96
+ file: resources/reference_lists/en/Concreteness_Brysbaert.txt
97
+ format: tsv
98
+ columns: *id002
99
+ has_header: true
100
+ enabled: true
101
+ analysis_type: lemma
102
+ log_transformable: []
103
+ selectable_measures:
104
+ - concreteness
105
+ default_measures:
106
+ - concreteness
107
+ default_log_transforms: []
108
+ measure_classifications:
109
+ concreteness: psycholinguistic
110
+ header_prefix: '#'
111
+ academic_words_token:
112
+ display_name: Academic Word List (Token)
113
+ description: Common academic vocabulary for research writing - token-based analysis
114
+ file: resources/reference_lists/en/academic_words_token.csv
115
+ format: csv
116
+ columns: &id003
117
  word: 0
118
  frequency: 1
119
  has_header: true
120
+ enabled: false
121
+ analysis_type: token
122
+ log_transformable:
123
+ - frequency
124
+ selectable_measures:
125
+ - frequency
126
+ default_measures:
127
+ - frequency
128
+ default_log_transforms:
129
+ - frequency
130
+ measure_classifications:
131
+ frequency: frequency
132
+ academic_words_lemma:
133
+ display_name: Academic Word List (Lemma)
134
+ description: Common academic vocabulary for research writing - lemma-based analysis
135
+ file: resources/reference_lists/en/academic_words_lemma.csv
136
+ format: csv
137
+ columns: *id003
138
+ has_header: true
139
+ enabled: false
140
+ analysis_type: lemma
141
+ log_transformable:
142
+ - frequency
143
+ selectable_measures:
144
+ - frequency
145
+ default_measures:
146
+ - frequency
147
+ default_log_transforms:
148
+ - frequency
149
+ measure_classifications:
150
+ frequency: frequency
151
+ COCA_spoken_frequency_token:
152
+ display_name: COCA Spoken Frequency (Token)
153
+ description: Frequency and range data from COCA spoken subcorpus - token-based
154
+ analysis
155
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
156
+ format: tsv
157
+ columns:
158
+ word: 0
159
+ frequency: 1
160
+ normalized_freq: 2
161
+ range: 3
162
+ dispersion: 4
163
+ has_header: false
164
+ enabled: true
165
+ analysis_type: token
166
+ log_transformable:
167
+ - frequency
168
+ - normalized_freq
169
+ selectable_measures:
170
+ - frequency
171
+ - normalized_freq
172
+ - range
173
+ - dispersion
174
+ default_measures:
175
+ - frequency
176
+ - normalized_freq
177
+ default_log_transforms:
178
+ - frequency
179
+ - normalized_freq
180
+ COCA_spoken_frequency_lemma:
181
+ display_name: COCA Spoken Frequency (Lemma)
182
+ description: Frequency and range data from COCA spoken subcorpus - lemma-based
183
+ analysis
184
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
185
+ format: tsv
186
  columns:
187
+ word: 0
188
+ frequency: 1
189
+ normalized_freq: 2
190
+ range: 3
191
+ dispersion: 4
192
+ has_header: false
193
+ enabled: true
194
+ analysis_type: lemma
195
+ log_transformable:
196
+ - frequency
197
+ - normalized_freq
198
+ selectable_measures:
199
+ - frequency
200
+ - normalized_freq
201
+ - range
202
+ - dispersion
203
+ default_measures:
204
+ - frequency
205
+ - normalized_freq
206
+ default_log_transforms:
207
+ - frequency
208
+ - normalized_freq
209
+ bigrams:
210
+ COCA_spoken_bigram_frequency_token:
211
+ display_name: COCA Spoken Bigram Frequency (Token)
212
+ description: Bigram frequencies and range data - token-based analysis
213
+ file: resources/reference_lists/en/COCA_spoken_bigram_list.csv
214
+ format: tsv
215
+ columns: &id004
216
  bigram: 0
217
  frequency: 1
218
  normalized_freq: 2
219
  documents: 3
220
  range: 4
221
  has_header: false
222
+ enabled: true
223
+ analysis_type: token
224
+ log_transformable:
225
+ - frequency
226
+ - normalized_freq
227
+ selectable_measures:
228
+ - frequency
229
+ - normalized_freq
230
+ - documents
231
+ - range
232
+ default_measures:
233
+ - frequency
234
+ - normalized_freq
235
+ - range
236
+ default_log_transforms:
237
+ - frequency
238
+ - normalized_freq
239
+ measure_classifications:
240
+ frequency: frequency
241
+ normalized_freq: frequency
242
+ documents: range
243
+ range: range
244
+ COCA_spoken_bigram_frequency_lemma:
245
+ display_name: COCA Spoken Bigram Frequency (Lemma)
246
+ description: Bigram frequencies and range data - lemma-based analysis
247
+ file: resources/reference_lists/en/COCA_spoken_bigram_list.csv
248
+ format: tsv
249
+ columns: *id004
250
+ has_header: false
251
+ enabled: true
252
+ analysis_type: lemma
253
+ log_transformable:
254
+ - frequency
255
+ - normalized_freq
256
+ selectable_measures:
257
+ - frequency
258
+ - normalized_freq
259
+ - documents
260
+ - range
261
+ default_measures:
262
+ - frequency
263
+ - normalized_freq
264
+ - range
265
+ default_log_transforms:
266
+ - frequency
267
+ - normalized_freq
268
+ measure_classifications:
269
+ frequency: frequency
270
+ normalized_freq: frequency
271
+ documents: range
272
+ range: range
273
+ COCA_spoken_bigram_association_token:
274
+ display_name: COCA Spoken Bigram Associations (Token)
275
+ description: Bigram association measures (MI, T-score, Delta P) - token-based
276
+ analysis
277
+ file: resources/reference_lists/en/spoken_bi_contingency.csv
278
+ format: csv
279
+ columns: &id005
280
  bigram: 0
281
  frequency: 1
282
  mi_score: 2
 
285
  delta_p: 5
286
  ap_collex: 6
287
  has_header: true
288
+ enabled: true
289
+ analysis_type: token
290
+ log_transformable:
291
+ - frequency
292
+ selectable_measures:
293
+ - frequency
294
+ - mi_score
295
+ - mi_2_score
296
+ - t_score
297
+ - delta_p
298
+ - ap_collex
299
+ default_measures:
300
+ - frequency
301
+ - t_score
302
+ default_log_transforms:
303
+ - frequency
304
+ measure_classifications:
305
+ frequency: frequency
306
+ mi_score: association
307
+ mi_2_score: association
308
+ t_score: association
309
+ delta_p: association
310
+ ap_collex: association
311
+ COCA_spoken_bigram_association_lemma:
312
+ display_name: COCA Spoken Bigram Associations (Lemma)
313
+ description: Bigram association measures (MI, T-score, Delta P) - lemma-based
314
+ analysis
315
+ file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
316
+ format: csv
317
+ columns: *id005
318
+ has_header: true
319
+ enabled: true
320
+ analysis_type: lemma
321
+ log_transformable:
322
+ - frequency
323
+ selectable_measures:
324
+ - frequency
325
+ - mi_score
326
+ - mi_2_score
327
+ - t_score
328
+ - delta_p
329
+ - ap_collex
330
+ default_measures:
331
+ - frequency
332
+ - t_score
333
+ default_log_transforms:
334
+ - frequency
335
+ measure_classifications:
336
+ frequency: frequency
337
+ mi_score: association
338
+ mi_2_score: association
339
+ t_score: association
340
+ delta_p: association
341
+ ap_collex: association
342
+ COCA_magazine_bigram_frequency_token:
343
+ display_name: COCA Magazine Bigram Frequency (Token)
344
+ description: Bigram frequencies and range data in Magazine - token-based analysis
345
+ file: resources/reference_lists/en/COCA_magazine_bigram_list.csv
346
+ format: tsv
347
+ columns: &id006
348
+ bigram: 0
349
+ frequency: 1
350
+ normalized_freq: 2
351
+ documents: 3
352
+ range: 4
353
+ has_header: false
354
+ enabled: true
355
+ analysis_type: token
356
+ log_transformable:
357
+ - frequency
358
+ - normalized_freq
359
+ selectable_measures:
360
+ - frequency
361
+ - normalized_freq
362
+ - documents
363
+ - range
364
+ default_measures:
365
+ - frequency
366
+ - normalized_freq
367
+ - range
368
+ default_log_transforms:
369
+ - frequency
370
+ - normalized_freq
371
+ measure_classifications:
372
+ frequency: frequency
373
+ normalized_freq: frequency
374
+ documents: range
375
+ range: range
376
+ COCA_magazine_bigram_frequency_lemma:
377
+ display_name: COCA Magazine Bigram Frequency (Lemma)
378
+ description: Bigram frequencies and range data in Magazine - lemma-based analysis
379
+ file: resources/reference_lists/en/COCA_spoken_bigram_list.csv
380
+ format: tsv
381
+ columns: *id006
382
+ has_header: false
383
+ enabled: true
384
+ analysis_type: lemma
385
+ log_transformable:
386
+ - frequency
387
+ - normalized_freq
388
+ selectable_measures:
389
+ - frequency
390
+ - normalized_freq
391
+ - documents
392
+ - range
393
+ default_measures:
394
+ - frequency
395
+ - normalized_freq
396
+ - range
397
+ default_log_transforms:
398
+ - frequency
399
+ - normalized_freq
400
+ measure_classifications:
401
+ frequency: frequency
402
+ normalized_freq: frequency
403
+ documents: range
404
+ range: range
405
+ COCA_magazine_bigram_association_token:
406
+ display_name: COCA Magazine Bigram Associations (Token)
407
+ description: Bigram association measures (MI, T-score, Delta P) - token-based
408
+ analysis
409
+ file: resources/reference_lists/en/magazine_bi_contingency.csv
410
+ format: csv
411
+ columns: &id007
412
+ bigram: 0
413
+ frequency: 1
414
+ mi_score: 2
415
+ mi_2_score: 3
416
+ t_score: 4
417
+ delta_p: 5
418
+ ap_collex: 6
419
+ has_header: true
420
+ enabled: true
421
+ analysis_type: token
422
+ log_transformable:
423
+ - frequency
424
+ selectable_measures:
425
+ - frequency
426
+ - mi_score
427
+ - mi_2_score
428
+ - t_score
429
+ - delta_p
430
+ - ap_collex
431
+ default_measures:
432
+ - frequency
433
+ - t_score
434
+ default_log_transforms:
435
+ - frequency
436
+ measure_classifications:
437
+ frequency: frequency
438
+ mi_score: association
439
+ mi_2_score: association
440
+ t_score: association
441
+ delta_p: association
442
+ ap_collex: association
443
+ COCA_magazine_bigram_association_lemma:
444
+ display_name: COCA Magazine Bigram Associations (Lemma)
445
+ description: Bigram association measures (MI, T-score, Delta P) - lemma-based
446
+ analysis
447
+ file: resources/reference_lists/en/magazine_bigram_lemma_contingency.csv
448
+ format: csv
449
+ columns: *id007
450
+ has_header: true
451
+ enabled: true
452
+ analysis_type: lemma
453
+ log_transformable:
454
+ - frequency
455
+ selectable_measures:
456
+ - frequency
457
+ - mi_score
458
+ - mi_2_score
459
+ - t_score
460
+ - delta_p
461
+ - ap_collex
462
+ default_measures:
463
+ - frequency
464
+ - t_score
465
+ default_log_transforms:
466
+ - frequency
467
+ measure_classifications:
468
+ frequency: frequency
469
+ mi_score: association
470
+ mi_2_score: association
471
+ t_score: association
472
+ delta_p: association
473
+ ap_collex: association
474
  trigrams:
475
+ COCA_trigram_frequency_token:
476
+ display_name: COCA Trigram Frequency (Token)
477
+ description: Trigram frequencies and range data - token-based analysis
478
+ file: resources/reference_lists/en/COCA_spoken_trigram_list.csv
479
+ format: tsv
480
+ columns: &id008
 
 
481
  trigram: 0
482
  frequency: 1
483
  normalized_freq: 2
 
485
  dispersion: 4
486
  has_header: false
487
  enabled: true
488
+ analysis_type: token
489
+ log_transformable:
490
+ - frequency
491
+ - normalized_freq
492
+ selectable_measures:
493
+ - frequency
494
+ - normalized_freq
495
+ - range
496
+ - dispersion
497
+ default_measures:
498
+ - frequency
499
+ - normalized_freq
500
+ - range
501
+ default_log_transforms:
502
+ - frequency
503
+ - normalized_freq
504
+ measure_classifications:
505
+ frequency: frequency
506
+ normalized_freq: frequency
507
+ range: range
508
+ dispersion: range
509
+ COCA_trigram_frequency_lemma:
510
+ display_name: COCA Trigram Frequency (Lemma)
511
+ description: Trigram frequencies and range data - lemma-based analysis
512
+ file: resources/reference_lists/en/COCA_spoken_trigram_list.csv
513
+ format: tsv
514
+ columns: *id008
515
+ has_header: false
516
+ enabled: true
517
+ analysis_type: lemma
518
+ log_transformable:
519
+ - frequency
520
+ - normalized_freq
521
+ selectable_measures:
522
+ - frequency
523
+ - normalized_freq
524
+ - range
525
+ - dispersion
526
+ default_measures:
527
+ - frequency
528
+ - normalized_freq
529
+ - range
530
+ default_log_transforms:
531
+ - frequency
532
+ - normalized_freq
533
+ measure_classifications:
534
+ frequency: frequency
535
+ normalized_freq: frequency
536
+ range: range
537
+ dispersion: range
538
+ COCA_trigram_assoc_uni_bi_token:
539
+ display_name: COCA Trigram→Bigram Associations (Token)
540
+ description: Trigram to bigram association measures - token-based analysis
541
+ file: resources/reference_lists/en/spoken_tri_contingency_1.csv
542
+ format: csv
543
+ columns: &id009
544
  trigram: 0
545
  frequency: 1
546
  mi_score: 2
 
549
  delta_p: 5
550
  ap_collex: 6
551
  has_header: true
552
+ enabled: true
553
+ analysis_type: token
554
+ log_transformable:
555
+ - frequency
556
+ selectable_measures:
557
+ - frequency
558
+ - mi_score
559
+ - mi_2_score
560
+ - t_score
561
+ - delta_p
562
+ - ap_collex
563
+ default_measures:
564
+ - frequency
565
+ - t_score
566
+ default_log_transforms:
567
+ - frequency
568
+ measure_classifications:
569
+ frequency: frequency
570
+ mi_score: association
571
+ mi_2_score: association
572
+ t_score: association
573
+ delta_p: association
574
+ ap_collex: association
575
+ COCA_trigram_assoc_uni_bi_lemma:
576
+ display_name: COCA Trigram→Bigram Associations (Lemma)
577
+ description: Trigram to bigram association measures - lemma-based analysis
578
+ file: resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv
579
+ format: csv
580
+ columns: *id009
581
+ has_header: true
582
+ enabled: true
583
+ analysis_type: lemma
584
+ log_transformable:
585
+ - frequency
586
+ selectable_measures:
587
+ - frequency
588
+ - mi_score
589
+ - mi_2_score
590
+ - t_score
591
+ - delta_p
592
+ - ap_collex
593
+ default_measures:
594
+ - frequency
595
+ - t_score
596
+ default_log_transforms:
597
+ - frequency
598
+ measure_classifications:
599
+ frequency: frequency
600
+ mi_score: association
601
+ mi_2_score: association
602
+ t_score: association
603
+ delta_p: association
604
+ ap_collex: association
605
+ COCA_trigram_assoc_bi_uni_token:
606
+ display_name: COCA Trigram→Unigram Associations (Token)
607
+ description: Trigram to unigram association measures - token-based analysis
608
+ file: resources/reference_lists/en/spoken_tri_contingency_2.csv
609
+ format: csv
610
+ columns: &id010
611
  trigram: 0
612
  frequency: 1
613
  mi_score: 2
 
616
  delta_p: 5
617
  ap_collex: 6
618
  has_header: true
619
+ enabled: true
620
+ analysis_type: token
621
+ log_transformable:
622
+ - frequency
623
+ selectable_measures:
624
+ - frequency
625
+ - mi_score
626
+ - mi_2_score
627
+ - t_score
628
+ - delta_p
629
+ - ap_collex
630
+ default_measures:
631
+ - frequency
632
+ - t_score
633
+ default_log_transforms:
634
+ - frequency
635
+ measure_classifications:
636
+ frequency: frequency
637
+ mi_score: association
638
+ mi_2_score: association
639
+ t_score: association
640
+ delta_p: association
641
+ ap_collex: association
642
+ COCA_trigram_assoc_bi_uni_lemma:
643
+ display_name: COCA Trigram→Unigram Associations (Lemma)
644
+ description: Trigram to unigram association measures - lemma-based analysis
645
+ file: resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv
646
+ format: csv
647
+ columns: *id010
648
+ has_header: true
649
+ enabled: true
650
+ analysis_type: lemma
651
+ log_transformable:
652
+ - frequency
653
+ selectable_measures:
654
+ - frequency
655
+ - mi_score
656
+ - mi_2_score
657
+ - t_score
658
+ - delta_p
659
+ - ap_collex
660
+ default_measures:
661
+ - frequency
662
+ - t_score
663
+ default_log_transforms:
664
+ - frequency
665
+ measure_classifications:
666
+ frequency: frequency
667
+ mi_score: association
668
+ mi_2_score: association
669
+ t_score: association
670
+ delta_p: association
671
+ ap_collex: association
672
  japanese:
673
  unigrams:
674
+ BCCWJ_frequency_token:
675
+ display_name: BCCWJ Written - Frequency (Token)
676
+ description: BCCWJ raw frequency counts for written Japanese - token-based analysis
677
+ file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
678
+ format: tsv
679
+ columns: &id011
680
+ surface_form: 1
681
+ lemma: 2
682
+ pos: 3
683
+ frequency: 6
684
  has_header: true
685
  enabled: true
686
+ analysis_type: token
687
+ log_transformable:
688
+ - frequency
689
+ selectable_measures:
690
+ - pos
691
+ - frequency
692
+ default_measures:
693
+ - frequency
694
+ - pos
695
+ default_log_transforms:
696
+ - frequency
697
+ measure_classifications:
698
+ pos: unknown
699
+ frequency: frequency
700
  japanese_corpus: true
701
+ BCCWJ_frequency_lemma:
702
+ display_name: BCCWJ Written - Frequency (Lemma)
703
+ description: BCCWJ raw frequency counts for written Japanese - lemma-based analysis
704
+ file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
705
+ format: tsv
706
+ columns: *id011
 
 
 
 
 
 
 
707
  has_header: true
708
  enabled: true
709
+ analysis_type: lemma
710
+ log_transformable:
711
+ - frequency
712
+ selectable_measures:
713
+ - pos
714
+ - frequency
715
+ default_measures:
716
+ - frequency
717
+ - pos
718
+ default_log_transforms:
719
+ - frequency
720
+ measure_classifications:
721
+ pos: unknown
722
+ frequency: frequency
723
  japanese_corpus: true
724
+ BCCWJ_pmw_token:
725
+ display_name: BCCWJ Written - Per Million Words (Token)
726
+ description: BCCWJ normalized frequency for written Japanese - token-based analysis
727
+ file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
728
+ format: tsv
729
+ columns: &id012
730
  surface_form: 1
731
  lemma: 2
732
  pos: 3
733
+ frequency: 7
 
 
 
 
 
 
 
 
734
  has_header: true
735
  enabled: true
736
+ analysis_type: token
737
+ log_transformable:
738
+ - frequency
739
+ selectable_measures:
740
+ - pos
741
+ - frequency
742
+ default_measures:
743
+ - frequency
744
+ - pos
745
+ default_log_transforms:
746
+ - frequency
747
+ measure_classifications:
748
+ pos: unknown
749
+ frequency: frequency
750
  japanese_corpus: true
751
+ BCCWJ_pmw_lemma:
752
+ display_name: BCCWJ Written - Per Million Words (Lemma)
753
+ description: BCCWJ normalized frequency for written Japanese - lemma-based analysis
754
+ file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
755
+ format: tsv
756
+ columns: *id012
757
+ has_header: true
758
+ enabled: true
759
+ analysis_type: lemma
760
+ log_transformable:
761
+ - frequency
762
+ selectable_measures:
763
+ - pos
764
+ - frequency
765
+ default_measures:
766
+ - frequency
767
+ - pos
768
+ default_log_transforms:
769
+ - frequency
770
+ measure_classifications:
771
+ pos: unknown
772
+ frequency: frequency
773
+ japanese_corpus: true
774
+ BCCWJ_rank_token:
775
+ display_name: BCCWJ Written - Frequency Rank (Token)
776
+ description: BCCWJ frequency ranking for written Japanese - token-based analysis
777
+ file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
778
+ format: tsv
779
+ columns: &id013
780
  surface_form: 1
781
  lemma: 2
782
  pos: 3
783
+ frequency: 0
 
 
 
 
 
 
 
 
784
  has_header: true
785
  enabled: true
786
+ analysis_type: token
787
+ log_transformable:
788
+ - frequency
789
+ selectable_measures:
790
+ - pos
791
+ - frequency
792
+ default_measures:
793
+ - frequency
794
+ - pos
795
+ default_log_transforms:
796
+ - frequency
797
+ measure_classifications:
798
+ pos: unknown
799
+ frequency: frequency
800
  japanese_corpus: true
801
+ BCCWJ_rank_lemma:
802
+ display_name: BCCWJ Written - Frequency Rank (Lemma)
803
+ description: BCCWJ frequency ranking for written Japanese - lemma-based analysis
804
+ file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
805
+ format: tsv
806
+ columns: *id013
807
+ has_header: true
808
+ enabled: true
809
+ analysis_type: lemma
810
+ log_transformable:
811
+ - frequency
812
+ selectable_measures:
813
+ - pos
814
+ - frequency
815
+ default_measures:
816
+ - frequency
817
+ - pos
818
+ default_log_transforms:
819
+ - frequency
820
+ measure_classifications:
821
+ pos: unknown
822
+ frequency: frequency
823
+ japanese_corpus: true
824
+ CSJ_frequency_token:
825
+ display_name: CSJ Spoken - Frequency (Token)
826
+ description: CSJ raw frequency counts for spoken Japanese - token-based analysis
827
+ file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
828
+ format: tsv
829
+ columns: &id014
830
  surface_form: 1
831
  lemma: 2
832
  pos: 3
833
  frequency: 6
 
 
 
 
 
 
 
 
834
  has_header: true
835
  enabled: true
836
+ analysis_type: token
837
+ log_transformable:
838
+ - frequency
839
+ selectable_measures:
840
+ - pos
841
+ - frequency
842
+ default_measures:
843
+ - frequency
844
+ - pos
845
+ default_log_transforms:
846
+ - frequency
847
+ measure_classifications:
848
+ pos: unknown
849
+ frequency: frequency
850
  japanese_corpus: true
851
+ CSJ_frequency_lemma:
852
+ display_name: CSJ Spoken - Frequency (Lemma)
853
+ description: CSJ raw frequency counts for spoken Japanese - lemma-based analysis
854
+ file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
855
+ format: tsv
856
+ columns: *id014
857
+ has_header: true
858
+ enabled: true
859
+ analysis_type: lemma
860
+ log_transformable:
861
+ - frequency
862
+ selectable_measures:
863
+ - pos
864
+ - frequency
865
+ default_measures:
866
+ - frequency
867
+ - pos
868
+ default_log_transforms:
869
+ - frequency
870
+ measure_classifications:
871
+ pos: unknown
872
+ frequency: frequency
873
+ japanese_corpus: true
874
+ CSJ_pmw_token:
875
+ display_name: CSJ Spoken - Per Million Words (Token)
876
+ description: CSJ normalized frequency for spoken Japanese - token-based analysis
877
+ file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
878
+ format: tsv
879
+ columns: &id015
880
  surface_form: 1
881
  lemma: 2
882
  pos: 3
883
  frequency: 7
 
 
 
 
 
 
 
 
884
  has_header: true
885
  enabled: true
886
+ analysis_type: token
887
+ log_transformable:
888
+ - frequency
889
+ selectable_measures:
890
+ - pos
891
+ - frequency
892
+ default_measures:
893
+ - frequency
894
+ - pos
895
+ default_log_transforms:
896
+ - frequency
897
+ measure_classifications:
898
+ pos: unknown
899
+ frequency: frequency
900
  japanese_corpus: true
901
+ CSJ_pmw_lemma:
902
+ display_name: CSJ Spoken - Per Million Words (Lemma)
903
+ description: CSJ normalized frequency for spoken Japanese - lemma-based analysis
904
+ file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
905
+ format: tsv
906
+ columns: *id015
907
+ has_header: true
908
+ enabled: true
909
+ analysis_type: lemma
910
+ log_transformable:
911
+ - frequency
912
+ selectable_measures:
913
+ - pos
914
+ - frequency
915
+ default_measures:
916
+ - frequency
917
+ - pos
918
+ default_log_transforms:
919
+ - frequency
920
+ measure_classifications:
921
+ pos: unknown
922
+ frequency: frequency
923
+ japanese_corpus: true
924
+ CSJ_rank_token:
925
+ display_name: CSJ Spoken - Frequency Rank (Token)
926
+ description: CSJ frequency ranking for spoken Japanese - token-based analysis
927
+ file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
928
+ format: tsv
929
+ columns: &id016
930
  surface_form: 1
931
  lemma: 2
932
  pos: 3
933
  frequency: 0
934
+ has_header: true
935
+ enabled: true
936
+ analysis_type: token
937
+ log_transformable:
938
+ - frequency
939
+ selectable_measures:
940
+ - pos
941
+ - frequency
942
+ default_measures:
943
+ - frequency
944
+ - pos
945
+ default_log_transforms:
946
+ - frequency
947
+ measure_classifications:
948
+ pos: unknown
949
+ frequency: frequency
950
+ japanese_corpus: true
951
+ CSJ_rank_lemma:
952
+ display_name: CSJ Spoken - Frequency Rank (Lemma)
953
+ description: CSJ frequency ranking for spoken Japanese - lemma-based analysis
954
+ file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
955
+ format: tsv
956
+ columns: *id016
957
+ has_header: true
958
+ enabled: true
959
+ analysis_type: lemma
960
+ log_transformable:
961
+ - frequency
962
+ selectable_measures:
963
+ - pos
964
+ - frequency
965
+ default_measures:
966
+ - frequency
967
+ - pos
968
+ default_log_transforms:
969
+ - frequency
970
+ measure_classifications:
971
+ pos: unknown
972
+ frequency: frequency
973
+ japanese_corpus: true
974
+ jp_frequency_token:
975
+ display_name: Japanese Frequency List (Token)
976
+ description: Frequency data for Japanese words - token-based analysis
977
+ file: resources/reference_lists/ja/jp_frequency_token.csv
978
+ format: csv
979
+ columns: &id017
980
  word: 0
981
  frequency: 1
982
  has_header: true
983
+ enabled: false
984
+ analysis_type: token
985
+ log_transformable:
986
+ - frequency
987
+ selectable_measures:
988
+ - frequency
989
+ default_measures:
990
+ - frequency
991
+ default_log_transforms:
992
+ - frequency
993
+ measure_classifications:
994
+ frequency: frequency
995
+ jp_frequency_lemma:
996
+ display_name: Japanese Frequency List (Lemma)
997
+ description: Frequency data for Japanese words - lemma-based analysis
998
+ file: resources/reference_lists/ja/jp_frequency_lemma.csv
999
+ format: csv
1000
+ columns: *id017
1001
+ has_header: true
1002
+ enabled: false
1003
+ analysis_type: lemma
1004
+ log_transformable:
1005
+ - frequency
1006
+ selectable_measures:
1007
+ - frequency
1008
+ default_measures:
1009
+ - frequency
1010
+ default_log_transforms:
1011
+ - frequency
1012
+ measure_classifications:
1013
+ frequency: frequency
config/reference_lists.yaml.backup_20250727_220815 ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for Default Reference Lists
2
+ # Add new reference lists here and they'll automatically appear in the UI
3
+ # Structure: language -> type -> list_name -> configuration
4
+
5
+ english:
6
+ unigrams:
7
+ COCA_spoken_frequency:
8
+ display_name: "COCA Spoken Frequency"
9
+ description: "Frequency and range data from COCA spoken subcorpus"
10
+ files:
11
+ token: "resources/reference_lists/en/COCA_spoken_unigram_list.csv"
12
+ lemma: "resources/reference_lists/en/COCA_spoken_unigram_list.csv" # Using same file for now
13
+ format: "tsv"
14
+ columns:
15
+ word: 0
16
+ frequency: 1
17
+ normalized_freq: 2
18
+ range: 3
19
+ dispersion: 4
20
+ has_header: false
21
+ enabled: true
22
+
23
+ COCA_magazine_frequency:
24
+ display_name: "COCA Magazine Frequency"
25
+ description: "Frequency and range data from COCA magazine subcorpus"
26
+ files:
27
+ token: "resources/reference_lists/en/COCA_magazine_unigram_list.csv"
28
+ lemma: "resources/reference_lists/en/COCA_magazine_unigram_list.csv" # Using same file for now
29
+ format: "tsv"
30
+ columns:
31
+ word: 0
32
+ frequency: 1
33
+ normalized_freq: 2
34
+ range: 3
35
+ dispersion: 4
36
+ has_header: false
37
+ enabled: true
38
+
39
+ concreteness_ratings:
40
+ display_name: "Concreteness Ratings"
41
+ description: "Concreteness ratings for English words (1-5 scale)"
42
+ files:
43
+ token: "resources/reference_lists/en/Concreteness_Brysbaert.txt"
44
+ lemma: "resources/reference_lists/en/Concreteness_Brysbaert.txt"
45
+ format: "tsv"
46
+ columns:
47
+ word: 0
48
+ concreteness: 1
49
+ has_header: true
50
+ header_prefix: "#"
51
+ enabled: true # Disabled until files exist
52
+
53
+ academic_words:
54
+ display_name: "Academic Word List"
55
+ description: "Common academic vocabulary for research writing"
56
+ files:
57
+ token: "resources/reference_lists/en/academic_words_token.csv"
58
+ lemma: "resources/reference_lists/en/academic_words_lemma.csv"
59
+ format: "csv"
60
+ columns:
61
+ word: 0
62
+ frequency: 1
63
+ has_header: true
64
+ enabled: false # Disabled until files exist
65
+
66
+ bigrams:
67
+ COCA_spoken_bigram_frequency:
68
+ display_name: "COCA Spoken Bigram Frequency"
69
+ description: "Bigram frequencies and range data"
70
+ files:
71
+ token: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
72
+ lemma: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
73
+ format: "tsv"
74
+ columns:
75
+ bigram: 0
76
+ frequency: 1
77
+ normalized_freq: 2
78
+ documents: 3
79
+ range: 4
80
+ has_header: false
81
+ enabled: true # Disabled until files exist
82
+
83
+ COCA_spoken_bigram_association:
84
+ display_name: "COCA Spoken Bigram Associations"
85
+ description: "Bigram association measures (MI, T-score, Delta P)"
86
+ files:
87
+ token: "resources/reference_lists/en/spoken_bi_contingency.csv"
88
+ lemma: "resources/reference_lists/en/spoken_bigram_lemma_contingency.csv"
89
+ format: "csv"
90
+ columns:
91
+ bigram: 0
92
+ frequency: 1
93
+ mi_score: 2
94
+ mi_2_score: 3
95
+ t_score: 4
96
+ delta_p: 5
97
+ ap_collex: 6
98
+ has_header: true
99
+ enabled: true # Disabled until files exist
100
+
101
+ COCA_magazine_bigram_frequency:
102
+ display_name: "COCA Magazine Bigram Frequency"
103
+ description: "Bigram frequencies and range data in Magazine"
104
+ files:
105
+ token: "resources/reference_lists/en/COCA_magazine_bigram_list.csv"
106
+ lemma: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
107
+ format: "tsv"
108
+ columns:
109
+ bigram: 0
110
+ frequency: 1
111
+ normalized_freq: 2
112
+ documents: 3
113
+ range: 4
114
+ has_header: false
115
+ enabled: true # Disabled until files exist
116
+
117
+ COCA_magazine_bigram_association:
118
+ display_name: "COCA Magazine Bigram Associations"
119
+ description: "Bigram association measures (MI, T-score, Delta P)"
120
+ files:
121
+ token: "resources/reference_lists/en/magazine_bi_contingency.csv"
122
+ lemma: "resources/reference_lists/en/magazine_bigram_lemma_contingency.csv"
123
+ format: "csv"
124
+ columns:
125
+ bigram: 0
126
+ frequency: 1
127
+ mi_score: 2
128
+ mi_2_score: 3
129
+ t_score: 4
130
+ delta_p: 5
131
+ ap_collex: 6
132
+ has_header: true
133
+ enabled: true # Disabled until files exist
134
+
135
+
136
+ trigrams:
137
+ COCA_trigram_frequency:
138
+ display_name: "COCA Trigram Frequency"
139
+ description: "Trigram frequencies and range data"
140
+ files:
141
+ token: "resources/reference_lists/en/COCA_spoken_trigram_list.csv"
142
+ lemma: "resources/reference_lists/en/COCA_spoken_trigram_list.csv"
143
+ format: "tsv"
144
+ columns:
145
+ trigram: 0
146
+ frequency: 1
147
+ normalized_freq: 2
148
+ range: 3
149
+ dispersion: 4
150
+ has_header: false
151
+ enabled: true
152
+
153
+ COCA_trigram_assoc_uni_bi:
154
+ display_name: "COCA Trigram→Bigram Associations"
155
+ description: "Trigram to bigram association measures"
156
+ files:
157
+ token: "resources/reference_lists/en/spoken_tri_contingency_1.csv"
158
+ lemma: "resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv"
159
+ format: "csv"
160
+ columns:
161
+ trigram: 0
162
+ frequency: 1
163
+ mi_score: 2
164
+ mi_2_score: 3
165
+ t_score: 4
166
+ delta_p: 5
167
+ ap_collex: 6
168
+ has_header: true
169
+ enabled: true # Disabled until files exist
170
+
171
+ COCA_trigram_assoc_bi_uni:
172
+ display_name: "COCA Trigram→Unigram Associations"
173
+ description: "Trigram to unigram association measures"
174
+ files:
175
+ token: "resources/reference_lists/en/spoken_tri_contingency_2.csv"
176
+ lemma: "resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv"
177
+ format: "csv"
178
+ columns:
179
+ trigram: 0
180
+ frequency: 1
181
+ mi_score: 2
182
+ mi_2_score: 3
183
+ t_score: 4
184
+ delta_p: 5
185
+ ap_collex: 6
186
+ has_header: true
187
+ enabled: true # Disabled until files exist
188
+
189
+ japanese:
190
+ unigrams:
191
+ BCCWJ_frequency:
192
+ display_name: "BCCWJ Written - Frequency"
193
+ description: "BCCWJ raw frequency counts for written Japanese"
194
+ files:
195
+ token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
196
+ lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
197
+ format: "tsv"
198
+ has_header: true
199
+ enabled: true
200
+ japanese_corpus: true
201
+ columns:
202
+ surface_form: 1 # lForm
203
+ lemma: 2 # lemma
204
+ pos: 3 # pos
205
+ frequency: 6 # primary measure column
206
+
207
+ BCCWJ_pmw:
208
+ display_name: "BCCWJ Written - Per Million Words"
209
+ description: "BCCWJ normalized frequency for written Japanese"
210
+ files:
211
+ token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
212
+ lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
213
+ format: "tsv"
214
+ has_header: true
215
+ enabled: true
216
+ japanese_corpus: true
217
+ columns:
218
+ surface_form: 1
219
+ lemma: 2
220
+ pos: 3
221
+ frequency: 7 # pmw column
222
+
223
+ BCCWJ_rank:
224
+ display_name: "BCCWJ Written - Frequency Rank"
225
+ description: "BCCWJ frequency ranking for written Japanese"
226
+ files:
227
+ token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
228
+ lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
229
+ format: "tsv"
230
+ has_header: true
231
+ enabled: true
232
+ japanese_corpus: true
233
+ columns:
234
+ surface_form: 1
235
+ lemma: 2
236
+ pos: 3
237
+ frequency: 0 # rank column
238
+
239
+ CSJ_frequency:
240
+ display_name: "CSJ Spoken - Frequency"
241
+ description: "CSJ raw frequency counts for spoken Japanese"
242
+ files:
243
+ token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
244
+ lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
245
+ format: "tsv"
246
+ has_header: true
247
+ enabled: true
248
+ japanese_corpus: true
249
+ columns:
250
+ surface_form: 1
251
+ lemma: 2
252
+ pos: 3
253
+ frequency: 6
254
+
255
+ CSJ_pmw:
256
+ display_name: "CSJ Spoken - Per Million Words"
257
+ description: "CSJ normalized frequency for spoken Japanese"
258
+ files:
259
+ token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
260
+ lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
261
+ format: "tsv"
262
+ has_header: true
263
+ enabled: true
264
+ japanese_corpus: true
265
+ columns:
266
+ surface_form: 1
267
+ lemma: 2
268
+ pos: 3
269
+ frequency: 7
270
+
271
+ CSJ_rank:
272
+ display_name: "CSJ Spoken - Frequency Rank"
273
+ description: "CSJ frequency ranking for spoken Japanese"
274
+ files:
275
+ token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
276
+ lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
277
+ format: "tsv"
278
+ has_header: true
279
+ enabled: true
280
+ japanese_corpus: true
281
+ columns:
282
+ surface_form: 1
283
+ lemma: 2
284
+ pos: 3
285
+ frequency: 0
286
+
287
+ jp_frequency:
288
+ display_name: "Japanese Frequency List"
289
+ description: "Frequency data for Japanese words"
290
+ files:
291
+ token: "resources/reference_lists/ja/jp_frequency_token.csv"
292
+ lemma: "resources/reference_lists/ja/jp_frequency_lemma.csv"
293
+ format: "csv"
294
+ columns:
295
+ word: 0
296
+ frequency: 1
297
+ has_header: true
298
+ enabled: false # Disabled until files exist
299
+
300
+ # bigrams: {}
301
+ # trigrams: {}
config/reference_lists.yaml.backup_20250727_230913 ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ english:
2
+ unigrams:
3
+ COCA_magazine_frequency:
4
+ display_name: COCA Magazine Frequency
5
+ description: Frequency and range data from COCA magazine subcorpus
6
+ files:
7
+ token: resources/reference_lists/en/COCA_magazine_unigram_list.csv
8
+ lemma: resources/reference_lists/en/COCA_magazine_unigram_list.csv
9
+ format: tsv
10
+ columns:
11
+ word: 0
12
+ frequency: 1
13
+ normalized_freq: 2
14
+ range: 3
15
+ dispersion: 4
16
+ has_header: false
17
+ enabled: true
18
+ concreteness_ratings:
19
+ display_name: Concreteness Ratings
20
+ description: Concreteness ratings for English words (1-5 scale)
21
+ files:
22
+ token: resources/reference_lists/en/Concreteness_Brysbaert.txt
23
+ lemma: resources/reference_lists/en/Concreteness_Brysbaert.txt
24
+ format: tsv
25
+ columns:
26
+ word: 0
27
+ concreteness: 1
28
+ has_header: true
29
+ header_prefix: '#'
30
+ enabled: true
31
+ academic_words:
32
+ display_name: Academic Word List
33
+ description: Common academic vocabulary for research writing
34
+ files:
35
+ token: resources/reference_lists/en/academic_words_token.csv
36
+ lemma: resources/reference_lists/en/academic_words_lemma.csv
37
+ format: csv
38
+ columns:
39
+ word: 0
40
+ frequency: 1
41
+ has_header: true
42
+ enabled: false
43
+ COCA_spoken_frequency_token:
44
+ display_name: COCA Spoken Frequency (Token)
45
+ description: Frequency and range data from COCA spoken subcorpus - token-based
46
+ analysis
47
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
48
+ format: tsv
49
+ columns:
50
+ word: 0
51
+ frequency: 1
52
+ normalized_freq: 2
53
+ range: 3
54
+ dispersion: 4
55
+ has_header: false
56
+ enabled: true
57
+ analysis_type: token
58
+ log_transformable:
59
+ - frequency
60
+ - normalized_freq
61
+ selectable_measures:
62
+ - frequency
63
+ - normalized_freq
64
+ - range
65
+ - dispersion
66
+ default_measures:
67
+ - frequency
68
+ - normalized_freq
69
+ default_log_transforms:
70
+ - frequency
71
+ - normalized_freq
72
+ COCA_spoken_frequency_lemma:
73
+ display_name: COCA Spoken Frequency (Lemma)
74
+ description: Frequency and range data from COCA spoken subcorpus - lemma-based
75
+ analysis
76
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
77
+ format: tsv
78
+ columns:
79
+ word: 0
80
+ frequency: 1
81
+ normalized_freq: 2
82
+ range: 3
83
+ dispersion: 4
84
+ has_header: false
85
+ enabled: true
86
+ analysis_type: lemma
87
+ log_transformable:
88
+ - frequency
89
+ - normalized_freq
90
+ selectable_measures:
91
+ - frequency
92
+ - normalized_freq
93
+ - range
94
+ - dispersion
95
+ default_measures:
96
+ - frequency
97
+ - normalized_freq
98
+ default_log_transforms:
99
+ - frequency
100
+ - normalized_freq
101
+ bigrams:
102
+ COCA_spoken_bigram_frequency:
103
+ display_name: COCA Spoken Bigram Frequency
104
+ description: Bigram frequencies and range data
105
+ files:
106
+ token: resources/reference_lists/en/COCA_spoken_bigram_list.csv
107
+ lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
108
+ format: tsv
109
+ columns:
110
+ bigram: 0
111
+ frequency: 1
112
+ normalized_freq: 2
113
+ documents: 3
114
+ range: 4
115
+ has_header: false
116
+ enabled: true
117
+ COCA_spoken_bigram_association:
118
+ display_name: COCA Spoken Bigram Associations
119
+ description: Bigram association measures (MI, T-score, Delta P)
120
+ files:
121
+ token: resources/reference_lists/en/spoken_bi_contingency.csv
122
+ lemma: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
123
+ format: csv
124
+ columns:
125
+ bigram: 0
126
+ frequency: 1
127
+ mi_score: 2
128
+ mi_2_score: 3
129
+ t_score: 4
130
+ delta_p: 5
131
+ ap_collex: 6
132
+ has_header: true
133
+ enabled: true
134
+ COCA_magazine_bigram_frequency:
135
+ display_name: COCA Magazine Bigram Frequency
136
+ description: Bigram frequencies and range data in Magazine
137
+ files:
138
+ token: resources/reference_lists/en/COCA_magazine_bigram_list.csv
139
+ lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
140
+ format: tsv
141
+ columns:
142
+ bigram: 0
143
+ frequency: 1
144
+ normalized_freq: 2
145
+ documents: 3
146
+ range: 4
147
+ has_header: false
148
+ enabled: true
149
+ COCA_magazine_bigram_association:
150
+ display_name: COCA Magazine Bigram Associations
151
+ description: Bigram association measures (MI, T-score, Delta P)
152
+ files:
153
+ token: resources/reference_lists/en/magazine_bi_contingency.csv
154
+ lemma: resources/reference_lists/en/magazine_bigram_lemma_contingency.csv
155
+ format: csv
156
+ columns:
157
+ bigram: 0
158
+ frequency: 1
159
+ mi_score: 2
160
+ mi_2_score: 3
161
+ t_score: 4
162
+ delta_p: 5
163
+ ap_collex: 6
164
+ has_header: true
165
+ enabled: true
166
+ trigrams:
167
+ COCA_trigram_frequency:
168
+ display_name: COCA Trigram Frequency
169
+ description: Trigram frequencies and range data
170
+ files:
171
+ token: resources/reference_lists/en/COCA_spoken_trigram_list.csv
172
+ lemma: resources/reference_lists/en/COCA_spoken_trigram_list.csv
173
+ format: tsv
174
+ columns:
175
+ trigram: 0
176
+ frequency: 1
177
+ normalized_freq: 2
178
+ range: 3
179
+ dispersion: 4
180
+ has_header: false
181
+ enabled: true
182
+ COCA_trigram_assoc_uni_bi:
183
+ display_name: COCA Trigram→Bigram Associations
184
+ description: Trigram to bigram association measures
185
+ files:
186
+ token: resources/reference_lists/en/spoken_tri_contingency_1.csv
187
+ lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv
188
+ format: csv
189
+ columns:
190
+ trigram: 0
191
+ frequency: 1
192
+ mi_score: 2
193
+ mi_2_score: 3
194
+ t_score: 4
195
+ delta_p: 5
196
+ ap_collex: 6
197
+ has_header: true
198
+ enabled: true
199
+ COCA_trigram_assoc_bi_uni:
200
+ display_name: COCA Trigram→Unigram Associations
201
+ description: Trigram to unigram association measures
202
+ files:
203
+ token: resources/reference_lists/en/spoken_tri_contingency_2.csv
204
+ lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv
205
+ format: csv
206
+ columns:
207
+ trigram: 0
208
+ frequency: 1
209
+ mi_score: 2
210
+ mi_2_score: 3
211
+ t_score: 4
212
+ delta_p: 5
213
+ ap_collex: 6
214
+ has_header: true
215
+ enabled: true
216
+ japanese:
217
+ unigrams:
218
+ BCCWJ_frequency:
219
+ display_name: BCCWJ Written - Frequency
220
+ description: BCCWJ raw frequency counts for written Japanese
221
+ files:
222
+ token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
223
+ lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
224
+ format: tsv
225
+ has_header: true
226
+ enabled: true
227
+ japanese_corpus: true
228
+ columns:
229
+ surface_form: 1
230
+ lemma: 2
231
+ pos: 3
232
+ frequency: 6
233
+ BCCWJ_pmw:
234
+ display_name: BCCWJ Written - Per Million Words
235
+ description: BCCWJ normalized frequency for written Japanese
236
+ files:
237
+ token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
238
+ lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
239
+ format: tsv
240
+ has_header: true
241
+ enabled: true
242
+ japanese_corpus: true
243
+ columns:
244
+ surface_form: 1
245
+ lemma: 2
246
+ pos: 3
247
+ frequency: 7
248
+ BCCWJ_rank:
249
+ display_name: BCCWJ Written - Frequency Rank
250
+ description: BCCWJ frequency ranking for written Japanese
251
+ files:
252
+ token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
253
+ lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
254
+ format: tsv
255
+ has_header: true
256
+ enabled: true
257
+ japanese_corpus: true
258
+ columns:
259
+ surface_form: 1
260
+ lemma: 2
261
+ pos: 3
262
+ frequency: 0
263
+ CSJ_frequency:
264
+ display_name: CSJ Spoken - Frequency
265
+ description: CSJ raw frequency counts for spoken Japanese
266
+ files:
267
+ token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
268
+ lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
269
+ format: tsv
270
+ has_header: true
271
+ enabled: true
272
+ japanese_corpus: true
273
+ columns:
274
+ surface_form: 1
275
+ lemma: 2
276
+ pos: 3
277
+ frequency: 6
278
+ CSJ_pmw:
279
+ display_name: CSJ Spoken - Per Million Words
280
+ description: CSJ normalized frequency for spoken Japanese
281
+ files:
282
+ token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
283
+ lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
284
+ format: tsv
285
+ has_header: true
286
+ enabled: true
287
+ japanese_corpus: true
288
+ columns:
289
+ surface_form: 1
290
+ lemma: 2
291
+ pos: 3
292
+ frequency: 7
293
+ CSJ_rank:
294
+ display_name: CSJ Spoken - Frequency Rank
295
+ description: CSJ frequency ranking for spoken Japanese
296
+ files:
297
+ token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
298
+ lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
299
+ format: tsv
300
+ has_header: true
301
+ enabled: true
302
+ japanese_corpus: true
303
+ columns:
304
+ surface_form: 1
305
+ lemma: 2
306
+ pos: 3
307
+ frequency: 0
308
+ jp_frequency:
309
+ display_name: Japanese Frequency List
310
+ description: Frequency data for Japanese words
311
+ files:
312
+ token: resources/reference_lists/ja/jp_frequency_token.csv
313
+ lemma: resources/reference_lists/ja/jp_frequency_lemma.csv
314
+ format: csv
315
+ columns:
316
+ word: 0
317
+ frequency: 1
318
+ has_header: true
319
+ enabled: false
config/reference_lists.yaml.backup_20250727_231728 ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ english:
2
+ unigrams:
3
+ COCA_magazine_frequency:
4
+ display_name: COCA Magazine Frequency
5
+ description: Frequency and range data from COCA magazine subcorpus
6
+ files:
7
+ token: resources/reference_lists/en/COCA_magazine_unigram_list.csv
8
+ lemma: resources/reference_lists/en/COCA_magazine_unigram_list.csv
9
+ format: tsv
10
+ columns:
11
+ word: 0
12
+ frequency: 1
13
+ normalized_freq: 2
14
+ range: 3
15
+ dispersion: 4
16
+ has_header: false
17
+ enabled: true
18
+ concreteness_ratings:
19
+ display_name: Concreteness Ratings
20
+ description: Concreteness ratings for English words (1-5 scale)
21
+ files:
22
+ token: resources/reference_lists/en/Concreteness_Brysbaert.txt
23
+ lemma: resources/reference_lists/en/Concreteness_Brysbaert.txt
24
+ format: tsv
25
+ columns:
26
+ word: 0
27
+ concreteness: 1
28
+ has_header: true
29
+ header_prefix: '#'
30
+ enabled: true
31
+ academic_words:
32
+ display_name: Academic Word List
33
+ description: Common academic vocabulary for research writing
34
+ files:
35
+ token: resources/reference_lists/en/academic_words_token.csv
36
+ lemma: resources/reference_lists/en/academic_words_lemma.csv
37
+ format: csv
38
+ columns:
39
+ word: 0
40
+ frequency: 1
41
+ has_header: true
42
+ enabled: false
43
+ COCA_spoken_frequency_token:
44
+ display_name: COCA Spoken Frequency (Token)
45
+ description: Frequency and range data from COCA spoken subcorpus - token-based
46
+ analysis
47
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
48
+ format: tsv
49
+ columns:
50
+ word: 0
51
+ frequency: 1
52
+ normalized_freq: 2
53
+ range: 3
54
+ dispersion: 4
55
+ has_header: false
56
+ enabled: true
57
+ analysis_type: token
58
+ log_transformable:
59
+ - frequency
60
+ - normalized_freq
61
+ selectable_measures:
62
+ - frequency
63
+ - normalized_freq
64
+ - range
65
+ - dispersion
66
+ default_measures:
67
+ - frequency
68
+ - normalized_freq
69
+ default_log_transforms:
70
+ - frequency
71
+ - normalized_freq
72
+ COCA_spoken_frequency_lemma:
73
+ display_name: COCA Spoken Frequency (Lemma)
74
+ description: Frequency and range data from COCA spoken subcorpus - lemma-based
75
+ analysis
76
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
77
+ format: tsv
78
+ columns:
79
+ word: 0
80
+ frequency: 1
81
+ normalized_freq: 2
82
+ range: 3
83
+ dispersion: 4
84
+ has_header: false
85
+ enabled: true
86
+ analysis_type: lemma
87
+ log_transformable:
88
+ - frequency
89
+ - normalized_freq
90
+ selectable_measures:
91
+ - frequency
92
+ - normalized_freq
93
+ - range
94
+ - dispersion
95
+ default_measures:
96
+ - frequency
97
+ - normalized_freq
98
+ default_log_transforms:
99
+ - frequency
100
+ - normalized_freq
101
+ bigrams:
102
+ COCA_spoken_bigram_frequency:
103
+ display_name: COCA Spoken Bigram Frequency
104
+ description: Bigram frequencies and range data
105
+ files:
106
+ token: resources/reference_lists/en/COCA_spoken_bigram_list.csv
107
+ lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
108
+ format: tsv
109
+ columns:
110
+ bigram: 0
111
+ frequency: 1
112
+ normalized_freq: 2
113
+ documents: 3
114
+ range: 4
115
+ has_header: false
116
+ enabled: true
117
+ COCA_spoken_bigram_association:
118
+ display_name: COCA Spoken Bigram Associations
119
+ description: Bigram association measures (MI, T-score, Delta P)
120
+ files:
121
+ token: resources/reference_lists/en/spoken_bi_contingency.csv
122
+ lemma: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
123
+ format: csv
124
+ columns:
125
+ bigram: 0
126
+ frequency: 1
127
+ mi_score: 2
128
+ mi_2_score: 3
129
+ t_score: 4
130
+ delta_p: 5
131
+ ap_collex: 6
132
+ has_header: true
133
+ enabled: true
134
+ COCA_magazine_bigram_frequency:
135
+ display_name: COCA Magazine Bigram Frequency
136
+ description: Bigram frequencies and range data in Magazine
137
+ files:
138
+ token: resources/reference_lists/en/COCA_magazine_bigram_list.csv
139
+ lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
140
+ format: tsv
141
+ columns:
142
+ bigram: 0
143
+ frequency: 1
144
+ normalized_freq: 2
145
+ documents: 3
146
+ range: 4
147
+ has_header: false
148
+ enabled: true
149
+ COCA_magazine_bigram_association:
150
+ display_name: COCA Magazine Bigram Associations
151
+ description: Bigram association measures (MI, T-score, Delta P)
152
+ files:
153
+ token: resources/reference_lists/en/magazine_bi_contingency.csv
154
+ lemma: resources/reference_lists/en/magazine_bigram_lemma_contingency.csv
155
+ format: csv
156
+ columns:
157
+ bigram: 0
158
+ frequency: 1
159
+ mi_score: 2
160
+ mi_2_score: 3
161
+ t_score: 4
162
+ delta_p: 5
163
+ ap_collex: 6
164
+ has_header: true
165
+ enabled: true
166
+ trigrams:
167
+ COCA_trigram_frequency:
168
+ display_name: COCA Trigram Frequency
169
+ description: Trigram frequencies and range data
170
+ files:
171
+ token: resources/reference_lists/en/COCA_spoken_trigram_list.csv
172
+ lemma: resources/reference_lists/en/COCA_spoken_trigram_list.csv
173
+ format: tsv
174
+ columns:
175
+ trigram: 0
176
+ frequency: 1
177
+ normalized_freq: 2
178
+ range: 3
179
+ dispersion: 4
180
+ has_header: false
181
+ enabled: true
182
+ COCA_trigram_assoc_uni_bi:
183
+ display_name: COCA Trigram→Bigram Associations
184
+ description: Trigram to bigram association measures
185
+ files:
186
+ token: resources/reference_lists/en/spoken_tri_contingency_1.csv
187
+ lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv
188
+ format: csv
189
+ columns:
190
+ trigram: 0
191
+ frequency: 1
192
+ mi_score: 2
193
+ mi_2_score: 3
194
+ t_score: 4
195
+ delta_p: 5
196
+ ap_collex: 6
197
+ has_header: true
198
+ enabled: true
199
+ COCA_trigram_assoc_bi_uni:
200
+ display_name: COCA Trigram→Unigram Associations
201
+ description: Trigram to unigram association measures
202
+ files:
203
+ token: resources/reference_lists/en/spoken_tri_contingency_2.csv
204
+ lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv
205
+ format: csv
206
+ columns:
207
+ trigram: 0
208
+ frequency: 1
209
+ mi_score: 2
210
+ mi_2_score: 3
211
+ t_score: 4
212
+ delta_p: 5
213
+ ap_collex: 6
214
+ has_header: true
215
+ enabled: true
216
+ japanese:
217
+ unigrams:
218
+ BCCWJ_frequency:
219
+ display_name: BCCWJ Written - Frequency
220
+ description: BCCWJ raw frequency counts for written Japanese
221
+ files:
222
+ token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
223
+ lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
224
+ format: tsv
225
+ has_header: true
226
+ enabled: true
227
+ japanese_corpus: true
228
+ columns:
229
+ surface_form: 1
230
+ lemma: 2
231
+ pos: 3
232
+ frequency: 6
233
+ BCCWJ_pmw:
234
+ display_name: BCCWJ Written - Per Million Words
235
+ description: BCCWJ normalized frequency for written Japanese
236
+ files:
237
+ token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
238
+ lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
239
+ format: tsv
240
+ has_header: true
241
+ enabled: true
242
+ japanese_corpus: true
243
+ columns:
244
+ surface_form: 1
245
+ lemma: 2
246
+ pos: 3
247
+ frequency: 7
248
+ BCCWJ_rank:
249
+ display_name: BCCWJ Written - Frequency Rank
250
+ description: BCCWJ frequency ranking for written Japanese
251
+ files:
252
+ token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
253
+ lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
254
+ format: tsv
255
+ has_header: true
256
+ enabled: true
257
+ japanese_corpus: true
258
+ columns:
259
+ surface_form: 1
260
+ lemma: 2
261
+ pos: 3
262
+ frequency: 0
263
+ CSJ_frequency:
264
+ display_name: CSJ Spoken - Frequency
265
+ description: CSJ raw frequency counts for spoken Japanese
266
+ files:
267
+ token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
268
+ lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
269
+ format: tsv
270
+ has_header: true
271
+ enabled: true
272
+ japanese_corpus: true
273
+ columns:
274
+ surface_form: 1
275
+ lemma: 2
276
+ pos: 3
277
+ frequency: 6
278
+ CSJ_pmw:
279
+ display_name: CSJ Spoken - Per Million Words
280
+ description: CSJ normalized frequency for spoken Japanese
281
+ files:
282
+ token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
283
+ lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
284
+ format: tsv
285
+ has_header: true
286
+ enabled: true
287
+ japanese_corpus: true
288
+ columns:
289
+ surface_form: 1
290
+ lemma: 2
291
+ pos: 3
292
+ frequency: 7
293
+ CSJ_rank:
294
+ display_name: CSJ Spoken - Frequency Rank
295
+ description: CSJ frequency ranking for spoken Japanese
296
+ files:
297
+ token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
298
+ lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
299
+ format: tsv
300
+ has_header: true
301
+ enabled: true
302
+ japanese_corpus: true
303
+ columns:
304
+ surface_form: 1
305
+ lemma: 2
306
+ pos: 3
307
+ frequency: 0
308
+ jp_frequency:
309
+ display_name: Japanese Frequency List
310
+ description: Frequency data for Japanese words
311
+ files:
312
+ token: resources/reference_lists/ja/jp_frequency_token.csv
313
+ lemma: resources/reference_lists/ja/jp_frequency_lemma.csv
314
+ format: csv
315
+ columns:
316
+ word: 0
317
+ frequency: 1
318
+ has_header: true
319
+ enabled: false
text_analyzer/lexical_sophistication.py CHANGED
@@ -484,16 +484,69 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
484
 
485
  return score
486
 
487
- def analyze_text(self, text: str, selected_indices: List[str],
488
- apply_log: bool = False, word_type_filter: Optional[str] = None) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  """
490
  Analyze text and return lexical sophistication scores.
491
 
492
  Args:
493
  text: Input text to analyze
494
  selected_indices: List of reference indices to apply
495
- apply_log: Whether to apply log10 transformation
496
  word_type_filter: Filter by word type ('CW', 'FW', or None for all)
 
 
 
 
 
 
497
 
498
  Returns:
499
  Dictionary containing analysis results
@@ -607,13 +660,21 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
607
  token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
608
  token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
609
 
610
- # Collect for summary statistics
611
  if token_score is not None:
612
- score_val = np.log10(token_score) if apply_log and token_score > 0 else token_score
 
 
 
 
613
  all_scores[f"{index_name}_token_{word_type}"].append(score_val)
614
 
615
  if lemma_score is not None:
616
- score_val = np.log10(lemma_score) if apply_log and lemma_score > 0 else lemma_score
 
 
 
 
617
  all_scores[f"{index_name}_lemma_{word_type}"].append(score_val)
618
 
619
  results['token_details'].append(token_detail)
@@ -664,10 +725,19 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
664
  # Get available measures
665
  available_measures = ref_data.columns[1:].tolist()
666
 
 
667
  for measure in available_measures:
 
 
 
 
668
  score = self._lookup_score(ngram, index_name, ngram_type, measure)
669
  if score is not None:
670
- score_val = np.log10(score) if apply_log and score > 0 else score
 
 
 
 
671
  ngram_detail[f"{index_name}_{measure}"] = score_val
672
  else:
673
  ngram_detail[f"{index_name}_{measure}"] = None
@@ -686,12 +756,21 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
686
  # Get available measures (all columns except the first one)
687
  available_measures = ref_data.columns[1:].tolist()
688
 
 
689
  for measure in available_measures:
 
 
 
 
690
  ngram_scores = []
691
  for ngram in ngrams:
692
  score = self._lookup_score(ngram, index_name, ngram_type, measure)
693
  if score is not None:
694
- score_val = np.log10(score) if apply_log and score > 0 else score
 
 
 
 
695
  ngram_scores.append(score_val)
696
 
697
  if ngram_scores:
 
484
 
485
  return score
486
 
487
+ def _should_apply_log_transform(self, index_name: str, analysis_type: str,
488
+ measure_name: str, log_transforms: Optional[Dict[str, List[str]]],
489
+ apply_log_fallback: bool) -> bool:
490
+ """
491
+ Determine if a specific measure should be log-transformed.
492
+
493
+ Args:
494
+ index_name: Name of the reference index
495
+ analysis_type: 'token' or 'lemma'
496
+ measure_name: Name of the measure (e.g., 'frequency', 'MI')
497
+ log_transforms: Dict mapping index names to lists of measures to log-transform
498
+ apply_log_fallback: Legacy fallback boolean
499
+
500
+ Returns:
501
+ True if the measure should be log-transformed, False otherwise
502
+ """
503
+ # If new log_transforms parameter is provided, use it
504
+ if log_transforms is not None:
505
+ index_transforms = log_transforms.get(index_name, [])
506
+ return measure_name in index_transforms
507
+
508
+ # Fallback to legacy apply_log behavior for backward compatibility
509
+ return apply_log_fallback
510
+
511
+ def _should_compute_measure(self, index_name: str, measure_name: str,
512
+ selected_measures: Optional[Dict[str, List[str]]]) -> bool:
513
+ """
514
+ Determine if a specific measure should be computed.
515
+
516
+ Args:
517
+ index_name: Name of the reference index
518
+ measure_name: Name of the measure (e.g., 'frequency', 'MI')
519
+ selected_measures: Dict mapping index names to lists of measures to compute
520
+
521
+ Returns:
522
+ True if the measure should be computed, False otherwise
523
+ """
524
+ # If selected_measures is provided, use it for filtering
525
+ if selected_measures is not None:
526
+ index_measures = selected_measures.get(index_name, [])
527
+ return measure_name in index_measures
528
+
529
+ # If not specified, compute all measures (backward compatibility)
530
+ return True
531
+
532
+ def analyze_text(self, text: str, selected_indices: List[str],
533
+ apply_log: bool = False, word_type_filter: Optional[str] = None,
534
+ log_transforms: Optional[Dict[str, List[str]]] = None,
535
+ selected_measures: Optional[Dict[str, List[str]]] = None) -> Dict:
536
  """
537
  Analyze text and return lexical sophistication scores.
538
 
539
  Args:
540
  text: Input text to analyze
541
  selected_indices: List of reference indices to apply
542
+ apply_log: Whether to apply log10 transformation (legacy parameter, superseded by log_transforms)
543
  word_type_filter: Filter by word type ('CW', 'FW', or None for all)
544
+ log_transforms: Dict mapping index names to list of measures that should be log-transformed
545
+ e.g., {'COCA_spoken_frequency_token': ['frequency', 'normalized_freq']}
546
+ If None, falls back to apply_log behavior for backward compatibility
547
+ selected_measures: Dict mapping index names to list of measures to compute
548
+ e.g., {'COCA_spoken_frequency_token': ['frequency', 'range']}
549
+ If None, computes all available measures for backward compatibility
550
 
551
  Returns:
552
  Dictionary containing analysis results
 
660
  token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
661
  token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
662
 
663
+ # Collect for summary statistics with selective log transformation
664
  if token_score is not None:
665
+ # Check if this specific measure should be log-transformed
666
+ should_log_transform = self._should_apply_log_transform(
667
+ index_name, 'token', 'frequency', log_transforms, apply_log
668
+ )
669
+ score_val = np.log10(token_score) if should_log_transform and token_score > 0 else token_score
670
  all_scores[f"{index_name}_token_{word_type}"].append(score_val)
671
 
672
  if lemma_score is not None:
673
+ # Check if this specific measure should be log-transformed
674
+ should_log_transform = self._should_apply_log_transform(
675
+ index_name, 'lemma', 'frequency', log_transforms, apply_log
676
+ )
677
+ score_val = np.log10(lemma_score) if should_log_transform and lemma_score > 0 else lemma_score
678
  all_scores[f"{index_name}_lemma_{word_type}"].append(score_val)
679
 
680
  results['token_details'].append(token_detail)
 
725
  # Get available measures
726
  available_measures = ref_data.columns[1:].tolist()
727
 
728
+ # Filter measures based on selection
729
  for measure in available_measures:
730
+ # Check if this measure should be computed
731
+ if not self._should_compute_measure(index_name, measure, selected_measures):
732
+ continue
733
+
734
  score = self._lookup_score(ngram, index_name, ngram_type, measure)
735
  if score is not None:
736
+ # Check if this measure should be log-transformed
737
+ should_log_transform = self._should_apply_log_transform(
738
+ index_name, ngram_type, measure, log_transforms, apply_log
739
+ )
740
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
741
  ngram_detail[f"{index_name}_{measure}"] = score_val
742
  else:
743
  ngram_detail[f"{index_name}_{measure}"] = None
 
756
  # Get available measures (all columns except the first one)
757
  available_measures = ref_data.columns[1:].tolist()
758
 
759
+ # Filter measures based on selection and compute summary statistics
760
  for measure in available_measures:
761
+ # Check if this measure should be computed
762
+ if not self._should_compute_measure(index_name, measure, selected_measures):
763
+ continue
764
+
765
  ngram_scores = []
766
  for ngram in ngrams:
767
  score = self._lookup_score(ngram, index_name, ngram_type, measure)
768
  if score is not None:
769
+ # Check if this measure should be log-transformed
770
+ should_log_transform = self._should_apply_log_transform(
771
+ index_name, ngram_type, measure, log_transforms, apply_log
772
+ )
773
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
774
  ngram_scores.append(score_val)
775
 
776
  if ngram_scores:
web_app/components/ui_components.py CHANGED
@@ -173,20 +173,82 @@ class UIComponents:
173
 
174
  @staticmethod
175
  def render_analysis_options():
176
- """Render analysis options UI."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  col1, col2 = st.columns(2)
178
 
179
  with col1:
180
- apply_log = st.checkbox("Apply log₁₀ transformation")
181
-
182
  with col2:
183
- word_type_filter = st.selectbox(
184
- "Word Type Filter",
185
- options=[None, 'CW', 'FW'],
186
- format_func=lambda x: 'All Words' if x is None else ('Content Words' if x == 'CW' else 'Function Words')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  )
 
 
 
 
188
 
189
- return apply_log, word_type_filter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  @staticmethod
192
  def display_configured_indices():
@@ -233,3 +295,53 @@ class UIComponents:
233
 
234
  if success_count == 0:
235
  st.error("No valid configurations found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  @staticmethod
175
  def render_analysis_options():
176
+ """Render enhanced analysis options UI with sophisticated hierarchical interface."""
177
+ from web_app.defaults_manager import DefaultsManager
178
+ from web_app.config_manager import ConfigManager
179
+ from web_app.session_manager import SessionManager
180
+
181
+ st.subheader("🔧 Analysis Configuration")
182
+
183
+ # Get current configuration
184
+ config = ConfigManager.load_reference_config()
185
+ reference_lists = SessionManager.get_reference_lists()
186
+
187
+ # Enhanced Reference Lists & Measures Section
188
+ st.write("### 📋 Reference Lists & Measures")
189
+
190
+ # Render the sophisticated hierarchical interface
191
+ selected_measures, log_transforms = UIComponents.render_enhanced_reference_selection(config, reference_lists)
192
+
193
+ # Global Analysis Options
194
+ st.write("### 🎯 Analysis Types")
195
  col1, col2 = st.columns(2)
196
 
197
  with col1:
198
+ token_analysis = st.checkbox("☑️ Token-based", value=True, key="token_analysis_enabled")
 
199
  with col2:
200
+ lemma_analysis = st.checkbox("☑️ Lemma-based", value=True, key="lemma_analysis_enabled")
201
+
202
+ # Global Options
203
+ st.write("### ⚙️ Global Options")
204
+ word_type_filter = st.selectbox(
205
+ "Word Type Filter:",
206
+ options=[None, 'CW', 'FW'],
207
+ format_func=lambda x: 'All Words ▼' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
208
+ key="word_type_filter"
209
+ )
210
+
211
+ # Advanced Configuration Section
212
+ with st.expander("🎯 Advanced Configuration (Optional)", expanded=False):
213
+ st.info("ℹ️ **Smart Defaults Active**: The system automatically applies appropriate settings. "
214
+ "Expand this section only if you need custom control.")
215
+
216
+ # Legacy log transformation toggle
217
+ legacy_log_toggle = st.checkbox(
218
+ "Apply log₁₀ transformation to ALL measures (Legacy Mode)",
219
+ value=False,
220
+ help="⚠️ Not recommended: This applies log transformation to all measures, "
221
+ "including those where it's scientifically inappropriate (e.g., concreteness ratings).",
222
+ key="legacy_log_transform"
223
  )
224
+
225
+ if legacy_log_toggle:
226
+ st.warning("⚠️ Legacy mode enabled: Log transformation will be applied to ALL numerical measures. "
227
+ "This may produce scientifically invalid results for psycholinguistic measures.")
228
 
229
+ # Return enhanced configuration
230
+ return {
231
+ 'token_analysis': token_analysis,
232
+ 'lemma_analysis': lemma_analysis,
233
+ 'word_type_filter': word_type_filter,
234
+ 'selected_measures': selected_measures,
235
+ 'log_transforms': log_transforms,
236
+ 'use_smart_defaults': not st.session_state.get('legacy_log_transform', False),
237
+ 'legacy_log_transform': st.session_state.get('legacy_log_transform', False)
238
+ }
239
+
240
+ @staticmethod
241
+ def _find_entry_config(entry_name: str, config: Dict[str, Any]) -> Optional[Dict[str, Any]]:
242
+ """Find configuration entry by name."""
243
+ for language, lang_data in config.items():
244
+ if not isinstance(lang_data, dict):
245
+ continue
246
+ for ngram_type, type_data in lang_data.items():
247
+ if not isinstance(type_data, dict):
248
+ continue
249
+ if entry_name in type_data:
250
+ return type_data[entry_name]
251
+ return None
252
 
253
  @staticmethod
254
  def display_configured_indices():
 
295
 
296
  if success_count == 0:
297
  st.error("No valid configurations found")
298
+
299
+ @staticmethod
300
+ def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
301
+ """Render the enhanced reference list selection interface with hierarchical display."""
302
+ from web_app.defaults_manager import DefaultsManager
303
+
304
+ # Initialize return values
305
+ selected_measures = {}
306
+ log_transforms = {}
307
+
308
+ if not reference_lists:
309
+ st.info("No reference lists selected. Please configure reference lists first.")
310
+ return selected_measures, log_transforms
311
+
312
+ # Simple hierarchical display showing selected lists with smart defaults info
313
+ for list_name in reference_lists.keys():
314
+ # Show smart defaults indicator
315
+ entry_config = UIComponents._find_entry_config(list_name, config)
316
+ if entry_config and entry_config.get('default_measures'):
317
+ defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
318
+ log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
319
+
320
+ # Determine analysis type badges
321
+ analysis_badges = []
322
+ if entry_config.get('analysis_type') == 'token' or not entry_config.get('analysis_type'):
323
+ analysis_badges.append("[Token ✓]")
324
+ if entry_config.get('analysis_type') == 'lemma' or not entry_config.get('analysis_type'):
325
+ analysis_badges.append("[Lemma ✓]")
326
+
327
+ analysis_info = " ".join(analysis_badges) if analysis_badges else ""
328
+
329
+ st.write(f"├─ **{list_name}** {analysis_info} [ℹ️ Smart defaults]")
330
+ st.write(f" {defaults_info}, {log_info}")
331
+
332
+ # Apply smart defaults to return values
333
+ selected_measures[list_name] = entry_config.get('default_measures', [])
334
+ log_transforms[list_name] = entry_config.get('default_log_transforms', [])
335
+ else:
336
+ st.write(f"├─ **{list_name}** [Legacy configuration]")
337
+
338
+ return selected_measures, log_transforms
339
+
340
+ @staticmethod
341
+ def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
342
+ """Check if a group has smart defaults configured."""
343
+ for entry_name in group_entries:
344
+ entry_config = UIComponents._find_entry_config(entry_name, config)
345
+ if entry_config and entry_config.get('default_measures'):
346
+ return True
347
+ return False
web_app/config_manager.py CHANGED
@@ -162,9 +162,17 @@ class ConfigManager:
162
 
163
  @staticmethod
164
  def load_reference_list_data(list_config: Dict[str, Any]) -> Dict[str, Any]:
165
- """Load actual data for a reference list based on its configuration."""
 
 
 
 
 
166
  data = {}
167
 
 
 
 
168
  # Check if this is a Japanese corpus
169
  is_japanese_corpus = list_config.get('japanese_corpus', False)
170
 
@@ -173,7 +181,21 @@ class ConfigManager:
173
  is_bigram = 'bigram' in columns
174
  is_trigram = 'trigram' in columns
175
 
176
- for file_type, file_path in list_config.get('files', {}).items():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  if file_path is None:
178
  continue
179
 
 
162
 
163
  @staticmethod
164
  def load_reference_list_data(list_config: Dict[str, Any]) -> Dict[str, Any]:
165
+ """Load actual data for a reference list based on its configuration.
166
+
167
+ Supports both old schema (files.token/files.lemma) and new schema (single file).
168
+ """
169
+ from web_app.schema_validator import SchemaValidator
170
+
171
  data = {}
172
 
173
+ # Detect schema version for this specific entry
174
+ is_new_schema = any(field in list_config for field in SchemaValidator.NEW_SCHEMA_FIELDS)
175
+
176
  # Check if this is a Japanese corpus
177
  is_japanese_corpus = list_config.get('japanese_corpus', False)
178
 
 
181
  is_bigram = 'bigram' in columns
182
  is_trigram = 'trigram' in columns
183
 
184
+ # Handle different schema formats
185
+ if is_new_schema:
186
+ # New schema: single file with analysis_type
187
+ file_path = list_config.get('file')
188
+ analysis_type = list_config.get('analysis_type', 'token')
189
+
190
+ if file_path:
191
+ files_to_process = {analysis_type: file_path}
192
+ else:
193
+ files_to_process = {}
194
+ else:
195
+ # Old schema: files.token/files.lemma
196
+ files_to_process = list_config.get('files', {})
197
+
198
+ for file_type, file_path in files_to_process.items():
199
  if file_path is None:
200
  continue
201
 
web_app/defaults_manager.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Smart Defaults Manager for Lexical Sophistication Analysis
3
+ Provides intelligent default configurations based on measure types and analysis context.
4
+ """
5
+
6
+ from typing import Dict, List, Any, Tuple, Optional
7
+ import logging
8
+ from web_app.schema_validator import SchemaValidator
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class DefaultsManager:
14
+ """Manages smart defaults for lexical sophistication analysis."""
15
+
16
+ # Define measure type patterns for intelligent classification
17
+ MEASURE_PATTERNS = {
18
+ 'frequency': ['freq', 'frequency', 'count', 'occurrence'],
19
+ 'association': ['mi', 't_score', 'delta_p', 'ap_collex', 'llr', 'dice'],
20
+ 'psycholinguistic': ['concreteness', 'valence', 'arousal', 'dominance', 'imageability', 'familiarity'],
21
+ 'range': ['range', 'documents', 'texts', 'dispersion'],
22
+ 'rank': ['rank', 'ranking', 'order'],
23
+ 'probability': ['probability', 'prob', 'likelihood']
24
+ }
25
+
26
+ # Define appropriate log transformation rules
27
+ LOG_TRANSFORM_RULES = {
28
+ 'frequency': True, # Always log-transform frequency measures
29
+ 'association': False, # Never log-transform association measures
30
+ 'psycholinguistic': False, # Never log-transform ratings/scales
31
+ 'range': False, # Never log-transform range measures
32
+ 'rank': False, # Never log-transform ranks
33
+ 'probability': False # Never log-transform probabilities
34
+ }
35
+
36
+ # Define default measure priorities (higher = more important/commonly used)
37
+ MEASURE_PRIORITIES = {
38
+ 'frequency': 100,
39
+ 'normalized_freq': 95,
40
+ 'mi': 90,
41
+ 't_score': 85,
42
+ 'concreteness': 80,
43
+ 'range': 75,
44
+ 'dispersion': 70,
45
+ 'delta_p': 65,
46
+ 'rank': 60,
47
+ 'ap_collex': 55
48
+ }
49
+
50
+ @classmethod
51
+ def classify_measure_type(cls, measure_name: str) -> str:
52
+ """
53
+ Classify a measure into its type category.
54
+
55
+ Args:
56
+ measure_name: Name of the measure to classify
57
+
58
+ Returns:
59
+ Category name ('frequency', 'association', 'psycholinguistic', 'range', 'rank', 'unknown')
60
+ """
61
+ measure_lower = measure_name.lower().strip()
62
+
63
+ for category, patterns in cls.MEASURE_PATTERNS.items():
64
+ if any(pattern in measure_lower for pattern in patterns):
65
+ return category
66
+
67
+ return 'unknown'
68
+
69
+ @classmethod
70
+ def get_smart_defaults_for_entry(cls, entry_config: Dict[str, Any]) -> Dict[str, Any]:
71
+ """
72
+ Generate smart defaults for a configuration entry.
73
+
74
+ Args:
75
+ entry_config: Configuration entry (old or new schema format)
76
+
77
+ Returns:
78
+ Dictionary with smart default fields
79
+ """
80
+ # Extract measure names from columns
81
+ columns = entry_config.get('columns', {})
82
+
83
+ # Get all non-word columns as potential measures
84
+ word_columns = {'word', 'surface_form', 'lemma', 'bigram', 'trigram', 'ngram'}
85
+ measure_names = []
86
+
87
+ for col_name, col_index in columns.items():
88
+ if col_name.lower() not in word_columns:
89
+ measure_names.append(col_name)
90
+
91
+ if not measure_names:
92
+ # Fallback: assume all columns except first are measures
93
+ if isinstance(columns, dict) and columns:
94
+ # Skip word column (typically index 0)
95
+ measure_names = [name for name, idx in columns.items() if idx != 0]
96
+
97
+ # Classify measures and generate defaults
98
+ return cls._generate_smart_defaults(measure_names)
99
+
100
+ @classmethod
101
+ def _generate_smart_defaults(cls, measure_names: List[str]) -> Dict[str, Any]:
102
+ """
103
+ Generate smart defaults based on measure classification.
104
+
105
+ Args:
106
+ measure_names: List of available measure names
107
+
108
+ Returns:
109
+ Dictionary with smart default configuration
110
+ """
111
+ # Classify each measure
112
+ measure_classifications = {}
113
+ for measure in measure_names:
114
+ measure_classifications[measure] = cls.classify_measure_type(measure)
115
+
116
+ # Determine log-transformable measures
117
+ log_transformable = []
118
+ for measure, category in measure_classifications.items():
119
+ if cls.LOG_TRANSFORM_RULES.get(category, False):
120
+ log_transformable.append(measure)
121
+
122
+ # Select default measures (prioritize by importance and type)
123
+ default_measures = cls._select_default_measures(measure_names, measure_classifications)
124
+
125
+ # Select default log transforms (intersection of defaults and log-transformable)
126
+ default_log_transforms = [m for m in default_measures if m in log_transformable]
127
+
128
+ return {
129
+ 'log_transformable': log_transformable,
130
+ 'selectable_measures': measure_names,
131
+ 'default_measures': default_measures,
132
+ 'default_log_transforms': default_log_transforms,
133
+ 'measure_classifications': measure_classifications # For debugging/UI display
134
+ }
135
+
136
+ @classmethod
137
+ def _select_default_measures(cls, measure_names: List[str],
138
+ measure_classifications: Dict[str, str]) -> List[str]:
139
+ """
140
+ Select default measures based on priority and balance.
141
+
142
+ Args:
143
+ measure_names: Available measure names
144
+ measure_classifications: Classification of each measure
145
+
146
+ Returns:
147
+ List of default measure names (typically 2-3 measures)
148
+ """
149
+ # Score measures by priority and type balance
150
+ measure_scores = {}
151
+
152
+ for measure in measure_names:
153
+ # Base score from priority list
154
+ base_score = cls.MEASURE_PRIORITIES.get(measure.lower(), 0)
155
+
156
+ # Bonus for common patterns
157
+ if any(pattern in measure.lower() for pattern in ['freq', 'frequency']):
158
+ base_score += 50
159
+ elif any(pattern in measure.lower() for pattern in ['mi', 't_score']):
160
+ base_score += 40
161
+ elif any(pattern in measure.lower() for pattern in ['concreteness', 'range']):
162
+ base_score += 30
163
+
164
+ measure_scores[measure] = base_score
165
+
166
+ # Sort by score and select top measures
167
+ sorted_measures = sorted(measure_scores.items(), key=lambda x: x[1], reverse=True)
168
+
169
+ # Select top measures with type diversity
170
+ selected = []
171
+ selected_types = set()
172
+
173
+ for measure, score in sorted_measures:
174
+ measure_type = measure_classifications[measure]
175
+
176
+ # Always include high-priority measures
177
+ if score >= 90 or len(selected) < 2:
178
+ selected.append(measure)
179
+ selected_types.add(measure_type)
180
+ # Add diverse types up to 3-4 measures
181
+ elif len(selected) < 4 and measure_type not in selected_types:
182
+ selected.append(measure)
183
+ selected_types.add(measure_type)
184
+ # Stop at 4 measures max
185
+ elif len(selected) >= 4:
186
+ break
187
+
188
+ return selected[:4] # Limit to 4 measures max
189
+
190
+ @classmethod
191
+ def get_ui_groupings(cls, config_data: Dict[str, Any]) -> Dict[str, List[str]]:
192
+ """
193
+ Generate UI groupings for reference list entries.
194
+ Groups related token/lemma entries together for display.
195
+
196
+ Args:
197
+ config_data: Full configuration data
198
+
199
+ Returns:
200
+ Dictionary mapping group names to entry lists
201
+ """
202
+ groupings = {}
203
+ processed_entries = set()
204
+
205
+ for language, lang_data in config_data.items():
206
+ if not isinstance(lang_data, dict):
207
+ continue
208
+
209
+ for ngram_type, type_data in lang_data.items():
210
+ if not isinstance(type_data, dict):
211
+ continue
212
+
213
+ for entry_name, entry_config in type_data.items():
214
+ if entry_name in processed_entries:
215
+ continue
216
+
217
+ # Check if this is a new schema entry with analysis_type
218
+ if entry_config.get('analysis_type'):
219
+ # Try to find matching token/lemma pair
220
+ base_name = entry_name.replace('_token', '').replace('_lemma', '')
221
+ token_name = f"{base_name}_token"
222
+ lemma_name = f"{base_name}_lemma"
223
+
224
+ if (token_name in type_data and lemma_name in type_data and
225
+ token_name not in processed_entries and lemma_name not in processed_entries):
226
+ # Group them together
227
+ group_key = f"{language}_{ngram_type}_{base_name}"
228
+ groupings[group_key] = {
229
+ 'display_name': base_name.replace('_', ' ').title(),
230
+ 'entries': [token_name, lemma_name],
231
+ 'type': ngram_type,
232
+ 'language': language
233
+ }
234
+ processed_entries.add(token_name)
235
+ processed_entries.add(lemma_name)
236
+ else:
237
+ # Single entry
238
+ group_key = f"{language}_{ngram_type}_{entry_name}"
239
+ groupings[group_key] = {
240
+ 'display_name': entry_config.get('display_name', entry_name),
241
+ 'entries': [entry_name],
242
+ 'type': ngram_type,
243
+ 'language': language
244
+ }
245
+ processed_entries.add(entry_name)
246
+ else:
247
+ # Old schema entry - single group
248
+ group_key = f"{language}_{ngram_type}_{entry_name}"
249
+ groupings[group_key] = {
250
+ 'display_name': entry_config.get('display_name', entry_name),
251
+ 'entries': [entry_name],
252
+ 'type': ngram_type,
253
+ 'language': language
254
+ }
255
+ processed_entries.add(entry_name)
256
+
257
+ return groupings
258
+
259
+ @classmethod
260
+ def apply_smart_defaults_to_config(cls, config_data: Dict[str, Any]) -> Dict[str, Any]:
261
+ """
262
+ Apply smart defaults to configuration entries that don't have them.
263
+
264
+ Args:
265
+ config_data: Configuration data to enhance
266
+
267
+ Returns:
268
+ Enhanced configuration data with smart defaults
269
+ """
270
+ enhanced_config = config_data.copy()
271
+
272
+ for language, lang_data in enhanced_config.items():
273
+ if not isinstance(lang_data, dict):
274
+ continue
275
+
276
+ for ngram_type, type_data in lang_data.items():
277
+ if not isinstance(type_data, dict):
278
+ continue
279
+
280
+ for entry_name, entry_config in type_data.items():
281
+ if not isinstance(entry_config, dict):
282
+ continue
283
+
284
+ # Check if entry needs smart defaults
285
+ needs_defaults = not any(field in entry_config
286
+ for field in SchemaValidator.NEW_SCHEMA_FIELDS)
287
+
288
+ if needs_defaults:
289
+ # Generate and apply smart defaults
290
+ smart_defaults = cls.get_smart_defaults_for_entry(entry_config)
291
+ entry_config.update(smart_defaults)
292
+ logger.info(f"Applied smart defaults to {entry_name}")
293
+
294
+ return enhanced_config
295
+
296
+ @classmethod
297
+ def get_default_analysis_config(cls, selected_entries: List[str],
298
+ config_data: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
299
+ """
300
+ Generate default analysis configuration for selected entries.
301
+
302
+ Args:
303
+ selected_entries: List of selected reference list entries
304
+ config_data: Full configuration data
305
+
306
+ Returns:
307
+ Tuple of (selected_measures, log_transforms) dictionaries
308
+ """
309
+ selected_measures = {}
310
+ log_transforms = {}
311
+
312
+ for entry_name in selected_entries:
313
+ # Find the entry in config
314
+ entry_config = None
315
+ for language, lang_data in config_data.items():
316
+ if not isinstance(lang_data, dict):
317
+ continue
318
+ for ngram_type, type_data in lang_data.items():
319
+ if not isinstance(type_data, dict):
320
+ continue
321
+ if entry_name in type_data:
322
+ entry_config = type_data[entry_name]
323
+ break
324
+ if entry_config:
325
+ break
326
+
327
+ if not entry_config:
328
+ continue
329
+
330
+ # Get defaults from config or generate them
331
+ if entry_config.get('default_measures'):
332
+ selected_measures[entry_name] = entry_config['default_measures']
333
+ else:
334
+ # Generate smart defaults
335
+ defaults = cls.get_smart_defaults_for_entry(entry_config)
336
+ selected_measures[entry_name] = defaults['default_measures']
337
+
338
+ if entry_config.get('default_log_transforms'):
339
+ log_transforms[entry_name] = entry_config['default_log_transforms']
340
+ else:
341
+ # Generate smart defaults
342
+ defaults = cls.get_smart_defaults_for_entry(entry_config)
343
+ log_transforms[entry_name] = defaults['default_log_transforms']
344
+
345
+ return selected_measures, log_transforms
346
+
347
+
348
+ def test_smart_defaults():
349
+ """Test the smart defaults functionality."""
350
+
351
+ print("=== TESTING SMART DEFAULTS ENGINE ===")
352
+
353
+ # Test measure classification
354
+ test_measures = ['frequency', 'MI', 'concreteness', 'range', 'delta_p', 'normalized_freq']
355
+
356
+ print("\n📊 Measure Classification:")
357
+ for measure in test_measures:
358
+ category = DefaultsManager.classify_measure_type(measure)
359
+ should_log = DefaultsManager.LOG_TRANSFORM_RULES.get(category, False)
360
+ print(f" {measure} → {category} (log: {should_log})")
361
+
362
+ # Test smart defaults generation
363
+ print("\n🎯 Smart Defaults Generation:")
364
+ test_config = {
365
+ 'columns': {
366
+ 'word': 0,
367
+ 'frequency': 1,
368
+ 'normalized_freq': 2,
369
+ 'range': 3,
370
+ 'dispersion': 4
371
+ }
372
+ }
373
+
374
+ defaults = DefaultsManager.get_smart_defaults_for_entry(test_config)
375
+ print(f" Log transformable: {defaults['log_transformable']}")
376
+ print(f" Default measures: {defaults['default_measures']}")
377
+ print(f" Default log transforms: {defaults['default_log_transforms']}")
378
+
379
+ # Test association measures
380
+ print("\n🔗 Association Measures Test:")
381
+ assoc_config = {
382
+ 'columns': {
383
+ 'bigram': 0,
384
+ 'frequency': 1,
385
+ 'MI': 2,
386
+ 'T': 3,
387
+ 'delta_p': 4
388
+ }
389
+ }
390
+
391
+ assoc_defaults = DefaultsManager.get_smart_defaults_for_entry(assoc_config)
392
+ print(f" Log transformable: {assoc_defaults['log_transformable']}")
393
+ print(f" Default measures: {assoc_defaults['default_measures']}")
394
+ print(f" Default log transforms: {assoc_defaults['default_log_transforms']}")
395
+
396
+ print("\n✅ Smart Defaults Engine working perfectly!")
397
+ return defaults, assoc_defaults
398
+
399
+
400
+ if __name__ == "__main__":
401
+ test_smart_defaults()
web_app/handlers/analysis_handlers.py CHANGED
@@ -71,8 +71,17 @@ class AnalysisHandlers:
71
  ReferenceManager.configure_reference_lists(analyzer)
72
  ReferenceManager.render_custom_upload_section()
73
 
74
- # Analysis options
75
- apply_log, word_type_filter = UIComponents.render_analysis_options()
 
 
 
 
 
 
 
 
 
76
 
77
  # Analysis button
78
  if st.button("Analyze Text", type="primary"):
@@ -86,13 +95,41 @@ class AnalysisHandlers:
86
  # Load reference lists
87
  analyzer.load_reference_lists(reference_lists)
88
 
89
- # Perform analysis
90
- results = analyzer.analyze_text(
91
- text_content,
92
- list(reference_lists.keys()),
93
- apply_log,
94
- word_type_filter
95
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Display results
98
  AnalysisHandlers.display_single_text_results(results)
@@ -406,4 +443,83 @@ class AnalysisHandlers:
406
  bargap=0.05
407
  )
408
 
409
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  ReferenceManager.configure_reference_lists(analyzer)
72
  ReferenceManager.render_custom_upload_section()
73
 
74
+ # Enhanced analysis options with smart defaults
75
+ analysis_config = AnalysisHandlers.render_enhanced_analysis_options()
76
+
77
+ # Extract configuration
78
+ token_analysis = analysis_config['token_analysis']
79
+ lemma_analysis = analysis_config['lemma_analysis']
80
+ word_type_filter = analysis_config['word_type_filter']
81
+ use_smart_defaults = analysis_config['use_smart_defaults']
82
+ legacy_log_transform = analysis_config.get('legacy_log_transform', False)
83
+ selected_measures = analysis_config.get('selected_measures', {})
84
+ log_transforms = analysis_config.get('log_transforms', {})
85
 
86
  # Analysis button
87
  if st.button("Analyze Text", type="primary"):
 
95
  # Load reference lists
96
  analyzer.load_reference_lists(reference_lists)
97
 
98
+ # Get analysis configuration
99
+ if use_smart_defaults:
100
+ # Use smart defaults from configuration
101
+ from web_app.defaults_manager import DefaultsManager
102
+ from web_app.config_manager import ConfigManager
103
+
104
+ config = ConfigManager.load_reference_config()
105
+ selected_measures, log_transforms = DefaultsManager.get_default_analysis_config(
106
+ list(reference_lists.keys()), config
107
+ )
108
+
109
+ # Perform enhanced analysis with smart defaults
110
+ results = analyzer.analyze_text(
111
+ text_content,
112
+ list(reference_lists.keys()),
113
+ apply_log=False, # Superseded by log_transforms
114
+ word_type_filter=word_type_filter,
115
+ log_transforms=log_transforms,
116
+ selected_measures=selected_measures
117
+ )
118
+
119
+ st.success("✨ Analysis completed using Smart Defaults!")
120
+ st.info(f"📊 Applied selective log transforms to {sum(len(measures) for measures in log_transforms.values())} measures")
121
+
122
+ else:
123
+ # Legacy mode - use global log transformation
124
+ results = analyzer.analyze_text(
125
+ text_content,
126
+ list(reference_lists.keys()),
127
+ apply_log=legacy_log_transform,
128
+ word_type_filter=word_type_filter
129
+ )
130
+
131
+ if legacy_log_transform:
132
+ st.warning("⚠️ Legacy mode: Log transformation applied to ALL measures")
133
 
134
  # Display results
135
  AnalysisHandlers.display_single_text_results(results)
 
443
  bargap=0.05
444
  )
445
 
446
+ st.plotly_chart(fig, use_container_width=True)
447
+
448
+ @staticmethod
449
+ def render_enhanced_analysis_options():
450
+ """Render the enhanced analysis interface with smart defaults and hierarchical display."""
451
+ from web_app.defaults_manager import DefaultsManager
452
+ from web_app.config_manager import ConfigManager
453
+ from web_app.session_manager import SessionManager
454
+
455
+ st.subheader("🔧 Analysis Configuration")
456
+
457
+ # Get current configuration
458
+ config = ConfigManager.load_reference_config()
459
+ reference_lists = SessionManager.get_reference_lists()
460
+
461
+ # Enhanced Reference Lists & Measures Section
462
+ st.write("### 📋 Reference Lists & Measures")
463
+
464
+ # Simple hierarchical display for now (basic implementation)
465
+ if reference_lists:
466
+ st.write("**Selected Reference Lists:**")
467
+ for list_name in reference_lists.keys():
468
+ # Show smart defaults indicator
469
+ entry_config = UIComponents._find_entry_config(list_name, config)
470
+ if entry_config and entry_config.get('default_measures'):
471
+ defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
472
+ log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
473
+ st.write(f"├─ **{list_name}** [Token ✓] [Lemma ✓] [ℹ️ Smart defaults]")
474
+ st.write(f" {defaults_info}, {log_info}")
475
+ else:
476
+ st.write(f"├─ **{list_name}** [Legacy configuration]")
477
+ else:
478
+ st.info("No reference lists selected. Please configure reference lists first.")
479
+
480
+ # Global Analysis Options
481
+ st.write("### 🎯 Analysis Types")
482
+ col1, col2 = st.columns(2)
483
+
484
+ with col1:
485
+ token_analysis = st.checkbox("☑️ Token-based", value=True, key="token_analysis_enabled")
486
+ with col2:
487
+ lemma_analysis = st.checkbox("☑️ Lemma-based", value=True, key="lemma_analysis_enabled")
488
+
489
+ # Global Options
490
+ st.write("### ⚙️ Global Options")
491
+ word_type_filter = st.selectbox(
492
+ "Word Type Filter:",
493
+ options=[None, 'CW', 'FW'],
494
+ format_func=lambda x: 'All Words ▼' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
495
+ key="word_type_filter"
496
+ )
497
+
498
+ # Advanced Configuration Section
499
+ with st.expander("🎯 Advanced Configuration (Optional)", expanded=False):
500
+ st.info("ℹ️ **Smart Defaults Active**: The system automatically applies appropriate settings. "
501
+ "Expand this section only if you need custom control.")
502
+
503
+ # Legacy log transformation toggle
504
+ legacy_log_toggle = st.checkbox(
505
+ "Apply log₁₀ transformation to ALL measures (Legacy Mode)",
506
+ value=False,
507
+ help="⚠️ Not recommended: This applies log transformation to all measures, "
508
+ "including those where it's scientifically inappropriate (e.g., concreteness ratings).",
509
+ key="legacy_log_transform"
510
+ )
511
+
512
+ if legacy_log_toggle:
513
+ st.warning("⚠️ Legacy mode enabled: Log transformation will be applied to ALL numerical measures. "
514
+ "This may produce scientifically invalid results for psycholinguistic measures.")
515
+
516
+ # Return enhanced configuration
517
+ return {
518
+ 'token_analysis': token_analysis,
519
+ 'lemma_analysis': lemma_analysis,
520
+ 'word_type_filter': word_type_filter,
521
+ 'use_smart_defaults': not st.session_state.get('legacy_log_transform', False),
522
+ 'legacy_log_transform': st.session_state.get('legacy_log_transform', False),
523
+ 'selected_measures': {}, # Will be filled by smart defaults
524
+ 'log_transforms': {} # Will be filled by smart defaults
525
+ }
web_app/reference_manager.py CHANGED
@@ -64,7 +64,7 @@ class ReferenceManager:
64
  def _update_default_reference_lists(selected_lists: List[tuple]):
65
  """Update default reference lists based on selections."""
66
  current_keys = set(SessionManager.get_reference_lists().keys())
67
- new_keys = set(f"{ngram_type}_{list_key}" for ngram_type, list_key, _ in selected_lists)
68
 
69
  # Remove deselected lists (only default lists, not custom ones)
70
  for key in current_keys - new_keys:
@@ -75,14 +75,13 @@ class ReferenceManager:
75
 
76
  # Add newly selected lists
77
  for ngram_type, list_key, list_config in selected_lists:
78
- combined_key = f"{ngram_type}_{list_key}"
79
-
80
- if combined_key not in SessionManager.get_reference_lists():
81
  # Load the actual data
82
  data = ConfigManager.load_reference_list_data(list_config)
83
 
84
  if data:
85
- SessionManager.add_reference_list(combined_key, data)
86
 
87
  @staticmethod
88
  def _display_loaded_lists():
@@ -221,4 +220,4 @@ class ReferenceManager:
221
  'data_size': len(data.get('token', {})) if isinstance(data.get('token'), dict) else 0
222
  }
223
 
224
- return config
 
64
  def _update_default_reference_lists(selected_lists: List[tuple]):
65
  """Update default reference lists based on selections."""
66
  current_keys = set(SessionManager.get_reference_lists().keys())
67
+ new_keys = set(list_key for ngram_type, list_key, _ in selected_lists) # Use list_key directly
68
 
69
  # Remove deselected lists (only default lists, not custom ones)
70
  for key in current_keys - new_keys:
 
75
 
76
  # Add newly selected lists
77
  for ngram_type, list_key, list_config in selected_lists:
78
+ # Use the YAML entry name directly (list_key) instead of combining with ngram_type
79
+ if list_key not in SessionManager.get_reference_lists():
 
80
  # Load the actual data
81
  data = ConfigManager.load_reference_list_data(list_config)
82
 
83
  if data:
84
+ SessionManager.add_reference_list(list_key, data) # Use list_key directly
85
 
86
  @staticmethod
87
  def _display_loaded_lists():
 
220
  'data_size': len(data.get('token', {})) if isinstance(data.get('token'), dict) else 0
221
  }
222
 
223
+ return config
web_app/schema_migrator.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Schema Migration Tool for Reference Lists Configuration
3
+ Converts old schema entries to new schema format.
4
+ """
5
+
6
+ import yaml
7
+ from typing import Dict, Any, List, Tuple
8
+ from pathlib import Path
9
+ import logging
10
+ from web_app.schema_validator import SchemaValidator
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class SchemaMigrator:
16
+ """Handles migration from old schema to new schema format."""
17
+
18
+ @classmethod
19
+ def migrate_single_entry(cls, entry_name: str, entry_config: Dict[str, Any]) -> List[Dict[str, Any]]:
20
+ """
21
+ Migrate a single old schema entry to new schema format.
22
+
23
+ Args:
24
+ entry_name: Name of the entry to migrate
25
+ entry_config: Old schema configuration
26
+
27
+ Returns:
28
+ List of new schema entries (one for each analysis type)
29
+ """
30
+ new_entries = []
31
+
32
+ # Get available files
33
+ files = entry_config.get('files', {})
34
+
35
+ for analysis_type in ['token', 'lemma']:
36
+ if analysis_type not in files:
37
+ continue
38
+
39
+ # Create new entry name
40
+ new_entry_name = f"{entry_name}_{analysis_type}"
41
+
42
+ # Get measure names from columns (exclude word column)
43
+ columns = entry_config.get('columns', {})
44
+ word_column_index = columns.get('word', 0)
45
+
46
+ # Extract measure names (all columns except word column)
47
+ measure_names = []
48
+ for col_name, col_index in columns.items():
49
+ if col_name != 'word' and col_index != word_column_index:
50
+ measure_names.append(col_name)
51
+
52
+ # Create smart defaults for new schema fields
53
+ new_schema_fields = SchemaValidator.create_default_new_schema_fields(
54
+ measure_names, analysis_type
55
+ )
56
+
57
+ # Build new entry configuration
58
+ new_entry = {
59
+ 'display_name': f"{entry_config.get('display_name', entry_name)} ({analysis_type.title()})",
60
+ 'description': f"{entry_config.get('description', '')} - {analysis_type}-based analysis",
61
+ 'file': files[analysis_type],
62
+ 'format': entry_config.get('format', 'tsv'),
63
+ 'columns': columns.copy(),
64
+ 'has_header': entry_config.get('has_header', False),
65
+ 'enabled': entry_config.get('enabled', True),
66
+ **new_schema_fields
67
+ }
68
+
69
+ # Add header_prefix if it exists
70
+ if 'header_prefix' in entry_config:
71
+ new_entry['header_prefix'] = entry_config['header_prefix']
72
+
73
+ # Add japanese_corpus flag if it exists
74
+ if entry_config.get('japanese_corpus', False):
75
+ new_entry['japanese_corpus'] = True
76
+
77
+ new_entries.append({
78
+ 'name': new_entry_name,
79
+ 'config': new_entry
80
+ })
81
+
82
+ return new_entries
83
+
84
+ @classmethod
85
+ def create_test_migration(cls, config_data: Dict[str, Any],
86
+ entry_path: Tuple[str, str, str]) -> Dict[str, Any]:
87
+ """
88
+ Create a test migration for a specific entry without modifying the original.
89
+
90
+ Args:
91
+ config_data: Full configuration data
92
+ entry_path: Tuple of (language, ngram_type, entry_name)
93
+
94
+ Returns:
95
+ Dictionary with migrated configuration
96
+ """
97
+ language, ngram_type, entry_name = entry_path
98
+
99
+ # Get the original entry
100
+ original_entry = config_data[language][ngram_type][entry_name]
101
+
102
+ # Migrate the entry
103
+ migrated_entries = cls.migrate_single_entry(entry_name, original_entry)
104
+
105
+ # Create new configuration structure
106
+ new_config = {
107
+ 'original_entry': {
108
+ 'path': f"{language}/{ngram_type}/{entry_name}",
109
+ 'config': original_entry
110
+ },
111
+ 'migrated_entries': {},
112
+ 'migration_summary': {
113
+ 'entries_created': len(migrated_entries),
114
+ 'schema_version': 'new'
115
+ }
116
+ }
117
+
118
+ # Add migrated entries
119
+ for entry in migrated_entries:
120
+ new_config['migrated_entries'][entry['name']] = entry['config']
121
+
122
+ return new_config
123
+
124
+
125
+ def test_migration():
126
+ """Test migration functionality."""
127
+ from web_app.schema_validator import load_and_validate_config
128
+
129
+ # Load current config
130
+ config_data, validation_results = load_and_validate_config("config/reference_lists.yaml")
131
+
132
+ if not validation_results['is_valid']:
133
+ print("❌ Invalid configuration file")
134
+ return
135
+
136
+ # Test migration of COCA_spoken_frequency
137
+ test_result = SchemaMigrator.create_test_migration(
138
+ config_data,
139
+ ('english', 'unigrams', 'COCA_spoken_frequency')
140
+ )
141
+
142
+ print("=== MIGRATION TEST RESULTS ===")
143
+ print(f"Original entry: {test_result['original_entry']['path']}")
144
+ print(f"Entries created: {test_result['migration_summary']['entries_created']}")
145
+ print("\n=== MIGRATED ENTRIES ===")
146
+
147
+ for entry_name, entry_config in test_result['migrated_entries'].items():
148
+ print(f"\n🔄 {entry_name}:")
149
+ print(f" - Display Name: {entry_config['display_name']}")
150
+ print(f" - Analysis Type: {entry_config['analysis_type']}")
151
+ print(f" - File: {entry_config['file']}")
152
+ print(f" - Selectable Measures: {entry_config['selectable_measures']}")
153
+ print(f" - Default Measures: {entry_config['default_measures']}")
154
+ print(f" - Log Transformable: {entry_config['log_transformable']}")
155
+ print(f" - Default Log Transforms: {entry_config['default_log_transforms']}")
156
+
157
+ return test_result
158
+
159
+
160
+ if __name__ == "__main__":
161
+ test_migration()
web_app/schema_validator.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ YAML Schema Validator for Reference Lists Configuration
3
+ Handles detection and validation of old vs new schema formats.
4
+ """
5
+
6
+ import yaml
7
+ from typing import Dict, Any, List, Optional, Tuple
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class SchemaValidator:
14
+ """Validates and detects YAML schema formats for reference lists."""
15
+
16
+ # New schema required fields
17
+ NEW_SCHEMA_FIELDS = {
18
+ 'analysis_type',
19
+ 'log_transformable',
20
+ 'selectable_measures',
21
+ 'default_measures',
22
+ 'default_log_transforms'
23
+ }
24
+
25
+ # Old schema indicator fields
26
+ OLD_SCHEMA_FIELDS = {
27
+ 'files' # Old schema uses files.token/files.lemma
28
+ }
29
+
30
+ @classmethod
31
+ def detect_schema_version(cls, config_data: Dict[str, Any]) -> str:
32
+ """
33
+ Detect whether configuration uses old or new schema.
34
+
35
+ Args:
36
+ config_data: Parsed YAML configuration data
37
+
38
+ Returns:
39
+ 'old', 'new', or 'mixed' schema version
40
+ """
41
+ old_count = 0
42
+ new_count = 0
43
+
44
+ # Check all language/type/entry combinations
45
+ for language, lang_data in config_data.items():
46
+ if not isinstance(lang_data, dict):
47
+ continue
48
+
49
+ for ngram_type, type_data in lang_data.items():
50
+ if not isinstance(type_data, dict):
51
+ continue
52
+
53
+ for entry_name, entry_config in type_data.items():
54
+ if not isinstance(entry_config, dict):
55
+ continue
56
+
57
+ # Check for old schema indicators
58
+ if any(field in entry_config for field in cls.OLD_SCHEMA_FIELDS):
59
+ old_count += 1
60
+
61
+ # Check for new schema indicators
62
+ if any(field in entry_config for field in cls.NEW_SCHEMA_FIELDS):
63
+ new_count += 1
64
+
65
+ if old_count > 0 and new_count == 0:
66
+ return 'old'
67
+ elif new_count > 0 and old_count == 0:
68
+ return 'new'
69
+ elif old_count > 0 and new_count > 0:
70
+ return 'mixed'
71
+ else:
72
+ # Default assumption if no clear indicators
73
+ return 'old'
74
+
75
+ @classmethod
76
+ def validate_old_schema(cls, entry_config: Dict[str, Any]) -> Tuple[bool, List[str]]:
77
+ """
78
+ Validate old schema entry format.
79
+
80
+ Args:
81
+ entry_config: Single entry configuration
82
+
83
+ Returns:
84
+ Tuple of (is_valid, error_messages)
85
+ """
86
+ errors = []
87
+
88
+ # Required fields for old schema
89
+ required_fields = {'display_name', 'description', 'files', 'format', 'columns', 'enabled'}
90
+
91
+ for field in required_fields:
92
+ if field not in entry_config:
93
+ errors.append(f"Missing required field: {field}")
94
+
95
+ # Validate files structure
96
+ if 'files' in entry_config:
97
+ files = entry_config['files']
98
+ if not isinstance(files, dict):
99
+ errors.append("'files' must be a dictionary")
100
+ else:
101
+ if 'token' not in files and 'lemma' not in files:
102
+ errors.append("'files' must contain at least 'token' or 'lemma'")
103
+
104
+ # Validate columns structure
105
+ if 'columns' in entry_config:
106
+ columns = entry_config['columns']
107
+ if not isinstance(columns, dict):
108
+ errors.append("'columns' must be a dictionary")
109
+
110
+ return len(errors) == 0, errors
111
+
112
+ @classmethod
113
+ def validate_new_schema(cls, entry_config: Dict[str, Any]) -> Tuple[bool, List[str]]:
114
+ """
115
+ Validate new schema entry format.
116
+
117
+ Args:
118
+ entry_config: Single entry configuration
119
+
120
+ Returns:
121
+ Tuple of (is_valid, error_messages)
122
+ """
123
+ errors = []
124
+
125
+ # Required fields for new schema
126
+ required_fields = {
127
+ 'display_name', 'description', 'file', 'format', 'columns',
128
+ 'enabled', 'analysis_type', 'log_transformable',
129
+ 'selectable_measures', 'default_measures', 'default_log_transforms'
130
+ }
131
+
132
+ for field in required_fields:
133
+ if field not in entry_config:
134
+ errors.append(f"Missing required field: {field}")
135
+
136
+ # Validate analysis_type
137
+ if 'analysis_type' in entry_config:
138
+ analysis_type = entry_config['analysis_type']
139
+ if analysis_type not in ['token', 'lemma']:
140
+ errors.append(f"'analysis_type' must be 'token' or 'lemma', got: {analysis_type}")
141
+
142
+ # Validate list fields
143
+ list_fields = ['log_transformable', 'selectable_measures', 'default_measures', 'default_log_transforms']
144
+ for field in list_fields:
145
+ if field in entry_config:
146
+ value = entry_config[field]
147
+ if not isinstance(value, list):
148
+ errors.append(f"'{field}' must be a list, got: {type(value).__name__}")
149
+
150
+ # Validate file field (single file path instead of files dict)
151
+ if 'file' in entry_config:
152
+ file_path = entry_config['file']
153
+ if not isinstance(file_path, str):
154
+ errors.append("'file' must be a string path")
155
+
156
+ return len(errors) == 0, errors
157
+
158
+ @classmethod
159
+ def get_schema_migration_plan(cls, config_data: Dict[str, Any]) -> Dict[str, Any]:
160
+ """
161
+ Generate a migration plan for converting old schema to new schema.
162
+
163
+ Args:
164
+ config_data: Current configuration data
165
+
166
+ Returns:
167
+ Dictionary containing migration plan details
168
+ """
169
+ schema_version = cls.detect_schema_version(config_data)
170
+
171
+ migration_plan = {
172
+ 'current_schema': schema_version,
173
+ 'requires_migration': schema_version in ['old', 'mixed'],
174
+ 'entries_to_migrate': [],
175
+ 'entries_to_split': [],
176
+ 'new_entries_count': 0
177
+ }
178
+
179
+ if not migration_plan['requires_migration']:
180
+ return migration_plan
181
+
182
+ # Analyze entries that need migration
183
+ for language, lang_data in config_data.items():
184
+ if not isinstance(lang_data, dict):
185
+ continue
186
+
187
+ for ngram_type, type_data in lang_data.items():
188
+ if not isinstance(type_data, dict):
189
+ continue
190
+
191
+ for entry_name, entry_config in type_data.items():
192
+ if not isinstance(entry_config, dict):
193
+ continue
194
+
195
+ # Check if this entry uses old schema
196
+ if 'files' in entry_config:
197
+ files = entry_config['files']
198
+ if isinstance(files, dict):
199
+ # Count how many files this entry will split into
200
+ file_count = len([k for k in files.keys() if k in ['token', 'lemma']])
201
+
202
+ migration_plan['entries_to_migrate'].append({
203
+ 'language': language,
204
+ 'type': ngram_type,
205
+ 'name': entry_name,
206
+ 'files': list(files.keys()),
207
+ 'will_create': file_count
208
+ })
209
+
210
+ migration_plan['new_entries_count'] += file_count
211
+
212
+ return migration_plan
213
+
214
+ @classmethod
215
+ def create_default_new_schema_fields(cls, measure_names: List[str],
216
+ analysis_type: str = 'token') -> Dict[str, Any]:
217
+ """
218
+ Create default values for new schema fields based on measure names.
219
+
220
+ Args:
221
+ measure_names: List of available measure names from columns
222
+ analysis_type: 'token' or 'lemma'
223
+
224
+ Returns:
225
+ Dictionary with default new schema fields
226
+ """
227
+ # Smart defaults based on measure names
228
+ frequency_measures = []
229
+ association_measures = []
230
+ psycholinguistic_measures = []
231
+
232
+ for measure in measure_names:
233
+ measure_lower = measure.lower()
234
+ if any(freq_term in measure_lower for freq_term in ['freq', 'frequency', 'count']):
235
+ frequency_measures.append(measure)
236
+ elif any(assoc_term in measure_lower for assoc_term in ['mi', 't_score', 'delta_p', 'ap_collex']):
237
+ association_measures.append(measure)
238
+ elif any(psych_term in measure_lower for psych_term in ['concreteness', 'valence', 'arousal', 'dominance']):
239
+ psycholinguistic_measures.append(measure)
240
+ else:
241
+ # Default to no log transform for unknown measures
242
+ pass
243
+
244
+ # Set defaults
245
+ log_transformable = frequency_measures # Only frequency measures should be log-transformed
246
+ selectable_measures = measure_names
247
+
248
+ # Smart default selection
249
+ if frequency_measures:
250
+ default_measures = frequency_measures[:2] # First 2 frequency measures
251
+ elif association_measures:
252
+ # Prefer MI and T-score for associations
253
+ default_measures = [m for m in association_measures if any(pref in m.lower() for pref in ['mi', 't_score'])][:2]
254
+ else:
255
+ default_measures = measure_names[:2] if len(measure_names) >= 2 else measure_names
256
+
257
+ # Default log transforms (only for frequency measures)
258
+ default_log_transforms = [m for m in default_measures if m in frequency_measures]
259
+
260
+ return {
261
+ 'analysis_type': analysis_type,
262
+ 'log_transformable': log_transformable,
263
+ 'selectable_measures': selectable_measures,
264
+ 'default_measures': default_measures,
265
+ 'default_log_transforms': default_log_transforms
266
+ }
267
+
268
+
269
+ def load_and_validate_config(config_path: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
270
+ """
271
+ Load and validate YAML configuration file.
272
+
273
+ Args:
274
+ config_path: Path to YAML configuration file
275
+
276
+ Returns:
277
+ Tuple of (config_data, validation_results)
278
+ """
279
+ try:
280
+ with open(config_path, 'r', encoding='utf-8') as f:
281
+ config_data = yaml.safe_load(f)
282
+
283
+ schema_version = SchemaValidator.detect_schema_version(config_data)
284
+ migration_plan = SchemaValidator.get_schema_migration_plan(config_data)
285
+
286
+ validation_results = {
287
+ 'schema_version': schema_version,
288
+ 'migration_plan': migration_plan,
289
+ 'is_valid': True,
290
+ 'errors': []
291
+ }
292
+
293
+ return config_data, validation_results
294
+
295
+ except Exception as e:
296
+ logger.error(f"Error loading config file {config_path}: {e}")
297
+ return {}, {
298
+ 'schema_version': 'unknown',
299
+ 'migration_plan': {},
300
+ 'is_valid': False,
301
+ 'errors': [str(e)]
302
+ }
303
+
304
+
305
+ if __name__ == "__main__":
306
+ # Test the validator
307
+ config_data, validation_results = load_and_validate_config("config/reference_lists.yaml")
308
+ print(f"Schema version: {validation_results['schema_version']}")
309
+ print(f"Migration plan: {validation_results['migration_plan']}")