egumasa commited on
Commit
9bdec5c
·
1 Parent(s): 5ac114d

updated reference lists

Browse files
config/reference_lists.yaml CHANGED
@@ -67,6 +67,67 @@ english:
67
  range: range
68
  dispersion: range
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  concreteness_ratings_token:
71
  display_name: Concreteness Ratings (Token)
72
  description: Concreteness ratings for English words (1-5 scale) - token-based
@@ -108,6 +169,107 @@ english:
108
  measure_classifications:
109
  concreteness: psycholinguistic
110
  header_prefix: '#'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  academic_words_token:
112
  display_name: Academic Word List (Token)
113
  description: Common academic vocabulary for research writing - token-based analysis
@@ -129,6 +291,7 @@ english:
129
  - frequency
130
  measure_classifications:
131
  frequency: frequency
 
132
  academic_words_lemma:
133
  display_name: Academic Word List (Lemma)
134
  description: Common academic vocabulary for research writing - lemma-based analysis
@@ -148,64 +311,7 @@ english:
148
  - frequency
149
  measure_classifications:
150
  frequency: frequency
151
- COCA_spoken_frequency_token:
152
- display_name: COCA Spoken Frequency (Token)
153
- description: Frequency and range data from COCA spoken subcorpus - token-based
154
- analysis
155
- file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
156
- format: tsv
157
- columns:
158
- word: 0
159
- frequency: 1
160
- normalized_freq: 2
161
- range: 3
162
- dispersion: 4
163
- has_header: false
164
- enabled: true
165
- analysis_type: token
166
- log_transformable:
167
- - frequency
168
- - normalized_freq
169
- selectable_measures:
170
- - frequency
171
- - normalized_freq
172
- - range
173
- - dispersion
174
- default_measures:
175
- - frequency
176
- - normalized_freq
177
- default_log_transforms:
178
- - frequency
179
- - normalized_freq
180
- COCA_spoken_frequency_lemma:
181
- display_name: COCA Spoken Frequency (Lemma)
182
- description: Frequency and range data from COCA spoken subcorpus - lemma-based
183
- analysis
184
- file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
185
- format: tsv
186
- columns:
187
- word: 0
188
- frequency: 1
189
- normalized_freq: 2
190
- range: 3
191
- dispersion: 4
192
- has_header: false
193
- enabled: true
194
- analysis_type: lemma
195
- log_transformable:
196
- - frequency
197
- - normalized_freq
198
- selectable_measures:
199
- - frequency
200
- - normalized_freq
201
- - range
202
- - dispersion
203
- default_measures:
204
- - frequency
205
- - normalized_freq
206
- default_log_transforms:
207
- - frequency
208
- - normalized_freq
209
  bigrams:
210
  COCA_spoken_bigram_frequency_token:
211
  display_name: COCA Spoken Bigram Frequency (Token)
@@ -317,7 +423,7 @@ english:
317
  analysis
318
  file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
319
  format: csv
320
- columns:
321
  bigram: 0
322
  frequency: 1
323
  mi_score: 5
@@ -461,7 +567,7 @@ english:
461
  format: csv
462
  columns: *id007
463
  has_header: true
464
- enabled: true
465
  analysis_type: lemma
466
  log_transformable:
467
  - frequency
@@ -592,7 +698,7 @@ english:
592
  format: csv
593
  columns: *id009
594
  has_header: true
595
- enabled: true
596
  analysis_type: lemma
597
  log_transformable:
598
  - frequency
@@ -659,7 +765,7 @@ english:
659
  format: csv
660
  columns: *id010
661
  has_header: true
662
- enabled: true
663
  analysis_type: lemma
664
  log_transformable:
665
  - frequency
 
67
  range: range
68
  dispersion: range
69
 
70
+ COCA_spoken_frequency_token:
71
+ display_name: COCA Spoken Frequency (Token)
72
+ description: Frequency and range data from COCA spoken subcorpus - token-based
73
+ analysis
74
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
75
+ format: tsv
76
+ columns:
77
+ word: 0
78
+ frequency: 1
79
+ normalized_freq: 2
80
+ range: 3
81
+ dispersion: 4
82
+ has_header: false
83
+ enabled: true
84
+ analysis_type: token
85
+ log_transformable:
86
+ - frequency
87
+ - normalized_freq
88
+ selectable_measures:
89
+ - frequency
90
+ - normalized_freq
91
+ - range
92
+ - dispersion
93
+ default_measures:
94
+ - frequency
95
+ - normalized_freq
96
+ default_log_transforms:
97
+ - frequency
98
+ - normalized_freq
99
+
100
+ COCA_spoken_frequency_lemma:
101
+ display_name: COCA Spoken Frequency (Lemma)
102
+ description: Frequency and range data from COCA spoken subcorpus - lemma-based
103
+ analysis
104
+ file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
105
+ format: tsv
106
+ columns:
107
+ word: 0
108
+ frequency: 1
109
+ normalized_freq: 2
110
+ range: 3
111
+ dispersion: 4
112
+ has_header: false
113
+ enabled: true
114
+ analysis_type: lemma
115
+ log_transformable:
116
+ - frequency
117
+ - normalized_freq
118
+ selectable_measures:
119
+ - frequency
120
+ - normalized_freq
121
+ - range
122
+ - dispersion
123
+ default_measures:
124
+ - frequency
125
+ - normalized_freq
126
+ default_log_transforms:
127
+ - frequency
128
+ - normalized_freq
129
+
130
+ # Psycholinguistic norm
131
  concreteness_ratings_token:
132
  display_name: Concreteness Ratings (Token)
133
  description: Concreteness ratings for English words (1-5 scale) - token-based
 
169
  measure_classifications:
170
  concreteness: psycholinguistic
171
  header_prefix: '#'
172
+
173
+ aoa_ratings_token:
174
+ display_name: Age of Acquisition (AOA) Ratings (lemma)
175
+ description: Age of Acquisition (AOA) ratings for English words - lemma-based
176
+ analysis
177
+ file: resources/reference_lists/en/AoA_Brysbart.txt
178
+ format: tsv
179
+ columns:
180
+ word: 0
181
+ AOA: 1
182
+ has_header: true
183
+ enabled: true
184
+ analysis_type: lemma
185
+ log_transformable: []
186
+ selectable_measures:
187
+ - AOA
188
+ default_measures:
189
+ - AOA
190
+ default_log_transforms: []
191
+ measure_classifications:
192
+ concreteness: AOA
193
+ header_prefix: '#'
194
+
195
+ aoe_ratings_lemma:
196
+ display_name: Age of Exposure (AOE) (Lemma)
197
+ description: Age of Exposure (AOA) based on K-12 textbooks - lemma-based
198
+ analysis
199
+ file: resources/reference_lists/en/AOE_words_dictionary.csv
200
+ format: tsv
201
+ columns: &id018
202
+ word: 0
203
+ InverseAverage: 1
204
+ InverseLinearRegressionSlope: 2
205
+ IndexAboveThreshold40: 3
206
+ InflectionPointPolynomial: 4
207
+ has_header: true
208
+ enabled: true
209
+ analysis_type: lemma
210
+ log_transformable: []
211
+ selectable_measures:
212
+ - InverseAverage
213
+ - InverseLinearRegressionSlope
214
+ - IndexAboveThreshold40
215
+ - InflectionPointPolynomial
216
+ default_measures:
217
+ - InflectionPointPolynomial
218
+ measure_classifications:
219
+ InverseAverage: psycholinguistic
220
+ InverseLinearRegressionSlope: psycholinguistic
221
+ IndexAboveThreshold40: psycholinguistic
222
+ InflectionPointPolynomial: psycholinguistic
223
+ header_prefix: '#'
224
+ default_log_transforms: []
225
+ measure_classifications:
226
+ InflectionPointPolynomial: AOE
227
+ IndexAboveThreshold40: AOE
228
+ header_prefix: '#'
229
+
230
+ semd_token:
231
+ display_name: SemD (Token)
232
+ description: Semantic Diversity (SemD) for English words - token-based
233
+ analysis
234
+ file: resources/reference_lists/en/SemD.txt
235
+ format: tsv
236
+ columns: &id019
237
+ word: 0
238
+ semd: 1
239
+ has_header: true
240
+ enabled: true
241
+ analysis_type: token
242
+ log_transformable: []
243
+ selectable_measures:
244
+ - semd
245
+ default_measures:
246
+ - semd
247
+ default_log_transforms: []
248
+ measure_classifications:
249
+ semd: contextual distinctiveness
250
+ header_prefix: '#'
251
+
252
+ mcd_cd_token:
253
+ display_name: McD CD (Token)
254
+ description: Macdonald Contextual Diversity (McD CD) for English words - token-based
255
+ analysis
256
+ file: resources/reference_lists/en/Mcd_CD.txt
257
+ format: tsv
258
+ columns: &id020
259
+ word: 0
260
+ mcd: 1
261
+ has_header: false
262
+ enabled: true
263
+ analysis_type: token
264
+ log_transformable: []
265
+ selectable_measures:
266
+ - mcd
267
+ default_measures:
268
+ - mcd
269
+ measure_classifications:
270
+ mcd: contextual_diversity
271
+ header_prefix: '#'
272
+
273
  academic_words_token:
274
  display_name: Academic Word List (Token)
275
  description: Common academic vocabulary for research writing - token-based analysis
 
291
  - frequency
292
  measure_classifications:
293
  frequency: frequency
294
+
295
  academic_words_lemma:
296
  display_name: Academic Word List (Lemma)
297
  description: Common academic vocabulary for research writing - lemma-based analysis
 
311
  - frequency
312
  measure_classifications:
313
  frequency: frequency
314
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  bigrams:
316
  COCA_spoken_bigram_frequency_token:
317
  display_name: COCA Spoken Bigram Frequency (Token)
 
423
  analysis
424
  file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
425
  format: csv
426
+ columns: &id021
427
  bigram: 0
428
  frequency: 1
429
  mi_score: 5
 
567
  format: csv
568
  columns: *id007
569
  has_header: true
570
+ enabled: false
571
  analysis_type: lemma
572
  log_transformable:
573
  - frequency
 
698
  format: csv
699
  columns: *id009
700
  has_header: true
701
+ enabled: false
702
  analysis_type: lemma
703
  log_transformable:
704
  - frequency
 
765
  format: csv
766
  columns: *id010
767
  has_header: true
768
+ enabled: false
769
  analysis_type: lemma
770
  log_transformable:
771
  - frequency
resources/reference_lists/en/AOE_words_dictionary.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d43ae65af3e66118b94aa988f3b3840dbe07681872e550f48f63717032a0e43
3
+ size 951099
resources/reference_lists/en/Mcd_CD.txt ADDED
The diff for this file is too large to render. See raw diff
 
resources/reference_lists/en/SemD.txt ADDED
The diff for this file is too large to render. See raw diff
 
resources/reference_lists/en/spoken_bigram_lemma_contingency.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47b5daff8127da415a8ca46a913dc9010ef4f5c9707203d9d6965cc852b8a749
3
- size 19236284
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c63ba4df79fb8e850017e7e5020d3c71b0b71be91433eacb143e27b19a8e29ac
3
+ size 19236676