File size: 11,893 Bytes
7b63815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#### NOTE: for all flags and their description, see the javadoc. Important parameters (in our experience) that you should tune for your dataset are marked with ***. Pay special attention to flags like targetAllowedNERs. Use batchProcesssSents and Lucene based options if you want low memory (but slower) runs.

#***Which type of patterns. Options are SURFACE and DEP
patternType=SURFACE

#name for the saved files for the output of the system (useful for comparing results of different experiments with different variables etc.)
identifier=useNERRestriction

#Directory where data lives
DIR=patterns

outDir=SPIEDPatternsout

#If you want output in which each labeled phrase has <label> </label> around it
#markedOutputTextFile=${outDir}/markedtext.txt

#Number of threads available on the machine
numThreads=1
#***Use these options if you are limited by memory
batchProcessSents = false
#This name is a misnomer. Max number of *lines* per batch file. Works only for text file format; ser files cannot be broken down
numMaxSentencesPerBatchFile=100
saveInvertedIndex=false
invertedIndexDirectory=${outDir}/invertedIndex
#Loading index from invertedIndexDirectory
#loadInvertedIndex=true

#Useful for memory heavy apps. 
#invertedIndexClass=edu.stanford.nlp.patterns.LuceneSentenceIndex


### Example for running it on presidents biographies. For more data examples, see the bottom of this file

#can be text. the code will tokenize it.
fileFormat=text
#Input file(s) (default assumed text). Can be one or more of (concatenated by comma or semi-colon): file, directory, files with regex in the filename (for example: "mydir/health-.*-processed.txt")
file=${DIR}/presidents.txt

#to save the serialized sentences into a file - text split into sentences, processed using ner, parse etc (depending on the flags) and labeled with seed set. Existing files will get deleted (if the fileFormat is not ser).
saveSentencesSerDir=${outDir}/sents

#if you use the flag above to save the file, you can use the saved file like this
#fileFormat=ser
#file=${outDir}/sents

#We are learning names of presidential candidates, places, and other names. In each line, all text after tabs are ignored in these seed files
seedWordsFiles=NAME,${DIR}/names.txt;PLACE,${DIR}/places.txt;OTHER,${DIR}/otherpeople.txt
#Useful for matching lemmas or spelling mistakes
fuzzyMatch=false
#Used if fuzzyMatch is true. minimum length of words to do fuzzy matching. 
minLen4FuzzyForPattern=6

#You can evaluate two ways; both presented here.
evaluate=true
goldEntitiesEvalFiles=NAME,${DIR}/goldnames.txt;PLACE,${DIR}/goldplaces.txt
#evalFileWithGoldLabels=${DIR}/presidents_eval.txt
#default as true, false if you want scores per token
evalPerEntity=true


#SAVE n LOAD the model (patterns and phrases) options
patternsWordsDir=${outDir}/${identifier}/model/
#save the learned patterns and learned words in patternsWordsDir directory
savePatternsWordsDir=true
#load the patterns and words from patternsWordsDir directory
#loadSavedPatternsWordsDir=true

#false if you just want to process the text into sents but not do anything with it, or you want to use loadSavedPatternsWordsDir option. Useful for batch processing and saving text as serialized objects,  then running the learning system on all the serialized objects (see saveSentences* and saveEvalSent* flags) or domain adaptation.
learn=true


#posModelPath=<if you want to use a different Stanford NLP group released POS tagger; e.g. caseless etc>

#In case the seeds have overlapping words like "lung" as bodyPart and "lung cancer" as disease. "lung" in "lung cancer" will be labeled as only disease, if the flag is true.
removeOverLappingLabelsFromSeed=true

######## creating patterns flags ##########
#***use context on the left
usePreviousContext=true

#***use context on the right
useNextContext = true

#***the context should be at least this long
minWindow4Pattern = 2

#***the context can be at most this long
maxWindow4Pattern = 4

#if the context consists of only stop words, add only if it's more than these many stop words
numMinStopWordsToAdd = 3

#***use POS tag restriction for the target phrase
usePOS4Pattern = true

#Ignore words {a, an, the} while matching the patterns to text (advisable true)
useFillerWordsInPat = false

#***Specific allowed tags' initials for the target phrase for each label while creating the patterns (if not specified, every tag is acceptable to create a pattern). Tag initials can be written as N or NN or J or N,J etc. E.g.: NAME,N,J;PLACE,N. If
targetAllowedTagsInitialsStr=NAME,N;OTHER,N

#You can save all possible patterns for all tokens in the flag allPatternsDir so you wouldn't need to calculate them everytime.
computeAllPatterns = true

#Options: MEMORY, DB, LUCENE. If using SQL for storing patterns for each token --- populate SQLConnection class, that is provide those properties!
storePatsForEachToken=MEMORY
#***If your code is running too slow, try to reduce this number. Samples % of sentences for learning patterns
sampleSentencesForSufficientStats=1.0

#Save or read (if computeAllPatterns is false) from here
allPatternsDir= ${DIR}/${identifier}_allpatterns

#***maximum Num of allowed words in the target phrase
numWordsCompound = 3

#***consider patterns without the POS restricion on the target phrase
addPatWithoutPOS = true

#Ignore common stop words occuring just before the target phrase
useStopWordsBeforeTerm=false

#Use lemma instead of words of the context tokens
useLemmaContextTokens=true

#make context matching lowercase (advisable)
matchLowerCaseContext=true

#***use named entity tag (predicted using StanfordCoreNLP NER) restriction of the target phrase
useTargetNERRestriction=true

#***If useTargetNERRestriction is true, you can give NER tags that the target phrase can take. Do not mention anything if you don't want any specific restriction
targetAllowedNERs=NAME,PERSON;PLACE,LOCATION;OTHER,PERSON

#use named entity tag restrictions for the context (neighboring) tokens
useContextNERRestriction=false

#***use the parse tag of the grandparent node as restriction (note that parent node is the POS tag of the word)
useTargetParserParentRestriction=false

#do not extract phrase in which any word is labeled with another class (for example, you don't wanna extract 'HIV patients' as disease)
doNotExtractPhraseAnyWordLabeledOtherClass = true

#### matching patterns to text ######

#kinda ignore this flag and use it as true. for those who care this too much: for each token, we use the phrase that originally matched that token instead of the token's word (in case you are using fuzzy matching)
useMatchingPhrase=true

#Use only the tokens that get matched by a pattern (advisable as false)
restrictToMatched = false

#Label the learned words in the text (advisable as true)
usePatternResultAsLabel=true

#remove common stop words from phrases to get clean phrases (for example, "disease" instead of "some disease")
removeStopWordsFromSelectedPhrases = true

#Do not learn phrases that have any stop word
removePhrasesWithStopWords = false


### evaluating candidate patterns

#***Minimum number of positive phrases a candidate pattern should extract
minPosPhraseSupportForPat = 1

##### thresholds for selecting paterns and words #####

#***threshold for learning a phrase
thresholdWordExtract=0.01

#***thrshold for learning a pattern
thresholdSelectPattern = 0.01

#keep lowering threshold as 0.8*threshold whenever the system doesn't learn any new patterns and phrases
tuneThresholdKeepRunning=false

#***discard phrases that do not have these many patterns extracting it
thresholdNumPatternsApplied = 1

#***max number of words to extract in each iteration
numWordsToAdd = 5

#***max number of words to extract in each pattern
numPatterns = 5

#***max number of iterations
numIterationsForPatterns = 8

#Consider words belonging to other labels as negative (advisable as true)
useOtherLabelsWordsasNegative=true

#***Pattern scoring measure. For more details, see the paper. The options are PhEvalInPatLogP, PhEvalInPat, PosNegUnlabOdds, RlogF, RlogFPosNeg,        YanGarber02, PosNegOdds, LOGREG, LOGREGlogP, RatioAll, SqrtAllRatio
patternScoring=RatioAll

#Class to be used to score phrases. The valid options are edu.stanford.nlp.patterns.surface.ScorePhrasesAverageFeatures and  edu.stanford.nlp.patterns.surface.ScorePhrasesLearnFeatWt
phraseScorerClass=edu.stanford.nlp.patterns.ScorePhrasesAverageFeatures
#phraseScorerClass=edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt

#***Club neighboring labeled words (of the same label) when extracting phrases
clubNeighboringLabeledWords=true

#if you want to sqrt root the pattern score
sqrtPatScore = false

#Phrase scoring measure; ignore.
wordScoring=WEIGHTEDNORM

#For scoring phrases that are OOV, a score is the average of the score of individual words (instead of min, which is default)
useAvgInsteadofMinPhraseScoring=true

#*** what all features to use to evaluate phrases. See the paper for more details on each
#only if wordClassClusterFile is provided
usePhraseEvalWordClass=false

#tf-idf scoring w.r.t to the domain
usePhraseEvalDomainNgram=false

#use pattern weights in scoring phrases extracted by them, if usePhraseEvalPatWtByFreq is true. otherwise it's just a tfidf like score
usePatternWeights=true

#basically patwt/log(freq), patwt = 1 if usePatternWeights is false
usePhraseEvalPatWtByFreq=true

#if using multiple label dictionaries etc, freq of the phrase in the label dictionary vs other dictionaries
usePhraseEvalSemanticOdds=true

#edit distance from positive entities
usePhraseEvalEditDistSame=true

#edit distance from the negative entities
usePhraseEvalEditDistOther=true

#if you have googlengrams, you can use googlengrams tf-idf scoring.
usePhraseEvalGoogleNgram=false

#% of positive labeled words with the same word class (see WordClassClassifier and chris2 for more details)
usePhraseEvalWordShape=true


#These flags are not valid if patternScoring is not PhEvalInPat* . Similar meaning as for the phrase ones above
usePatternEvalWordClass=false
usePatternEvalGoogleNgram=false
usePatternEvalSemanticOdds=true
usePatternEvalEditDistSame=true
usePatternEvalEditDistOther=true
usePatternEvalDomainNgram=false
usePatternEvalWordShape=true

#Options are LOG, NONE or SQRT
wordFreqNorm = NONE

######For logging

#4 if you wanna print out every single thing happening in the system, 3 if you want fair amount of debug messages and justification, 2 means some debug msgs, 1 means only necessary msgs and 0 means (almost) no msgs
debug = 3



#stop words file
stopWordsPatternFiles=${DIR}/stopwords.txt

englishWordsFiles=${stopWordsPatternFiles}
commonWordsPatternFiles= ${stopWordsPatternFiles}
#You can give some common words like this
#commonWordsPatternFiles =${DIR}/lists/commonEngWords1k.txt

#If you are using Google Ngrams TF-IDF feature
#googleNGramsFile=/u/nlp/scr/google-ngrams/1gms/vocab
#weightDomainFreq=10

#below is optional; comma separated files with list of phrases that def do not belong to any of the labels
#otherSemanticClassesFiles=${DIR}/nondiseases.txt

#The flags below are used when either LOGREG is used for patternScoring or ScorePhrasesLearnFeatWt class is used for phrase scoring
#% unlabeled tokens selected as negative
#perSelectRand=0.5
#% negative tokens selected as negative
#perSelectNeg=1


### Example for running the code on BioMed articles and NCBI corpus (instead of the toy example above)

#fileFormat=text
#file=${DIR}/BioMedSample
#saveSentencesSerFile=${DIR}/biomed_sents.ser

#evalFileWithGoldLabels=${DIR}/NCBI_corpus_testing_processed.txt
#saveEvalSentencesSerFile=${DIR}/ncbi_corpus_testing_sents.ser
#addEvalSentsToTrain=true

#seedWordsFiles=disease,${DIR}/diseases.txt;nondisease,${DIR}/nondiseases.txt

#wordClassClusterFile=${DIR}/ncbi_disease_brownclusters_200_min5.txt

#externalFeatureWeightsFile = ${DIR}/out/wordclass_weights