File size: 11,893 Bytes

7b63815

#### NOTE: for all flags and their description, see the javadoc. Important parameters (in our experience) that you should tune for your dataset are marked with ***. Pay special attention to flags like targetAllowedNERs. Use batchProcesssSents and Lucene based options if you want low memory (but slower) runs.

#***Which type of patterns. Options are SURFACE and DEP
patternType=SURFACE

#name for the saved files for the output of the system (useful for comparing results of different experiments with different variables etc.)
identifier=useNERRestriction

#Directory where data lives
DIR=patterns

outDir=SPIEDPatternsout

#If you want output in which each labeled phrase has <label> </label> around it
#markedOutputTextFile=${outDir}/markedtext.txt

#Number of threads available on the machine
numThreads=1
#***Use these options if you are limited by memory
batchProcessSents = false
#This name is a misnomer. Max number of *lines* per batch file. Works only for text file format; ser files cannot be broken down
numMaxSentencesPerBatchFile=100
saveInvertedIndex=false
invertedIndexDirectory=${outDir}/invertedIndex
#Loading index from invertedIndexDirectory
#loadInvertedIndex=true

#Useful for memory heavy apps. 
#invertedIndexClass=edu.stanford.nlp.patterns.LuceneSentenceIndex


### Example for running it on presidents biographies. For more data examples, see the bottom of this file

#can be text. the code will tokenize it.
fileFormat=text
#Input file(s) (default assumed text). Can be one or more of (concatenated by comma or semi-colon): file, directory, files with regex in the filename (for example: "mydir/health-.*-processed.txt")
file=${DIR}/presidents.txt

#to save the serialized sentences into a file - text split into sentences, processed using ner, parse etc (depending on the flags) and labeled with seed set. Existing files will get deleted (if the fileFormat is not ser).
saveSentencesSerDir=${outDir}/sents

#if you use the flag above to save the file, you can use the saved file like this
#fileFormat=ser
#file=${outDir}/sents

#We are learning names of presidential candidates, places, and other names. In each line, all text after tabs are ignored in these seed files
seedWordsFiles=NAME,${DIR}/names.txt;PLACE,${DIR}/places.txt;OTHER,${DIR}/otherpeople.txt
#Useful for matching lemmas or spelling mistakes
fuzzyMatch=false
#Used if fuzzyMatch is true. minimum length of words to do fuzzy matching. 
minLen4FuzzyForPattern=6

#You can evaluate two ways; both presented here.
evaluate=true
goldEntitiesEvalFiles=NAME,${DIR}/goldnames.txt;PLACE,${DIR}/goldplaces.txt
#evalFileWithGoldLabels=${DIR}/presidents_eval.txt
#default as true, false if you want scores per token
evalPerEntity=true


#SAVE n LOAD the model (patterns and phrases) options
patternsWordsDir=${outDir}/${identifier}/model/
#save the learned patterns and learned words in patternsWordsDir directory
savePatternsWordsDir=true
#load the patterns and words from patternsWordsDir directory
#loadSavedPatternsWordsDir=true

#false if you just want to process the text into sents but not do anything with it, or you want to use loadSavedPatternsWordsDir option. Useful for batch processing and saving text as serialized objects,  then running the learning system on all the serialized objects (see saveSentences* and saveEvalSent* flags) or domain adaptation.
learn=true


#posModelPath=<if you want to use a different Stanford NLP group released POS tagger; e.g. caseless etc>

#In case the seeds have overlapping words like "lung" as bodyPart and "lung cancer" as disease. "lung" in "lung cancer" will be labeled as only disease, if the flag is true.
removeOverLappingLabelsFromSeed=true

######## creating patterns flags ##########
#***use context on the left
usePreviousContext=true

#***use context on the right
useNextContext = true

#***the context should be at least this long
minWindow4Pattern = 2

#***the context can be at most this long
maxWindow4Pattern = 4

#if the context consists of only stop words, add only if it's more than these many stop words
numMinStopWordsToAdd = 3

#***use POS tag restriction for the target phrase
usePOS4Pattern = true

#Ignore words {a, an, the} while matching the patterns to text (advisable true)
useFillerWordsInPat = false

#***Specific allowed tags' initials for the target phrase for each label while creating the patterns (if not specified, every tag is acceptable to create a pattern). Tag initials can be written as N or NN or J or N,J etc. E.g.: NAME,N,J;PLACE,N. If
targetAllowedTagsInitialsStr=NAME,N;OTHER,N

#You can save all possible patterns for all tokens in the flag allPatternsDir so you wouldn't need to calculate them everytime.
computeAllPatterns = true

#Options: MEMORY, DB, LUCENE. If using SQL for storing patterns for each token --- populate SQLConnection class, that is provide those properties!
storePatsForEachToken=MEMORY
#***If your code is running too slow, try to reduce this number. Samples % of sentences for learning patterns
sampleSentencesForSufficientStats=1.0

#Save or read (if computeAllPatterns is false) from here
allPatternsDir= ${DIR}/${identifier}_allpatterns

#***maximum Num of allowed words in the target phrase
numWordsCompound = 3

#***consider patterns without the POS restricion on the target phrase
addPatWithoutPOS = true

#Ignore common stop words occuring just before the target phrase
useStopWordsBeforeTerm=false

#Use lemma instead of words of the context tokens
useLemmaContextTokens=true

#make context matching lowercase (advisable)
matchLowerCaseContext=true

#***use named entity tag (predicted using StanfordCoreNLP NER) restriction of the target phrase
useTargetNERRestriction=true

#***If useTargetNERRestriction is true, you can give NER tags that the target phrase can take. Do not mention anything if you don't want any specific restriction
targetAllowedNERs=NAME,PERSON;PLACE,LOCATION;OTHER,PERSON

#use named entity tag restrictions for the context (neighboring) tokens
useContextNERRestriction=false

#***use the parse tag of the grandparent node as restriction (note that parent node is the POS tag of the word)
useTargetParserParentRestriction=false

#do not extract phrase in which any word is labeled with another class (for example, you don't wanna extract 'HIV patients' as disease)
doNotExtractPhraseAnyWordLabeledOtherClass = true

#### matching patterns to text ######

#kinda ignore this flag and use it as true. for those who care this too much: for each token, we use the phrase that originally matched that token instead of the token's word (in case you are using fuzzy matching)
useMatchingPhrase=true

#Use only the tokens that get matched by a pattern (advisable as false)
restrictToMatched = false

#Label the learned words in the text (advisable as true)
usePatternResultAsLabel=true

#remove common stop words from phrases to get clean phrases (for example, "disease" instead of "some disease")
removeStopWordsFromSelectedPhrases = true

#Do not learn phrases that have any stop word
removePhrasesWithStopWords = false


### evaluating candidate patterns

#***Minimum number of positive phrases a candidate pattern should extract
minPosPhraseSupportForPat = 1

##### thresholds for selecting paterns and words #####

#***threshold for learning a phrase
thresholdWordExtract=0.01

#***thrshold for learning a pattern
thresholdSelectPattern = 0.01

#keep lowering threshold as 0.8*threshold whenever the system doesn't learn any new patterns and phrases
tuneThresholdKeepRunning=false

#***discard phrases that do not have these many patterns extracting it
thresholdNumPatternsApplied = 1

#***max number of words to extract in each iteration
numWordsToAdd = 5

#***max number of words to extract in each pattern
numPatterns = 5

#***max number of iterations
numIterationsForPatterns = 8

#Consider words belonging to other labels as negative (advisable as true)
useOtherLabelsWordsasNegative=true

#***Pattern scoring measure. For more details, see the paper. The options are PhEvalInPatLogP, PhEvalInPat, PosNegUnlabOdds, RlogF, RlogFPosNeg,        YanGarber02, PosNegOdds, LOGREG, LOGREGlogP, RatioAll, SqrtAllRatio
patternScoring=RatioAll

#Class to be used to score phrases. The valid options are edu.stanford.nlp.patterns.surface.ScorePhrasesAverageFeatures and  edu.stanford.nlp.patterns.surface.ScorePhrasesLearnFeatWt
phraseScorerClass=edu.stanford.nlp.patterns.ScorePhrasesAverageFeatures
#phraseScorerClass=edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt

#***Club neighboring labeled words (of the same label) when extracting phrases
clubNeighboringLabeledWords=true

#if you want to sqrt root the pattern score
sqrtPatScore = false

#Phrase scoring measure; ignore.
wordScoring=WEIGHTEDNORM

#For scoring phrases that are OOV, a score is the average of the score of individual words (instead of min, which is default)
useAvgInsteadofMinPhraseScoring=true

#*** what all features to use to evaluate phrases. See the paper for more details on each
#only if wordClassClusterFile is provided
usePhraseEvalWordClass=false

#tf-idf scoring w.r.t to the domain
usePhraseEvalDomainNgram=false

#use pattern weights in scoring phrases extracted by them, if usePhraseEvalPatWtByFreq is true. otherwise it's just a tfidf like score
usePatternWeights=true

#basically patwt/log(freq), patwt = 1 if usePatternWeights is false
usePhraseEvalPatWtByFreq=true

#if using multiple label dictionaries etc, freq of the phrase in the label dictionary vs other dictionaries
usePhraseEvalSemanticOdds=true

#edit distance from positive entities
usePhraseEvalEditDistSame=true

#edit distance from the negative entities
usePhraseEvalEditDistOther=true

#if you have googlengrams, you can use googlengrams tf-idf scoring.
usePhraseEvalGoogleNgram=false

#% of positive labeled words with the same word class (see WordClassClassifier and chris2 for more details)
usePhraseEvalWordShape=true


#These flags are not valid if patternScoring is not PhEvalInPat* . Similar meaning as for the phrase ones above
usePatternEvalWordClass=false
usePatternEvalGoogleNgram=false
usePatternEvalSemanticOdds=true
usePatternEvalEditDistSame=true
usePatternEvalEditDistOther=true
usePatternEvalDomainNgram=false
usePatternEvalWordShape=true

#Options are LOG, NONE or SQRT
wordFreqNorm = NONE

######For logging

#4 if you wanna print out every single thing happening in the system, 3 if you want fair amount of debug messages and justification, 2 means some debug msgs, 1 means only necessary msgs and 0 means (almost) no msgs
debug = 3



#stop words file
stopWordsPatternFiles=${DIR}/stopwords.txt

englishWordsFiles=${stopWordsPatternFiles}
commonWordsPatternFiles= ${stopWordsPatternFiles}
#You can give some common words like this
#commonWordsPatternFiles =${DIR}/lists/commonEngWords1k.txt

#If you are using Google Ngrams TF-IDF feature
#googleNGramsFile=/u/nlp/scr/google-ngrams/1gms/vocab
#weightDomainFreq=10

#below is optional; comma separated files with list of phrases that def do not belong to any of the labels
#otherSemanticClassesFiles=${DIR}/nondiseases.txt

#The flags below are used when either LOGREG is used for patternScoring or ScorePhrasesLearnFeatWt class is used for phrase scoring
#% unlabeled tokens selected as negative
#perSelectRand=0.5
#% negative tokens selected as negative
#perSelectNeg=1


### Example for running the code on BioMed articles and NCBI corpus (instead of the toy example above)

#fileFormat=text
#file=${DIR}/BioMedSample
#saveSentencesSerFile=${DIR}/biomed_sents.ser

#evalFileWithGoldLabels=${DIR}/NCBI_corpus_testing_processed.txt
#saveEvalSentencesSerFile=${DIR}/ncbi_corpus_testing_sents.ser
#addEvalSentsToTrain=true

#seedWordsFiles=disease,${DIR}/diseases.txt;nondisease,${DIR}/nondiseases.txt

#wordClassClusterFile=${DIR}/ncbi_disease_brownclusters_200_min5.txt

#externalFeatureWeightsFile = ${DIR}/out/wordclass_weights