|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
package edu.siu.sentise.preprocessing;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
import java.util.List;
|
|
|
import java.util.Properties;
|
|
|
|
|
|
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
|
|
|
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
|
|
|
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
|
|
|
import edu.siu.sentise.model.SentimentData;
|
|
|
import edu.stanford.nlp.ling.CoreLabel;
|
|
|
import edu.stanford.nlp.pipeline.Annotation;
|
|
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
|
|
|
import edu.stanford.nlp.util.CoreMap;
|
|
|
|
|
|
public class StanfordCoreNLPLemmatizer implements weka.core.stemmers.Stemmer {
|
|
|
|
|
|
private StanfordCoreNLP pipeline = null;
|
|
|
|
|
|
public StanfordCoreNLPLemmatizer () {
|
|
|
|
|
|
Properties props = new Properties();
|
|
|
props.setProperty("annotators","tokenize, ssplit, pos, lemma");
|
|
|
pipeline = new StanfordCoreNLP(props);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public String stem(String word)
|
|
|
{
|
|
|
StringBuilder lema=new StringBuilder();
|
|
|
|
|
|
Annotation document = new Annotation(word);
|
|
|
|
|
|
pipeline.annotate(document);
|
|
|
|
|
|
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
|
|
|
for(CoreMap sentence: sentences) {
|
|
|
|
|
|
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
|
|
|
|
|
|
|
|
|
|
|
|
if(!token.value().toString().endsWith("s"))
|
|
|
lema.append(token.value().toString());
|
|
|
else
|
|
|
lema.append((token.get(LemmaAnnotation.class)));
|
|
|
|
|
|
}
|
|
|
}
|
|
|
return lema.toString();
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
public String getRevision() {
|
|
|
|
|
|
return "$Revision: 8034 $";
|
|
|
}
|
|
|
}
|
|
|
|