WildnerveAI's picture
Upload 20 files
0861a59 verified
"""
Stub implementation of NLTK to avoid dependencies in container environments
"""
import logging
logger = logging.getLogger(__name__)
logger.info("Using stub NLTK implementation")
# Stub for download - do nothing
def download(*args, **kwargs):
logger.warning("NLTK download stub called - no actual download performed")
return True
# Add the missing SimpleTokenizer class
class SimpleTokenizer:
"""A simple tokenizer implementation for the NLTK stub"""
def __init__(self):
logger.info("Stub SimpleTokenizer initialized")
def tokenize(self, text):
"""Simple word tokenization by whitespace"""
return text.split()
# Tokenization stubs
class WordTokenizer:
def tokenize(self, text):
return text.split()
def word_tokenize(text):
return text.split()
class SentenceTokenizer:
def tokenize(self, text):
return text.split('.')
def sent_tokenize(text):
return text.split('.')
# Stemmer stubs
class PorterStemmer:
def stem(self, word):
# Very basic stemming
if word.endswith('ing'):
return word[:-3]
elif word.endswith('ed'):
return word[:-2]
elif word.endswith('s') and not word.endswith('ss'):
return word[:-1]
return word
class LancasterStemmer:
def stem(self, word):
return PorterStemmer().stem(word)
class SimpleStemmer:
def __init__(self):
logger.info("SimpleStemmer stub initialized")
def stem(self, word):
# Very basic stemming: remove common endings
if word.endswith('ing'):
return word[:-3]
elif word.endswith('ed'):
return word[:-2]
elif word.endswith('s') and not word.endswith('ss'):
return word[:-1]
return word
# Stub WordNetLemmatizer class
class WordNetLemmatizer:
def __init__(self):
logger.info("Stub WordNetLemmatizer initialized")
def lemmatize(self, word, pos=None):
# Just return the word as is
return word
# Namespace stubs for import compatibility
class tokenize:
WordTokenizer = WordTokenizer
SentenceTokenizer = SentenceTokenizer
word_tokenize = word_tokenize
sent_tokenize = sent_tokenize
class stem:
PorterStemmer = PorterStemmer
LancasterStemmer = LancasterStemmer
SimpleStemmer = SimpleStemmer
# Stub for corpus
class _CorpusModule:
class stopwords:
@staticmethod
def words(language="english"):
# Return basic English stopwords
return {
"i", "me", "my", "myself", "we", "our", "ours", "ourselves",
"you", "your", "yours", "yourself", "yourselves", "he", "him",
"his", "himself", "she", "her", "hers", "herself", "it", "its",
"itself", "they", "them", "their", "theirs", "themselves",
"what", "which", "who", "whom", "this", "that", "these",
"those", "am", "is", "are", "was", "were", "be", "been",
"being", "have", "has", "had", "having", "do", "does", "did",
"doing", "a", "an", "the", "and", "but", "if", "or", "because",
"as", "until", "while", "of", "at", "by", "for", "with",
"about", "against", "between", "into", "through", "during",
"before", "after", "above", "below", "to", "from", "up", "down",
"in", "out", "on", "off", "over", "under", "again", "further",
"then", "once", "here", "there", "when", "where", "why", "how",
"all", "any", "both", "each", "few", "more", "most", "other",
"some", "such", "no", "nor", "not", "only", "own", "same", "so",
"than", "too", "very", "s", "t", "can", "will", "just", "don",
"should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain",
"aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven",
}
corpus = _CorpusModule()