Spaces:
Sleeping
Sleeping
File size: 1,668 Bytes
2a89678 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# Run this script to download required NLTK data
import nltk
import ssl
# Handle SSL certificate issues if they occur
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# Download required NLTK data
print("Downloading NLTK data...")
try:
# Download the newer punkt_tab tokenizer
nltk.download('punkt_tab')
print("✓ punkt_tab downloaded successfully")
except:
print("Failed to download punkt_tab, trying punkt...")
try:
nltk.download('punkt')
print("✓ punkt downloaded successfully")
except:
print("Failed to download punkt tokenizer")
try:
# Download stopwords
nltk.download('stopwords')
print("✓ stopwords downloaded successfully")
except:
print("Failed to download stopwords")
print("NLTK setup complete!")
# Test the downloads
try:
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
test_text = "Hello world. This is a test."
sentences = sent_tokenize(test_text)
stop_words = set(stopwords.words('english'))
print(f"\nTesting tokenization:")
print(f"Input: {test_text}")
print(f"Sentences: {sentences}")
print(f"Number of English stopwords: {len(stop_words)}")
print("\n✓ All NLTK components working correctly!")
except Exception as e:
print(f"\n❌ Error testing NLTK components: {e}")
print("You may need to run the downloads manually in Python:")
print(">>> import nltk")
print(">>> nltk.download('punkt_tab')")
print(">>> nltk.download('stopwords')") |