File size: 1,668 Bytes
2a89678
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Run this script to download required NLTK data
import nltk
import ssl

# Handle SSL certificate issues if they occur
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download required NLTK data
print("Downloading NLTK data...")

try:
    # Download the newer punkt_tab tokenizer
    nltk.download('punkt_tab')
    print("✓ punkt_tab downloaded successfully")
except:
    print("Failed to download punkt_tab, trying punkt...")
    try:
        nltk.download('punkt')
        print("✓ punkt downloaded successfully")
    except:
        print("Failed to download punkt tokenizer")

try:
    # Download stopwords
    nltk.download('stopwords')
    print("✓ stopwords downloaded successfully")
except:
    print("Failed to download stopwords")

print("NLTK setup complete!")

# Test the downloads
try:
    from nltk.tokenize import sent_tokenize
    from nltk.corpus import stopwords
    
    test_text = "Hello world. This is a test."
    sentences = sent_tokenize(test_text)
    stop_words = set(stopwords.words('english'))
    
    print(f"\nTesting tokenization:")
    print(f"Input: {test_text}")
    print(f"Sentences: {sentences}")
    print(f"Number of English stopwords: {len(stop_words)}")
    print("\n✓ All NLTK components working correctly!")
    
except Exception as e:
    print(f"\n❌ Error testing NLTK components: {e}")
    print("You may need to run the downloads manually in Python:")
    print(">>> import nltk")
    print(">>> nltk.download('punkt_tab')")
    print(">>> nltk.download('stopwords')")