File size: 2,316 Bytes
f29d474
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
import json
from langdetect import detect
from collections import defaultdict

def test_collected_data():
    print("Testing collected Macedonian data...")
    
    # Paths
    raw_data = os.path.join("data", "raw", "mk_web_data.json")
    cleaned_data = os.path.join("data", "cleaned", "mk_combined_data.txt")
    
    # Test raw data
    if os.path.exists(raw_data):
        with open(raw_data, 'r', encoding='utf-8') as f:
            web_data = json.load(f)
            
        # Analyze sources
        sources = defaultdict(int)
        categories = defaultdict(int)
        total_chars = 0
        
        for item in web_data:
            sources[item['source']] += 1
            categories[item['category']] += 1
            total_chars += len(item['text'])
        
        print("\n📊 Raw Data Statistics:")
        print(f"Total entries: {len(web_data)}")
        print(f"Total characters: {total_chars:,}")
        print(f"Average entry length: {total_chars/len(web_data):,.0f} characters")
        
        print("\n📑 Categories:")
        for cat, count in categories.items():
            print(f"- {cat}: {count} entries")
    
    # Test cleaned data
    if os.path.exists(cleaned_data):
        with open(cleaned_data, 'r', encoding='utf-8') as f:
            cleaned_texts = f.read().split('\n\n')
        
        mk_count = 0
        total_len = 0
        
        print("\n🧹 Cleaned Data Statistics:")
        print(f"Total entries: {len(cleaned_texts)}")
        
        # Test random samples
        print("\n📝 Testing random samples:")
        import random
        for i, text in enumerate(random.sample(cleaned_texts, min(5, len(cleaned_texts)))):
            try:
                is_mk = detect(text) == 'mk'
                mk_count += 1 if is_mk else 0
                total_len += len(text)
                print(f"\nSample {i+1} ({len(text)} chars):")
                print(f"First 100 chars: {text[:100]}...")
                print(f"Language detected: {'Macedonian ✅' if is_mk else 'Other ❌'}")
            except:
                continue
        
        print(f"\nAverage text length: {total_len/len(cleaned_texts):,.0f} characters")

if __name__ == "__main__":
    test_collected_data()