MK-LLM-Mistral / examples /test_data.py
ainow-mk's picture
Upload 65 files
f29d474 verified
import os
import json
from langdetect import detect
from collections import defaultdict
def test_collected_data():
print("Testing collected Macedonian data...")
# Paths
raw_data = os.path.join("data", "raw", "mk_web_data.json")
cleaned_data = os.path.join("data", "cleaned", "mk_combined_data.txt")
# Test raw data
if os.path.exists(raw_data):
with open(raw_data, 'r', encoding='utf-8') as f:
web_data = json.load(f)
# Analyze sources
sources = defaultdict(int)
categories = defaultdict(int)
total_chars = 0
for item in web_data:
sources[item['source']] += 1
categories[item['category']] += 1
total_chars += len(item['text'])
print("\n๐Ÿ“Š Raw Data Statistics:")
print(f"Total entries: {len(web_data)}")
print(f"Total characters: {total_chars:,}")
print(f"Average entry length: {total_chars/len(web_data):,.0f} characters")
print("\n๐Ÿ“‘ Categories:")
for cat, count in categories.items():
print(f"- {cat}: {count} entries")
# Test cleaned data
if os.path.exists(cleaned_data):
with open(cleaned_data, 'r', encoding='utf-8') as f:
cleaned_texts = f.read().split('\n\n')
mk_count = 0
total_len = 0
print("\n๐Ÿงน Cleaned Data Statistics:")
print(f"Total entries: {len(cleaned_texts)}")
# Test random samples
print("\n๐Ÿ“ Testing random samples:")
import random
for i, text in enumerate(random.sample(cleaned_texts, min(5, len(cleaned_texts)))):
try:
is_mk = detect(text) == 'mk'
mk_count += 1 if is_mk else 0
total_len += len(text)
print(f"\nSample {i+1} ({len(text)} chars):")
print(f"First 100 chars: {text[:100]}...")
print(f"Language detected: {'Macedonian โœ…' if is_mk else 'Other โŒ'}")
except:
continue
print(f"\nAverage text length: {total_len/len(cleaned_texts):,.0f} characters")
if __name__ == "__main__":
test_collected_data()