File size: 5,772 Bytes
f29d474
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
import time
from trafilatura import extract as trafilatura_extract
from trafilatura.settings import use_config
import gcld3
from text_dedup.minhash import MinHashDeduper

def collect_mk_websites_data():
    print("Collecting data from Macedonian websites...")
    start_time = time.time()
    
    # Define websites to scrape
    websites = {
        'news': [
            'https://time.mk',
            'https://daily.mk',
            'https://www.fakulteti.mk',
            'https://www.akademik.mk',
            'https://www.mkd.mk'
        ],
        'government': [
            'https://mon.gov.mk',
            'http://www.ujp.gov.mk',
            'https://fzo.org.mk',
            'https://uslugi.gov.mk',
            'https://vlada.mk',
            'https://www.sobranie.mk'
        ],
        'education': [
            'https://ukim.edu.mk',
            'https://www.finki.ukim.mk',
            'https://www.feit.ukim.edu.mk',
            'https://www.pmf.ukim.edu.mk'
        ],
        'culture': [
            'https://www.kultura.gov.mk',
            'https://mmc.mk',
            'https://www.mkc.mk'
        ],
        'business': [
            'https://www.mchamber.mk',
            'https://www.nbrm.mk',
            'https://www.stat.gov.mk'
        ],
        'tech': [
            'https://www.ainow.mk/mk',
            'https://it.mk',
            'https://gsix.mk',
            'https://ainow.mk'
        ]
    }
    
    collected_texts = []
    total_sites = sum(len(urls) for urls in websites.values())
    
    with tqdm(total=total_sites, desc="Processing websites") as pbar:
        for category, urls in websites.items():
            print(f"\nProcessing {category} websites...")
            for url in urls:
                try:
                    response = requests.get(url, timeout=10, verify=False)
                    response.encoding = 'utf-8'
                    # Prefer trafilatura extraction for cleaner text
                    config = use_config()
                    config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
                    text = trafilatura_extract(response.text, config=config) or ""
                    if len(text) > 150:
                        collected_texts.append({'category': category, 'source': url, 'text': text.strip()})

                    # Also collect internal links
                    soup = BeautifulSoup(response.text, 'html.parser')
                    links = soup.find_all('a', href=True)
                    for link in links[:5]:
                        full_url = urljoin(url, link['href'])
                        if url in full_url:
                            try:
                                sub_response = requests.get(full_url, timeout=5, verify=False)
                                sub_response.encoding = 'utf-8'
                                sub_text = trafilatura_extract(sub_response.text, config=config) or ""
                                if len(sub_text) > 150:
                                    collected_texts.append({'category': category, 'source': full_url, 'text': sub_text.strip()})
                            except Exception:
                                continue

                    pbar.update(1)
                    pbar.set_description(f"Processing {url[:30]}...")
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                    pbar.update(1)
                    continue
    
    elapsed_time = time.time() - start_time
    print(f"\nTotal collection time: {elapsed_time/60:.2f} minutes")
    return collected_texts

def process_all_data():
    print("Processing all Macedonian data sources...")
    
    # Create directories
    raw_dir = os.path.join("data", "raw")
    wiki_dir = os.path.join("data", "wikipedia", "processed")
    output_dir = os.path.join("data", "cleaned")
    
    for directory in [raw_dir, output_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)
    
    # Collect new website data
    web_texts = collect_mk_websites_data()
    
    # Save raw web data
    web_file = os.path.join(raw_dir, "mk_web_data.json")
    with open(web_file, 'w', encoding='utf-8') as f:
        json.dump(web_texts, f, ensure_ascii=False, indent=2)
    
    all_texts = []
    
    # Add web texts
    all_texts.extend([item['text'] for item in web_texts])
    
    # Add Wikipedia data if exists
    wiki_file = os.path.join(wiki_dir, "mk_wiki_text.txt")
    if os.path.exists(wiki_file):
        with open(wiki_file, 'r', encoding='utf-8') as f:
            wiki_texts = f.readlines()
            all_texts.extend(wiki_texts)
    
    # Language filter (mk) with gcld3
    detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)
    lang_filtered = []
    for text in all_texts:
        t = text.strip()
        if len(t) <= 150:
            continue
        res = detector.FindLanguage(t)
        if res.language == 'mk' and res.is_reliable:
            lang_filtered.append(t)

    # Deduplicate with MinHash
    deduper = MinHashDeduper(num_perm=128, threshold=0.9)
    unique_texts = deduper.dedup(lang_filtered)
    
    # Save final dataset
    output_file = os.path.join(output_dir, "mk_combined_data.txt")
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n\n'.join(unique_texts))
    
    print(f"Successfully processed and saved {len(unique_texts)} text samples")

if __name__ == "__main__":
    process_all_data()