File size: 5,772 Bytes

f29d474

import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
import time
from trafilatura import extract as trafilatura_extract
from trafilatura.settings import use_config
import gcld3
from text_dedup.minhash import MinHashDeduper

def collect_mk_websites_data():
    print("Collecting data from Macedonian websites...")
    start_time = time.time()
    
    # Define websites to scrape
    websites = {
        'news': [
            'https://time.mk',
            'https://daily.mk',
            'https://www.fakulteti.mk',
            'https://www.akademik.mk',
            'https://www.mkd.mk'
        ],
        'government': [
            'https://mon.gov.mk',
            'http://www.ujp.gov.mk',
            'https://fzo.org.mk',
            'https://uslugi.gov.mk',
            'https://vlada.mk',
            'https://www.sobranie.mk'
        ],
        'education': [
            'https://ukim.edu.mk',
            'https://www.finki.ukim.mk',
            'https://www.feit.ukim.edu.mk',
            'https://www.pmf.ukim.edu.mk'
        ],
        'culture': [
            'https://www.kultura.gov.mk',
            'https://mmc.mk',
            'https://www.mkc.mk'
        ],
        'business': [
            'https://www.mchamber.mk',
            'https://www.nbrm.mk',
            'https://www.stat.gov.mk'
        ],
        'tech': [
            'https://www.ainow.mk/mk',
            'https://it.mk',
            'https://gsix.mk',
            'https://ainow.mk'
        ]
    }
    
    collected_texts = []
    total_sites = sum(len(urls) for urls in websites.values())
    
    with tqdm(total=total_sites, desc="Processing websites") as pbar:
        for category, urls in websites.items():
            print(f"\nProcessing {category} websites...")
            for url in urls:
                try:
                    response = requests.get(url, timeout=10, verify=False)
                    response.encoding = 'utf-8'
                    # Prefer trafilatura extraction for cleaner text
                    config = use_config()
                    config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
                    text = trafilatura_extract(response.text, config=config) or ""
                    if len(text) > 150:
                        collected_texts.append({'category': category, 'source': url, 'text': text.strip()})

                    # Also collect internal links
                    soup = BeautifulSoup(response.text, 'html.parser')
                    links = soup.find_all('a', href=True)
                    for link in links[:5]:
                        full_url = urljoin(url, link['href'])
                        if url in full_url:
                            try:
                                sub_response = requests.get(full_url, timeout=5, verify=False)
                                sub_response.encoding = 'utf-8'
                                sub_text = trafilatura_extract(sub_response.text, config=config) or ""
                                if len(sub_text) > 150:
                                    collected_texts.append({'category': category, 'source': full_url, 'text': sub_text.strip()})
                            except Exception:
                                continue

                    pbar.update(1)
                    pbar.set_description(f"Processing {url[:30]}...")
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                    pbar.update(1)
                    continue
    
    elapsed_time = time.time() - start_time
    print(f"\nTotal collection time: {elapsed_time/60:.2f} minutes")
    return collected_texts

def process_all_data():
    print("Processing all Macedonian data sources...")
    
    # Create directories
    raw_dir = os.path.join("data", "raw")
    wiki_dir = os.path.join("data", "wikipedia", "processed")
    output_dir = os.path.join("data", "cleaned")
    
    for directory in [raw_dir, output_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)
    
    # Collect new website data
    web_texts = collect_mk_websites_data()
    
    # Save raw web data
    web_file = os.path.join(raw_dir, "mk_web_data.json")
    with open(web_file, 'w', encoding='utf-8') as f:
        json.dump(web_texts, f, ensure_ascii=False, indent=2)
    
    all_texts = []
    
    # Add web texts
    all_texts.extend([item['text'] for item in web_texts])
    
    # Add Wikipedia data if exists
    wiki_file = os.path.join(wiki_dir, "mk_wiki_text.txt")
    if os.path.exists(wiki_file):
        with open(wiki_file, 'r', encoding='utf-8') as f:
            wiki_texts = f.readlines()
            all_texts.extend(wiki_texts)
    
    # Language filter (mk) with gcld3
    detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)
    lang_filtered = []
    for text in all_texts:
        t = text.strip()
        if len(t) <= 150:
            continue
        res = detector.FindLanguage(t)
        if res.language == 'mk' and res.is_reliable:
            lang_filtered.append(t)

    # Deduplicate with MinHash
    deduper = MinHashDeduper(num_perm=128, threshold=0.9)
    unique_texts = deduper.dedup(lang_filtered)
    
    # Save final dataset
    output_file = os.path.join(output_dir, "mk_combined_data.txt")
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n\n'.join(unique_texts))
    
    print(f"Successfully processed and saved {len(unique_texts)} text samples")

if __name__ == "__main__":
    process_all_data()