import os import json import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from tqdm import tqdm import time from trafilatura import extract as trafilatura_extract from trafilatura.settings import use_config import gcld3 from text_dedup.minhash import MinHashDeduper def collect_mk_websites_data(): print("Collecting data from Macedonian websites...") start_time = time.time() # Define websites to scrape websites = { 'news': [ 'https://time.mk', 'https://daily.mk', 'https://www.fakulteti.mk', 'https://www.akademik.mk', 'https://www.mkd.mk' ], 'government': [ 'https://mon.gov.mk', 'http://www.ujp.gov.mk', 'https://fzo.org.mk', 'https://uslugi.gov.mk', 'https://vlada.mk', 'https://www.sobranie.mk' ], 'education': [ 'https://ukim.edu.mk', 'https://www.finki.ukim.mk', 'https://www.feit.ukim.edu.mk', 'https://www.pmf.ukim.edu.mk' ], 'culture': [ 'https://www.kultura.gov.mk', 'https://mmc.mk', 'https://www.mkc.mk' ], 'business': [ 'https://www.mchamber.mk', 'https://www.nbrm.mk', 'https://www.stat.gov.mk' ], 'tech': [ 'https://www.ainow.mk/mk', 'https://it.mk', 'https://gsix.mk', 'https://ainow.mk' ] } collected_texts = [] total_sites = sum(len(urls) for urls in websites.values()) with tqdm(total=total_sites, desc="Processing websites") as pbar: for category, urls in websites.items(): print(f"\nProcessing {category} websites...") for url in urls: try: response = requests.get(url, timeout=10, verify=False) response.encoding = 'utf-8' # Prefer trafilatura extraction for cleaner text config = use_config() config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0") text = trafilatura_extract(response.text, config=config) or "" if len(text) > 150: collected_texts.append({'category': category, 'source': url, 'text': text.strip()}) # Also collect internal links soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all('a', href=True) for link in links[:5]: full_url = urljoin(url, link['href']) if url in full_url: try: sub_response = requests.get(full_url, timeout=5, verify=False) sub_response.encoding = 'utf-8' sub_text = trafilatura_extract(sub_response.text, config=config) or "" if len(sub_text) > 150: collected_texts.append({'category': category, 'source': full_url, 'text': sub_text.strip()}) except Exception: continue pbar.update(1) pbar.set_description(f"Processing {url[:30]}...") except Exception as e: print(f"Error processing {url}: {e}") pbar.update(1) continue elapsed_time = time.time() - start_time print(f"\nTotal collection time: {elapsed_time/60:.2f} minutes") return collected_texts def process_all_data(): print("Processing all Macedonian data sources...") # Create directories raw_dir = os.path.join("data", "raw") wiki_dir = os.path.join("data", "wikipedia", "processed") output_dir = os.path.join("data", "cleaned") for directory in [raw_dir, output_dir]: if not os.path.exists(directory): os.makedirs(directory) # Collect new website data web_texts = collect_mk_websites_data() # Save raw web data web_file = os.path.join(raw_dir, "mk_web_data.json") with open(web_file, 'w', encoding='utf-8') as f: json.dump(web_texts, f, ensure_ascii=False, indent=2) all_texts = [] # Add web texts all_texts.extend([item['text'] for item in web_texts]) # Add Wikipedia data if exists wiki_file = os.path.join(wiki_dir, "mk_wiki_text.txt") if os.path.exists(wiki_file): with open(wiki_file, 'r', encoding='utf-8') as f: wiki_texts = f.readlines() all_texts.extend(wiki_texts) # Language filter (mk) with gcld3 detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000) lang_filtered = [] for text in all_texts: t = text.strip() if len(t) <= 150: continue res = detector.FindLanguage(t) if res.language == 'mk' and res.is_reliable: lang_filtered.append(t) # Deduplicate with MinHash deduper = MinHashDeduper(num_perm=128, threshold=0.9) unique_texts = deduper.dedup(lang_filtered) # Save final dataset output_file = os.path.join(output_dir, "mk_combined_data.txt") with open(output_file, 'w', encoding='utf-8') as f: f.write('\n\n'.join(unique_texts)) print(f"Successfully processed and saved {len(unique_texts)} text samples") if __name__ == "__main__": process_all_data()