|
|
import os
|
|
|
import json
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
from urllib.parse import urljoin
|
|
|
from tqdm import tqdm
|
|
|
import time
|
|
|
from trafilatura import extract as trafilatura_extract
|
|
|
from trafilatura.settings import use_config
|
|
|
import gcld3
|
|
|
from text_dedup.minhash import MinHashDeduper
|
|
|
|
|
|
def collect_mk_websites_data():
|
|
|
print("Collecting data from Macedonian websites...")
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
websites = {
|
|
|
'news': [
|
|
|
'https://time.mk',
|
|
|
'https://daily.mk',
|
|
|
'https://www.fakulteti.mk',
|
|
|
'https://www.akademik.mk',
|
|
|
'https://www.mkd.mk'
|
|
|
],
|
|
|
'government': [
|
|
|
'https://mon.gov.mk',
|
|
|
'http://www.ujp.gov.mk',
|
|
|
'https://fzo.org.mk',
|
|
|
'https://uslugi.gov.mk',
|
|
|
'https://vlada.mk',
|
|
|
'https://www.sobranie.mk'
|
|
|
],
|
|
|
'education': [
|
|
|
'https://ukim.edu.mk',
|
|
|
'https://www.finki.ukim.mk',
|
|
|
'https://www.feit.ukim.edu.mk',
|
|
|
'https://www.pmf.ukim.edu.mk'
|
|
|
],
|
|
|
'culture': [
|
|
|
'https://www.kultura.gov.mk',
|
|
|
'https://mmc.mk',
|
|
|
'https://www.mkc.mk'
|
|
|
],
|
|
|
'business': [
|
|
|
'https://www.mchamber.mk',
|
|
|
'https://www.nbrm.mk',
|
|
|
'https://www.stat.gov.mk'
|
|
|
],
|
|
|
'tech': [
|
|
|
'https://www.ainow.mk/mk',
|
|
|
'https://it.mk',
|
|
|
'https://gsix.mk',
|
|
|
'https://ainow.mk'
|
|
|
]
|
|
|
}
|
|
|
|
|
|
collected_texts = []
|
|
|
total_sites = sum(len(urls) for urls in websites.values())
|
|
|
|
|
|
with tqdm(total=total_sites, desc="Processing websites") as pbar:
|
|
|
for category, urls in websites.items():
|
|
|
print(f"\nProcessing {category} websites...")
|
|
|
for url in urls:
|
|
|
try:
|
|
|
response = requests.get(url, timeout=10, verify=False)
|
|
|
response.encoding = 'utf-8'
|
|
|
|
|
|
config = use_config()
|
|
|
config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
|
|
|
text = trafilatura_extract(response.text, config=config) or ""
|
|
|
if len(text) > 150:
|
|
|
collected_texts.append({'category': category, 'source': url, 'text': text.strip()})
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
links = soup.find_all('a', href=True)
|
|
|
for link in links[:5]:
|
|
|
full_url = urljoin(url, link['href'])
|
|
|
if url in full_url:
|
|
|
try:
|
|
|
sub_response = requests.get(full_url, timeout=5, verify=False)
|
|
|
sub_response.encoding = 'utf-8'
|
|
|
sub_text = trafilatura_extract(sub_response.text, config=config) or ""
|
|
|
if len(sub_text) > 150:
|
|
|
collected_texts.append({'category': category, 'source': full_url, 'text': sub_text.strip()})
|
|
|
except Exception:
|
|
|
continue
|
|
|
|
|
|
pbar.update(1)
|
|
|
pbar.set_description(f"Processing {url[:30]}...")
|
|
|
except Exception as e:
|
|
|
print(f"Error processing {url}: {e}")
|
|
|
pbar.update(1)
|
|
|
continue
|
|
|
|
|
|
elapsed_time = time.time() - start_time
|
|
|
print(f"\nTotal collection time: {elapsed_time/60:.2f} minutes")
|
|
|
return collected_texts
|
|
|
|
|
|
def process_all_data():
|
|
|
print("Processing all Macedonian data sources...")
|
|
|
|
|
|
|
|
|
raw_dir = os.path.join("data", "raw")
|
|
|
wiki_dir = os.path.join("data", "wikipedia", "processed")
|
|
|
output_dir = os.path.join("data", "cleaned")
|
|
|
|
|
|
for directory in [raw_dir, output_dir]:
|
|
|
if not os.path.exists(directory):
|
|
|
os.makedirs(directory)
|
|
|
|
|
|
|
|
|
web_texts = collect_mk_websites_data()
|
|
|
|
|
|
|
|
|
web_file = os.path.join(raw_dir, "mk_web_data.json")
|
|
|
with open(web_file, 'w', encoding='utf-8') as f:
|
|
|
json.dump(web_texts, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
all_texts = []
|
|
|
|
|
|
|
|
|
all_texts.extend([item['text'] for item in web_texts])
|
|
|
|
|
|
|
|
|
wiki_file = os.path.join(wiki_dir, "mk_wiki_text.txt")
|
|
|
if os.path.exists(wiki_file):
|
|
|
with open(wiki_file, 'r', encoding='utf-8') as f:
|
|
|
wiki_texts = f.readlines()
|
|
|
all_texts.extend(wiki_texts)
|
|
|
|
|
|
|
|
|
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)
|
|
|
lang_filtered = []
|
|
|
for text in all_texts:
|
|
|
t = text.strip()
|
|
|
if len(t) <= 150:
|
|
|
continue
|
|
|
res = detector.FindLanguage(t)
|
|
|
if res.language == 'mk' and res.is_reliable:
|
|
|
lang_filtered.append(t)
|
|
|
|
|
|
|
|
|
deduper = MinHashDeduper(num_perm=128, threshold=0.9)
|
|
|
unique_texts = deduper.dedup(lang_filtered)
|
|
|
|
|
|
|
|
|
output_file = os.path.join(output_dir, "mk_combined_data.txt")
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
|
f.write('\n\n'.join(unique_texts))
|
|
|
|
|
|
print(f"Successfully processed and saved {len(unique_texts)} text samples")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
process_all_data() |