MK-LLM-Mistral / data /process_all_data.py
ainow-mk's picture
Upload 65 files
f29d474 verified
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
import time
from trafilatura import extract as trafilatura_extract
from trafilatura.settings import use_config
import gcld3
from text_dedup.minhash import MinHashDeduper
def collect_mk_websites_data():
print("Collecting data from Macedonian websites...")
start_time = time.time()
# Define websites to scrape
websites = {
'news': [
'https://time.mk',
'https://daily.mk',
'https://www.fakulteti.mk',
'https://www.akademik.mk',
'https://www.mkd.mk'
],
'government': [
'https://mon.gov.mk',
'http://www.ujp.gov.mk',
'https://fzo.org.mk',
'https://uslugi.gov.mk',
'https://vlada.mk',
'https://www.sobranie.mk'
],
'education': [
'https://ukim.edu.mk',
'https://www.finki.ukim.mk',
'https://www.feit.ukim.edu.mk',
'https://www.pmf.ukim.edu.mk'
],
'culture': [
'https://www.kultura.gov.mk',
'https://mmc.mk',
'https://www.mkc.mk'
],
'business': [
'https://www.mchamber.mk',
'https://www.nbrm.mk',
'https://www.stat.gov.mk'
],
'tech': [
'https://www.ainow.mk/mk',
'https://it.mk',
'https://gsix.mk',
'https://ainow.mk'
]
}
collected_texts = []
total_sites = sum(len(urls) for urls in websites.values())
with tqdm(total=total_sites, desc="Processing websites") as pbar:
for category, urls in websites.items():
print(f"\nProcessing {category} websites...")
for url in urls:
try:
response = requests.get(url, timeout=10, verify=False)
response.encoding = 'utf-8'
# Prefer trafilatura extraction for cleaner text
config = use_config()
config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
text = trafilatura_extract(response.text, config=config) or ""
if len(text) > 150:
collected_texts.append({'category': category, 'source': url, 'text': text.strip()})
# Also collect internal links
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)
for link in links[:5]:
full_url = urljoin(url, link['href'])
if url in full_url:
try:
sub_response = requests.get(full_url, timeout=5, verify=False)
sub_response.encoding = 'utf-8'
sub_text = trafilatura_extract(sub_response.text, config=config) or ""
if len(sub_text) > 150:
collected_texts.append({'category': category, 'source': full_url, 'text': sub_text.strip()})
except Exception:
continue
pbar.update(1)
pbar.set_description(f"Processing {url[:30]}...")
except Exception as e:
print(f"Error processing {url}: {e}")
pbar.update(1)
continue
elapsed_time = time.time() - start_time
print(f"\nTotal collection time: {elapsed_time/60:.2f} minutes")
return collected_texts
def process_all_data():
print("Processing all Macedonian data sources...")
# Create directories
raw_dir = os.path.join("data", "raw")
wiki_dir = os.path.join("data", "wikipedia", "processed")
output_dir = os.path.join("data", "cleaned")
for directory in [raw_dir, output_dir]:
if not os.path.exists(directory):
os.makedirs(directory)
# Collect new website data
web_texts = collect_mk_websites_data()
# Save raw web data
web_file = os.path.join(raw_dir, "mk_web_data.json")
with open(web_file, 'w', encoding='utf-8') as f:
json.dump(web_texts, f, ensure_ascii=False, indent=2)
all_texts = []
# Add web texts
all_texts.extend([item['text'] for item in web_texts])
# Add Wikipedia data if exists
wiki_file = os.path.join(wiki_dir, "mk_wiki_text.txt")
if os.path.exists(wiki_file):
with open(wiki_file, 'r', encoding='utf-8') as f:
wiki_texts = f.readlines()
all_texts.extend(wiki_texts)
# Language filter (mk) with gcld3
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)
lang_filtered = []
for text in all_texts:
t = text.strip()
if len(t) <= 150:
continue
res = detector.FindLanguage(t)
if res.language == 'mk' and res.is_reliable:
lang_filtered.append(t)
# Deduplicate with MinHash
deduper = MinHashDeduper(num_perm=128, threshold=0.9)
unique_texts = deduper.dedup(lang_filtered)
# Save final dataset
output_file = os.path.join(output_dir, "mk_combined_data.txt")
with open(output_file, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(unique_texts))
print(f"Successfully processed and saved {len(unique_texts)} text samples")
if __name__ == "__main__":
process_all_data()