| # Script to clean and preprocess Macedonian Wikipedia text | |
| import re | |
| def clean_text(text): | |
| text = re.sub(r'\[.*?\]', '', text) # Remove Wikipedia category links | |
| text = re.sub(r'\{\{.*?\}\}', '', text) # Remove Wikipedia templates | |
| text = re.sub(r'<.*?>', '', text) # Remove HTML tags | |
| text = text.replace("\n", " ").strip() | |
| return text | |
| if __name__ == "__main__": | |
| print("Data cleaning script ready to process text!") | |