Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| def clean_markdown(content): | |
| """ | |
| Basic cleaning to remove navbars, footers, and other boilerplate. | |
| """ | |
| # Remove top navbar (Locus Logo + Home/About/Events/etc.) | |
| content = re.sub(r'\[!\[Locus.*?\]\(.*?\)\].*?\[Sponsors\]\(.*?\)', '', content, flags=re.DOTALL) | |
| # Remove "More" lines | |
| content = re.sub(r'^More$', '', content, flags=re.MULTILINE) | |
| # Remove footer boilerplate | |
| # Matches from Prospectus/IOE PULCHOWK until the end of the file or copyright | |
| content = re.sub(r'\[Prospectus\].*?©COPYRIGHT.*', '', content, flags=re.DOTALL | re.IGNORECASE) | |
| content = re.sub(r'MADE WITH ❤ BY LOST.*', '', content, flags=re.DOTALL | re.IGNORECASE) | |
| # Remove any remaining social media links in brackets | |
| content = re.sub(r'\[\]\(https://(www\.)?(instagram|facebook|linkedin|twitter)\.com/.*?\)', '', content) | |
| # Remove "Read More" buttons or links | |
| content = re.sub(r'\[Read More\]\(.*?\)', '', content) | |
| content = re.sub(r'Read More', '', content, flags=re.IGNORECASE) | |
| # Remove multiple empty lines | |
| content = re.sub(r'\n{3,}', '\n\n', content) | |
| return content.strip() | |
| def main(): | |
| data_dir = "data" | |
| cleaned_dir = "data/cleaned" | |
| if not os.path.exists(data_dir): | |
| print(f"Directory {data_dir} does not exist. Run ingest.py first.") | |
| return | |
| os.makedirs(cleaned_dir, exist_ok=True) | |
| for filename in os.listdir(data_dir): | |
| if filename.endswith(".md"): | |
| filepath = os.path.join(data_dir, filename) | |
| # Skip if it's a directory | |
| if os.path.isdir(filepath): | |
| continue | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| print(f"Cleaning {filename}...") | |
| cleaned_content = clean_markdown(content) | |
| cleaned_filepath = os.path.join(cleaned_dir, filename) | |
| with open(cleaned_filepath, "w", encoding="utf-8") as f: | |
| f.write(cleaned_content) | |
| print(f"Saved cleaned version to {cleaned_filepath}") | |
| if __name__ == "__main__": | |
| main() | |