import os import re def clean_markdown(content): """ Basic cleaning to remove navbars, footers, and other boilerplate. """ # Remove top navbar (Locus Logo + Home/About/Events/etc.) content = re.sub(r'\[!\[Locus.*?\]\(.*?\)\].*?\[Sponsors\]\(.*?\)', '', content, flags=re.DOTALL) # Remove "More" lines content = re.sub(r'^More$', '', content, flags=re.MULTILINE) # Remove footer boilerplate # Matches from Prospectus/IOE PULCHOWK until the end of the file or copyright content = re.sub(r'\[Prospectus\].*?©COPYRIGHT.*', '', content, flags=re.DOTALL | re.IGNORECASE) content = re.sub(r'MADE WITH ❤ BY LOST.*', '', content, flags=re.DOTALL | re.IGNORECASE) # Remove any remaining social media links in brackets content = re.sub(r'\[\]\(https://(www\.)?(instagram|facebook|linkedin|twitter)\.com/.*?\)', '', content) # Remove "Read More" buttons or links content = re.sub(r'\[Read More\]\(.*?\)', '', content) content = re.sub(r'Read More', '', content, flags=re.IGNORECASE) # Remove multiple empty lines content = re.sub(r'\n{3,}', '\n\n', content) return content.strip() def main(): data_dir = "data" cleaned_dir = "data/cleaned" if not os.path.exists(data_dir): print(f"Directory {data_dir} does not exist. Run ingest.py first.") return os.makedirs(cleaned_dir, exist_ok=True) for filename in os.listdir(data_dir): if filename.endswith(".md"): filepath = os.path.join(data_dir, filename) # Skip if it's a directory if os.path.isdir(filepath): continue with open(filepath, "r", encoding="utf-8") as f: content = f.read() print(f"Cleaning {filename}...") cleaned_content = clean_markdown(content) cleaned_filepath = os.path.join(cleaned_dir, filename) with open(cleaned_filepath, "w", encoding="utf-8") as f: f.write(cleaned_content) print(f"Saved cleaned version to {cleaned_filepath}") if __name__ == "__main__": main()