locus-rag-bot / scripts /clean_data.py
khagu's picture
add all files
4b55bd6
import os
import re
def clean_markdown(content):
"""
Basic cleaning to remove navbars, footers, and other boilerplate.
"""
# Remove top navbar (Locus Logo + Home/About/Events/etc.)
content = re.sub(r'\[!\[Locus.*?\]\(.*?\)\].*?\[Sponsors\]\(.*?\)', '', content, flags=re.DOTALL)
# Remove "More" lines
content = re.sub(r'^More$', '', content, flags=re.MULTILINE)
# Remove footer boilerplate
# Matches from Prospectus/IOE PULCHOWK until the end of the file or copyright
content = re.sub(r'\[Prospectus\].*?©COPYRIGHT.*', '', content, flags=re.DOTALL | re.IGNORECASE)
content = re.sub(r'MADE WITH ❤ BY LOST.*', '', content, flags=re.DOTALL | re.IGNORECASE)
# Remove any remaining social media links in brackets
content = re.sub(r'\[\]\(https://(www\.)?(instagram|facebook|linkedin|twitter)\.com/.*?\)', '', content)
# Remove "Read More" buttons or links
content = re.sub(r'\[Read More\]\(.*?\)', '', content)
content = re.sub(r'Read More', '', content, flags=re.IGNORECASE)
# Remove multiple empty lines
content = re.sub(r'\n{3,}', '\n\n', content)
return content.strip()
def main():
data_dir = "data"
cleaned_dir = "data/cleaned"
if not os.path.exists(data_dir):
print(f"Directory {data_dir} does not exist. Run ingest.py first.")
return
os.makedirs(cleaned_dir, exist_ok=True)
for filename in os.listdir(data_dir):
if filename.endswith(".md"):
filepath = os.path.join(data_dir, filename)
# Skip if it's a directory
if os.path.isdir(filepath):
continue
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
print(f"Cleaning {filename}...")
cleaned_content = clean_markdown(content)
cleaned_filepath = os.path.join(cleaned_dir, filename)
with open(cleaned_filepath, "w", encoding="utf-8") as f:
f.write(cleaned_content)
print(f"Saved cleaned version to {cleaned_filepath}")
if __name__ == "__main__":
main()