Spaces:
Sleeping
Sleeping
File size: 2,197 Bytes
4b55bd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
import re
def clean_markdown(content):
"""
Basic cleaning to remove navbars, footers, and other boilerplate.
"""
# Remove top navbar (Locus Logo + Home/About/Events/etc.)
content = re.sub(r'\[!\[Locus.*?\]\(.*?\)\].*?\[Sponsors\]\(.*?\)', '', content, flags=re.DOTALL)
# Remove "More" lines
content = re.sub(r'^More$', '', content, flags=re.MULTILINE)
# Remove footer boilerplate
# Matches from Prospectus/IOE PULCHOWK until the end of the file or copyright
content = re.sub(r'\[Prospectus\].*?©COPYRIGHT.*', '', content, flags=re.DOTALL | re.IGNORECASE)
content = re.sub(r'MADE WITH ❤ BY LOST.*', '', content, flags=re.DOTALL | re.IGNORECASE)
# Remove any remaining social media links in brackets
content = re.sub(r'\[\]\(https://(www\.)?(instagram|facebook|linkedin|twitter)\.com/.*?\)', '', content)
# Remove "Read More" buttons or links
content = re.sub(r'\[Read More\]\(.*?\)', '', content)
content = re.sub(r'Read More', '', content, flags=re.IGNORECASE)
# Remove multiple empty lines
content = re.sub(r'\n{3,}', '\n\n', content)
return content.strip()
def main():
data_dir = "data"
cleaned_dir = "data/cleaned"
if not os.path.exists(data_dir):
print(f"Directory {data_dir} does not exist. Run ingest.py first.")
return
os.makedirs(cleaned_dir, exist_ok=True)
for filename in os.listdir(data_dir):
if filename.endswith(".md"):
filepath = os.path.join(data_dir, filename)
# Skip if it's a directory
if os.path.isdir(filepath):
continue
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
print(f"Cleaning {filename}...")
cleaned_content = clean_markdown(content)
cleaned_filepath = os.path.join(cleaned_dir, filename)
with open(cleaned_filepath, "w", encoding="utf-8") as f:
f.write(cleaned_content)
print(f"Saved cleaned version to {cleaned_filepath}")
if __name__ == "__main__":
main()
|