File size: 2,197 Bytes
4b55bd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import re

def clean_markdown(content):
    """
    Basic cleaning to remove navbars, footers, and other boilerplate.
    """
    # Remove top navbar (Locus Logo + Home/About/Events/etc.)
    content = re.sub(r'\[!\[Locus.*?\]\(.*?\)\].*?\[Sponsors\]\(.*?\)', '', content, flags=re.DOTALL)
    
    # Remove "More" lines
    content = re.sub(r'^More$', '', content, flags=re.MULTILINE)
    
    # Remove footer boilerplate
    # Matches from Prospectus/IOE PULCHOWK until the end of the file or copyright
    content = re.sub(r'\[Prospectus\].*?©COPYRIGHT.*', '', content, flags=re.DOTALL | re.IGNORECASE)
    content = re.sub(r'MADE WITH ❤ BY LOST.*', '', content, flags=re.DOTALL | re.IGNORECASE)
    
    # Remove any remaining social media links in brackets
    content = re.sub(r'\[\]\(https://(www\.)?(instagram|facebook|linkedin|twitter)\.com/.*?\)', '', content)
    
    # Remove "Read More" buttons or links
    content = re.sub(r'\[Read More\]\(.*?\)', '', content)
    content = re.sub(r'Read More', '', content, flags=re.IGNORECASE)
    
    # Remove multiple empty lines
    content = re.sub(r'\n{3,}', '\n\n', content)
    
    return content.strip()

def main():
    data_dir = "data"
    cleaned_dir = "data/cleaned"
    
    if not os.path.exists(data_dir):
        print(f"Directory {data_dir} does not exist. Run ingest.py first.")
        return
        
    os.makedirs(cleaned_dir, exist_ok=True)
    
    for filename in os.listdir(data_dir):
        if filename.endswith(".md"):
            filepath = os.path.join(data_dir, filename)
            # Skip if it's a directory
            if os.path.isdir(filepath):
                continue
                
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
            
            print(f"Cleaning {filename}...")
            cleaned_content = clean_markdown(content)
            
            cleaned_filepath = os.path.join(cleaned_dir, filename)
            with open(cleaned_filepath, "w", encoding="utf-8") as f:
                f.write(cleaned_content)
            print(f"Saved cleaned version to {cleaned_filepath}")

if __name__ == "__main__":
    main()