Transformers
English
Hindi
Sanskrit
sovereign-ai
ecological-intelligence
indian-llm
environmental-protection
iamkoder001 commited on
Commit
5648af1
·
verified ·
1 Parent(s): c7c6bc0

Create data/scripts/cleaner.py

Browse files
Files changed (1) hide show
  1. data/scripts/cleaner.py +53 -0
data/scripts/cleaner.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pymupdf4llm
3
+ import pathlib
4
+ import hashlib
5
+
6
+ class SovereignCleaner:
7
+ """
8
+ Cleans raw PDF ingestion and converts it to training-ready text.
9
+ Ensures every document is hashed for the GOEC Audit Trail.
10
+ """
11
+ def __init__(self, raw_dir="data/raw/", clean_dir="data/processed/texts/"):
12
+ self.raw_dir = raw_dir
13
+ self.clean_dir = clean_dir
14
+ if not os.path.exists(self.clean_dir):
15
+ os.makedirs(self.clean_dir)
16
+
17
+ def _get_file_hash(self, filepath):
18
+ """Generates SHA-256 hash to ensure the data is unfalsifiable."""
19
+ sha256_hash = hashlib.sha256()
20
+ with open(filepath, "rb") as f:
21
+ for byte_block in iter(lambda: f.read(4096), b""):
22
+ sha256_hash.update(byte_block)
23
+ return sha256_hash.hexdigest()
24
+
25
+ def clean_all(self):
26
+ """Iterates through raw PDFs and extracts structured text."""
27
+ files = [f for f in os.listdir(self.raw_dir) if f.endswith(".pdf")]
28
+ print(f"Cleaning {len(files)} documents for ARAVALLI-1...")
29
+
30
+ for file in files:
31
+ raw_path = os.path.join(self.raw_dir, file)
32
+ file_hash = self._get_file_hash(raw_path)
33
+
34
+ # Use PyMuPDF4LLM for Markdown extraction (keeps tables/headings)
35
+ try:
36
+ md_text = pymupdf4llm.to_markdown(raw_path)
37
+
38
+ # Metadata injection for the model's context
39
+ header = f"--- SOURCE_HASH: {file_hash} ---\n"
40
+ final_text = header + md_text
41
+
42
+ clean_name = file.replace(".pdf", ".md")
43
+ clean_path = os.path.join(self.clean_dir, clean_name)
44
+
45
+ with open(clean_path, "w", encoding="utf-8") as f:
46
+ f.write(final_text)
47
+ print(f"Verified & Cleaned: {file}")
48
+ except Exception as e:
49
+ print(f"Failed to clean {file}: {e}")
50
+
51
+ if __name__ == "__main__":
52
+ cleaner = SovereignCleaner()
53
+ cleaner.clean_all()