iamkoder001
/

ARAVALLI-1

ecological-intelligence

environmental-protection

Model card Files Files and versions

iamkoder001 commited on 5 days ago

Commit

5648af1

·

verified ·

1 Parent(s): c7c6bc0

Create data/scripts/cleaner.py

Files changed (1) hide show

data/scripts/cleaner.py +53 -0

data/scripts/cleaner.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import pymupdf4llm
+import pathlib
+import hashlib
+class SovereignCleaner:
+    """
+    Cleans raw PDF ingestion and converts it to training-ready text.
+    Ensures every document is hashed for the GOEC Audit Trail.
+    """
+    def __init__(self, raw_dir="data/raw/", clean_dir="data/processed/texts/"):
+        self.raw_dir = raw_dir
+        self.clean_dir = clean_dir
+        if not os.path.exists(self.clean_dir):
+            os.makedirs(self.clean_dir)
+    def _get_file_hash(self, filepath):
+        """Generates SHA-256 hash to ensure the data is unfalsifiable."""
+        sha256_hash = hashlib.sha256()
+        with open(filepath, "rb") as f:
+            for byte_block in iter(lambda: f.read(4096), b""):
+                sha256_hash.update(byte_block)
+        return sha256_hash.hexdigest()
+    def clean_all(self):
+        """Iterates through raw PDFs and extracts structured text."""
+        files = [f for f in os.listdir(self.raw_dir) if f.endswith(".pdf")]
+        print(f"Cleaning {len(files)} documents for ARAVALLI-1...")
+        for file in files:
+            raw_path = os.path.join(self.raw_dir, file)
+            file_hash = self._get_file_hash(raw_path)
+            # Use PyMuPDF4LLM for Markdown extraction (keeps tables/headings)
+            try:
+                md_text = pymupdf4llm.to_markdown(raw_path)
+                # Metadata injection for the model's context
+                header = f"--- SOURCE_HASH: {file_hash} ---\n"
+                final_text = header + md_text
+                clean_name = file.replace(".pdf", ".md")
+                clean_path = os.path.join(self.clean_dir, clean_name)
+                with open(clean_path, "w", encoding="utf-8") as f:
+                    f.write(final_text)
+                print(f"Verified & Cleaned: {file}")
+            except Exception as e:
+                print(f"Failed to clean {file}: {e}")
+if __name__ == "__main__":
+    cleaner = SovereignCleaner()
+    cleaner.clean_all()