Create data/scripts/cleaner.py
Browse files- data/scripts/cleaner.py +53 -0
data/scripts/cleaner.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pymupdf4llm
|
| 3 |
+
import pathlib
|
| 4 |
+
import hashlib
|
| 5 |
+
|
| 6 |
+
class SovereignCleaner:
|
| 7 |
+
"""
|
| 8 |
+
Cleans raw PDF ingestion and converts it to training-ready text.
|
| 9 |
+
Ensures every document is hashed for the GOEC Audit Trail.
|
| 10 |
+
"""
|
| 11 |
+
def __init__(self, raw_dir="data/raw/", clean_dir="data/processed/texts/"):
|
| 12 |
+
self.raw_dir = raw_dir
|
| 13 |
+
self.clean_dir = clean_dir
|
| 14 |
+
if not os.path.exists(self.clean_dir):
|
| 15 |
+
os.makedirs(self.clean_dir)
|
| 16 |
+
|
| 17 |
+
def _get_file_hash(self, filepath):
|
| 18 |
+
"""Generates SHA-256 hash to ensure the data is unfalsifiable."""
|
| 19 |
+
sha256_hash = hashlib.sha256()
|
| 20 |
+
with open(filepath, "rb") as f:
|
| 21 |
+
for byte_block in iter(lambda: f.read(4096), b""):
|
| 22 |
+
sha256_hash.update(byte_block)
|
| 23 |
+
return sha256_hash.hexdigest()
|
| 24 |
+
|
| 25 |
+
def clean_all(self):
|
| 26 |
+
"""Iterates through raw PDFs and extracts structured text."""
|
| 27 |
+
files = [f for f in os.listdir(self.raw_dir) if f.endswith(".pdf")]
|
| 28 |
+
print(f"Cleaning {len(files)} documents for ARAVALLI-1...")
|
| 29 |
+
|
| 30 |
+
for file in files:
|
| 31 |
+
raw_path = os.path.join(self.raw_dir, file)
|
| 32 |
+
file_hash = self._get_file_hash(raw_path)
|
| 33 |
+
|
| 34 |
+
# Use PyMuPDF4LLM for Markdown extraction (keeps tables/headings)
|
| 35 |
+
try:
|
| 36 |
+
md_text = pymupdf4llm.to_markdown(raw_path)
|
| 37 |
+
|
| 38 |
+
# Metadata injection for the model's context
|
| 39 |
+
header = f"--- SOURCE_HASH: {file_hash} ---\n"
|
| 40 |
+
final_text = header + md_text
|
| 41 |
+
|
| 42 |
+
clean_name = file.replace(".pdf", ".md")
|
| 43 |
+
clean_path = os.path.join(self.clean_dir, clean_name)
|
| 44 |
+
|
| 45 |
+
with open(clean_path, "w", encoding="utf-8") as f:
|
| 46 |
+
f.write(final_text)
|
| 47 |
+
print(f"Verified & Cleaned: {file}")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"Failed to clean {file}: {e}")
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
cleaner = SovereignCleaner()
|
| 53 |
+
cleaner.clean_all()
|