Spaces:
Sleeping
Sleeping
File size: 1,732 Bytes
ea9ca44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import os
from typing import Optional
from .cleaner import clean_text
def preprocess_jobs(raw_dir: str = "data/jobs/raw", out_dir: str = "data/jobs/preprocessed") -> Optional[int]:
"""Read all .txt files from `raw_dir`, clean them using `clean_text`,
and write cleaned versions to `out_dir` preserving filenames.
Returns the number of files processed or None on error.
"""
try:
if not os.path.isdir(raw_dir):
print(f"⚠️ Raw jobs dir does not exist: {raw_dir}")
return 0
os.makedirs(out_dir, exist_ok=True)
files = [f for f in os.listdir(raw_dir) if f.lower().endswith(".txt")]
for fname in files:
src_path = os.path.join(raw_dir, fname)
dst_path = os.path.join(out_dir, fname)
# skip if already preprocessed
if os.path.exists(dst_path) and os.path.getsize(dst_path) > 0:
print(f"Skipping already preprocessed file: {dst_path}")
continue
try:
with open(src_path, "r", encoding="utf-8") as rf:
text = rf.read()
except Exception as e:
print(f"⚠️ Failed to read {src_path}: {e}")
continue
cleaned = clean_text(text)
try:
with open(dst_path, "w", encoding="utf-8") as wf:
wf.write(cleaned)
except Exception as e:
print(f"⚠️ Failed to write {dst_path}: {e}")
continue
print(f"Preprocessed job file: {src_path} -> {dst_path}")
return len(files)
except Exception as e:
print(f"⚠️ preprocess_jobs failed: {e}")
return None
|