jayyd commited on
Commit
d4ab4e4
·
verified ·
1 Parent(s): d113e1a

Update utils/data_preprocessing.py

Browse files
Files changed (1) hide show
  1. utils/data_preprocessing.py +68 -0
utils/data_preprocessing.py CHANGED
@@ -1 +1,69 @@
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ import os
3
+ import pdfplumber
4
+ import re
5
+ from pathlib import Path
6
+ from utils.chunking import smart_chunk_text
7
+
8
+ RAW_DIR = "data/raw"
9
+ PROCESSED_DIR = "data/processed"
10
+ CHUNKS_DIR = "data/chunks"
11
+
12
+ Path(CHUNKS_DIR).mkdir(parents=True, exist_ok=True)
13
+ Path(PROCESSED_DIR).mkdir(parents=True, exist_ok=True)
14
+
15
+ def extract_text_from_pdf(pdf_path):
16
+ with pdfplumber.open(pdf_path) as pdf:
17
+ text = ""
18
+ for page in pdf.pages:
19
+ page_text = page.extract_text()
20
+ if page_text: # skip empty pages
21
+ text += page_text + "\n"
22
+ return text
23
+
24
+
25
+ def clean_text(text: str) -> str:
26
+ # Remove common headers/footers
27
+ text = re.sub(r'Allstate.*?\n', '', text, flags=re.IGNORECASE)
28
+ text = re.sub(r'Page \d+ of \d+', '', text)
29
+
30
+ # Fix broken numbers: "57 , 094" → "57,094"
31
+ text = re.sub(r'(\d)\s*,\s*(\d)', r'\1,\2', text)
32
+
33
+ # Fix broken words like "T o t a l" → "Total" (only when letters are isolated)
34
+ text = re.sub(r'(?<=\b\w) (?=\w\b)', '', text)
35
+
36
+ # Normalize spaces/newlines
37
+ text = re.sub(r'\s+', ' ', text)
38
+ text = re.sub(r'\n+', '\n', text)
39
+
40
+ # Remove stray lines: pure digits, year-only, or too short
41
+ lines = []
42
+ for line in text.splitlines():
43
+ line = line.strip()
44
+ if len(line) <= 5:
45
+ continue
46
+ if re.fullmatch(r"\d{4}", line): # year like 2023
47
+ continue
48
+ if re.fullmatch(r"[\d,\. ]+", line): # only numbers
49
+ continue
50
+ lines.append(line)
51
+
52
+ return "\n".join(lines).strip()
53
+
54
+
55
+ # Process all PDFs
56
+ for fname in os.listdir(RAW_DIR):
57
+ if fname.endswith(".pdf"):
58
+ raw_text = extract_text_from_pdf(os.path.join(RAW_DIR, fname))
59
+ clean = clean_text(raw_text)
60
+
61
+ # Save cleaned text
62
+ with open(os.path.join(PROCESSED_DIR, fname.replace(".pdf", ".txt")), "w", encoding="utf-8") as f:
63
+ f.write(clean)
64
+
65
+ # Chunk and save
66
+ chunks = smart_chunk_text([clean], chunk_size=300, overlap=50)
67
+ with open(os.path.join(CHUNKS_DIR, fname.replace(".pdf", "_chunks.txt")), "w", encoding="utf-8") as f:
68
+ for chunk in chunks:
69
+ f.write(chunk + "\n---\n")