Spaces:

jayyd
/

financial_qa_rag

Sleeping

App Files Files Community

jayyd commited on Aug 24, 2025

Commit

d4ab4e4

verified ·

1 Parent(s): d113e1a

Update utils/data_preprocessing.py

Browse files

Files changed (1) hide show

utils/data_preprocessing.py +68 -0

utils/data_preprocessing.py CHANGED Viewed

	@@ -1 +1,69 @@
1

+import os
+import pdfplumber
+import re
+from pathlib import Path
+from utils.chunking import smart_chunk_text
+RAW_DIR = "data/raw"
+PROCESSED_DIR = "data/processed"
+CHUNKS_DIR = "data/chunks"
+Path(CHUNKS_DIR).mkdir(parents=True, exist_ok=True)
+Path(PROCESSED_DIR).mkdir(parents=True, exist_ok=True)
+def extract_text_from_pdf(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        text = ""
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:  # skip empty pages
+                text += page_text + "\n"
+    return text
+def clean_text(text: str) -> str:
+    # Remove common headers/footers
+    text = re.sub(r'Allstate.*?\n', '', text, flags=re.IGNORECASE)
+    text = re.sub(r'Page \d+ of \d+', '', text)
+    # Fix broken numbers: "57 , 094" → "57,094"
+    text = re.sub(r'(\d)\s*,\s*(\d)', r'\1,\2', text)
+    # Fix broken words like "T o t a l" → "Total" (only when letters are isolated)
+    text = re.sub(r'(?<=\b\w) (?=\w\b)', '', text)
+    # Normalize spaces/newlines
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\n+', '\n', text)
+    # Remove stray lines: pure digits, year-only, or too short
+    lines = []
+    for line in text.splitlines():
+        line = line.strip()
+        if len(line) <= 5:
+            continue
+        if re.fullmatch(r"\d{4}", line):  # year like 2023
+            continue
+        if re.fullmatch(r"[\d,\. ]+", line):  # only numbers
+            continue
+        lines.append(line)
+    return "\n".join(lines).strip()
+# Process all PDFs
+for fname in os.listdir(RAW_DIR):
+    if fname.endswith(".pdf"):
+        raw_text = extract_text_from_pdf(os.path.join(RAW_DIR, fname))
+        clean = clean_text(raw_text)
+        # Save cleaned text
+        with open(os.path.join(PROCESSED_DIR, fname.replace(".pdf", ".txt")), "w", encoding="utf-8") as f:
+            f.write(clean)
+        # Chunk and save
+        chunks = smart_chunk_text([clean], chunk_size=300, overlap=50)
+        with open(os.path.join(CHUNKS_DIR, fname.replace(".pdf", "_chunks.txt")), "w", encoding="utf-8") as f:
+            for chunk in chunks:
+                f.write(chunk + "\n---\n")