Shubham170793 commited on
Commit
85242e3
·
verified ·
1 Parent(s): e9faa78

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +118 -63
src/ingestion.py CHANGED
@@ -1,112 +1,167 @@
1
  import re
2
  import fitz # PyMuPDF
 
3
 
4
- # -----------------------------
5
- # TEXT EXTRACTION (Robust)
6
- # -----------------------------
7
  def extract_text_from_pdf(file_path: str) -> str:
8
  """
9
  Extracts and cleans text from a PDF using PyMuPDF.
10
- Handles both textual and scanned PDFs gracefully.
11
 
12
  Args:
13
  file_path (str): Path to the PDF file.
14
  Returns:
15
- str: Combined extracted text.
16
  """
17
  text = ""
18
  try:
19
  with fitz.open(file_path) as pdf:
20
- for page in pdf:
21
  page_text = page.get_text("text").strip()
 
 
22
  if not page_text:
23
- # Fallback: extract raw blocks (helps with weird PDFs)
24
- blocks = pdf.get_text("blocks")
25
  page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
 
 
 
 
 
26
  text += page_text + "\n"
 
27
  except Exception as e:
28
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
29
 
30
- # Clean out any extra whitespace or control characters
31
- text = re.sub(r'\s+', ' ', text).strip()
32
  return text
33
 
34
 
35
- # -----------------------------
36
- # SMART CHUNKING (Step-Aware + Context Aware)
37
- # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
39
  """
40
  Splits text into overlapping, structured chunks.
41
  Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
42
  Falls back to sentence-based chunking for normal paragraphs.
43
-
44
- Args:
45
- text (str): Input text.
46
- chunk_size (int): Max characters per chunk (default: 800).
47
- overlap (int): Overlapping characters for continuity (default: 200).
48
-
49
- Returns:
50
- list[str]: Chunked text segments.
51
  """
52
- # Clean and normalize text
 
53
  text = re.sub(r'\s+', ' ', text.strip())
54
 
55
- # Try to detect “Step” patterns
56
  step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
57
  step_splits = [s.strip() for s in step_splits if s.strip()]
58
 
59
  chunks = []
60
 
61
- # Case 1️⃣: Document has visible “Step” structure
62
  if len(step_splits) > 1:
63
  for step in step_splits:
64
  if len(step) > chunk_size:
65
- # If a step is too long → split by sentences within that step
66
- sentences = re.split(r'(?<=[.!?])\s+', step)
67
- current = ""
68
- for sent in sentences:
69
- if len(current) + len(sent) + 1 <= chunk_size:
70
- current += " " + sent
71
- else:
72
- if current.strip():
73
- chunks.append(current.strip())
74
- overlap_part = current[-overlap:] if overlap > 0 else ""
75
- current = overlap_part + " " + sent
76
- if current.strip():
77
- chunks.append(current.strip())
78
  else:
79
  chunks.append(step.strip())
80
 
81
- # Case 2️⃣: No “Step” keywords fall back to sentence-based chunking
82
  else:
83
- sentences = re.split(r'(?<=[.!?])\s+', text)
84
- current = ""
85
- for sent in sentences:
86
- if len(current) + len(sent) + 1 <= chunk_size:
87
- current += " " + sent
88
- else:
89
- if current.strip():
90
- chunks.append(current.strip())
91
- overlap_part = current[-overlap:] if overlap > 0 else ""
92
- current = overlap_part + " " + sent
93
- if current.strip():
94
- chunks.append(current.strip())
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  return chunks
97
 
98
 
99
- # -----------------------------
100
- # DEBUGGING (Manual Run)
101
- # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  if __name__ == "__main__":
103
- sample_text = """
104
- Step 1: Open the application.
105
- Step 2: Navigate to the dashboard.
106
- Step 3: Review the summary and click ‘Export’.
107
- If the steps are missing, the function should still chunk by sentences.
108
- """
109
- chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
110
- print(f"✅ Chunks created: {len(chunks)}")
111
- for i, c in enumerate(chunks, 1):
112
- print(f"\n--- Chunk {i} ---\n{c}")
 
1
  import re
2
  import fitz # PyMuPDF
3
+ import unicodedata
4
 
5
+ # ==========================================================
6
+ # 1️⃣ TEXT EXTRACTION (Clean + Layout Normalization)
7
+ # ==========================================================
8
  def extract_text_from_pdf(file_path: str) -> str:
9
  """
10
  Extracts and cleans text from a PDF using PyMuPDF.
11
+ Handles noisy layout artifacts, page numbers, and TOC dots.
12
 
13
  Args:
14
  file_path (str): Path to the PDF file.
15
  Returns:
16
+ str: Cleaned, normalized text.
17
  """
18
  text = ""
19
  try:
20
  with fitz.open(file_path) as pdf:
21
+ for page_num, page in enumerate(pdf, start=1):
22
  page_text = page.get_text("text").strip()
23
+
24
+ # Fallback: handle scanned or weirdly structured pages
25
  if not page_text:
26
+ blocks = page.get_text("blocks")
 
27
  page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
28
+
29
+ # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
30
+ page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
31
+ page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
32
+
33
  text += page_text + "\n"
34
+
35
  except Exception as e:
36
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
37
 
38
+ # --- Cleaning pipeline ---
39
+ text = clean_text(text)
40
  return text
41
 
42
 
43
+ # ==========================================================
44
+ # 2️⃣ ADVANCED CLEANING PIPELINE (SAP / Enterprise PDFs)
45
+ # ==========================================================
46
+ def clean_text(text: str) -> str:
47
+ """
48
+ Cleans noisy extracted PDF text before chunking and embedding.
49
+ Handles TOC artifacts, broken lines, bullets, and special characters.
50
+ """
51
+
52
+ # Normalize Unicode (e.g., weird quotes, ligatures)
53
+ text = unicodedata.normalize("NFKD", text)
54
+
55
+ # Remove TOC or numbering noise (e.g., “6.3.1 Prerequisites .............. 53”)
56
+ text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
57
+
58
+ # Replace bullet symbols and dots with consistent spacing
59
+ text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
60
+
61
+ # Remove excessive dots and hyphenated page wraps
62
+ text = re.sub(r"\.{3,}", ". ", text)
63
+ text = re.sub(r"-\s*\n", "", text)
64
+
65
+ # Remove page headers/footers (common in SAP docs)
66
+ text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
67
+ text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
68
+
69
+ # Normalize newlines → paragraph breaks
70
+ text = text.replace("\r", " ")
71
+ text = re.sub(r"\n{2,}", "\n", text)
72
+ text = re.sub(r"\s{2,}", " ", text)
73
+
74
+ # Remove leftover special chars / artifacts
75
+ text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
76
+
77
+ # Remove multiple section dots from TOC lines
78
+ text = re.sub(r"(\s*\.\s*){3,}", " ", text)
79
+
80
+ # Trim and normalize spacing
81
+ text = text.strip()
82
+
83
+ return text
84
+
85
+
86
+ # ==========================================================
87
+ # 3️⃣ SMART CHUNKING (Step-Aware + Sentence Backup)
88
+ # ==========================================================
89
  def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
90
  """
91
  Splits text into overlapping, structured chunks.
92
  Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
93
  Falls back to sentence-based chunking for normal paragraphs.
 
 
 
 
 
 
 
 
94
  """
95
+
96
+ # Normalize whitespace first
97
  text = re.sub(r'\s+', ' ', text.strip())
98
 
99
+ # Try to detect “Step” patterns (case-insensitive)
100
  step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
101
  step_splits = [s.strip() for s in step_splits if s.strip()]
102
 
103
  chunks = []
104
 
105
+ # Case 1️⃣: “Step” sections present
106
  if len(step_splits) > 1:
107
  for step in step_splits:
108
  if len(step) > chunk_size:
109
+ chunks.extend(_split_by_sentence(step, chunk_size, overlap))
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
  chunks.append(step.strip())
112
 
113
+ # Case 2️⃣: No “Step” pattern fallback
114
  else:
115
+ chunks.extend(_split_by_sentence(text, chunk_size, overlap))
116
+
117
+ # Merge tiny chunks for semantic completeness
118
+ chunks = _merge_small_chunks(chunks, min_len=150)
119
+ print(f"✅ Final chunks created: {len(chunks)}")
120
+ return chunks
121
+
 
 
 
 
 
122
 
123
+ # ==========================================================
124
+ # 4️⃣ Helper Functions
125
+ # ==========================================================
126
+ def _split_by_sentence(text, chunk_size=800, overlap=80):
127
+ """Split by sentence punctuation to preserve semantics."""
128
+ sentences = re.split(r'(?<=[.!?])\s+', text)
129
+ chunks, current = [], ""
130
+ for sent in sentences:
131
+ if len(current) + len(sent) + 1 <= chunk_size:
132
+ current += " " + sent
133
+ else:
134
+ if current.strip():
135
+ chunks.append(current.strip())
136
+ overlap_part = current[-overlap:] if overlap > 0 else ""
137
+ current = overlap_part + " " + sent
138
+ if current.strip():
139
+ chunks.append(current.strip())
140
  return chunks
141
 
142
 
143
+ def _merge_small_chunks(chunks, min_len=150):
144
+ """Merge undersized chunks with the next one."""
145
+ merged, buffer = [], ""
146
+ for ch in chunks:
147
+ if len(ch) < min_len:
148
+ buffer += " " + ch
149
+ else:
150
+ if buffer:
151
+ merged.append(buffer.strip())
152
+ buffer = ""
153
+ merged.append(ch.strip())
154
+ if buffer:
155
+ merged.append(buffer.strip())
156
+ return merged
157
+
158
+
159
+ # ==========================================================
160
+ # 5️⃣ DEBUGGING (Manual Run)
161
+ # ==========================================================
162
  if __name__ == "__main__":
163
+ pdf_path = "sample.pdf"
164
+ text = extract_text_from_pdf(pdf_path)
165
+ chunks = chunk_text(text, chunk_size=600, overlap=100)
166
+ for i, c in enumerate(chunks[:5], 1):
167
+ print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")