Shubham170793 commited on
Commit
e11a9ad
·
verified ·
1 Parent(s): 2315af4

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +67 -13
src/ingestion.py CHANGED
@@ -1,6 +1,7 @@
1
  import re
2
  import fitz # PyMuPDF
3
  import unicodedata
 
4
 
5
  # ==========================================================
6
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
@@ -9,7 +10,7 @@ def extract_text_from_pdf(file_path: str):
9
  """
10
  Extracts and cleans text from a PDF using PyMuPDF.
11
  Handles layout artifacts, numbered sections, and TOC.
12
- Returns both clean text and detected TOC (if any).
13
  """
14
  text = ""
15
  try:
@@ -17,7 +18,7 @@ def extract_text_from_pdf(file_path: str):
17
  for page_num, page in enumerate(pdf, start=1):
18
  page_text = page.get_text("text").strip()
19
 
20
- # Fallback: for scanned/weird layouts
21
  if not page_text:
22
  blocks = page.get_text("blocks")
23
  page_text = " ".join(
@@ -47,14 +48,11 @@ def extract_text_from_pdf(file_path: str):
47
  # --- Cleaning pipeline ---
48
  text = clean_text(text)
49
 
50
- # --- TOC extraction ---
51
- toc = extract_table_of_contents(text)
52
- if toc:
53
- print(f"📘 TOC detected with {len(toc)} entries.")
54
- else:
55
- print("⚠️ No Table of Contents detected.")
56
 
57
- return text, toc
58
 
59
 
60
  # ==========================================================
@@ -91,7 +89,7 @@ def clean_text(text: str) -> str:
91
 
92
 
93
  # ==========================================================
94
- # 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
95
  # ==========================================================
96
  def extract_table_of_contents(text: str):
97
  """
@@ -107,14 +105,14 @@ def extract_table_of_contents(text: str):
107
  line_count = len(lines)
108
 
109
  for i, line in enumerate(lines):
110
- # --- Step 1️⃣: Detect possible TOC header variants ---
111
  if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
112
  next_lines = lines[i + 1 : i + 8]
113
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
114
  toc_started = True
115
  continue
116
 
117
- # --- Step 2️⃣: Smart fallback — detect implicit TOC without header ---
118
  if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
119
  numbered_lines = 0
120
  for j in range(i, min(i + 5, line_count)):
@@ -152,6 +150,62 @@ def extract_table_of_contents(text: str):
152
  return deduped
153
 
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  # ==========================================================
156
  # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
157
  # ==========================================================
@@ -251,7 +305,7 @@ def _merge_small_chunks(chunks, min_len=150):
251
  # ==========================================================
252
  if __name__ == "__main__":
253
  pdf_path = "sample.pdf"
254
- text, toc = extract_text_from_pdf(pdf_path)
255
  print("\n📚 TOC Preview:", toc[:5])
256
  chunks = chunk_text(text)
257
  print(f"\n✅ {len(chunks)} chunks created.")
 
1
  import re
2
  import fitz # PyMuPDF
3
  import unicodedata
4
+ from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
5
 
6
  # ==========================================================
7
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
 
10
  """
11
  Extracts and cleans text from a PDF using PyMuPDF.
12
  Handles layout artifacts, numbered sections, and TOC.
13
+ Returns clean text + TOC list + source label.
14
  """
15
  text = ""
16
  try:
 
18
  for page_num, page in enumerate(pdf, start=1):
19
  page_text = page.get_text("text").strip()
20
 
21
+ # Fallback: for scanned or weird layouts
22
  if not page_text:
23
  blocks = page.get_text("blocks")
24
  page_text = " ".join(
 
48
  # --- Cleaning pipeline ---
49
  text = clean_text(text)
50
 
51
+ # --- TOC extraction (Hybrid) ---
52
+ toc, toc_source = get_hybrid_toc(text)
53
+ print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
 
 
 
54
 
55
+ return text, toc, toc_source
56
 
57
 
58
  # ==========================================================
 
89
 
90
 
91
  # ==========================================================
92
+ # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
93
  # ==========================================================
94
  def extract_table_of_contents(text: str):
95
  """
 
105
  line_count = len(lines)
106
 
107
  for i, line in enumerate(lines):
108
+ # --- Step 1️⃣: Detect TOC header variants ---
109
  if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
110
  next_lines = lines[i + 1 : i + 8]
111
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
112
  toc_started = True
113
  continue
114
 
115
+ # --- Step 2️⃣: Smart fallback — detect implicit TOC ---
116
  if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
117
  numbered_lines = 0
118
  for j in range(i, min(i + 5, line_count)):
 
150
  return deduped
151
 
152
 
153
+ # ==========================================================
154
+ # 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred)
155
+ # ==========================================================
156
+ def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
157
+ """
158
+ Uses an LLM to infer a Table of Contents from the document text.
159
+ Called only when no TOC is found via regex parsing.
160
+ """
161
+ snippet = text[:max_chars]
162
+ llm = ChatOpenAI(model=model, temperature=0)
163
+ prompt = f"""
164
+ You are a document structure analyzer.
165
+ Read the following text and infer its main section titles.
166
+ Output a clean, numbered list (1., 2., 3.) with 5–10 entries max.
167
+
168
+ TEXT SAMPLE:
169
+ {snippet}
170
+ """
171
+ try:
172
+ response = llm.invoke(prompt)
173
+ lines = [
174
+ re.sub(r"^[0-9.\-•\\s]+", "", l.strip())
175
+ for l in response.content.splitlines()
176
+ if l.strip()
177
+ ]
178
+ toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
179
+ return toc_ai
180
+ except Exception as e:
181
+ print(f"⚠️ AI TOC fallback failed: {e}")
182
+ return []
183
+
184
+
185
+ # ==========================================================
186
+ # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
187
+ # ==========================================================
188
+ def get_hybrid_toc(text: str):
189
+ """
190
+ Attempts heuristic TOC extraction; if none found,
191
+ triggers adaptive AI fallback.
192
+ Returns (toc_entries, source_label).
193
+ """
194
+ toc_entries = extract_table_of_contents(text)
195
+ if toc_entries:
196
+ print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
197
+ return toc_entries, "heuristic"
198
+
199
+ print("⚠️ No TOC detected — invoking adaptive AI fallback...")
200
+ toc_ai = adaptive_fallback_toc(text)
201
+ if toc_ai:
202
+ print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
203
+ return toc_ai, "ai_inferred"
204
+
205
+ print("❌ No TOC could be detected or inferred.")
206
+ return [], "none"
207
+
208
+
209
  # ==========================================================
210
  # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
211
  # ==========================================================
 
305
  # ==========================================================
306
  if __name__ == "__main__":
307
  pdf_path = "sample.pdf"
308
+ text, toc, source = extract_text_from_pdf(pdf_path)
309
  print("\n📚 TOC Preview:", toc[:5])
310
  chunks = chunk_text(text)
311
  print(f"\n✅ {len(chunks)} chunks created.")