Shubham170793 commited on
Commit
6d87461
·
verified ·
1 Parent(s): df1d611

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +15 -7
src/ingestion.py CHANGED
@@ -93,27 +93,34 @@ def clean_text(text: str) -> str:
93
  # ==========================================================
94
  # 3️⃣ TABLE OF CONTENTS DETECTION
95
  # ==========================================================
 
 
 
96
  def extract_table_of_contents(text: str):
97
  """
98
  Detects Table of Contents (TOC) in PDFs.
 
99
  Returns list of (section_number, section_title).
100
  """
101
  toc_entries = []
102
  lines = text.split("\n")
103
  toc_started = False
104
 
105
- for line in lines:
106
- # Detect start of TOC
107
- if not toc_started and re.search(r"table\s*of\s*contents", line, re.IGNORECASE):
108
- toc_started = True
109
- continue
 
 
 
110
 
111
  if toc_started:
112
- # Stop scanning when we reach main content
113
  if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
114
  break
115
 
116
- # Match TOC patterns like "3.2 Configure Endpoints ........ 13"
117
  match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
118
  if match:
119
  section = match.group(1).strip()
@@ -124,6 +131,7 @@ def extract_table_of_contents(text: str):
124
  return toc_entries
125
 
126
 
 
127
  # ==========================================================
128
  # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
129
  # ==========================================================
 
93
  # ==========================================================
94
  # 3️⃣ TABLE OF CONTENTS DETECTION
95
  # ==========================================================
96
+ # ==========================================================
97
+ # 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
98
+ # ==========================================================
99
  def extract_table_of_contents(text: str):
100
  """
101
  Detects Table of Contents (TOC) in PDFs.
102
+ Supports variants like 'Contents', 'Index', or 'Overview'.
103
  Returns list of (section_number, section_title).
104
  """
105
  toc_entries = []
106
  lines = text.split("\n")
107
  toc_started = False
108
 
109
+ for i, line in enumerate(lines):
110
+ # Detect possible TOC header variants
111
+ if not toc_started and re.search(r"\b(table\s*of\s*contents|contents|index|overview)\b", line, re.IGNORECASE):
112
+ # Confidence check — look ahead a few lines
113
+ next_lines = lines[i + 1 : i + 6]
114
+ if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
115
+ toc_started = True
116
+ continue
117
 
118
  if toc_started:
119
+ # Stop scanning when main content starts (e.g., "Step 1:" or "1. Introduction")
120
  if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
121
  break
122
 
123
+ # Match lines like "3.2 Configure Endpoints ........ 13"
124
  match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
125
  if match:
126
  section = match.group(1).strip()
 
131
  return toc_entries
132
 
133
 
134
+
135
  # ==========================================================
136
  # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
137
  # ==========================================================