File size: 11,976 Bytes
35646e4 275cb5c 85242e3 a537c9b d36c8e6 275cb5c 29e4ac0 85242e3 df1d611 85242e3 df1d611 a537c9b 275cb5c 6b0c8b8 85242e3 6b0c8b8 a537c9b d36c8e6 6b0c8b8 85242e3 b61a150 a537c9b 32f64de a537c9b 6b0c8b8 85242e3 6b0c8b8 a537c9b 85242e3 a537c9b e11a9ad df1d611 e11a9ad 275cb5c 29e4ac0 85242e3 5c1a3d7 85242e3 5c1a3d7 a537c9b 5c1a3d7 a537c9b 5c1a3d7 85242e3 5c1a3d7 85242e3 c0ebdcb 5c1a3d7 c0ebdcb b61a150 85242e3 5c1a3d7 6d87461 e11a9ad 6d87461 df1d611 499bde3 df1d611 6d87461 499bde3 29e4ac0 6d87461 df1d611 499bde3 2f812ee 499bde3 85242e3 499bde3 df1d611 499bde3 df1d611 a537c9b 499bde3 df1d611 6d87461 e11a9ad 8afec0a e11a9ad 8afec0a 29e4ac0 0dc8e87 cb020cf 0dc8e87 29e4ac0 cb020cf 0dc8e87 cb020cf 0dc8e87 e11a9ad 8afec0a cb020cf 29e4ac0 ee4a18f a537c9b 8afec0a a537c9b ee4a18f 8afec0a cb020cf e11a9ad 8afec0a e11a9ad ee4a18f e11a9ad 8afec0a e11a9ad a537c9b e11a9ad 8afec0a e11a9ad 29e4ac0 e11a9ad a537c9b e11a9ad df1d611 00eb202 df1d611 b61a150 f2fb7ac 35646e4 29e4ac0 68c8d89 00eb202 29e4ac0 00eb202 68c8d89 f2fb7ac 68c8d89 f2fb7ac 85242e3 00eb202 f2fb7ac 85242e3 35646e4 d36c8e6 35646e4 85242e3 a537c9b 85242e3 35646e4 a537c9b df1d611 b61a150 df1d611 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
import re
import fitz # PyMuPDF
import unicodedata
import os
import json
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
# ==========================================================
# 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
# ==========================================================
def extract_text_from_pdf(file_path: str):
"""
Extracts and cleans text from a PDF using PyMuPDF.
Handles layout artifacts, numbered sections, and TOC.
Returns clean text + TOC list + source label.
"""
text = ""
try:
with fitz.open(file_path) as pdf:
for page_num, page in enumerate(pdf, start=1):
page_text = page.get_text("text").strip()
# Fallback for scanned/weird layouts
if not page_text:
blocks = page.get_text("blocks")
page_text = " ".join(
block[4] for block in blocks if isinstance(block[4], str)
)
# Clean structural noise
page_text = page_text.replace("• ", "\n• ")
page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
text += page_text + "\n"
except Exception as e:
raise RuntimeError(f"❌ PDF extraction failed: {e}")
# --- Cleaning pipeline ---
text = clean_text(text)
# --- TOC extraction (Hybrid) ---
toc, toc_source = get_hybrid_toc(text)
print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
return text, toc, toc_source
# ==========================================================
# 2️⃣ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
# ==========================================================
def clean_text(text: str) -> str:
"""Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
import unicodedata
import re
# Normalize to handle combined Devanagari characters properly
text = unicodedata.normalize("NFKD", text)
# Remove common TOC-like artifacts (page dots, numbering, etc.)
text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
# Normalize bullets, dots, and spacing
text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
text = re.sub(r"\.{3,}", ". ", text)
text = re.sub(r"-\s*\n", "", text)
text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
text = text.replace("\r", " ")
text = re.sub(r"\n{2,}", "\n", text)
text = re.sub(r"\s{2,}", " ", text)
# 🚀 CRITICAL FIX — Preserve Hindi (Devanagari Unicode \u0900–\u097F)
# The old regex removed these characters. Now we explicitly keep them.
# \w under re.UNICODE handles most scripts, but we ensure full Devanagari retention.
text = re.sub(r"[^\w\s\u0900-\u097F,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
# Clean repeated dots/spaces
text = re.sub(r"(\s*\.\s*){3,}", " ", text)
return text.strip()
# ==========================================================
# 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
# ==========================================================
def extract_table_of_contents(text: str):
toc_entries = []
lines = text.split("\n")
toc_started = False
toc_ended = False
line_count = len(lines)
for i, line in enumerate(lines):
if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
next_lines = lines[i + 1: i + 8]
if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
toc_started = True
continue
if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
numbered_lines = 0
for j in range(i, min(i + 5, line_count)):
if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
numbered_lines += 1
if numbered_lines >= 3:
toc_started = True
if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
toc_ended = True
break
if toc_started and not toc_ended:
match = re.match(
r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
line.strip()
)
if match:
section = match.group(1).strip()
title = match.group(2).strip()
if len(title) > 3 and not re.match(r"^\d+$", title):
toc_entries.append((section, title))
deduped, seen = [], set()
for sec, title in toc_entries:
key = (sec, title.lower())
if key not in seen:
deduped.append((sec, title))
seen.add(key)
return deduped
# ==========================================================
# 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
# ==========================================================
def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
snippet = text[:7000]
creds = {}
base_url = ""
creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
if os.path.exists(creds_path):
try:
with open(creds_path, "r") as f:
creds = json.load(f)
base_url = (
creds.get("base_url")
or creds.get("serviceurls", {}).get("AI_API_URL", "")
or creds.get("AICORE_BASE_URL", "")
)
except Exception as e:
print(f"⚠️ Could not read GenAI proxy credentials: {e}")
else:
print("⚠️ No SAP GenAI credentials file found — skipping AI fallback.")
return []
if not base_url:
print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
return []
os.environ.update({
"AICORE_AUTH_URL": creds.get("url", ""),
"AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
"AICORE_CLIENT_SECRET": creds.get("clientsecret") or creds.get("client_secret", ""),
"AICORE_RESOURCE_GROUP": "default",
"AICORE_BASE_URL": base_url
})
try:
print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
llm = ChatOpenAI(proxy_model_name=model_name, proxy_client=proxy_client, temperature=0.0, max_tokens=700)
prompt = f"""
You are a document structure analyzer.
Read the following text and infer its main section titles.
Output a numbered list of 5–10 clean section names that could appear in a Table of Contents.
TEXT SAMPLE:
{snippet}
"""
response = llm.invoke(prompt)
response_text = getattr(response, "content", str(response))
lines = [
re.sub(r"^[0-9.\-•\s]+", "", l.strip())
for l in response_text.splitlines()
if l.strip()
]
toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
return toc_ai
except Exception as e:
print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
return []
# ==========================================================
# 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
# ==========================================================
def get_hybrid_toc(text: str):
toc_entries = extract_table_of_contents(text)
if toc_entries:
print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
return toc_entries, "heuristic"
print("⚠️ No TOC detected — invoking GenAI fallback...")
toc_ai = adaptive_fallback_toc(text)
if toc_ai:
print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
return toc_ai, "ai_inferred"
print("❌ No TOC could be detected or inferred.")
return [], "none"
# ==========================================================
# 4️⃣ SMART CHUNKING (hierarchical + procedure-aware)
# ==========================================================
def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
text_length = len(text)
if chunk_size is None:
if text_length > 200000:
chunk_size, overlap = 2000, 250
elif text_length > 50000:
chunk_size, overlap = 1500, 200
else:
chunk_size, overlap = 1000, 150
elif overlap is None:
overlap = 150
print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
text = re.sub(r"\s+", " ", text.strip())
section_blocks = re.split(r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})", text)
procedure_blocks = []
for sec in section_blocks:
if not sec.strip():
continue
sub_blocks = re.split(
r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
sec, flags=re.IGNORECASE
)
procedure_blocks.extend(sub_blocks)
chunks = []
for block in procedure_blocks:
if not block.strip():
continue
if len(block) < chunk_size * 1.5:
chunks.append(block.strip())
else:
chunks.extend(_split_by_sentence(block, chunk_size, overlap))
chunks = _merge_small_chunks(chunks, min_len=200)
final_chunks = []
for i, ch in enumerate(chunks):
if i == 0:
final_chunks.append(ch)
else:
prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
final_chunks.append((prev_tail + " " + ch).strip())
print(f"✅ Final chunks created (section-aware + procedure-aware): {len(final_chunks)}")
return final_chunks
# ==========================================================
# 🔹 Helper Functions
# ==========================================================
def _split_by_sentence(text, chunk_size=800, overlap=80):
sentences = re.split(r"(?<=[.!?])\s+", text)
chunks, current = [], ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
if current.strip():
chunks.append(current.strip())
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
if current.strip():
chunks.append(current.strip())
return chunks
def _merge_small_chunks(chunks, min_len=150):
merged, buffer = [], ""
for ch in chunks:
if len(ch) < min_len:
buffer += " " + ch
else:
if buffer:
merged.append(buffer.strip())
buffer = ""
merged.append(ch.strip())
if buffer:
merged.append(buffer.strip())
return merged
# ==========================================================
# 5️⃣ DEBUGGING (Manual Test)
# ==========================================================
if __name__ == "__main__":
pdf_path = "sample_ai_resume_structured.pdf"
text, toc, toc_source = extract_text_from_pdf(pdf_path)
print("\n📚 TOC Preview:", toc[:5])
chunks = chunk_text(text)
print(f"\n✅ {len(chunks)} chunks created.")
|