File size: 12,447 Bytes
35646e4 275cb5c 85242e3 a537c9b d36c8e6 275cb5c 85242e3 df1d611 85242e3 df1d611 a537c9b 275cb5c 6b0c8b8 85242e3 6b0c8b8 a537c9b d36c8e6 6b0c8b8 85242e3 b61a150 a537c9b 32f64de a537c9b 6b0c8b8 85242e3 6b0c8b8 a537c9b 85242e3 a537c9b e11a9ad df1d611 e11a9ad 275cb5c 85242e3 c0ebdcb 85242e3 c0ebdcb 85242e3 a537c9b c0ebdcb 2f812ee a537c9b 85242e3 df1d611 85242e3 c0ebdcb 85242e3 b61a150 85242e3 c0ebdcb 6d87461 e11a9ad 6d87461 df1d611 499bde3 df1d611 6d87461 499bde3 6d87461 df1d611 499bde3 2f812ee 499bde3 85242e3 499bde3 df1d611 499bde3 df1d611 a537c9b 499bde3 df1d611 6d87461 e11a9ad 8afec0a e11a9ad 8afec0a ee4a18f 3653196 ee4a18f 0dc8e87 cb020cf ee4a18f 0dc8e87 cb020cf 0dc8e87 cb020cf ee4a18f 0dc8e87 e11a9ad 8afec0a cb020cf ee4a18f 8afec0a 5fa88dd ee4a18f a537c9b 8afec0a a537c9b ee4a18f 8afec0a cb020cf ee4a18f e11a9ad 8afec0a e11a9ad ee4a18f e11a9ad 8afec0a e11a9ad a537c9b e11a9ad 8afec0a e11a9ad a537c9b e11a9ad df1d611 00eb202 df1d611 b61a150 f2fb7ac 35646e4 d36c8e6 00eb202 68c8d89 d36c8e6 00eb202 d36c8e6 00eb202 68c8d89 f2fb7ac 68c8d89 f2fb7ac 85242e3 00eb202 f2fb7ac 85242e3 35646e4 d36c8e6 35646e4 85242e3 a537c9b 85242e3 35646e4 a537c9b df1d611 b61a150 df1d611 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
import re
import fitz # PyMuPDF
import unicodedata
import os
import json
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
# ==========================================================
# 1οΈβ£ TEXT EXTRACTION (Clean + TOC Detection)
# ==========================================================
def extract_text_from_pdf(file_path: str):
"""
Extracts and cleans text from a PDF using PyMuPDF.
Handles layout artifacts, numbered sections, and TOC.
Returns clean text + TOC list + source label.
"""
text = ""
try:
with fitz.open(file_path) as pdf:
for page_num, page in enumerate(pdf, start=1):
page_text = page.get_text("text").strip()
# Fallback for scanned/weird layouts
if not page_text:
blocks = page.get_text("blocks")
page_text = " ".join(
block[4] for block in blocks if isinstance(block[4], str)
)
# Clean structural noise
page_text = page_text.replace("β’ ", "\nβ’ ")
page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
page_text = re.sub(r"(PUBLIC|Confidential|Β© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
text += page_text + "\n"
except Exception as e:
raise RuntimeError(f"β PDF extraction failed: {e}")
# --- Cleaning pipeline ---
text = clean_text(text)
# --- TOC extraction (Hybrid) ---
toc, toc_source = get_hybrid_toc(text)
print(f"π TOC Source: {toc_source} | Entries: {len(toc)}")
return text, toc, toc_source
# ==========================================================
# 2οΈβ£ ADVANCED CLEANING PIPELINE (Unicode-Safe for Hindi + English)
# ==========================================================
def clean_text(text: str) -> str:
"""Cleans noisy PDF text while preserving Unicode (Hindi, multilingual)."""
text = unicodedata.normalize("NFKD", text)
# Remove TOC-like noise
text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
# Normalize bullets, dots, and spacing
text = text.replace("β’", "- ").replace("βͺ", "- ").replace("β£", "- ")
text = re.sub(r"\.{3,}", ". ", text)
text = re.sub(r"-\s*\n", "", text)
text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"Β©\s*[A-Z].*?\d{4}", "", text)
text = text.replace("\r", " ")
text = re.sub(r"\n{2,}", "\n", text)
text = re.sub(r"\s{2,}", " ", text)
# π Keep Unicode letters β no more ASCII-only restriction
# \w under re.UNICODE keeps Hindi & other scripts, safe for embeddings
text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text, flags=re.UNICODE)
# Trim repetitive punctuation and stray spaces
text = re.sub(r"(\s*\.\s*){3,}", " ", text)
return text.strip()
# ==========================================================
# 3οΈβ£ TABLE OF CONTENTS DETECTION (Heuristic)
# ==========================================================
def extract_table_of_contents(text: str):
toc_entries = []
lines = text.split("\n")
toc_started = False
toc_ended = False
line_count = len(lines)
for i, line in enumerate(lines):
if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
next_lines = lines[i + 1 : i + 8]
if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
toc_started = True
continue
if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
numbered_lines = 0
for j in range(i, min(i + 5, line_count)):
if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
numbered_lines += 1
if numbered_lines >= 3:
toc_started = True
if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
toc_ended = True
break
if toc_started and not toc_ended:
match = re.match(
r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
line.strip()
)
if match:
section = match.group(1).strip()
title = match.group(2).strip()
if len(title) > 3 and not re.match(r"^\d+$", title):
toc_entries.append((section, title))
deduped, seen = [], set()
for sec, title in toc_entries:
key = (sec, title.lower())
if key not in seen:
deduped.append((sec, title))
seen.add(key)
return deduped
# ==========================================================
# 3AοΈβ£ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
# ==========================================================
def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
"""
Uses SAP GenAI Hub proxy (same as QA pipeline) to infer a Table of Contents.
This ensures consistent credentials, no manual token handling, and safe reuse
of your existing GEN AI HUB PROXY.json configuration.
"""
snippet = text[:7000] # β
Simple, fast fallback β first 7000 chars only
creds = {}
base_url = ""
# β
Load credentials from same JSON as QA pipeline
creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
if os.path.exists(creds_path):
try:
with open(creds_path, "r") as f:
creds = json.load(f)
base_url = (
creds.get("base_url")
or creds.get("serviceurls", {}).get("AI_API_URL", "")
or creds.get("AICORE_BASE_URL", "")
)
except Exception as e:
print(f"β οΈ Could not read GenAI proxy credentials: {e}")
else:
print("β οΈ No SAP GenAI credentials file found β skipping AI fallback.")
return []
if not base_url:
print("β οΈ Missing AI_API_URL or base_url in credentials β skipping fallback.")
return []
# β
Inject credentials into environment (matches QA setup)
os.environ.update({
"AICORE_AUTH_URL": creds.get("url", ""),
"AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
"AICORE_CLIENT_SECRET": creds.get("clientsecret") or creds.get("client_secret", ""),
"AICORE_RESOURCE_GROUP": "default",
"AICORE_BASE_URL": base_url
})
try:
print(f"βοΈ Invoking GenAI proxy for TOC inference using model: {model_name}")
proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
llm = ChatOpenAI(
proxy_model_name=model_name,
proxy_client=proxy_client,
temperature=0.0,
max_tokens=700
)
prompt = f"""
You are a document structure analyzer.
Read the following text and infer its main section titles.
Output a numbered list of 5β10 clean section names that could appear in a Table of Contents.
TEXT SAMPLE:
{snippet}
"""
response = llm.invoke(prompt)
response_text = getattr(response, "content", str(response))
# β
Extract clean TOC-like lines
lines = [
re.sub(r"^[0-9.\-β’\s]+", "", l.strip())
for l in response_text.splitlines()
if l.strip()
]
toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
print(f"β¨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
return toc_ai
except Exception as e:
print(f"β οΈ AI TOC fallback failed via GenAI proxy: {e}")
return []
# ==========================================================
# 3BοΈβ£ UNIFIED WRAPPER (Heuristic + AI Hybrid)
# ==========================================================
def get_hybrid_toc(text: str):
toc_entries = extract_table_of_contents(text)
if toc_entries:
print(f"π TOC detected with {len(toc_entries)} entries (heuristic).")
return toc_entries, "heuristic"
print("β οΈ No TOC detected β invoking GenAI fallback...")
toc_ai = adaptive_fallback_toc(text)
if toc_ai:
print(f"β¨ AI-inferred TOC generated with {len(toc_ai)} entries.")
return toc_ai, "ai_inferred"
print("β No TOC could be detected or inferred.")
return [], "none"
# ==========================================================
# 4οΈβ£ SMART CHUNKING (hierarchical + procedure-aware)
# ==========================================================
def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
text_length = len(text)
if chunk_size is None:
if text_length > 200000:
chunk_size, overlap = 2000, 250
elif text_length > 50000:
chunk_size, overlap = 1500, 200
else:
chunk_size, overlap = 1000, 150
elif overlap is None:
overlap = 150
print(f"βοΈ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
text = re.sub(r"\s+", " ", text.strip())
# --- Step 1: Split by major numbered section headers
section_blocks = re.split(
r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
text
)
# --- Step 2: Detect procedural subsections within each section
procedure_blocks = []
for sec in section_blocks:
if not sec.strip():
continue
sub_blocks = re.split(
r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
sec,
flags=re.IGNORECASE
)
procedure_blocks.extend(sub_blocks)
# --- Step 3: Build final chunks
chunks = []
for block in procedure_blocks:
if not block.strip():
continue
if len(block) < chunk_size * 1.5:
chunks.append(block.strip())
else:
chunks.extend(_split_by_sentence(block, chunk_size, overlap))
chunks = _merge_small_chunks(chunks, min_len=200)
final_chunks = []
for i, ch in enumerate(chunks):
if i == 0:
final_chunks.append(ch)
else:
prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
final_chunks.append((prev_tail + " " + ch).strip())
print(f"β
Final chunks created (section-aware + procedure-aware): {len(final_chunks)}")
return final_chunks
# ==========================================================
# πΉ Helper Functions
# ==========================================================
def _split_by_sentence(text, chunk_size=800, overlap=80):
sentences = re.split(r"(?<=[.!?])\s+", text)
chunks, current = [], ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
if current.strip():
chunks.append(current.strip())
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
if current.strip():
chunks.append(current.strip())
return chunks
def _merge_small_chunks(chunks, min_len=150):
merged, buffer = [], ""
for ch in chunks:
if len(ch) < min_len:
buffer += " " + ch
else:
if buffer:
merged.append(buffer.strip())
buffer = ""
merged.append(ch.strip())
if buffer:
merged.append(buffer.strip())
return merged
# ==========================================================
# 5οΈβ£ DEBUGGING (Manual Test)
# ==========================================================
if __name__ == "__main__":
pdf_path = "sample_ai_resume_structured.pdf"
text, toc, toc_source = extract_text_from_pdf(pdf_path)
print("\nπ TOC Preview:", toc[:5])
chunks = chunk_text(text)
print(f"\nβ
{len(chunks)} chunks created.")
|