File size: 11,599 Bytes
35646e4 275cb5c 85242e3 a537c9b d36c8e6 275cb5c 85242e3 df1d611 85242e3 df1d611 a537c9b 6241bc0 a537c9b 275cb5c 6b0c8b8 85242e3 12787fa 6b0c8b8 a537c9b 6241bc0 12787fa 85242e3 b61a150 a537c9b 6241bc0 32f64de a537c9b 12787fa 6b0c8b8 85242e3 6b0c8b8 6241bc0 85242e3 a537c9b 12787fa 6241bc0 e11a9ad df1d611 e11a9ad 275cb5c 85242e3 6241bc0 85242e3 6241bc0 5c1a3d7 6241bc0 5c1a3d7 a537c9b 6241bc0 85242e3 5c1a3d7 85242e3 c0ebdcb 6241bc0 5c1a3d7 b61a150 85242e3 6d87461 e11a9ad 6d87461 df1d611 499bde3 df1d611 6d87461 499bde3 29e4ac0 6d87461 df1d611 499bde3 2f812ee 499bde3 85242e3 499bde3 df1d611 499bde3 df1d611 a537c9b 499bde3 df1d611 e11a9ad 8afec0a e11a9ad 8afec0a 29e4ac0 6241bc0 0dc8e87 cb020cf 29e4ac0 cb020cf 0dc8e87 cb020cf 0dc8e87 e11a9ad 8afec0a cb020cf 29e4ac0 ee4a18f a537c9b 8afec0a a537c9b ee4a18f 8afec0a cb020cf e11a9ad 8afec0a e11a9ad ee4a18f e11a9ad 8afec0a e11a9ad a537c9b e11a9ad 8afec0a e11a9ad a537c9b e11a9ad df1d611 6241bc0 df1d611 b61a150 f2fb7ac 35646e4 29e4ac0 68c8d89 00eb202 29e4ac0 00eb202 68c8d89 f2fb7ac 68c8d89 f2fb7ac 85242e3 6241bc0 f2fb7ac 85242e3 d36c8e6 85242e3 a537c9b 85242e3 35646e4 a537c9b df1d611 b61a150 df1d611 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 |
import re
import fitz # PyMuPDF
import unicodedata
import os
import json
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
# ==========================================================
# 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
# ==========================================================
def extract_text_from_pdf(file_path: str):
"""
Extracts and cleans text from a PDF using PyMuPDF.
Handles layout artifacts, numbered sections, and TOC.
Returns clean text + TOC list + source label.
English-only version.
"""
text = ""
try:
with fitz.open(file_path) as pdf:
for page_num, page in enumerate(pdf, start=1):
# Primary text extraction
page_text = page.get_text("text").strip()
# Fallback for PDFs with minimal text
if not page_text or len(page_text) < 10:
blocks = page.get_text("blocks")
page_text = " ".join(
block[4] for block in blocks if isinstance(block[4], str)
)
# Structural cleanup
page_text = page_text.replace("• ", "\n• ")
page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
text += page_text + "\n"
except Exception as e:
raise RuntimeError(f"❌ PDF extraction failed: {e}")
# Clean text (English only)
text = clean_text(text)
print("🧾 TEXT SAMPLE (first 400 chars):", text[:400])
# TOC detection
toc, toc_source = get_hybrid_toc(text)
print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
return text, toc, toc_source
# ==========================================================
# 2️⃣ CLEANING PIPELINE (English Only)
# ==========================================================
def clean_text(text: str) -> str:
"""Cleans noisy PDF text for English documents."""
text = unicodedata.normalize("NFKC", text)
# Remove common TOC-like artifacts
text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
# Normalize bullets and spacing
text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
text = re.sub(r"\.{3,}", ". ", text)
text = re.sub(r"-\s*\n", "", text)
text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
text = text.replace("\r", " ")
text = re.sub(r"\n{2,}", "\n", text)
text = re.sub(r"\s{2,}", " ", text)
# English-safe filter (no Devanagari)
text = re.sub(r"[^\w\s,;:.\-\(\)/&]", "", text)
text = re.sub(r"(\s*\.\s*){3,}", " ", text)
return text.strip()
# ==========================================================
# 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
# ==========================================================
def extract_table_of_contents(text: str):
toc_entries = []
lines = text.split("\n")
toc_started = False
toc_ended = False
line_count = len(lines)
for i, line in enumerate(lines):
if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
next_lines = lines[i + 1: i + 8]
if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
toc_started = True
continue
if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
numbered_lines = 0
for j in range(i, min(i + 5, line_count)):
if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
numbered_lines += 1
if numbered_lines >= 3:
toc_started = True
if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
toc_ended = True
break
if toc_started and not toc_ended:
match = re.match(
r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
line.strip()
)
if match:
section = match.group(1).strip()
title = match.group(2).strip()
if len(title) > 3 and not re.match(r"^\d+$", title):
toc_entries.append((section, title))
deduped, seen = [], set()
for sec, title in toc_entries:
key = (sec, title.lower())
if key not in seen:
deduped.append((sec, title))
seen.add(key)
return deduped
# ==========================================================
# 3A️⃣ HYBRID TOC FALLBACK (AI-Inferred using SAP GenAI Hub Proxy)
# ==========================================================
def adaptive_fallback_toc(text: str, model_name: str = "gpt-4o"):
snippet = text[:7000]
creds_path = os.path.join(os.path.dirname(__file__), "GEN AI HUB PROXY.json")
creds = {}
base_url = ""
if os.path.exists(creds_path):
try:
with open(creds_path, "r") as f:
creds = json.load(f)
base_url = (
creds.get("base_url")
or creds.get("serviceurls", {}).get("AI_API_URL", "")
or creds.get("AICORE_BASE_URL", "")
)
except Exception as e:
print(f"⚠️ Could not read GenAI proxy credentials: {e}")
else:
print("⚠️ No SAP GenAI credentials file found — skipping AI fallback.")
return []
if not base_url:
print("⚠️ Missing AI_API_URL or base_url in credentials — skipping fallback.")
return []
os.environ.update({
"AICORE_AUTH_URL": creds.get("url", ""),
"AICORE_CLIENT_ID": creds.get("clientid") or creds.get("client_id", ""),
"AICORE_CLIENT_SECRET": creds.get("clientsecret") or creds.get("client_secret", ""),
"AICORE_RESOURCE_GROUP": "default",
"AICORE_BASE_URL": base_url
})
try:
print(f"⚙️ Invoking GenAI proxy for TOC inference using model: {model_name}")
proxy_client = get_proxy_client("gen-ai-hub", base_url=base_url)
llm = ChatOpenAI(proxy_model_name=model_name, proxy_client=proxy_client, temperature=0.0, max_tokens=700)
prompt = f"""
You are a document structure analyzer.
Read the following text and infer its main section titles.
Output a numbered list of 5–10 clean section names that could appear in a Table of Contents.
TEXT SAMPLE:
{snippet}
"""
response = llm.invoke(prompt)
response_text = getattr(response, "content", str(response))
lines = [
re.sub(r"^[0-9.\-•\s]+", "", l.strip())
for l in response_text.splitlines()
if l.strip()
]
toc_ai = [(str(i + 1), l) for i, l in enumerate(lines) if len(l) > 3]
print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries (proxy-based).")
return toc_ai
except Exception as e:
print(f"⚠️ AI TOC fallback failed via GenAI proxy: {e}")
return []
# ==========================================================
# 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
# ==========================================================
def get_hybrid_toc(text: str):
toc_entries = extract_table_of_contents(text)
if toc_entries:
print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
return toc_entries, "heuristic"
print("⚠️ No TOC detected — invoking GenAI fallback...")
toc_ai = adaptive_fallback_toc(text)
if toc_ai:
print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
return toc_ai, "ai_inferred"
print("❌ No TOC could be detected or inferred.")
return [], "none"
# ==========================================================
# 4️⃣ SMART CHUNKING (Section + Procedure Aware)
# ==========================================================
def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
text_length = len(text)
if chunk_size is None:
if text_length > 200000:
chunk_size, overlap = 2000, 250
elif text_length > 50000:
chunk_size, overlap = 1500, 200
else:
chunk_size, overlap = 1000, 150
elif overlap is None:
overlap = 150
print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
text = re.sub(r"\s+", " ", text.strip())
section_blocks = re.split(r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})", text)
procedure_blocks = []
for sec in section_blocks:
if not sec.strip():
continue
sub_blocks = re.split(
r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
sec, flags=re.IGNORECASE
)
procedure_blocks.extend(sub_blocks)
chunks = []
for block in procedure_blocks:
if not block.strip():
continue
if len(block) < chunk_size * 1.5:
chunks.append(block.strip())
else:
chunks.extend(_split_by_sentence(block, chunk_size, overlap))
chunks = _merge_small_chunks(chunks, min_len=200)
final_chunks = []
for i, ch in enumerate(chunks):
if i == 0:
final_chunks.append(ch)
else:
prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
final_chunks.append((prev_tail + " " + ch).strip())
print(f"✅ Final chunks created: {len(final_chunks)}")
return final_chunks
# ==========================================================
# 🔹 Helper Functions
# ==========================================================
def _split_by_sentence(text, chunk_size=800, overlap=80):
sentences = re.split(r"(?<=[.!?])\s+", text)
chunks, current = [], ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
if current.strip():
chunks.append(current.strip())
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
if current.strip():
chunks.append(current.strip())
return chunks
def _merge_small_chunks(chunks, min_len=150):
merged, buffer = [], ""
for ch in chunks:
if len(ch) < min_len:
buffer += " " + ch
else:
if buffer:
merged.append(buffer.strip())
buffer = ""
merged.append(ch.strip())
if buffer:
merged.append(buffer.strip())
return merged
# ==========================================================
# 5️⃣ DEBUGGING (Manual Test)
# ==========================================================
if __name__ == "__main__":
pdf_path = "sample_ai_resume_structured.pdf"
text, toc, toc_source = extract_text_from_pdf(pdf_path)
print("\n📚 TOC Preview:", toc[:5])
chunks = chunk_text(text)
print(f"\n✅ {len(chunks)} chunks created.")
|