Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,10 +3,12 @@
|
|
| 3 |
|
| 4 |
"""
|
| 5 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
| 6 |
-
-
|
| 7 |
- Upload design/migration docs (PDF/DOCX/TXT/MD).
|
| 8 |
-
- Ask questions; get
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
Run locally:
|
| 11 |
pip install gradio PyPDF2 python-docx
|
| 12 |
python app.py
|
|
@@ -16,14 +18,16 @@ import os
|
|
| 16 |
import io
|
| 17 |
import re
|
| 18 |
import math
|
| 19 |
-
from typing import List, Tuple, Dict, Any
|
| 20 |
from collections import Counter, defaultdict
|
| 21 |
|
| 22 |
import gradio as gr
|
| 23 |
|
| 24 |
-
#
|
|
|
|
|
|
|
| 25 |
try:
|
| 26 |
-
import PyPDF2 #
|
| 27 |
except Exception:
|
| 28 |
PyPDF2 = None
|
| 29 |
|
|
@@ -38,6 +42,9 @@ except Exception:
|
|
| 38 |
# =========================
|
| 39 |
|
| 40 |
TRUSTED_SOURCES: List[Tuple[str, str]] = [
|
|
|
|
|
|
|
|
|
|
| 41 |
("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
|
| 42 |
("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
|
| 43 |
("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
|
|
@@ -82,9 +89,7 @@ FAQ_SEEDS: List[Dict[str, Any]] = [
|
|
| 82 |
_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
|
| 83 |
|
| 84 |
def tokenize(text: str) -> List[str]:
|
| 85 |
-
|
| 86 |
-
return []
|
| 87 |
-
return [t.lower() for t in _WORD_RE.findall(text)]
|
| 88 |
|
| 89 |
def list_refs(ref_names: List[str]) -> str:
|
| 90 |
links = []
|
|
@@ -96,7 +101,7 @@ def list_refs(ref_names: List[str]) -> str:
|
|
| 96 |
|
| 97 |
|
| 98 |
# =========================
|
| 99 |
-
# Tiny TF-IDF
|
| 100 |
# =========================
|
| 101 |
|
| 102 |
class TinyTfidfIndex:
|
|
@@ -109,14 +114,12 @@ class TinyTfidfIndex:
|
|
| 109 |
|
| 110 |
def add_documents(self, tokenized_docs: List[List[str]]):
|
| 111 |
self.docs = tokenized_docs[:]
|
| 112 |
-
# document frequency
|
| 113 |
self.df = Counter()
|
| 114 |
for toks in self.docs:
|
| 115 |
self.df.update(set(toks))
|
| 116 |
N = max(1, len(self.docs))
|
| 117 |
self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
|
| 118 |
self.voc_size = len(self.idf)
|
| 119 |
-
# precompute norms
|
| 120 |
self.doc_norms = []
|
| 121 |
for toks in self.docs:
|
| 122 |
tf = Counter(toks)
|
|
@@ -137,7 +140,7 @@ class TinyTfidfIndex:
|
|
| 137 |
v[term] = (cnt / total) * idf
|
| 138 |
return v
|
| 139 |
|
| 140 |
-
def query(self, text: str, k: int =
|
| 141 |
if not self.docs:
|
| 142 |
return []
|
| 143 |
qv = self._vec(tokenize(text))
|
|
@@ -157,52 +160,16 @@ class TinyTfidfIndex:
|
|
| 157 |
|
| 158 |
|
| 159 |
# =========================
|
| 160 |
-
#
|
| 161 |
# =========================
|
| 162 |
|
| 163 |
CHECKS = [
|
| 164 |
-
{
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
},
|
| 171 |
-
{
|
| 172 |
-
"id": "connectivity",
|
| 173 |
-
"desc": "Connectivity planned (ExpressRoute/VPN), DNS, MTU validated for HCX.",
|
| 174 |
-
"fix": "Verify ER/VPN, DNS resolution, and HCX MTU/mobility settings.",
|
| 175 |
-
"keywords": ["expressroute", "vpn", "dns", "mtu", "hcx", "connectivity"],
|
| 176 |
-
"pillar": "networking",
|
| 177 |
-
},
|
| 178 |
-
{
|
| 179 |
-
"id": "migrate_tooling",
|
| 180 |
-
"desc": "Discovery/assessment and tooling chosen (Azure Migrate or HCX).",
|
| 181 |
-
"fix": "Run Azure Migrate discovery; select HCX or Azure Migrate per downtime.",
|
| 182 |
-
"keywords": ["azure", "migrate", "discovery", "assessment", "hcx", "replication"],
|
| 183 |
-
"pillar": "operations",
|
| 184 |
-
},
|
| 185 |
-
{
|
| 186 |
-
"id": "security",
|
| 187 |
-
"desc": "Security/identity configured (Key Vault, Defender, Sentinel, PIM/MFA).",
|
| 188 |
-
"fix": "Centralize secrets in Key Vault; enable Defender/Sentinel; enforce PIM/MFA.",
|
| 189 |
-
"keywords": ["key", "vault", "defender", "sentinel", "pim", "mfa", "entra", "aad", "identity"],
|
| 190 |
-
"pillar": "security",
|
| 191 |
-
},
|
| 192 |
-
{
|
| 193 |
-
"id": "dr_backup",
|
| 194 |
-
"desc": "Backups, DR, RTO/RPO defined; ASR drills planned.",
|
| 195 |
-
"fix": "Set RTO/RPO; immutability & soft-delete; test ASR failover/failback.",
|
| 196 |
-
"keywords": ["backup", "rto", "rpo", "dr", "asr", "failover", "restore"],
|
| 197 |
-
"pillar": "reliability",
|
| 198 |
-
},
|
| 199 |
-
{
|
| 200 |
-
"id": "cost",
|
| 201 |
-
"desc": "Cost optimization plan (right-sizing, reservations, tagging).",
|
| 202 |
-
"fix": "Use reservations/Savings Plans, rightsizing, and enforce tags.",
|
| 203 |
-
"keywords": ["cost", "reservation", "savings", "right", "tag"],
|
| 204 |
-
"pillar": "cost",
|
| 205 |
-
},
|
| 206 |
]
|
| 207 |
|
| 208 |
def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
|
|
@@ -230,7 +197,119 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
|
|
| 230 |
|
| 231 |
|
| 232 |
# =========================
|
| 233 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
# =========================
|
| 235 |
|
| 236 |
def read_pdf_bytes(b: bytes) -> str:
|
|
@@ -238,13 +317,7 @@ def read_pdf_bytes(b: bytes) -> str:
|
|
| 238 |
return ""
|
| 239 |
try:
|
| 240 |
reader = PyPDF2.PdfReader(io.BytesIO(b))
|
| 241 |
-
|
| 242 |
-
for page in reader.pages:
|
| 243 |
-
try:
|
| 244 |
-
out.append(page.extract_text() or "")
|
| 245 |
-
except Exception:
|
| 246 |
-
pass
|
| 247 |
-
return "\n".join(out)
|
| 248 |
except Exception:
|
| 249 |
return ""
|
| 250 |
|
|
@@ -267,7 +340,6 @@ def read_text_bytes(b: bytes) -> str:
|
|
| 267 |
return ""
|
| 268 |
|
| 269 |
def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
|
| 270 |
-
"""Returns {"file": <name>, "text": <extracted_text>}"""
|
| 271 |
name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
|
| 272 |
data = file_obj.get("data")
|
| 273 |
if data is None:
|
|
@@ -277,72 +349,38 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
|
|
| 277 |
data = fh.read()
|
| 278 |
if data is None:
|
| 279 |
return {"file": name, "text": ""}
|
| 280 |
-
|
| 281 |
low = name.lower()
|
| 282 |
if low.endswith(".pdf"):
|
| 283 |
text = read_pdf_bytes(data)
|
| 284 |
elif low.endswith((".docx", ".doc")):
|
| 285 |
text = read_docx_bytes(data)
|
| 286 |
-
elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
|
| 287 |
-
text = read_text_bytes(data)
|
| 288 |
else:
|
| 289 |
text = read_text_bytes(data)
|
| 290 |
return {"file": os.path.basename(name), "text": text or ""}
|
| 291 |
|
| 292 |
|
| 293 |
# =========================
|
| 294 |
-
# Detailed Answer Composer
|
| 295 |
# =========================
|
| 296 |
|
| 297 |
def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
for g in glist[:limit]:
|
| 305 |
-
out.append(f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_")
|
| 306 |
-
return "\n".join(out) if out else "- No major issues detected in the sampled excerpts."
|
| 307 |
-
|
| 308 |
-
refs = list_refs([
|
| 309 |
-
"Azure VMware Solution (AVS)",
|
| 310 |
-
"Azure Migrate",
|
| 311 |
-
"Cloud Adoption Framework (CAF)",
|
| 312 |
-
"Azure Well-Architected Framework (WAF)",
|
| 313 |
-
"VMware HCX Docs",
|
| 314 |
-
])
|
| 315 |
-
|
| 316 |
-
pillar_lines = []
|
| 317 |
-
for k_, v_ in scores.items():
|
| 318 |
-
if k_ == "overall":
|
| 319 |
-
continue
|
| 320 |
-
pillar_lines.append(f"- **{k_.capitalize()}**: {v_}")
|
| 321 |
-
pillar_md = "\n".join(pillar_lines) if pillar_lines else "- (no signals)"
|
| 322 |
-
|
| 323 |
-
md = (
|
| 324 |
f"### Answer (detailed)\n"
|
| 325 |
f"**Your question:** {query}\n\n"
|
| 326 |
-
f"**
|
| 327 |
-
f"
|
| 328 |
-
f"####
|
| 329 |
-
"
|
| 330 |
-
"2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
|
| 331 |
-
"3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
|
| 332 |
-
"4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
|
| 333 |
-
"5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
|
| 334 |
-
"6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
|
| 335 |
-
"7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
|
| 336 |
-
f"#### What your documents emphasize (auto-scored)\n"
|
| 337 |
-
f"**Overall score:** {scores.get('overall', 0)} / 5.0\n\n"
|
| 338 |
-
f"**Per-pillar signals:**\n{pillar_md}\n\n"
|
| 339 |
-
f"#### Gaps & quick fixes\n{_mk_gaps(gaps, limit=8)}\n\n"
|
| 340 |
-
f"#### Supporting excerpts\n"
|
| 341 |
)
|
| 342 |
for s in snippets:
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
return
|
| 346 |
|
| 347 |
|
| 348 |
# =========================
|
|
@@ -354,33 +392,27 @@ def answer_faq_or_approach_detailed(
|
|
| 354 |
use_uploaded_docs: bool,
|
| 355 |
index_obj: Any,
|
| 356 |
_matrix_unused: Any,
|
| 357 |
-
corpus: List[Dict[str, str]]
|
| 358 |
) -> str:
|
| 359 |
q = (question or "").strip()
|
| 360 |
if not q:
|
| 361 |
return "Please enter a question."
|
| 362 |
|
| 363 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
q_tokens = set(tokenize(q))
|
| 365 |
for item in FAQ_SEEDS:
|
| 366 |
seed_tokens = set(tokenize(item["q"]))
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
base = (
|
| 371 |
-
f"### Answer (detailed)\n"
|
| 372 |
f"{item['a']}\n\n"
|
| 373 |
-
"
|
| 374 |
-
"1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
|
| 375 |
-
"2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
|
| 376 |
-
"3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
|
| 377 |
-
"4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
|
| 378 |
-
"5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
|
| 379 |
-
"6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
|
| 380 |
-
"7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
|
| 381 |
-
f"**Trusted sources:** {refs}"
|
| 382 |
)
|
| 383 |
-
return base
|
| 384 |
|
| 385 |
# 2) Use uploaded docs (RAG) → detailed synthesized answer
|
| 386 |
if use_uploaded_docs and index_obj is not None and corpus:
|
|
@@ -394,60 +426,43 @@ def answer_faq_or_approach_detailed(
|
|
| 394 |
snippets.append({
|
| 395 |
"file": item["file"],
|
| 396 |
"relevance": float(sim),
|
| 397 |
-
"excerpt": excerpt
|
| 398 |
})
|
| 399 |
if snippets:
|
| 400 |
return _compose_detailed_from_snippets(q, snippets)
|
| 401 |
|
| 402 |
-
# 3) Fallback (no docs) → generic
|
| 403 |
-
refs = list_refs([
|
| 404 |
-
|
| 405 |
-
"Azure Migrate",
|
| 406 |
-
"Cloud Adoption Framework (CAF)",
|
| 407 |
-
"Azure Well-Architected Framework (WAF)",
|
| 408 |
-
"VMware HCX Docs",
|
| 409 |
-
])
|
| 410 |
-
generic = (
|
| 411 |
"### Answer (detailed)\n"
|
| 412 |
-
"
|
| 413 |
-
"
|
| 414 |
-
"
|
| 415 |
-
"
|
| 416 |
-
"
|
| 417 |
-
"
|
| 418 |
-
"4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
|
| 419 |
-
"5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
|
| 420 |
-
"6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
|
| 421 |
-
"7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
|
| 422 |
f"**Trusted sources:** {refs}"
|
| 423 |
)
|
| 424 |
-
return generic
|
| 425 |
|
| 426 |
|
| 427 |
# =========================
|
| 428 |
-
# Build
|
| 429 |
# =========================
|
| 430 |
|
| 431 |
-
def build_index(files: List[Dict[str, Any]])
|
| 432 |
-
"""Returns: (index_obj, matrix_placeholder, corpus, status_message)"""
|
| 433 |
if not files:
|
| 434 |
return None, None, [], "No files uploaded yet."
|
| 435 |
-
|
| 436 |
corpus: List[Dict[str, str]] = []
|
| 437 |
for f in files:
|
| 438 |
rec = parse_file(f)
|
| 439 |
if rec["text"]:
|
| 440 |
corpus.append(rec)
|
| 441 |
-
|
| 442 |
if not corpus:
|
| 443 |
-
return None, None, [], "
|
| 444 |
-
|
| 445 |
tokenized = [tokenize(c["text"]) for c in corpus]
|
| 446 |
idx = TinyTfidfIndex()
|
| 447 |
idx.add_documents(tokenized)
|
| 448 |
-
|
| 449 |
-
status = f"Indexed {len(corpus)} document(s). Vocabulary size ≈ {idx.voc_size}."
|
| 450 |
-
return idx, None, corpus, status
|
| 451 |
|
| 452 |
|
| 453 |
# =========================
|
|
@@ -457,38 +472,26 @@ def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, s
|
|
| 457 |
with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
|
| 458 |
gr.Markdown(
|
| 459 |
"## VMware On-Prem → Azure Local Migration Assistant\n"
|
| 460 |
-
"
|
| 461 |
-
"- Ask questions. Toggle **Use uploaded docs** for RAG-based answers\n"
|
| 462 |
-
"- Answers are **detailed** by default, with structured steps and trusted references\n"
|
| 463 |
)
|
| 464 |
|
| 465 |
with gr.Row():
|
| 466 |
with gr.Column(scale=2):
|
| 467 |
-
file_in = gr.Files(
|
| 468 |
-
label="Upload documents (PDF/DOCX/TXT/MD)",
|
| 469 |
-
file_count="multiple",
|
| 470 |
-
type="filepath" # we will open paths ourselves
|
| 471 |
-
)
|
| 472 |
index_status = gr.Markdown("No index yet.")
|
| 473 |
-
|
| 474 |
-
# Hidden/State to hold in-memory data
|
| 475 |
st_index = gr.State()
|
| 476 |
-
st_matrix = gr.State()
|
| 477 |
st_corpus = gr.State()
|
| 478 |
-
|
| 479 |
build_btn = gr.Button("Build Index", variant="primary")
|
|
|
|
| 480 |
with gr.Column(scale=3):
|
| 481 |
-
question = gr.Textbox(
|
| 482 |
-
label="Ask a question",
|
| 483 |
-
placeholder="e.g., How do I minimize downtime for our VMware migration?",
|
| 484 |
-
lines=3
|
| 485 |
-
)
|
| 486 |
use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
|
| 487 |
ask_btn = gr.Button("Ask", variant="primary")
|
| 488 |
answer_box = gr.Markdown("")
|
| 489 |
|
| 490 |
-
# Convert gr.Files (paths)
|
| 491 |
-
def _collect_files(paths: List[str])
|
| 492 |
out = []
|
| 493 |
for p in paths or []:
|
| 494 |
try:
|
|
@@ -501,19 +504,18 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
|
|
| 501 |
|
| 502 |
def _build(files_paths: List[str]):
|
| 503 |
files = _collect_files(files_paths)
|
| 504 |
-
|
| 505 |
-
return status, idx, mat, corpus
|
| 506 |
|
| 507 |
build_btn.click(
|
| 508 |
_build,
|
| 509 |
inputs=[file_in],
|
| 510 |
-
outputs=[index_status, st_index, st_matrix, st_corpus]
|
| 511 |
)
|
| 512 |
|
| 513 |
ask_btn.click(
|
| 514 |
answer_faq_or_approach_detailed,
|
| 515 |
inputs=[question, use_docs, st_index, st_matrix, st_corpus],
|
| 516 |
-
outputs=[answer_box]
|
| 517 |
)
|
| 518 |
|
| 519 |
if __name__ == "__main__":
|
|
|
|
| 3 |
|
| 4 |
"""
|
| 5 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
| 6 |
+
- No external API calls. No scikit-learn.
|
| 7 |
- Upload design/migration docs (PDF/DOCX/TXT/MD).
|
| 8 |
+
- Ask questions; get RELIABLE, DETAILED answers:
|
| 9 |
+
• Concept KB (for definitions like “What is Azure Arc-enabled SDN?”)
|
| 10 |
+
• RAG on uploaded docs (excerpts + gaps/fixes)
|
| 11 |
+
• Seeded FAQs (migration flows)
|
| 12 |
Run locally:
|
| 13 |
pip install gradio PyPDF2 python-docx
|
| 14 |
python app.py
|
|
|
|
| 18 |
import io
|
| 19 |
import re
|
| 20 |
import math
|
| 21 |
+
from typing import List, Tuple, Dict, Any, Optional
|
| 22 |
from collections import Counter, defaultdict
|
| 23 |
|
| 24 |
import gradio as gr
|
| 25 |
|
| 26 |
+
# -------------------------
|
| 27 |
+
# Optional parsers (graceful fallback)
|
| 28 |
+
# -------------------------
|
| 29 |
try:
|
| 30 |
+
import PyPDF2 # often present on Spaces
|
| 31 |
except Exception:
|
| 32 |
PyPDF2 = None
|
| 33 |
|
|
|
|
| 42 |
# =========================
|
| 43 |
|
| 44 |
TRUSTED_SOURCES: List[Tuple[str, str]] = [
|
| 45 |
+
("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
|
| 46 |
+
("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
|
| 47 |
+
("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
|
| 48 |
("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
|
| 49 |
("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
|
| 50 |
("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
|
|
|
|
| 89 |
_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
|
| 90 |
|
| 91 |
def tokenize(text: str) -> List[str]:
|
| 92 |
+
return [t.lower() for t in _WORD_RE.findall(text or "")]
|
|
|
|
|
|
|
| 93 |
|
| 94 |
def list_refs(ref_names: List[str]) -> str:
|
| 95 |
links = []
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
# =========================
|
| 104 |
+
# Tiny TF-IDF Index (no sklearn)
|
| 105 |
# =========================
|
| 106 |
|
| 107 |
class TinyTfidfIndex:
|
|
|
|
| 114 |
|
| 115 |
def add_documents(self, tokenized_docs: List[List[str]]):
|
| 116 |
self.docs = tokenized_docs[:]
|
|
|
|
| 117 |
self.df = Counter()
|
| 118 |
for toks in self.docs:
|
| 119 |
self.df.update(set(toks))
|
| 120 |
N = max(1, len(self.docs))
|
| 121 |
self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
|
| 122 |
self.voc_size = len(self.idf)
|
|
|
|
| 123 |
self.doc_norms = []
|
| 124 |
for toks in self.docs:
|
| 125 |
tf = Counter(toks)
|
|
|
|
| 140 |
v[term] = (cnt / total) * idf
|
| 141 |
return v
|
| 142 |
|
| 143 |
+
def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
|
| 144 |
if not self.docs:
|
| 145 |
return []
|
| 146 |
qv = self._vec(tokenize(text))
|
|
|
|
| 160 |
|
| 161 |
|
| 162 |
# =========================
|
| 163 |
+
# Rubric for RAG-tailoring
|
| 164 |
# =========================
|
| 165 |
|
| 166 |
CHECKS = [
|
| 167 |
+
{"id": "landing_zone", "desc": "Landing zone defined.", "fix": "Use CAF blueprints.", "keywords": ["landing", "hub", "spoke", "policy", "rbac"], "pillar": "governance"},
|
| 168 |
+
{"id": "connectivity", "desc": "Connectivity planned.", "fix": "Verify ER/VPN, DNS, MTU.", "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx"], "pillar": "networking"},
|
| 169 |
+
{"id": "migrate_tooling","desc": "Tooling chosen.", "fix": "Run Azure Migrate discovery.", "keywords": ["migrate", "discovery", "assessment", "hcx"], "pillar": "operations"},
|
| 170 |
+
{"id": "security", "desc": "Security configured.", "fix": "Enable Key Vault, Defender, Sentinel, MFA.", "keywords": ["vault", "defender", "sentinel", "mfa", "identity"], "pillar": "security"},
|
| 171 |
+
{"id": "dr_backup", "desc": "Backups/DR defined.", "fix": "Set RTO/RPO; test ASR.", "keywords": ["backup", "rto", "rpo", "dr", "asr"], "pillar": "reliability"},
|
| 172 |
+
{"id": "cost", "desc": "Cost optimization.", "fix": "Use reservations, rightsizing, tags.", "keywords": ["cost", "reservation", "savings", "tag"], "pillar": "cost"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
]
|
| 174 |
|
| 175 |
def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
|
|
|
|
| 197 |
|
| 198 |
|
| 199 |
# =========================
|
| 200 |
+
# Built-in Concept KB (for definitional questions)
|
| 201 |
+
# =========================
|
| 202 |
+
|
| 203 |
+
class Concept:
|
| 204 |
+
def __init__(self, name: str, aliases: List[str], builder):
|
| 205 |
+
self.name = name
|
| 206 |
+
self.aliases = [tokenize(a) for a in aliases]
|
| 207 |
+
self.builder = builder # function(query:str)->str
|
| 208 |
+
|
| 209 |
+
def _kb_ans_azure_sdn(_: str) -> str:
|
| 210 |
+
refs = list_refs(["Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"])
|
| 211 |
+
return (
|
| 212 |
+
"### Azure SDN — What it is and why it matters\n"
|
| 213 |
+
"**Definition:** Azure SDN is Microsoft's software-defined networking stack that centralizes network control in software, "
|
| 214 |
+
"decoupling policy and management from physical hardware. It lets you programmatically create and secure virtual networks, "
|
| 215 |
+
"subnets, microsegmentation (ACL/NSG-like policies), load balancers and gateways across Azure and Azure Local (Azure Stack HCI) environments.\n\n"
|
| 216 |
+
"**Key capabilities**\n"
|
| 217 |
+
"- Central, policy-driven control plane for virtual networking resources.\n"
|
| 218 |
+
"- Automation & GitOps-friendly configuration for repeatable environments.\n"
|
| 219 |
+
"- Microsegmentation and traffic filtering for east–west security.\n"
|
| 220 |
+
"- Software load balancing and gateway services for app connectivity.\n"
|
| 221 |
+
"- Consistent constructs across cloud and on-prem (with Azure Local).\n\n"
|
| 222 |
+
"**How it works (high level)**\n"
|
| 223 |
+
"- A software control plane programs host virtual switches and network functions.\n"
|
| 224 |
+
"- Network intent (VNets, subnets, policies) is applied consistently across hosts.\n"
|
| 225 |
+
"- Integrates with Azure identity/management for RBAC and governance.\n\n"
|
| 226 |
+
"**Common use cases**\n"
|
| 227 |
+
"- Rapidly provisioning isolated app environments.\n"
|
| 228 |
+
"- Enforcing zero-trust style segmentation between tiers.\n"
|
| 229 |
+
"- Hybrid apps spanning Azure and Azure Local.\n\n"
|
| 230 |
+
f"**Trusted sources:** {refs}"
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
def _kb_ans_arc_enabled_sdn(_: str) -> str:
|
| 234 |
+
refs = list_refs(["Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"])
|
| 235 |
+
return (
|
| 236 |
+
"### Azure Arc-enabled SDN — Definition & details\n"
|
| 237 |
+
"**Definition:** Azure Arc-enabled SDN brings Azure's software-defined networking to on-premises Azure Local (Azure Stack HCI) clusters, "
|
| 238 |
+
"managed through Azure Arc. It decouples network control from hardware so you can centrally define, automate, and secure "
|
| 239 |
+
"virtual networks, subnets, and policies in your datacenter using Azure-consistent tools.\n\n"
|
| 240 |
+
"**Why it matters**\n"
|
| 241 |
+
"- Gives you Azure-like VNet constructs and policy management on-prem.\n"
|
| 242 |
+
"- Enables consistent security and segmentation across hybrid estates.\n"
|
| 243 |
+
"- Supports rapid, software-driven changes without touching physical fabric.\n\n"
|
| 244 |
+
"**Key capabilities**\n"
|
| 245 |
+
"- Create/modify on-prem VNets, subnets, and routing policies from Azure.\n"
|
| 246 |
+
"- Apply microsegmentation rules (policy/ACL-style) for east–west security.\n"
|
| 247 |
+
"- Software load balancing and gateway services for north–south/east–west flows.\n"
|
| 248 |
+
"- Integration with Azure RBAC, tagging, and governance for change control.\n\n"
|
| 249 |
+
"**Core components (conceptual)**\n"
|
| 250 |
+
"- **Arc resource bridge & agents** — connect your HCI cluster to Azure control.\n"
|
| 251 |
+
"- **SDN controller & host agents** — program the Hyper-V vSwitch and network functions.\n"
|
| 252 |
+
"- **Azure portal/CLI/GitOps** — define intent (VNets, subnets, policies) and deploy.\n\n"
|
| 253 |
+
"**Prerequisites (typical)**\n"
|
| 254 |
+
"- Azure Local (Azure Stack HCI) cluster connected to Azure Arc.\n"
|
| 255 |
+
"- Arc resource bridge onboarded; network requirements met.\n"
|
| 256 |
+
"- Appropriate RBAC roles to manage networking resources.\n\n"
|
| 257 |
+
"**Use cases**\n"
|
| 258 |
+
"- Host Azure-consistent app networks on-prem for data locality/regulatory needs.\n"
|
| 259 |
+
"- Hybrid deployments with identical network constructs across Azure and HCI.\n"
|
| 260 |
+
"- Rapid rollout of segmented networks for dev/test/prod without hardware changes.\n\n"
|
| 261 |
+
"**Notes & limitations (high level)**\n"
|
| 262 |
+
"- Physical underlay still matters (IP design, routing, bandwidth, HA).\n"
|
| 263 |
+
"- Feature parity with public Azure services may vary; validate per release.\n\n"
|
| 264 |
+
f"**Trusted sources:** {refs}"
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
KB_CONCEPTS: List[Concept] = [
|
| 268 |
+
Concept(
|
| 269 |
+
name="azure sdn",
|
| 270 |
+
aliases=[
|
| 271 |
+
"azure sdn",
|
| 272 |
+
"software defined networking azure",
|
| 273 |
+
"sdn in azure",
|
| 274 |
+
"azure local sdn",
|
| 275 |
+
"azure stack hci sdn",
|
| 276 |
+
],
|
| 277 |
+
builder=_kb_ans_azure_sdn,
|
| 278 |
+
),
|
| 279 |
+
Concept(
|
| 280 |
+
name="azure arc enabled sdn",
|
| 281 |
+
aliases=[
|
| 282 |
+
"azure arc enabled sdn",
|
| 283 |
+
"azure arc-enabled sdn",
|
| 284 |
+
"arc enabled sdn",
|
| 285 |
+
"arc-enabled sdn",
|
| 286 |
+
"arc sdn",
|
| 287 |
+
"azure local arc sdn",
|
| 288 |
+
"azure stack hci arc sdn",
|
| 289 |
+
],
|
| 290 |
+
builder=_kb_ans_arc_enabled_sdn,
|
| 291 |
+
),
|
| 292 |
+
]
|
| 293 |
+
|
| 294 |
+
def lookup_concept(query: str) -> Optional[Concept]:
|
| 295 |
+
q_tokens = set(tokenize(query))
|
| 296 |
+
best: Optional[Concept] = None
|
| 297 |
+
best_score = 0.0
|
| 298 |
+
for c in KB_CONCEPTS:
|
| 299 |
+
for alias_tokens in c.aliases:
|
| 300 |
+
if not alias_tokens:
|
| 301 |
+
continue
|
| 302 |
+
overlap = len(q_tokens & set(alias_tokens))
|
| 303 |
+
score = overlap / float(len(set(alias_tokens)))
|
| 304 |
+
if score > best_score:
|
| 305 |
+
best_score = score
|
| 306 |
+
best = c
|
| 307 |
+
# threshold: intentional but tolerant
|
| 308 |
+
return best if best_score >= 0.5 else None
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
# =========================
|
| 312 |
+
# File Parsing
|
| 313 |
# =========================
|
| 314 |
|
| 315 |
def read_pdf_bytes(b: bytes) -> str:
|
|
|
|
| 317 |
return ""
|
| 318 |
try:
|
| 319 |
reader = PyPDF2.PdfReader(io.BytesIO(b))
|
| 320 |
+
return "\n".join([page.extract_text() or "" for page in reader.pages])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
except Exception:
|
| 322 |
return ""
|
| 323 |
|
|
|
|
| 340 |
return ""
|
| 341 |
|
| 342 |
def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
|
|
|
|
| 343 |
name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
|
| 344 |
data = file_obj.get("data")
|
| 345 |
if data is None:
|
|
|
|
| 349 |
data = fh.read()
|
| 350 |
if data is None:
|
| 351 |
return {"file": name, "text": ""}
|
|
|
|
| 352 |
low = name.lower()
|
| 353 |
if low.endswith(".pdf"):
|
| 354 |
text = read_pdf_bytes(data)
|
| 355 |
elif low.endswith((".docx", ".doc")):
|
| 356 |
text = read_docx_bytes(data)
|
|
|
|
|
|
|
| 357 |
else:
|
| 358 |
text = read_text_bytes(data)
|
| 359 |
return {"file": os.path.basename(name), "text": text or ""}
|
| 360 |
|
| 361 |
|
| 362 |
# =========================
|
| 363 |
+
# Detailed Answer Composer (for RAG path)
|
| 364 |
# =========================
|
| 365 |
|
| 366 |
def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
|
| 367 |
+
combined = "\n\n".join([s.get("excerpt", "") for s in snippets])
|
| 368 |
+
scores, gaps = score_text_against_checks(combined)
|
| 369 |
+
def _mk_gaps(glist):
|
| 370 |
+
return "\n".join([f"- ({g['severity']}) {g['id']}: {g['fix']}" for g in glist]) or "- No major issues detected."
|
| 371 |
+
refs = list_refs([s[0] for s in TRUSTED_SOURCES])
|
| 372 |
+
details = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
f"### Answer (detailed)\n"
|
| 374 |
f"**Your question:** {query}\n\n"
|
| 375 |
+
f"**Summary:** Migration planning must cover landing zone, connectivity, tooling, security, DR, and cost.\n\n"
|
| 376 |
+
f"#### Scores\nOverall: {scores.get('overall', 0)}/5.0\n\n"
|
| 377 |
+
f"#### Gaps & Fixes\n{_mk_gaps(gaps)}\n\n"
|
| 378 |
+
f"#### Supporting Excerpts\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
)
|
| 380 |
for s in snippets:
|
| 381 |
+
details += f"- {s['file']} (rel {s['relevance']:.2f}): {s['excerpt']}\n"
|
| 382 |
+
details += f"\n**Trusted sources:** {refs}"
|
| 383 |
+
return details
|
| 384 |
|
| 385 |
|
| 386 |
# =========================
|
|
|
|
| 392 |
use_uploaded_docs: bool,
|
| 393 |
index_obj: Any,
|
| 394 |
_matrix_unused: Any,
|
| 395 |
+
corpus: List[Dict[str, str]]
|
| 396 |
) -> str:
|
| 397 |
q = (question or "").strip()
|
| 398 |
if not q:
|
| 399 |
return "Please enter a question."
|
| 400 |
|
| 401 |
+
# 0) Concept KB for definitional questions (e.g., "What is Azure Arc-enabled SDN?")
|
| 402 |
+
concept = lookup_concept(q)
|
| 403 |
+
if concept is not None:
|
| 404 |
+
return concept.builder(q)
|
| 405 |
+
|
| 406 |
+
# 1) Seeded FAQs → detailed plan when relevant (>=50% overlap with seed)
|
| 407 |
q_tokens = set(tokenize(q))
|
| 408 |
for item in FAQ_SEEDS:
|
| 409 |
seed_tokens = set(tokenize(item["q"]))
|
| 410 |
+
if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
|
| 411 |
+
return (
|
| 412 |
+
"### Answer (detailed)\n"
|
|
|
|
|
|
|
| 413 |
f"{item['a']}\n\n"
|
| 414 |
+
f"**Trusted sources:** {list_refs(item.get('refs', []))}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
)
|
|
|
|
| 416 |
|
| 417 |
# 2) Use uploaded docs (RAG) → detailed synthesized answer
|
| 418 |
if use_uploaded_docs and index_obj is not None and corpus:
|
|
|
|
| 426 |
snippets.append({
|
| 427 |
"file": item["file"],
|
| 428 |
"relevance": float(sim),
|
| 429 |
+
"excerpt": excerpt
|
| 430 |
})
|
| 431 |
if snippets:
|
| 432 |
return _compose_detailed_from_snippets(q, snippets)
|
| 433 |
|
| 434 |
+
# 3) Fallback (no docs) → generic, but structured overview (not migration-only)
|
| 435 |
+
refs = list_refs(["Azure Arc (overview)", "Azure Stack HCI (Azure Local)", "Azure SDN concepts (HCI)"])
|
| 436 |
+
return (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
"### Answer (detailed)\n"
|
| 438 |
+
"I couldn't match a specific concept or supporting excerpts, so here's a structured overview you can refine:\n\n"
|
| 439 |
+
"**Definition:** Describe what the service/feature is, what problems it solves, and where it runs (Azure / Azure Local).\n\n"
|
| 440 |
+
"**Key capabilities:** automation, policy-driven control, security segmentation, connectivity services.\n\n"
|
| 441 |
+
"**How it works:** control plane programs host/network functions; policies applied consistently; integrates with RBAC/governance.\n\n"
|
| 442 |
+
"**Prerequisites:** identity/RBAC, connectivity to Azure (for Arc), supported host/cluster versions.\n\n"
|
| 443 |
+
"**Use cases:** hybrid deployments, zero-trust segmentation, rapid environment provisioning.\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
f"**Trusted sources:** {refs}"
|
| 445 |
)
|
|
|
|
| 446 |
|
| 447 |
|
| 448 |
# =========================
|
| 449 |
+
# Build Index
|
| 450 |
# =========================
|
| 451 |
|
| 452 |
+
def build_index(files: List[Dict[str, Any]]):
|
|
|
|
| 453 |
if not files:
|
| 454 |
return None, None, [], "No files uploaded yet."
|
|
|
|
| 455 |
corpus: List[Dict[str, str]] = []
|
| 456 |
for f in files:
|
| 457 |
rec = parse_file(f)
|
| 458 |
if rec["text"]:
|
| 459 |
corpus.append(rec)
|
|
|
|
| 460 |
if not corpus:
|
| 461 |
+
return None, None, [], "No text extracted."
|
|
|
|
| 462 |
tokenized = [tokenize(c["text"]) for c in corpus]
|
| 463 |
idx = TinyTfidfIndex()
|
| 464 |
idx.add_documents(tokenized)
|
| 465 |
+
return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}."
|
|
|
|
|
|
|
| 466 |
|
| 467 |
|
| 468 |
# =========================
|
|
|
|
| 472 |
with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
|
| 473 |
gr.Markdown(
|
| 474 |
"## VMware On-Prem → Azure Local Migration Assistant\n"
|
| 475 |
+
"Upload documents and ask questions. Detailed answers will be provided."
|
|
|
|
|
|
|
| 476 |
)
|
| 477 |
|
| 478 |
with gr.Row():
|
| 479 |
with gr.Column(scale=2):
|
| 480 |
+
file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
index_status = gr.Markdown("No index yet.")
|
|
|
|
|
|
|
| 482 |
st_index = gr.State()
|
| 483 |
+
st_matrix = gr.State()
|
| 484 |
st_corpus = gr.State()
|
|
|
|
| 485 |
build_btn = gr.Button("Build Index", variant="primary")
|
| 486 |
+
|
| 487 |
with gr.Column(scale=3):
|
| 488 |
+
question = gr.Textbox(label="Ask a question", placeholder="e.g., What is Azure Arc-enabled SDN, and why would I use it?")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
|
| 490 |
ask_btn = gr.Button("Ask", variant="primary")
|
| 491 |
answer_box = gr.Markdown("")
|
| 492 |
|
| 493 |
+
# Convert gr.Files (paths) to expected dicts
|
| 494 |
+
def _collect_files(paths: List[str]):
|
| 495 |
out = []
|
| 496 |
for p in paths or []:
|
| 497 |
try:
|
|
|
|
| 504 |
|
| 505 |
def _build(files_paths: List[str]):
|
| 506 |
files = _collect_files(files_paths)
|
| 507 |
+
return build_index(files)
|
|
|
|
| 508 |
|
| 509 |
build_btn.click(
|
| 510 |
_build,
|
| 511 |
inputs=[file_in],
|
| 512 |
+
outputs=[index_status, st_index, st_matrix, st_corpus]
|
| 513 |
)
|
| 514 |
|
| 515 |
ask_btn.click(
|
| 516 |
answer_faq_or_approach_detailed,
|
| 517 |
inputs=[question, use_docs, st_index, st_matrix, st_corpus],
|
| 518 |
+
outputs=[answer_box]
|
| 519 |
)
|
| 520 |
|
| 521 |
if __name__ == "__main__":
|