Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
| 6 |
- Upload design/migration docs (PDF/DOCX/TXT/MD).
|
| 7 |
- Ask questions; get reliable, detailed, and relevant answers.
|
| 8 |
-
- Intent-aware (definitions
|
| 9 |
- No external APIs. No scikit-learn.
|
| 10 |
|
| 11 |
Run locally:
|
|
@@ -18,7 +18,7 @@ import io
|
|
| 18 |
import re
|
| 19 |
import math
|
| 20 |
from typing import List, Tuple, Dict, Any
|
| 21 |
-
from collections import Counter
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
|
|
@@ -44,7 +44,7 @@ TRUSTED_SOURCES: List[Tuple[str, str]] = [
|
|
| 44 |
# Core guidance
|
| 45 |
("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
|
| 46 |
("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
|
| 47 |
-
# Networking
|
| 48 |
("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
|
| 49 |
("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
|
| 50 |
("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
|
|
@@ -110,7 +110,7 @@ def list_refs(ref_names: List[str]) -> str:
|
|
| 110 |
|
| 111 |
|
| 112 |
# =========================
|
| 113 |
-
# Intent &
|
| 114 |
# =========================
|
| 115 |
|
| 116 |
_DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I)
|
|
@@ -149,7 +149,7 @@ def topic_refs(topic: str) -> List[str]:
|
|
| 149 |
|
| 150 |
|
| 151 |
# =========================
|
| 152 |
-
# Tiny TF-IDF Index
|
| 153 |
# =========================
|
| 154 |
|
| 155 |
class TinyTfidfIndex:
|
|
@@ -208,44 +208,7 @@ class TinyTfidfIndex:
|
|
| 208 |
|
| 209 |
|
| 210 |
# =========================
|
| 211 |
-
#
|
| 212 |
-
# =========================
|
| 213 |
-
|
| 214 |
-
CHECKS = [
|
| 215 |
-
{"id": "landing_zone", "desc": "Landing zone defined.", "fix": "Use CAF blueprints.", "keywords": ["landing", "hub", "spoke", "policy", "rbac"], "pillar": "governance"},
|
| 216 |
-
{"id": "connectivity", "desc": "Connectivity planned.", "fix": "Verify ER/VPN, DNS, MTU.", "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx"], "pillar": "networking"},
|
| 217 |
-
{"id": "migrate_tooling","desc": "Tooling chosen.", "fix": "Run Azure Migrate discovery.", "keywords": ["migrate", "discovery", "assessment", "hcx"], "pillar": "operations"},
|
| 218 |
-
{"id": "security", "desc": "Security configured.", "fix": "Enable Key Vault, Defender, Sentinel, MFA.", "keywords": ["vault", "defender", "sentinel", "mfa", "identity"], "pillar": "security"},
|
| 219 |
-
{"id": "dr_backup", "desc": "Backups/DR defined.", "fix": "Set RTO/RPO; test ASR.", "keywords": ["backup", "rto", "rpo", "dr", "asr"], "pillar": "reliability"},
|
| 220 |
-
{"id": "cost", "desc": "Cost optimization.", "fix": "Use reservations, rightsizing, tags.", "keywords": ["cost", "reservation", "savings", "tag"], "pillar": "cost"},
|
| 221 |
-
]
|
| 222 |
-
|
| 223 |
-
def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
|
| 224 |
-
toks = set(tokenize(text))
|
| 225 |
-
scores = defaultdict(float)
|
| 226 |
-
gaps = []
|
| 227 |
-
for chk in CHECKS:
|
| 228 |
-
matched = any(kw in toks for kw in chk["keywords"])
|
| 229 |
-
if matched:
|
| 230 |
-
scores["overall"] += 1.0
|
| 231 |
-
scores[chk["pillar"]] += 1.0
|
| 232 |
-
else:
|
| 233 |
-
gaps.append({
|
| 234 |
-
"id": chk["id"],
|
| 235 |
-
"desc": chk["desc"],
|
| 236 |
-
"fix": chk["fix"],
|
| 237 |
-
"severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
|
| 238 |
-
})
|
| 239 |
-
max_possible = float(len(CHECKS))
|
| 240 |
-
scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
|
| 241 |
-
for k in list(scores.keys()):
|
| 242 |
-
if k != "overall":
|
| 243 |
-
scores[k] = round(scores[k], 2)
|
| 244 |
-
return scores, gaps
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
# =========================
|
| 248 |
-
# File parsing
|
| 249 |
# =========================
|
| 250 |
|
| 251 |
def read_pdf_bytes(b: bytes) -> str:
|
|
@@ -296,214 +259,163 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
|
|
| 296 |
|
| 297 |
|
| 298 |
# =========================
|
| 299 |
-
#
|
| 300 |
# =========================
|
| 301 |
|
|
|
|
|
|
|
| 302 |
def _extract_subject_from_question(q: str) -> str:
|
| 303 |
-
""
|
| 304 |
-
Pulls the likely subject (e.g., 'Azure SDN') from 'what is/define/explain ...' questions.
|
| 305 |
-
Simple heuristic: remove leading interrogatives and trailing punctuation.
|
| 306 |
-
"""
|
| 307 |
-
s = re.sub(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", "", q, flags=re.I).strip()
|
| 308 |
s = re.sub(r"[?.!]+$", "", s).strip()
|
| 309 |
-
# Trim leading 'an', 'a', 'the'
|
| 310 |
s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I)
|
| 311 |
-
|
| 312 |
-
return " ".join(w.capitalize() if w.isalpha() else w for w in s.split()) or "the topic"
|
| 313 |
-
|
| 314 |
-
def _extract_key_points(text: str, max_points: int = 6) -> List[str]:
|
| 315 |
-
parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
|
| 316 |
-
points = []
|
| 317 |
-
for p in parts:
|
| 318 |
-
p = p.strip()
|
| 319 |
-
if 40 <= len(p) <= 300 and p not in points:
|
| 320 |
-
points.append(p)
|
| 321 |
-
if len(points) >= max_points:
|
| 322 |
-
break
|
| 323 |
-
return points
|
| 324 |
-
|
| 325 |
-
def _topic_steps(topic: str) -> List[str]:
|
| 326 |
-
if topic == "sdn":
|
| 327 |
-
return [
|
| 328 |
-
"Define VNets/subnets and segmentation policy.",
|
| 329 |
-
"Automate configuration (ARM/Bicep/Terraform/GitOps).",
|
| 330 |
-
"Harden east–west flows with policy-based filtering.",
|
| 331 |
-
"Plan ingress/egress with load balancers and gateways.",
|
| 332 |
-
"Integrate with RBAC, logging, and change control.",
|
| 333 |
-
]
|
| 334 |
-
if topic == "migration":
|
| 335 |
-
return [
|
| 336 |
-
"Establish governed landing zone (Policy, RBAC, logging).",
|
| 337 |
-
"Connect networks (ExpressRoute/VPN), validate DNS/MTU.",
|
| 338 |
-
"Discover/assess with Azure Migrate; classify apps.",
|
| 339 |
-
"Pilot 2–3 VMs; choose HCX or Azure Migrate cutover.",
|
| 340 |
-
"Migrate in waves; document rollback and success criteria.",
|
| 341 |
-
]
|
| 342 |
-
if topic == "dr":
|
| 343 |
-
return [
|
| 344 |
-
"Define business RTO/RPO per workload.",
|
| 345 |
-
"Enable ASR where applicable; set up replication.",
|
| 346 |
-
"Run planned/unplanned failover drills; validate runbooks.",
|
| 347 |
-
"Harden backups (immutability, soft-delete).",
|
| 348 |
-
"Document recovery steps and responsibilities.",
|
| 349 |
-
]
|
| 350 |
-
if topic == "security":
|
| 351 |
-
return [
|
| 352 |
-
"Centralize secrets in Key Vault; enable RBAC/PIM/MFA.",
|
| 353 |
-
"Enable Defender for Cloud and configure policies.",
|
| 354 |
-
"Collect/monitor logs; set alerts and playbooks.",
|
| 355 |
-
"Segment networks; restrict lateral movement.",
|
| 356 |
-
"Review identity hygiene and conditional access.",
|
| 357 |
-
]
|
| 358 |
-
if topic == "cost":
|
| 359 |
-
return [
|
| 360 |
-
"Right-size compute/storage based on metrics.",
|
| 361 |
-
"Use reservations or Savings Plans where stable.",
|
| 362 |
-
"Automate tagging for showback/chargeback.",
|
| 363 |
-
"Schedule shutdowns for non-prod.",
|
| 364 |
-
"Monitor cost anomalies and budgets.",
|
| 365 |
-
]
|
| 366 |
-
return [
|
| 367 |
-
"Clarify objective, constraints, and success criteria.",
|
| 368 |
-
"Assess current state and dependencies.",
|
| 369 |
-
"Choose an MVP approach; pilot and iterate.",
|
| 370 |
-
"Define rollout plan, rollback, and verification.",
|
| 371 |
-
"Measure results and continuously improve.",
|
| 372 |
-
]
|
| 373 |
|
| 374 |
-
def
|
| 375 |
"""
|
| 376 |
-
|
| 377 |
-
|
| 378 |
"""
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
"- Programmatic virtual networking (VNets, subnets, routing).",
|
| 395 |
-
"- Microsegmentation and traffic filtering for east–west security.",
|
| 396 |
-
"- Software load balancing and gateway services for connectivity.",
|
| 397 |
-
"- Hybrid consistency across Azure and Azure Local (Azure Stack HCI).",
|
| 398 |
-
]
|
| 399 |
-
elif topic == "migration":
|
| 400 |
-
md += [
|
| 401 |
-
"- Discovery and assessment of on-prem workloads.",
|
| 402 |
-
"- Replication, cutover orchestration (e.g., HCX or Azure Migrate).",
|
| 403 |
-
"- Wave-based moves with rollback and validation.",
|
| 404 |
-
"- Governance hooks for tagging, RBAC, policy.",
|
| 405 |
]
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
"
|
| 409 |
-
"
|
| 410 |
-
"- Testing and non-disruptive drills.",
|
| 411 |
-
"- Integration with backup immutability and soft-delete.",
|
| 412 |
]
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
"-
|
| 416 |
-
"
|
| 417 |
-
"- Detection and response (alerts, analytics, playbooks).",
|
| 418 |
-
"- Compliance reporting and governance integration.",
|
| 419 |
]
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
"-
|
| 423 |
-
"
|
| 424 |
-
"- Rightsizing and purchase optimizations (Reservations/Savings Plans).",
|
| 425 |
-
"- Tagging for showback/chargeback and accountability.",
|
| 426 |
-
]
|
| 427 |
-
else:
|
| 428 |
-
md += [
|
| 429 |
-
"- Policy-driven management and automation.",
|
| 430 |
-
"- Consistent APIs/CLI/portal and GitOps-friendly workflows.",
|
| 431 |
-
"- Observability (logs/metrics) and compliance integration.",
|
| 432 |
]
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
"
|
| 440 |
-
""
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
"-
|
| 444 |
-
"
|
| 445 |
-
"",
|
| 446 |
-
f"**Trusted sources:** {refs}",
|
| 447 |
]
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
scores, gaps = score_text_against_checks(combined)
|
| 453 |
-
points = _extract_key_points(combined, max_points=6)
|
| 454 |
-
refs = list_refs(topic_refs(topic))
|
| 455 |
-
|
| 456 |
-
md = [
|
| 457 |
-
"### Answer (detailed)",
|
| 458 |
-
f"**Your question:** {query}",
|
| 459 |
-
"",
|
| 460 |
-
"**Executive summary:**",
|
| 461 |
]
|
| 462 |
-
|
| 463 |
-
for
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
md += ["", f"**Trusted sources:** {refs}"]
|
| 477 |
return "\n".join(md)
|
| 478 |
|
| 479 |
-
def _compose_topic_fallback(query: str, topic: str, intent: str) -> str:
|
| 480 |
-
# Use a topic-relevant fallback, with more detail than a plain template.
|
| 481 |
-
if intent == "define":
|
| 482 |
-
subject = _extract_subject_from_question(query)
|
| 483 |
-
return _compose_definition(subject, topic)
|
| 484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
refs = list_refs(topic_refs(topic))
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
"
|
| 489 |
-
|
| 490 |
-
"
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
"
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
return "\n".join(md)
|
| 508 |
|
| 509 |
|
|
@@ -511,13 +423,7 @@ def _compose_topic_fallback(query: str, topic: str, intent: str) -> str:
|
|
| 511 |
# Main Answer Function
|
| 512 |
# =========================
|
| 513 |
|
| 514 |
-
def answer_faq_or_approach_detailed(
|
| 515 |
-
question: str,
|
| 516 |
-
use_uploaded_docs: bool,
|
| 517 |
-
index_obj: Any,
|
| 518 |
-
_matrix_unused: Any,
|
| 519 |
-
corpus: List[Dict[str, str]]
|
| 520 |
-
) -> str:
|
| 521 |
q = (question or "").strip()
|
| 522 |
if not q:
|
| 523 |
return "Please enter a question."
|
|
@@ -525,20 +431,24 @@ def answer_faq_or_approach_detailed(
|
|
| 525 |
intent = detect_intent(q)
|
| 526 |
topic = detect_topic(q)
|
| 527 |
|
| 528 |
-
#
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
if not ({"migrate", "migration", "hcx", "avs"} & q_tokens):
|
| 533 |
-
continue
|
| 534 |
-
if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
|
| 535 |
-
return (
|
| 536 |
-
"### Answer (detailed)\n"
|
| 537 |
-
f"{item['a']}\n\n"
|
| 538 |
-
f"**Trusted sources:** {list_refs(item.get('refs', []))}"
|
| 539 |
-
)
|
| 540 |
|
| 541 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
if use_uploaded_docs and index_obj is not None and corpus:
|
| 543 |
top = index_obj.query(q, k=6)
|
| 544 |
snippets = []
|
|
@@ -547,35 +457,24 @@ def answer_faq_or_approach_detailed(
|
|
| 547 |
excerpt = (item["text"] or "").strip()
|
| 548 |
if len(excerpt) > 700:
|
| 549 |
excerpt = excerpt[:700] + "..."
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
"relevance": float(sim),
|
| 553 |
-
"excerpt": excerpt
|
| 554 |
-
})
|
| 555 |
if snippets:
|
| 556 |
-
return
|
| 557 |
|
| 558 |
-
#
|
| 559 |
-
if intent
|
| 560 |
-
|
| 561 |
-
return _compose_definition(subject, topic)
|
| 562 |
-
|
| 563 |
-
# 4) Topic-aware fallback for other intents
|
| 564 |
-
return _compose_topic_fallback(q, topic, intent)
|
| 565 |
|
| 566 |
|
| 567 |
# =========================
|
| 568 |
-
#
|
| 569 |
# =========================
|
| 570 |
|
| 571 |
def build_index(files: List[Dict[str, Any]]):
|
| 572 |
if not files:
|
| 573 |
return None, None, [], "No files uploaded yet."
|
| 574 |
-
corpus
|
| 575 |
-
for f in files:
|
| 576 |
-
rec = parse_file(f)
|
| 577 |
-
if rec["text"]:
|
| 578 |
-
corpus.append(rec)
|
| 579 |
if not corpus:
|
| 580 |
return None, None, [], "No text extracted."
|
| 581 |
tokenized = [tokenize(c["text"]) for c in corpus]
|
|
@@ -595,16 +494,12 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
|
|
| 595 |
"- Click **Build Index**\n"
|
| 596 |
"- Ask a question. Answers are **detailed** and **topic-relevant**\n"
|
| 597 |
)
|
| 598 |
-
|
| 599 |
with gr.Row():
|
| 600 |
with gr.Column(scale=2):
|
| 601 |
file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
|
| 602 |
index_status = gr.Markdown("No index yet.")
|
| 603 |
-
st_index = gr.State()
|
| 604 |
-
st_matrix = gr.State()
|
| 605 |
-
st_corpus = gr.State()
|
| 606 |
build_btn = gr.Button("Build Index", variant="primary")
|
| 607 |
-
|
| 608 |
with gr.Column(scale=3):
|
| 609 |
question = gr.Textbox(
|
| 610 |
label="Ask a question",
|
|
@@ -614,7 +509,6 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
|
|
| 614 |
ask_btn = gr.Button("Ask", variant="primary")
|
| 615 |
answer_box = gr.Markdown("")
|
| 616 |
|
| 617 |
-
# Convert gr.Files (paths) to expected dicts
|
| 618 |
def _collect_files(paths: List[str]):
|
| 619 |
out = []
|
| 620 |
for p in paths or []:
|
|
@@ -630,11 +524,7 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
|
|
| 630 |
files = _collect_files(files_paths)
|
| 631 |
return build_index(files)
|
| 632 |
|
| 633 |
-
build_btn.click(
|
| 634 |
-
_build,
|
| 635 |
-
inputs=[file_in],
|
| 636 |
-
outputs=[index_status, st_index, st_matrix, st_corpus]
|
| 637 |
-
)
|
| 638 |
|
| 639 |
ask_btn.click(
|
| 640 |
answer_faq_or_approach_detailed,
|
|
|
|
| 5 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
| 6 |
- Upload design/migration docs (PDF/DOCX/TXT/MD).
|
| 7 |
- Ask questions; get reliable, detailed, and relevant answers.
|
| 8 |
+
- Intent-aware (definitions | how-to | plans | comparisons) with topic-aware details.
|
| 9 |
- No external APIs. No scikit-learn.
|
| 10 |
|
| 11 |
Run locally:
|
|
|
|
| 18 |
import re
|
| 19 |
import math
|
| 20 |
from typing import List, Tuple, Dict, Any
|
| 21 |
+
from collections import Counter
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
|
|
|
|
| 44 |
# Core guidance
|
| 45 |
("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
|
| 46 |
("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
|
| 47 |
+
# Networking / SDN (used when question is about SDN)
|
| 48 |
("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
|
| 49 |
("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
|
| 50 |
("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
|
|
|
|
| 110 |
|
| 111 |
|
| 112 |
# =========================
|
| 113 |
+
# Intent & topic detection
|
| 114 |
# =========================
|
| 115 |
|
| 116 |
_DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I)
|
|
|
|
| 149 |
|
| 150 |
|
| 151 |
# =========================
|
| 152 |
+
# Tiny TF-IDF Index
|
| 153 |
# =========================
|
| 154 |
|
| 155 |
class TinyTfidfIndex:
|
|
|
|
| 208 |
|
| 209 |
|
| 210 |
# =========================
|
| 211 |
+
# File Parsing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
# =========================
|
| 213 |
|
| 214 |
def read_pdf_bytes(b: bytes) -> str:
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
# =========================
|
| 262 |
+
# Strong definition composer (for “what is …”)
|
| 263 |
# =========================
|
| 264 |
|
| 265 |
+
_DEF_RE_LEAD = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", re.I)
|
| 266 |
+
|
| 267 |
def _extract_subject_from_question(q: str) -> str:
|
| 268 |
+
s = _DEF_RE_LEAD.sub("", q).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
s = re.sub(r"[?.!]+$", "", s).strip()
|
|
|
|
| 270 |
s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I)
|
| 271 |
+
return s if s else "the topic"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
def _definition_for_subject(subject: str, topic: str) -> Tuple[str, List[str], List[str], List[str], List[str], List[str]]:
|
| 274 |
"""
|
| 275 |
+
Returns: (definition, capabilities[], how[], best_practices[], use_cases[], refs_list)
|
| 276 |
+
Provides a specific definition for SDN; otherwise a generic but detailed scaffold using the subject.
|
| 277 |
"""
|
| 278 |
+
# SDN-specific, as per your example (paraphrased, not reused verbatim for all topics)
|
| 279 |
+
if topic == "sdn" or "sdn" in subject.lower():
|
| 280 |
+
definition = (
|
| 281 |
+
f"{subject} is Microsoft's implementation of software-defined networking: "
|
| 282 |
+
"a model that shifts network control into software so you can centrally design, automate, "
|
| 283 |
+
"and protect virtual networks across Azure and Azure Local (Azure Stack HCI). "
|
| 284 |
+
"By separating the control plane from underlying hardware, it enables programmability and "
|
| 285 |
+
"policy-driven management of components such as virtual networks, subnets, firewalls/ACLs, "
|
| 286 |
+
"load balancers, and gateways—well-suited for dynamic cloud and hybrid environments."
|
| 287 |
+
)
|
| 288 |
+
capabilities = [
|
| 289 |
+
"Programmatic creation of VNets, subnets, routing, and address spaces.",
|
| 290 |
+
"Micro-segmentation and policy enforcement for east–west traffic.",
|
| 291 |
+
"Software load balancing and gateway services for app connectivity.",
|
| 292 |
+
"Consistency across Azure and Azure Local (Azure Stack HCI) via Azure Arc.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
]
|
| 294 |
+
how = [
|
| 295 |
+
"A centralized control plane applies intent (network topology and policies) to host virtual switches.",
|
| 296 |
+
"Agents/controllers translate intent into concrete configuration on each host.",
|
| 297 |
+
"Telemetry and logs feed monitoring, governance, and troubleshooting workflows.",
|
|
|
|
|
|
|
| 298 |
]
|
| 299 |
+
best = [
|
| 300 |
+
"Use Infrastructure-as-Code (Bicep/Terraform) and GitOps to standardize changes.",
|
| 301 |
+
"Apply least-privilege and RBAC; review segmentation policies regularly.",
|
| 302 |
+
"Integrate with logging/monitoring; alert on drift and policy violations.",
|
|
|
|
|
|
|
| 303 |
]
|
| 304 |
+
uses = [
|
| 305 |
+
"Rapidly provisioning isolated app environments and tiers.",
|
| 306 |
+
"Zero-trust segmentation between workloads and environments.",
|
| 307 |
+
"Hybrid designs spanning Azure and Azure Local with consistent constructs.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
]
|
| 309 |
+
refs_list = topic_refs("sdn")
|
| 310 |
+
return definition, capabilities, how, best, uses, refs_list
|
| 311 |
+
|
| 312 |
+
# Generic detailed definition for other subjects
|
| 313 |
+
sub = subject.strip()
|
| 314 |
+
definition = (
|
| 315 |
+
f"{sub} is a service/technology that centralizes control through software and policy so teams can "
|
| 316 |
+
f"create, operate, and secure resources consistently across environments."
|
| 317 |
+
)
|
| 318 |
+
capabilities = [
|
| 319 |
+
"Automation and policy-driven configuration to reduce manual effort and errors.",
|
| 320 |
+
"Governance integration (RBAC, tagging, policy) for consistency and compliance.",
|
| 321 |
+
"Observability hooks (logs/metrics) for reliability and performance tuning.",
|
|
|
|
| 322 |
]
|
| 323 |
+
how = [
|
| 324 |
+
"A control plane captures intent (configuration/policies) and applies it to managed resources.",
|
| 325 |
+
"Providers/agents on the platform translate intent into changes at runtime.",
|
| 326 |
+
"Feedback loops via telemetry inform continuous improvement.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
]
|
| 328 |
+
best = [
|
| 329 |
+
"Adopt Infrastructure-as-Code and peer reviews for change control.",
|
| 330 |
+
"Define tagging, RBAC roles, and policy baselines early.",
|
| 331 |
+
"Pilot in a non-prod environment before broad rollout.",
|
| 332 |
+
]
|
| 333 |
+
uses = [
|
| 334 |
+
"Faster, repeatable environment provisioning.",
|
| 335 |
+
"Improved security posture through standardized controls.",
|
| 336 |
+
"Hybrid scenarios requiring consistent management across sites.",
|
| 337 |
+
]
|
| 338 |
+
refs_list = topic_refs(detect_topic(sub))
|
| 339 |
+
return definition, capabilities, how, best, uses, refs_list
|
| 340 |
+
|
| 341 |
+
def _compose_definition_markdown(query: str, subject: str, topic: str) -> str:
|
| 342 |
+
definition, capabilities, how, best, uses, refs_list = _definition_for_subject(subject, topic)
|
| 343 |
+
refs = list_refs(refs_list)
|
| 344 |
+
md = [f"### {subject} — Detailed definition",
|
| 345 |
+
f"**Your question:** {query}", "",
|
| 346 |
+
f"**Definition:** {definition}", "",
|
| 347 |
+
"**Key capabilities:**"]
|
| 348 |
+
md += [f"- {c}" for c in capabilities]
|
| 349 |
+
md += ["", "**How it works:**"]
|
| 350 |
+
md += [f"- {h}" for h in how]
|
| 351 |
+
md += ["", "**Best practices:**"]
|
| 352 |
+
md += [f"- {b}" for b in best]
|
| 353 |
+
md += ["", "**Common use cases:**"]
|
| 354 |
+
md += [f"- {u}" for u in uses]
|
| 355 |
md += ["", f"**Trusted sources:** {refs}"]
|
| 356 |
return "\n".join(md)
|
| 357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
+
# =========================
|
| 360 |
+
# RAG: build a detailed answer from uploaded docs
|
| 361 |
+
# =========================
|
| 362 |
+
|
| 363 |
+
def _extract_points(text: str, max_points: int = 6) -> List[str]:
|
| 364 |
+
parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
|
| 365 |
+
pts = []
|
| 366 |
+
for p in parts:
|
| 367 |
+
p = p.strip()
|
| 368 |
+
if 40 <= len(p) <= 280 and p not in pts:
|
| 369 |
+
pts.append(p)
|
| 370 |
+
if len(pts) >= max_points:
|
| 371 |
+
break
|
| 372 |
+
return pts
|
| 373 |
+
|
| 374 |
+
def _compose_rag_answer(query: str, snippets: List[str], topic: str) -> str:
|
| 375 |
+
combined = " ".join(snippets)
|
| 376 |
+
points = _extract_points(combined, max_points=6)
|
| 377 |
refs = list_refs(topic_refs(topic))
|
| 378 |
+
md = ["### Answer (detailed)", f"**Your question:** {query}", ""]
|
| 379 |
+
if points:
|
| 380 |
+
md += ["**Executive summary:**"] + [f"- {p}" for p in points]
|
| 381 |
+
else:
|
| 382 |
+
md += ["**Executive summary:**", "- Here are key considerations synthesized from your documents."]
|
| 383 |
+
# Add a short topic-aware checklist
|
| 384 |
+
checklist = {
|
| 385 |
+
"sdn": [
|
| 386 |
+
"Define VNets/subnets and segmentation policy.",
|
| 387 |
+
"Automate with IaC (Bicep/Terraform) and GitOps.",
|
| 388 |
+
"Harden east–west traffic with micro-segmentation.",
|
| 389 |
+
"Plan ingress/egress with LBs and gateways."
|
| 390 |
+
],
|
| 391 |
+
"migration": [
|
| 392 |
+
"Establish landing zone (Policy, RBAC, logging).",
|
| 393 |
+
"Connect networks (ER/VPN), validate DNS/MTU.",
|
| 394 |
+
"Discover/assess with Azure Migrate; pilot a few VMs.",
|
| 395 |
+
"Choose HCX or Azure Migrate for cutover; migrate in waves."
|
| 396 |
+
],
|
| 397 |
+
"dr": [
|
| 398 |
+
"Define RTO/RPO; choose replication targets.",
|
| 399 |
+
"Run planned/unplanned failover drills.",
|
| 400 |
+
"Ensure immutable backups and soft-delete."
|
| 401 |
+
],
|
| 402 |
+
"security": [
|
| 403 |
+
"Enable RBAC/PIM/MFA and Key Vault.",
|
| 404 |
+
"Turn on Defender for Cloud; set policies and alerts.",
|
| 405 |
+
"Collect logs; restrict lateral movement."
|
| 406 |
+
],
|
| 407 |
+
"cost": [
|
| 408 |
+
"Right-size; use Reservations/Savings Plans.",
|
| 409 |
+
"Tag resources; set budgets/alerts.",
|
| 410 |
+
"Automate non-prod shutdowns."
|
| 411 |
+
],
|
| 412 |
+
"general": [
|
| 413 |
+
"Clarify objectives and constraints.",
|
| 414 |
+
"Pilot changes; define rollback and verification."
|
| 415 |
+
]
|
| 416 |
+
}.get(topic, ["Clarify objectives and constraints.", "Pilot changes; define rollback and verification."])
|
| 417 |
+
md += ["", "**Recommended steps:**"] + [f"- {s}" for s in checklist]
|
| 418 |
+
md += ["", f"**Trusted sources:** {refs}"]
|
| 419 |
return "\n".join(md)
|
| 420 |
|
| 421 |
|
|
|
|
| 423 |
# Main Answer Function
|
| 424 |
# =========================
|
| 425 |
|
| 426 |
+
def answer_faq_or_approach_detailed(question: str, use_uploaded_docs: bool, index_obj: Any, _matrix_unused: Any, corpus: List[Dict[str,str]]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
q = (question or "").strip()
|
| 428 |
if not q:
|
| 429 |
return "Please enter a question."
|
|
|
|
| 431 |
intent = detect_intent(q)
|
| 432 |
topic = detect_topic(q)
|
| 433 |
|
| 434 |
+
# A) Definitions: build a strong, subject-specific definition (e.g., "What is Azure SDN?")
|
| 435 |
+
if intent == "define":
|
| 436 |
+
subject = _extract_subject_from_question(q)
|
| 437 |
+
return _compose_definition_markdown(q, subject, topic)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
+
# B) Migration FAQs (only if the question is migration-like to avoid hijacking)
|
| 440 |
+
q_tokens = set(tokenize(q))
|
| 441 |
+
if {"migrate", "migration", "hcx", "avs"} & q_tokens:
|
| 442 |
+
for item in FAQ_SEEDS:
|
| 443 |
+
seed_tokens = set(tokenize(item["q"]))
|
| 444 |
+
if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
|
| 445 |
+
return (
|
| 446 |
+
"### Answer (detailed)\n"
|
| 447 |
+
f"{item['a']}\n\n"
|
| 448 |
+
f"**Trusted sources:** {list_refs(item.get('refs', []))}"
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
# C) RAG over uploaded docs → detailed synthesized answer
|
| 452 |
if use_uploaded_docs and index_obj is not None and corpus:
|
| 453 |
top = index_obj.query(q, k=6)
|
| 454 |
snippets = []
|
|
|
|
| 457 |
excerpt = (item["text"] or "").strip()
|
| 458 |
if len(excerpt) > 700:
|
| 459 |
excerpt = excerpt[:700] + "..."
|
| 460 |
+
if excerpt:
|
| 461 |
+
snippets.append(excerpt)
|
|
|
|
|
|
|
|
|
|
| 462 |
if snippets:
|
| 463 |
+
return _compose_rag_answer(q, snippets, topic)
|
| 464 |
|
| 465 |
+
# D) Topic-aware fallback (short but relevant)
|
| 466 |
+
subject = _extract_subject_from_question(q) if intent in {"how", "plan", "compare"} else q
|
| 467 |
+
return _compose_definition_markdown(q, subject, topic)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
|
| 470 |
# =========================
|
| 471 |
+
# Index Builder
|
| 472 |
# =========================
|
| 473 |
|
| 474 |
def build_index(files: List[Dict[str, Any]]):
|
| 475 |
if not files:
|
| 476 |
return None, None, [], "No files uploaded yet."
|
| 477 |
+
corpus = [parse_file(f) for f in files if parse_file(f)["text"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
if not corpus:
|
| 479 |
return None, None, [], "No text extracted."
|
| 480 |
tokenized = [tokenize(c["text"]) for c in corpus]
|
|
|
|
| 494 |
"- Click **Build Index**\n"
|
| 495 |
"- Ask a question. Answers are **detailed** and **topic-relevant**\n"
|
| 496 |
)
|
|
|
|
| 497 |
with gr.Row():
|
| 498 |
with gr.Column(scale=2):
|
| 499 |
file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
|
| 500 |
index_status = gr.Markdown("No index yet.")
|
| 501 |
+
st_index = gr.State(); st_matrix = gr.State(); st_corpus = gr.State()
|
|
|
|
|
|
|
| 502 |
build_btn = gr.Button("Build Index", variant="primary")
|
|
|
|
| 503 |
with gr.Column(scale=3):
|
| 504 |
question = gr.Textbox(
|
| 505 |
label="Ask a question",
|
|
|
|
| 509 |
ask_btn = gr.Button("Ask", variant="primary")
|
| 510 |
answer_box = gr.Markdown("")
|
| 511 |
|
|
|
|
| 512 |
def _collect_files(paths: List[str]):
|
| 513 |
out = []
|
| 514 |
for p in paths or []:
|
|
|
|
| 524 |
files = _collect_files(files_paths)
|
| 525 |
return build_index(files)
|
| 526 |
|
| 527 |
+
build_btn.click(_build, inputs=[file_in], outputs=[index_status, st_index, st_matrix, st_corpus])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
|
| 529 |
ask_btn.click(
|
| 530 |
answer_faq_or_approach_detailed,
|