ajayinsac commited on
Commit
6660a36
·
verified ·
1 Parent(s): e5a16a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -61
app.py CHANGED
@@ -3,12 +3,10 @@
3
 
4
  """
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
- - No external API calls. No scikit-learn.
7
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
8
- - Ask questions; get RELIABLE, DETAILED, and RELEVANT answers:
9
- RAG on uploaded docs (excerpts + topic-tailored structure)
10
- Seeded FAQs (for migration flows)
11
- • Topic-aware fallbacks (no more SDN leakage into unrelated topics)
12
 
13
  Run locally:
14
  pip install gradio PyPDF2 python-docx
@@ -28,7 +26,7 @@ import gradio as gr
28
  # Optional parsers (graceful fallback)
29
  # -------------------------
30
  try:
31
- import PyPDF2 # often present on Spaces
32
  except Exception:
33
  PyPDF2 = None
34
 
@@ -43,14 +41,14 @@ except Exception:
43
  # =========================
44
 
45
  TRUSTED_SOURCES: List[Tuple[str, str]] = [
46
- # Core Azure landing/ops
47
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
48
  ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
49
- # Networking / SDN (used ONLY when topic == 'sdn')
 
50
  ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
51
  ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
52
  ("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
53
- ("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
54
  # Migration
55
  ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
56
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
@@ -112,29 +110,33 @@ def list_refs(ref_names: List[str]) -> str:
112
 
113
 
114
  # =========================
115
- # Topic detection (keeps answers relevant)
116
  # =========================
117
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def detect_topic(q: str) -> str:
119
- """
120
- Returns one of: 'sdn', 'migration', 'dr', 'security', 'cost', 'general'
121
- """
122
  toks = set(tokenize(q))
123
- if "sdn" in toks or "software-defined" in toks or "softwaredefined" in toks:
124
- return "sdn"
125
- if {"migrate", "migration", "hcx", "avs", "vmotion", "cutover"} & toks:
126
- return "migration"
127
- if {"dr", "disaster", "asr", "rto", "rpo", "failover"} & toks:
128
- return "dr"
129
- if {"defender", "sentinel", "pim", "mfa", "vault", "identity", "entra"} & toks:
130
- return "security"
131
- if {"cost", "reservation", "savings", "rightsizing", "tagging"} & toks:
132
- return "cost"
133
  return "general"
134
 
135
  def topic_refs(topic: str) -> List[str]:
136
  if topic == "sdn":
137
- return ["Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)", "Azure Virtual Network"]
138
  if topic == "migration":
139
  return ["Azure Migrate", "Azure VMware Solution (AVS)", "VMware HCX Docs", "Cloud Adoption Framework (CAF)"]
140
  if topic == "dr":
@@ -206,7 +208,7 @@ class TinyTfidfIndex:
206
 
207
 
208
  # =========================
209
- # Rubric for tailoring RAG output
210
  # =========================
211
 
212
  CHECKS = [
@@ -243,7 +245,7 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
243
 
244
 
245
  # =========================
246
- # File Parsing
247
  # =========================
248
 
249
  def read_pdf_bytes(b: bytes) -> str:
@@ -297,13 +299,24 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
297
  # Helpers for composing detailed answers
298
  # =========================
299
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  def _extract_key_points(text: str, max_points: int = 6) -> List[str]:
301
- # naive sentence splitter
302
- parts = re.split(r"(?<=[.!?])\s+", text.strip())
303
  points = []
304
  for p in parts:
305
  p = p.strip()
306
- if 30 <= len(p) <= 300 and p not in points:
307
  points.append(p)
308
  if len(points) >= max_points:
309
  break
@@ -353,10 +366,86 @@ def _topic_steps(topic: str) -> List[str]:
353
  return [
354
  "Clarify objective, constraints, and success criteria.",
355
  "Assess current state and dependencies.",
356
- "Choose the minimal viable approach first; pilot.",
357
  "Define rollout plan, rollback, and verification.",
358
- "Measure results and iterate.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  ]
 
360
 
361
  def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]], topic: str) -> str:
362
  combined = "\n\n".join([s.get("excerpt", "") for s in snippets])
@@ -376,28 +465,23 @@ def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]],
376
  else:
377
  md.append("- Based on your documents, here is a structured plan and key considerations.")
378
 
379
- md += [
380
- "",
381
- "#### Recommended steps",
382
- ]
383
  for step in _topic_steps(topic):
384
  md.append(f"- {step}")
385
 
386
- md += [
387
- "",
388
- "#### Supporting excerpts",
389
- ]
390
  for s in snippets:
391
  md.append(f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}")
392
 
393
- md += [
394
- "",
395
- f"**Trusted sources:** {refs}"
396
- ]
397
-
398
  return "\n".join(md)
399
 
400
- def _compose_topic_fallback(query: str, topic: str) -> str:
 
 
 
 
 
401
  refs = list_refs(topic_refs(topic))
402
  headline = {
403
  "sdn": "Azure SDN — Overview",
@@ -412,20 +496,11 @@ def _compose_topic_fallback(query: str, topic: str) -> str:
412
  f"### {headline}",
413
  f"**Your question:** {query}",
414
  "",
415
- "**Definition/Context:**",
416
- "- What it is, the problem it solves, and where it runs (Azure / Azure Local).",
417
- "",
418
- "**Key capabilities / success factors:**",
419
  ]
420
  for step in _topic_steps(topic):
421
  md.append(f"- {step}")
422
-
423
  md += [
424
- "",
425
- "**Notes & caveats:**",
426
- "- Validate limits and prerequisites for your environment.",
427
- "- Align with governance and security baselines.",
428
- "- Pilot before broad rollout.",
429
  "",
430
  f"**Trusted sources:** {refs}",
431
  ]
@@ -433,7 +508,7 @@ def _compose_topic_fallback(query: str, topic: str) -> str:
433
 
434
 
435
  # =========================
436
- # Main Answer Function (no SDN bias)
437
  # =========================
438
 
439
  def answer_faq_or_approach_detailed(
@@ -447,13 +522,13 @@ def answer_faq_or_approach_detailed(
447
  if not q:
448
  return "Please enter a question."
449
 
 
450
  topic = detect_topic(q)
451
 
452
- # 1) Seeded FAQs detailed plan (only for migration-like questions)
453
  q_tokens = set(tokenize(q))
454
  for item in FAQ_SEEDS:
455
  seed_tokens = set(tokenize(item["q"]))
456
- # require at least one migration-specific token to avoid hijacking definitional questions
457
  if not ({"migrate", "migration", "hcx", "avs"} & q_tokens):
458
  continue
459
  if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
@@ -480,8 +555,13 @@ def answer_faq_or_approach_detailed(
480
  if snippets:
481
  return _compose_detailed_from_snippets(q, snippets, topic)
482
 
483
- # 3) Topic-aware fallback (no SDN unless you asked about SDN)
484
- return _compose_topic_fallback(q, topic)
 
 
 
 
 
485
 
486
 
487
  # =========================
@@ -526,7 +606,10 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
526
  build_btn = gr.Button("Build Index", variant="primary")
527
 
528
  with gr.Column(scale=3):
529
- question = gr.Textbox(label="Ask a question", placeholder="e.g., What is Azure Arc-enabled SDN? or What's the best way to minimize downtime for our AVS migration?")
 
 
 
530
  use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
531
  ask_btn = gr.Button("Ask", variant="primary")
532
  answer_box = gr.Markdown("")
 
3
 
4
  """
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
 
6
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
7
+ - Ask questions; get reliable, detailed, and relevant answers.
8
+ - Intent-aware (definitions, how-tos, comparisons, plans), topic-aware (sdn/migration/dr/security/cost).
9
+ - No external APIs. No scikit-learn.
 
10
 
11
  Run locally:
12
  pip install gradio PyPDF2 python-docx
 
26
  # Optional parsers (graceful fallback)
27
  # -------------------------
28
  try:
29
+ import PyPDF2
30
  except Exception:
31
  PyPDF2 = None
32
 
 
41
  # =========================
42
 
43
  TRUSTED_SOURCES: List[Tuple[str, str]] = [
44
+ # Core guidance
45
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
46
  ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
47
+ # Networking
48
+ ("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
49
  ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
50
  ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
51
  ("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
 
52
  # Migration
53
  ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
54
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
 
110
 
111
 
112
  # =========================
113
+ # Intent & Topic detection
114
  # =========================
115
 
116
+ _DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I)
117
+ _HOW_RE = re.compile(r"^\s*(how\s+do|how\s+to|how\s+does|how\s+can)\b", re.I)
118
+ _CMP_RE = re.compile(r"\b(vs\.?|versus|compare|difference|differ)\b", re.I)
119
+ _PLAN_RE = re.compile(r"\b(plan|approach|steps|roadmap|sequence|strategy)\b", re.I)
120
+
121
+ def detect_intent(q: str) -> str:
122
+ if _DEF_RE.search(q): return "define"
123
+ if _CMP_RE.search(q): return "compare"
124
+ if _PLAN_RE.search(q): return "plan"
125
+ if _HOW_RE.search(q): return "how"
126
+ return "general"
127
+
128
  def detect_topic(q: str) -> str:
 
 
 
129
  toks = set(tokenize(q))
130
+ if {"sdn", "software-defined", "softwaredefined"} & toks: return "sdn"
131
+ if {"migrate", "migration", "hcx", "avs", "vmotion", "cutover"} & toks: return "migration"
132
+ if {"dr", "disaster", "asr", "rto", "rpo", "failover"} & toks: return "dr"
133
+ if {"defender", "sentinel", "pim", "mfa", "vault", "identity", "entra"} & toks: return "security"
134
+ if {"cost", "reservation", "savings", "rightsizing", "tagging"} & toks: return "cost"
 
 
 
 
 
135
  return "general"
136
 
137
  def topic_refs(topic: str) -> List[str]:
138
  if topic == "sdn":
139
+ return ["Azure Virtual Network", "Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"]
140
  if topic == "migration":
141
  return ["Azure Migrate", "Azure VMware Solution (AVS)", "VMware HCX Docs", "Cloud Adoption Framework (CAF)"]
142
  if topic == "dr":
 
208
 
209
 
210
  # =========================
211
+ # Rubric (used to tailor RAG summaries)
212
  # =========================
213
 
214
  CHECKS = [
 
245
 
246
 
247
  # =========================
248
+ # File parsing
249
  # =========================
250
 
251
  def read_pdf_bytes(b: bytes) -> str:
 
299
  # Helpers for composing detailed answers
300
  # =========================
301
 
302
+ def _extract_subject_from_question(q: str) -> str:
303
+ """
304
+ Pulls the likely subject (e.g., 'Azure SDN') from 'what is/define/explain ...' questions.
305
+ Simple heuristic: remove leading interrogatives and trailing punctuation.
306
+ """
307
+ s = re.sub(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", "", q, flags=re.I).strip()
308
+ s = re.sub(r"[?.!]+$", "", s).strip()
309
+ # Trim leading 'an', 'a', 'the'
310
+ s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I)
311
+ # Capitalize first letter of each word heuristically
312
+ return " ".join(w.capitalize() if w.isalpha() else w for w in s.split()) or "the topic"
313
+
314
  def _extract_key_points(text: str, max_points: int = 6) -> List[str]:
315
+ parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
 
316
  points = []
317
  for p in parts:
318
  p = p.strip()
319
+ if 40 <= len(p) <= 300 and p not in points:
320
  points.append(p)
321
  if len(points) >= max_points:
322
  break
 
366
  return [
367
  "Clarify objective, constraints, and success criteria.",
368
  "Assess current state and dependencies.",
369
+ "Choose an MVP approach; pilot and iterate.",
370
  "Define rollout plan, rollback, and verification.",
371
+ "Measure results and continuously improve.",
372
+ ]
373
+
374
+ def _compose_definition(subject: str, topic: str) -> str:
375
+ """
376
+ Produces a clear, detailed definition for 'define/what is' questions
377
+ using the detected subject and topic to pick references.
378
+ """
379
+ refs = list_refs(topic_refs(topic))
380
+ # Definition scaffold tailored to topic, but generic enough for any subject.
381
+ md = [
382
+ f"### {subject} — Detailed overview",
383
+ f"**Definition:** {subject} is a service/technology that centralizes control through software and policy so you can create, operate, and secure resources consistently across environments.",
384
+ "",
385
+ "**Why it matters:**",
386
+ "- Reduces manual configuration and errors with automation and governance.",
387
+ "- Improves security through consistent, policy-driven controls.",
388
+ "- Accelerates delivery with repeatable, programmable workflows.",
389
+ "",
390
+ "**Core capabilities:**",
391
+ ]
392
+ if topic == "sdn":
393
+ md += [
394
+ "- Programmatic virtual networking (VNets, subnets, routing).",
395
+ "- Microsegmentation and traffic filtering for east–west security.",
396
+ "- Software load balancing and gateway services for connectivity.",
397
+ "- Hybrid consistency across Azure and Azure Local (Azure Stack HCI).",
398
+ ]
399
+ elif topic == "migration":
400
+ md += [
401
+ "- Discovery and assessment of on-prem workloads.",
402
+ "- Replication, cutover orchestration (e.g., HCX or Azure Migrate).",
403
+ "- Wave-based moves with rollback and validation.",
404
+ "- Governance hooks for tagging, RBAC, policy.",
405
+ ]
406
+ elif topic == "dr":
407
+ md += [
408
+ "- Replication and recovery planning (RPO/RTO).",
409
+ "- Failover/failback workflows and runbooks.",
410
+ "- Testing and non-disruptive drills.",
411
+ "- Integration with backup immutability and soft-delete.",
412
+ ]
413
+ elif topic == "security":
414
+ md += [
415
+ "- Posture management, policy, and workload protections.",
416
+ "- Identity controls (RBAC, PIM/MFA), secrets management.",
417
+ "- Detection and response (alerts, analytics, playbooks).",
418
+ "- Compliance reporting and governance integration.",
419
+ ]
420
+ elif topic == "cost":
421
+ md += [
422
+ "- Visibility into spend and resource utilization.",
423
+ "- Budgeting, alerts, and anomaly detection.",
424
+ "- Rightsizing and purchase optimizations (Reservations/Savings Plans).",
425
+ "- Tagging for showback/chargeback and accountability.",
426
+ ]
427
+ else:
428
+ md += [
429
+ "- Policy-driven management and automation.",
430
+ "- Consistent APIs/CLI/portal and GitOps-friendly workflows.",
431
+ "- Observability (logs/metrics) and compliance integration.",
432
+ ]
433
+
434
+ md += [
435
+ "",
436
+ "**How it works (high-level):**",
437
+ "- A control plane applies intent (configuration/policy) to managed resources.",
438
+ "- Agents/providers translate intent into concrete changes.",
439
+ "- Telemetry feeds monitoring and governance for continuous improvement.",
440
+ "",
441
+ "**Common use cases:**",
442
+ "- Standardized environments across dev/test/prod.",
443
+ "- Stronger security posture via segmentation and least-privilege.",
444
+ "- Hybrid scenarios spanning Azure and Azure Local where relevant.",
445
+ "",
446
+ f"**Trusted sources:** {refs}",
447
  ]
448
+ return "\n".join(md)
449
 
450
  def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]], topic: str) -> str:
451
  combined = "\n\n".join([s.get("excerpt", "") for s in snippets])
 
465
  else:
466
  md.append("- Based on your documents, here is a structured plan and key considerations.")
467
 
468
+ md += ["", "#### Recommended steps"]
 
 
 
469
  for step in _topic_steps(topic):
470
  md.append(f"- {step}")
471
 
472
+ md += ["", "#### Supporting excerpts"]
 
 
 
473
  for s in snippets:
474
  md.append(f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}")
475
 
476
+ md += ["", f"**Trusted sources:** {refs}"]
 
 
 
 
477
  return "\n".join(md)
478
 
479
+ def _compose_topic_fallback(query: str, topic: str, intent: str) -> str:
480
+ # Use a topic-relevant fallback, with more detail than a plain template.
481
+ if intent == "define":
482
+ subject = _extract_subject_from_question(query)
483
+ return _compose_definition(subject, topic)
484
+
485
  refs = list_refs(topic_refs(topic))
486
  headline = {
487
  "sdn": "Azure SDN — Overview",
 
496
  f"### {headline}",
497
  f"**Your question:** {query}",
498
  "",
499
+ "**Key points:**",
 
 
 
500
  ]
501
  for step in _topic_steps(topic):
502
  md.append(f"- {step}")
 
503
  md += [
 
 
 
 
 
504
  "",
505
  f"**Trusted sources:** {refs}",
506
  ]
 
508
 
509
 
510
  # =========================
511
+ # Main Answer Function
512
  # =========================
513
 
514
  def answer_faq_or_approach_detailed(
 
522
  if not q:
523
  return "Please enter a question."
524
 
525
+ intent = detect_intent(q)
526
  topic = detect_topic(q)
527
 
528
+ # 1) Restrict FAQ route to migration-like queries only (prevents hijacking)
529
  q_tokens = set(tokenize(q))
530
  for item in FAQ_SEEDS:
531
  seed_tokens = set(tokenize(item["q"]))
 
532
  if not ({"migrate", "migration", "hcx", "avs"} & q_tokens):
533
  continue
534
  if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
 
555
  if snippets:
556
  return _compose_detailed_from_snippets(q, snippets, topic)
557
 
558
+ # 3) Intent-aware fallback (especially for definitions like "What is Azure SDN?")
559
+ if intent == "define":
560
+ subject = _extract_subject_from_question(q)
561
+ return _compose_definition(subject, topic)
562
+
563
+ # 4) Topic-aware fallback for other intents
564
+ return _compose_topic_fallback(q, topic, intent)
565
 
566
 
567
  # =========================
 
606
  build_btn = gr.Button("Build Index", variant="primary")
607
 
608
  with gr.Column(scale=3):
609
+ question = gr.Textbox(
610
+ label="Ask a question",
611
+ placeholder="e.g., What is Azure SDN? • How do I minimize downtime for our AVS migration?"
612
+ )
613
  use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
614
  ask_btn = gr.Button("Ask", variant="primary")
615
  answer_box = gr.Markdown("")