ajayinsac commited on
Commit
5038afa
·
verified ·
1 Parent(s): 6660a36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -282
app.py CHANGED
@@ -5,7 +5,7 @@
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
7
  - Ask questions; get reliable, detailed, and relevant answers.
8
- - Intent-aware (definitions, how-tos, comparisons, plans), topic-aware (sdn/migration/dr/security/cost).
9
  - No external APIs. No scikit-learn.
10
 
11
  Run locally:
@@ -18,7 +18,7 @@ import io
18
  import re
19
  import math
20
  from typing import List, Tuple, Dict, Any
21
- from collections import Counter, defaultdict
22
 
23
  import gradio as gr
24
 
@@ -44,7 +44,7 @@ TRUSTED_SOURCES: List[Tuple[str, str]] = [
44
  # Core guidance
45
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
46
  ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
47
- # Networking
48
  ("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
49
  ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
50
  ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
@@ -110,7 +110,7 @@ def list_refs(ref_names: List[str]) -> str:
110
 
111
 
112
  # =========================
113
- # Intent & Topic detection
114
  # =========================
115
 
116
  _DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I)
@@ -149,7 +149,7 @@ def topic_refs(topic: str) -> List[str]:
149
 
150
 
151
  # =========================
152
- # Tiny TF-IDF Index (no sklearn)
153
  # =========================
154
 
155
  class TinyTfidfIndex:
@@ -208,44 +208,7 @@ class TinyTfidfIndex:
208
 
209
 
210
  # =========================
211
- # Rubric (used to tailor RAG summaries)
212
- # =========================
213
-
214
- CHECKS = [
215
- {"id": "landing_zone", "desc": "Landing zone defined.", "fix": "Use CAF blueprints.", "keywords": ["landing", "hub", "spoke", "policy", "rbac"], "pillar": "governance"},
216
- {"id": "connectivity", "desc": "Connectivity planned.", "fix": "Verify ER/VPN, DNS, MTU.", "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx"], "pillar": "networking"},
217
- {"id": "migrate_tooling","desc": "Tooling chosen.", "fix": "Run Azure Migrate discovery.", "keywords": ["migrate", "discovery", "assessment", "hcx"], "pillar": "operations"},
218
- {"id": "security", "desc": "Security configured.", "fix": "Enable Key Vault, Defender, Sentinel, MFA.", "keywords": ["vault", "defender", "sentinel", "mfa", "identity"], "pillar": "security"},
219
- {"id": "dr_backup", "desc": "Backups/DR defined.", "fix": "Set RTO/RPO; test ASR.", "keywords": ["backup", "rto", "rpo", "dr", "asr"], "pillar": "reliability"},
220
- {"id": "cost", "desc": "Cost optimization.", "fix": "Use reservations, rightsizing, tags.", "keywords": ["cost", "reservation", "savings", "tag"], "pillar": "cost"},
221
- ]
222
-
223
- def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
224
- toks = set(tokenize(text))
225
- scores = defaultdict(float)
226
- gaps = []
227
- for chk in CHECKS:
228
- matched = any(kw in toks for kw in chk["keywords"])
229
- if matched:
230
- scores["overall"] += 1.0
231
- scores[chk["pillar"]] += 1.0
232
- else:
233
- gaps.append({
234
- "id": chk["id"],
235
- "desc": chk["desc"],
236
- "fix": chk["fix"],
237
- "severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
238
- })
239
- max_possible = float(len(CHECKS))
240
- scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
241
- for k in list(scores.keys()):
242
- if k != "overall":
243
- scores[k] = round(scores[k], 2)
244
- return scores, gaps
245
-
246
-
247
- # =========================
248
- # File parsing
249
  # =========================
250
 
251
  def read_pdf_bytes(b: bytes) -> str:
@@ -296,214 +259,163 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
296
 
297
 
298
  # =========================
299
- # Helpers for composing detailed answers
300
  # =========================
301
 
 
 
302
  def _extract_subject_from_question(q: str) -> str:
303
- """
304
- Pulls the likely subject (e.g., 'Azure SDN') from 'what is/define/explain ...' questions.
305
- Simple heuristic: remove leading interrogatives and trailing punctuation.
306
- """
307
- s = re.sub(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", "", q, flags=re.I).strip()
308
  s = re.sub(r"[?.!]+$", "", s).strip()
309
- # Trim leading 'an', 'a', 'the'
310
  s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I)
311
- # Capitalize first letter of each word heuristically
312
- return " ".join(w.capitalize() if w.isalpha() else w for w in s.split()) or "the topic"
313
-
314
- def _extract_key_points(text: str, max_points: int = 6) -> List[str]:
315
- parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
316
- points = []
317
- for p in parts:
318
- p = p.strip()
319
- if 40 <= len(p) <= 300 and p not in points:
320
- points.append(p)
321
- if len(points) >= max_points:
322
- break
323
- return points
324
-
325
- def _topic_steps(topic: str) -> List[str]:
326
- if topic == "sdn":
327
- return [
328
- "Define VNets/subnets and segmentation policy.",
329
- "Automate configuration (ARM/Bicep/Terraform/GitOps).",
330
- "Harden east–west flows with policy-based filtering.",
331
- "Plan ingress/egress with load balancers and gateways.",
332
- "Integrate with RBAC, logging, and change control.",
333
- ]
334
- if topic == "migration":
335
- return [
336
- "Establish governed landing zone (Policy, RBAC, logging).",
337
- "Connect networks (ExpressRoute/VPN), validate DNS/MTU.",
338
- "Discover/assess with Azure Migrate; classify apps.",
339
- "Pilot 2–3 VMs; choose HCX or Azure Migrate cutover.",
340
- "Migrate in waves; document rollback and success criteria.",
341
- ]
342
- if topic == "dr":
343
- return [
344
- "Define business RTO/RPO per workload.",
345
- "Enable ASR where applicable; set up replication.",
346
- "Run planned/unplanned failover drills; validate runbooks.",
347
- "Harden backups (immutability, soft-delete).",
348
- "Document recovery steps and responsibilities.",
349
- ]
350
- if topic == "security":
351
- return [
352
- "Centralize secrets in Key Vault; enable RBAC/PIM/MFA.",
353
- "Enable Defender for Cloud and configure policies.",
354
- "Collect/monitor logs; set alerts and playbooks.",
355
- "Segment networks; restrict lateral movement.",
356
- "Review identity hygiene and conditional access.",
357
- ]
358
- if topic == "cost":
359
- return [
360
- "Right-size compute/storage based on metrics.",
361
- "Use reservations or Savings Plans where stable.",
362
- "Automate tagging for showback/chargeback.",
363
- "Schedule shutdowns for non-prod.",
364
- "Monitor cost anomalies and budgets.",
365
- ]
366
- return [
367
- "Clarify objective, constraints, and success criteria.",
368
- "Assess current state and dependencies.",
369
- "Choose an MVP approach; pilot and iterate.",
370
- "Define rollout plan, rollback, and verification.",
371
- "Measure results and continuously improve.",
372
- ]
373
 
374
- def _compose_definition(subject: str, topic: str) -> str:
375
  """
376
- Produces a clear, detailed definition for 'define/what is' questions
377
- using the detected subject and topic to pick references.
378
  """
379
- refs = list_refs(topic_refs(topic))
380
- # Definition scaffold tailored to topic, but generic enough for any subject.
381
- md = [
382
- f"### {subject} Detailed overview",
383
- f"**Definition:** {subject} is a service/technology that centralizes control through software and policy so you can create, operate, and secure resources consistently across environments.",
384
- "",
385
- "**Why it matters:**",
386
- "- Reduces manual configuration and errors with automation and governance.",
387
- "- Improves security through consistent, policy-driven controls.",
388
- "- Accelerates delivery with repeatable, programmable workflows.",
389
- "",
390
- "**Core capabilities:**",
391
- ]
392
- if topic == "sdn":
393
- md += [
394
- "- Programmatic virtual networking (VNets, subnets, routing).",
395
- "- Microsegmentation and traffic filtering for east–west security.",
396
- "- Software load balancing and gateway services for connectivity.",
397
- "- Hybrid consistency across Azure and Azure Local (Azure Stack HCI).",
398
- ]
399
- elif topic == "migration":
400
- md += [
401
- "- Discovery and assessment of on-prem workloads.",
402
- "- Replication, cutover orchestration (e.g., HCX or Azure Migrate).",
403
- "- Wave-based moves with rollback and validation.",
404
- "- Governance hooks for tagging, RBAC, policy.",
405
  ]
406
- elif topic == "dr":
407
- md += [
408
- "- Replication and recovery planning (RPO/RTO).",
409
- "- Failover/failback workflows and runbooks.",
410
- "- Testing and non-disruptive drills.",
411
- "- Integration with backup immutability and soft-delete.",
412
  ]
413
- elif topic == "security":
414
- md += [
415
- "- Posture management, policy, and workload protections.",
416
- "- Identity controls (RBAC, PIM/MFA), secrets management.",
417
- "- Detection and response (alerts, analytics, playbooks).",
418
- "- Compliance reporting and governance integration.",
419
  ]
420
- elif topic == "cost":
421
- md += [
422
- "- Visibility into spend and resource utilization.",
423
- "- Budgeting, alerts, and anomaly detection.",
424
- "- Rightsizing and purchase optimizations (Reservations/Savings Plans).",
425
- "- Tagging for showback/chargeback and accountability.",
426
- ]
427
- else:
428
- md += [
429
- "- Policy-driven management and automation.",
430
- "- Consistent APIs/CLI/portal and GitOps-friendly workflows.",
431
- "- Observability (logs/metrics) and compliance integration.",
432
  ]
433
-
434
- md += [
435
- "",
436
- "**How it works (high-level):**",
437
- "- A control plane applies intent (configuration/policy) to managed resources.",
438
- "- Agents/providers translate intent into concrete changes.",
439
- "- Telemetry feeds monitoring and governance for continuous improvement.",
440
- "",
441
- "**Common use cases:**",
442
- "- Standardized environments across dev/test/prod.",
443
- "- Stronger security posture via segmentation and least-privilege.",
444
- "- Hybrid scenarios spanning Azure and Azure Local where relevant.",
445
- "",
446
- f"**Trusted sources:** {refs}",
447
  ]
448
- return "\n".join(md)
449
-
450
- def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]], topic: str) -> str:
451
- combined = "\n\n".join([s.get("excerpt", "") for s in snippets])
452
- scores, gaps = score_text_against_checks(combined)
453
- points = _extract_key_points(combined, max_points=6)
454
- refs = list_refs(topic_refs(topic))
455
-
456
- md = [
457
- "### Answer (detailed)",
458
- f"**Your question:** {query}",
459
- "",
460
- "**Executive summary:**",
461
  ]
462
- if points:
463
- for p in points:
464
- md.append(f"- {p}")
465
- else:
466
- md.append("- Based on your documents, here is a structured plan and key considerations.")
467
-
468
- md += ["", "#### Recommended steps"]
469
- for step in _topic_steps(topic):
470
- md.append(f"- {step}")
471
-
472
- md += ["", "#### Supporting excerpts"]
473
- for s in snippets:
474
- md.append(f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}")
475
-
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  md += ["", f"**Trusted sources:** {refs}"]
477
  return "\n".join(md)
478
 
479
- def _compose_topic_fallback(query: str, topic: str, intent: str) -> str:
480
- # Use a topic-relevant fallback, with more detail than a plain template.
481
- if intent == "define":
482
- subject = _extract_subject_from_question(query)
483
- return _compose_definition(subject, topic)
484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  refs = list_refs(topic_refs(topic))
486
- headline = {
487
- "sdn": "Azure SDN — Overview",
488
- "migration": "Azure VMware / Azure Migrate — Overview",
489
- "dr": "Azure Site Recovery (DR) — Overview",
490
- "security": "Security & Governance in Azure Overview",
491
- "cost": "Cost Optimization in Azure — Overview",
492
- "general": "Overview",
493
- }[topic]
494
-
495
- md = [
496
- f"### {headline}",
497
- f"**Your question:** {query}",
498
- "",
499
- "**Key points:**",
500
- ]
501
- for step in _topic_steps(topic):
502
- md.append(f"- {step}")
503
- md += [
504
- "",
505
- f"**Trusted sources:** {refs}",
506
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  return "\n".join(md)
508
 
509
 
@@ -511,13 +423,7 @@ def _compose_topic_fallback(query: str, topic: str, intent: str) -> str:
511
  # Main Answer Function
512
  # =========================
513
 
514
- def answer_faq_or_approach_detailed(
515
- question: str,
516
- use_uploaded_docs: bool,
517
- index_obj: Any,
518
- _matrix_unused: Any,
519
- corpus: List[Dict[str, str]]
520
- ) -> str:
521
  q = (question or "").strip()
522
  if not q:
523
  return "Please enter a question."
@@ -525,20 +431,24 @@ def answer_faq_or_approach_detailed(
525
  intent = detect_intent(q)
526
  topic = detect_topic(q)
527
 
528
- # 1) Restrict FAQ route to migration-like queries only (prevents hijacking)
529
- q_tokens = set(tokenize(q))
530
- for item in FAQ_SEEDS:
531
- seed_tokens = set(tokenize(item["q"]))
532
- if not ({"migrate", "migration", "hcx", "avs"} & q_tokens):
533
- continue
534
- if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
535
- return (
536
- "### Answer (detailed)\n"
537
- f"{item['a']}\n\n"
538
- f"**Trusted sources:** {list_refs(item.get('refs', []))}"
539
- )
540
 
541
- # 2) Use uploaded docs (RAG) detailed synthesized answer
 
 
 
 
 
 
 
 
 
 
 
 
542
  if use_uploaded_docs and index_obj is not None and corpus:
543
  top = index_obj.query(q, k=6)
544
  snippets = []
@@ -547,35 +457,24 @@ def answer_faq_or_approach_detailed(
547
  excerpt = (item["text"] or "").strip()
548
  if len(excerpt) > 700:
549
  excerpt = excerpt[:700] + "..."
550
- snippets.append({
551
- "file": item["file"],
552
- "relevance": float(sim),
553
- "excerpt": excerpt
554
- })
555
  if snippets:
556
- return _compose_detailed_from_snippets(q, snippets, topic)
557
 
558
- # 3) Intent-aware fallback (especially for definitions like "What is Azure SDN?")
559
- if intent == "define":
560
- subject = _extract_subject_from_question(q)
561
- return _compose_definition(subject, topic)
562
-
563
- # 4) Topic-aware fallback for other intents
564
- return _compose_topic_fallback(q, topic, intent)
565
 
566
 
567
  # =========================
568
- # Build Index
569
  # =========================
570
 
571
  def build_index(files: List[Dict[str, Any]]):
572
  if not files:
573
  return None, None, [], "No files uploaded yet."
574
- corpus: List[Dict[str, str]] = []
575
- for f in files:
576
- rec = parse_file(f)
577
- if rec["text"]:
578
- corpus.append(rec)
579
  if not corpus:
580
  return None, None, [], "No text extracted."
581
  tokenized = [tokenize(c["text"]) for c in corpus]
@@ -595,16 +494,12 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
595
  "- Click **Build Index**\n"
596
  "- Ask a question. Answers are **detailed** and **topic-relevant**\n"
597
  )
598
-
599
  with gr.Row():
600
  with gr.Column(scale=2):
601
  file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
602
  index_status = gr.Markdown("No index yet.")
603
- st_index = gr.State()
604
- st_matrix = gr.State()
605
- st_corpus = gr.State()
606
  build_btn = gr.Button("Build Index", variant="primary")
607
-
608
  with gr.Column(scale=3):
609
  question = gr.Textbox(
610
  label="Ask a question",
@@ -614,7 +509,6 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
614
  ask_btn = gr.Button("Ask", variant="primary")
615
  answer_box = gr.Markdown("")
616
 
617
- # Convert gr.Files (paths) to expected dicts
618
  def _collect_files(paths: List[str]):
619
  out = []
620
  for p in paths or []:
@@ -630,11 +524,7 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
630
  files = _collect_files(files_paths)
631
  return build_index(files)
632
 
633
- build_btn.click(
634
- _build,
635
- inputs=[file_in],
636
- outputs=[index_status, st_index, st_matrix, st_corpus]
637
- )
638
 
639
  ask_btn.click(
640
  answer_faq_or_approach_detailed,
 
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
7
  - Ask questions; get reliable, detailed, and relevant answers.
8
+ - Intent-aware (definitions | how-to | plans | comparisons) with topic-aware details.
9
  - No external APIs. No scikit-learn.
10
 
11
  Run locally:
 
18
  import re
19
  import math
20
  from typing import List, Tuple, Dict, Any
21
+ from collections import Counter
22
 
23
  import gradio as gr
24
 
 
44
  # Core guidance
45
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
46
  ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
47
+ # Networking / SDN (used when question is about SDN)
48
  ("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
49
  ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
50
  ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
 
110
 
111
 
112
  # =========================
113
+ # Intent & topic detection
114
  # =========================
115
 
116
  _DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I)
 
149
 
150
 
151
  # =========================
152
+ # Tiny TF-IDF Index
153
  # =========================
154
 
155
  class TinyTfidfIndex:
 
208
 
209
 
210
  # =========================
211
+ # File Parsing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  # =========================
213
 
214
  def read_pdf_bytes(b: bytes) -> str:
 
259
 
260
 
261
  # =========================
262
+ # Strong definition composer (for “what is …”)
263
  # =========================
264
 
265
+ _DEF_RE_LEAD = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", re.I)
266
+
267
  def _extract_subject_from_question(q: str) -> str:
268
+ s = _DEF_RE_LEAD.sub("", q).strip()
 
 
 
 
269
  s = re.sub(r"[?.!]+$", "", s).strip()
 
270
  s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I)
271
+ return s if s else "the topic"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
+ def _definition_for_subject(subject: str, topic: str) -> Tuple[str, List[str], List[str], List[str], List[str], List[str]]:
274
  """
275
+ Returns: (definition, capabilities[], how[], best_practices[], use_cases[], refs_list)
276
+ Provides a specific definition for SDN; otherwise a generic but detailed scaffold using the subject.
277
  """
278
+ # SDN-specific, as per your example (paraphrased, not reused verbatim for all topics)
279
+ if topic == "sdn" or "sdn" in subject.lower():
280
+ definition = (
281
+ f"{subject} is Microsoft's implementation of software-defined networking: "
282
+ "a model that shifts network control into software so you can centrally design, automate, "
283
+ "and protect virtual networks across Azure and Azure Local (Azure Stack HCI). "
284
+ "By separating the control plane from underlying hardware, it enables programmability and "
285
+ "policy-driven management of components such as virtual networks, subnets, firewalls/ACLs, "
286
+ "load balancers, and gateways—well-suited for dynamic cloud and hybrid environments."
287
+ )
288
+ capabilities = [
289
+ "Programmatic creation of VNets, subnets, routing, and address spaces.",
290
+ "Micro-segmentation and policy enforcement for east–west traffic.",
291
+ "Software load balancing and gateway services for app connectivity.",
292
+ "Consistency across Azure and Azure Local (Azure Stack HCI) via Azure Arc.",
 
 
 
 
 
 
 
 
 
 
 
293
  ]
294
+ how = [
295
+ "A centralized control plane applies intent (network topology and policies) to host virtual switches.",
296
+ "Agents/controllers translate intent into concrete configuration on each host.",
297
+ "Telemetry and logs feed monitoring, governance, and troubleshooting workflows.",
 
 
298
  ]
299
+ best = [
300
+ "Use Infrastructure-as-Code (Bicep/Terraform) and GitOps to standardize changes.",
301
+ "Apply least-privilege and RBAC; review segmentation policies regularly.",
302
+ "Integrate with logging/monitoring; alert on drift and policy violations.",
 
 
303
  ]
304
+ uses = [
305
+ "Rapidly provisioning isolated app environments and tiers.",
306
+ "Zero-trust segmentation between workloads and environments.",
307
+ "Hybrid designs spanning Azure and Azure Local with consistent constructs.",
 
 
 
 
 
 
 
 
308
  ]
309
+ refs_list = topic_refs("sdn")
310
+ return definition, capabilities, how, best, uses, refs_list
311
+
312
+ # Generic detailed definition for other subjects
313
+ sub = subject.strip()
314
+ definition = (
315
+ f"{sub} is a service/technology that centralizes control through software and policy so teams can "
316
+ f"create, operate, and secure resources consistently across environments."
317
+ )
318
+ capabilities = [
319
+ "Automation and policy-driven configuration to reduce manual effort and errors.",
320
+ "Governance integration (RBAC, tagging, policy) for consistency and compliance.",
321
+ "Observability hooks (logs/metrics) for reliability and performance tuning.",
 
322
  ]
323
+ how = [
324
+ "A control plane captures intent (configuration/policies) and applies it to managed resources.",
325
+ "Providers/agents on the platform translate intent into changes at runtime.",
326
+ "Feedback loops via telemetry inform continuous improvement.",
 
 
 
 
 
 
 
 
 
327
  ]
328
+ best = [
329
+ "Adopt Infrastructure-as-Code and peer reviews for change control.",
330
+ "Define tagging, RBAC roles, and policy baselines early.",
331
+ "Pilot in a non-prod environment before broad rollout.",
332
+ ]
333
+ uses = [
334
+ "Faster, repeatable environment provisioning.",
335
+ "Improved security posture through standardized controls.",
336
+ "Hybrid scenarios requiring consistent management across sites.",
337
+ ]
338
+ refs_list = topic_refs(detect_topic(sub))
339
+ return definition, capabilities, how, best, uses, refs_list
340
+
341
+ def _compose_definition_markdown(query: str, subject: str, topic: str) -> str:
342
+ definition, capabilities, how, best, uses, refs_list = _definition_for_subject(subject, topic)
343
+ refs = list_refs(refs_list)
344
+ md = [f"### {subject} — Detailed definition",
345
+ f"**Your question:** {query}", "",
346
+ f"**Definition:** {definition}", "",
347
+ "**Key capabilities:**"]
348
+ md += [f"- {c}" for c in capabilities]
349
+ md += ["", "**How it works:**"]
350
+ md += [f"- {h}" for h in how]
351
+ md += ["", "**Best practices:**"]
352
+ md += [f"- {b}" for b in best]
353
+ md += ["", "**Common use cases:**"]
354
+ md += [f"- {u}" for u in uses]
355
  md += ["", f"**Trusted sources:** {refs}"]
356
  return "\n".join(md)
357
 
 
 
 
 
 
358
 
359
+ # =========================
360
+ # RAG: build a detailed answer from uploaded docs
361
+ # =========================
362
+
363
+ def _extract_points(text: str, max_points: int = 6) -> List[str]:
364
+ parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
365
+ pts = []
366
+ for p in parts:
367
+ p = p.strip()
368
+ if 40 <= len(p) <= 280 and p not in pts:
369
+ pts.append(p)
370
+ if len(pts) >= max_points:
371
+ break
372
+ return pts
373
+
374
+ def _compose_rag_answer(query: str, snippets: List[str], topic: str) -> str:
375
+ combined = " ".join(snippets)
376
+ points = _extract_points(combined, max_points=6)
377
  refs = list_refs(topic_refs(topic))
378
+ md = ["### Answer (detailed)", f"**Your question:** {query}", ""]
379
+ if points:
380
+ md += ["**Executive summary:**"] + [f"- {p}" for p in points]
381
+ else:
382
+ md += ["**Executive summary:**", "- Here are key considerations synthesized from your documents."]
383
+ # Add a short topic-aware checklist
384
+ checklist = {
385
+ "sdn": [
386
+ "Define VNets/subnets and segmentation policy.",
387
+ "Automate with IaC (Bicep/Terraform) and GitOps.",
388
+ "Harden east–west traffic with micro-segmentation.",
389
+ "Plan ingress/egress with LBs and gateways."
390
+ ],
391
+ "migration": [
392
+ "Establish landing zone (Policy, RBAC, logging).",
393
+ "Connect networks (ER/VPN), validate DNS/MTU.",
394
+ "Discover/assess with Azure Migrate; pilot a few VMs.",
395
+ "Choose HCX or Azure Migrate for cutover; migrate in waves."
396
+ ],
397
+ "dr": [
398
+ "Define RTO/RPO; choose replication targets.",
399
+ "Run planned/unplanned failover drills.",
400
+ "Ensure immutable backups and soft-delete."
401
+ ],
402
+ "security": [
403
+ "Enable RBAC/PIM/MFA and Key Vault.",
404
+ "Turn on Defender for Cloud; set policies and alerts.",
405
+ "Collect logs; restrict lateral movement."
406
+ ],
407
+ "cost": [
408
+ "Right-size; use Reservations/Savings Plans.",
409
+ "Tag resources; set budgets/alerts.",
410
+ "Automate non-prod shutdowns."
411
+ ],
412
+ "general": [
413
+ "Clarify objectives and constraints.",
414
+ "Pilot changes; define rollback and verification."
415
+ ]
416
+ }.get(topic, ["Clarify objectives and constraints.", "Pilot changes; define rollback and verification."])
417
+ md += ["", "**Recommended steps:**"] + [f"- {s}" for s in checklist]
418
+ md += ["", f"**Trusted sources:** {refs}"]
419
  return "\n".join(md)
420
 
421
 
 
423
  # Main Answer Function
424
  # =========================
425
 
426
+ def answer_faq_or_approach_detailed(question: str, use_uploaded_docs: bool, index_obj: Any, _matrix_unused: Any, corpus: List[Dict[str,str]]) -> str:
 
 
 
 
 
 
427
  q = (question or "").strip()
428
  if not q:
429
  return "Please enter a question."
 
431
  intent = detect_intent(q)
432
  topic = detect_topic(q)
433
 
434
+ # A) Definitions: build a strong, subject-specific definition (e.g., "What is Azure SDN?")
435
+ if intent == "define":
436
+ subject = _extract_subject_from_question(q)
437
+ return _compose_definition_markdown(q, subject, topic)
 
 
 
 
 
 
 
 
438
 
439
+ # B) Migration FAQs (only if the question is migration-like to avoid hijacking)
440
+ q_tokens = set(tokenize(q))
441
+ if {"migrate", "migration", "hcx", "avs"} & q_tokens:
442
+ for item in FAQ_SEEDS:
443
+ seed_tokens = set(tokenize(item["q"]))
444
+ if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
445
+ return (
446
+ "### Answer (detailed)\n"
447
+ f"{item['a']}\n\n"
448
+ f"**Trusted sources:** {list_refs(item.get('refs', []))}"
449
+ )
450
+
451
+ # C) RAG over uploaded docs → detailed synthesized answer
452
  if use_uploaded_docs and index_obj is not None and corpus:
453
  top = index_obj.query(q, k=6)
454
  snippets = []
 
457
  excerpt = (item["text"] or "").strip()
458
  if len(excerpt) > 700:
459
  excerpt = excerpt[:700] + "..."
460
+ if excerpt:
461
+ snippets.append(excerpt)
 
 
 
462
  if snippets:
463
+ return _compose_rag_answer(q, snippets, topic)
464
 
465
+ # D) Topic-aware fallback (short but relevant)
466
+ subject = _extract_subject_from_question(q) if intent in {"how", "plan", "compare"} else q
467
+ return _compose_definition_markdown(q, subject, topic)
 
 
 
 
468
 
469
 
470
  # =========================
471
+ # Index Builder
472
  # =========================
473
 
474
  def build_index(files: List[Dict[str, Any]]):
475
  if not files:
476
  return None, None, [], "No files uploaded yet."
477
+ corpus = [parse_file(f) for f in files if parse_file(f)["text"]]
 
 
 
 
478
  if not corpus:
479
  return None, None, [], "No text extracted."
480
  tokenized = [tokenize(c["text"]) for c in corpus]
 
494
  "- Click **Build Index**\n"
495
  "- Ask a question. Answers are **detailed** and **topic-relevant**\n"
496
  )
 
497
  with gr.Row():
498
  with gr.Column(scale=2):
499
  file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
500
  index_status = gr.Markdown("No index yet.")
501
+ st_index = gr.State(); st_matrix = gr.State(); st_corpus = gr.State()
 
 
502
  build_btn = gr.Button("Build Index", variant="primary")
 
503
  with gr.Column(scale=3):
504
  question = gr.Textbox(
505
  label="Ask a question",
 
509
  ask_btn = gr.Button("Ask", variant="primary")
510
  answer_box = gr.Markdown("")
511
 
 
512
  def _collect_files(paths: List[str]):
513
  out = []
514
  for p in paths or []:
 
524
  files = _collect_files(files_paths)
525
  return build_index(files)
526
 
527
+ build_btn.click(_build, inputs=[file_in], outputs=[index_status, st_index, st_matrix, st_corpus])
 
 
 
 
528
 
529
  ask_btn.click(
530
  answer_faq_or_approach_detailed,