ajayinsac commited on
Commit
bdd12dc
·
verified ·
1 Parent(s): c76f040

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -184
app.py CHANGED
@@ -3,10 +3,12 @@
3
 
4
  """
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
- - Works on Hugging Face Spaces (no external API calls, no sklearn).
7
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
8
- - Ask questions; get reliable, detailed answers with excerpts + trusted refs.
9
-
 
 
10
  Run locally:
11
  pip install gradio PyPDF2 python-docx
12
  python app.py
@@ -16,14 +18,16 @@ import os
16
  import io
17
  import re
18
  import math
19
- from typing import List, Tuple, Dict, Any
20
  from collections import Counter, defaultdict
21
 
22
  import gradio as gr
23
 
24
- # Optional parsers (gracefully degrade if not installed on Spaces)
 
 
25
  try:
26
- import PyPDF2 # lightweight; often available on Spaces
27
  except Exception:
28
  PyPDF2 = None
29
 
@@ -38,6 +42,9 @@ except Exception:
38
  # =========================
39
 
40
  TRUSTED_SOURCES: List[Tuple[str, str]] = [
 
 
 
41
  ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
42
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
43
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
@@ -82,9 +89,7 @@ FAQ_SEEDS: List[Dict[str, Any]] = [
82
  _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
83
 
84
  def tokenize(text: str) -> List[str]:
85
- if not text:
86
- return []
87
- return [t.lower() for t in _WORD_RE.findall(text)]
88
 
89
  def list_refs(ref_names: List[str]) -> str:
90
  links = []
@@ -96,7 +101,7 @@ def list_refs(ref_names: List[str]) -> str:
96
 
97
 
98
  # =========================
99
- # Tiny TF-IDF implementation (no sklearn)
100
  # =========================
101
 
102
  class TinyTfidfIndex:
@@ -109,14 +114,12 @@ class TinyTfidfIndex:
109
 
110
  def add_documents(self, tokenized_docs: List[List[str]]):
111
  self.docs = tokenized_docs[:]
112
- # document frequency
113
  self.df = Counter()
114
  for toks in self.docs:
115
  self.df.update(set(toks))
116
  N = max(1, len(self.docs))
117
  self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
118
  self.voc_size = len(self.idf)
119
- # precompute norms
120
  self.doc_norms = []
121
  for toks in self.docs:
122
  tf = Counter(toks)
@@ -137,7 +140,7 @@ class TinyTfidfIndex:
137
  v[term] = (cnt / total) * idf
138
  return v
139
 
140
- def query(self, text: str, k: int = 6) -> List[Tuple[int, float]]:
141
  if not self.docs:
142
  return []
143
  qv = self._vec(tokenize(text))
@@ -157,52 +160,16 @@ class TinyTfidfIndex:
157
 
158
 
159
  # =========================
160
- # Scoring rubric to tailor the detailed output
161
  # =========================
162
 
163
  CHECKS = [
164
- {
165
- "id": "landing_zone",
166
- "desc": "Landing zone defined (hub/spoke, Policy, RBAC, logging).",
167
- "fix": "Use CAF blueprints; enforce Policy for guardrails and RBAC.",
168
- "keywords": ["landing", "hub", "spoke", "policy", "rbac", "log", "monitor"],
169
- "pillar": "governance",
170
- },
171
- {
172
- "id": "connectivity",
173
- "desc": "Connectivity planned (ExpressRoute/VPN), DNS, MTU validated for HCX.",
174
- "fix": "Verify ER/VPN, DNS resolution, and HCX MTU/mobility settings.",
175
- "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx", "connectivity"],
176
- "pillar": "networking",
177
- },
178
- {
179
- "id": "migrate_tooling",
180
- "desc": "Discovery/assessment and tooling chosen (Azure Migrate or HCX).",
181
- "fix": "Run Azure Migrate discovery; select HCX or Azure Migrate per downtime.",
182
- "keywords": ["azure", "migrate", "discovery", "assessment", "hcx", "replication"],
183
- "pillar": "operations",
184
- },
185
- {
186
- "id": "security",
187
- "desc": "Security/identity configured (Key Vault, Defender, Sentinel, PIM/MFA).",
188
- "fix": "Centralize secrets in Key Vault; enable Defender/Sentinel; enforce PIM/MFA.",
189
- "keywords": ["key", "vault", "defender", "sentinel", "pim", "mfa", "entra", "aad", "identity"],
190
- "pillar": "security",
191
- },
192
- {
193
- "id": "dr_backup",
194
- "desc": "Backups, DR, RTO/RPO defined; ASR drills planned.",
195
- "fix": "Set RTO/RPO; immutability & soft-delete; test ASR failover/failback.",
196
- "keywords": ["backup", "rto", "rpo", "dr", "asr", "failover", "restore"],
197
- "pillar": "reliability",
198
- },
199
- {
200
- "id": "cost",
201
- "desc": "Cost optimization plan (right-sizing, reservations, tagging).",
202
- "fix": "Use reservations/Savings Plans, rightsizing, and enforce tags.",
203
- "keywords": ["cost", "reservation", "savings", "right", "tag"],
204
- "pillar": "cost",
205
- },
206
  ]
207
 
208
  def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
@@ -230,7 +197,119 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
230
 
231
 
232
  # =========================
233
- # File parsing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  # =========================
235
 
236
  def read_pdf_bytes(b: bytes) -> str:
@@ -238,13 +317,7 @@ def read_pdf_bytes(b: bytes) -> str:
238
  return ""
239
  try:
240
  reader = PyPDF2.PdfReader(io.BytesIO(b))
241
- out = []
242
- for page in reader.pages:
243
- try:
244
- out.append(page.extract_text() or "")
245
- except Exception:
246
- pass
247
- return "\n".join(out)
248
  except Exception:
249
  return ""
250
 
@@ -267,7 +340,6 @@ def read_text_bytes(b: bytes) -> str:
267
  return ""
268
 
269
  def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
270
- """Returns {"file": <name>, "text": <extracted_text>}"""
271
  name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
272
  data = file_obj.get("data")
273
  if data is None:
@@ -277,72 +349,38 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
277
  data = fh.read()
278
  if data is None:
279
  return {"file": name, "text": ""}
280
-
281
  low = name.lower()
282
  if low.endswith(".pdf"):
283
  text = read_pdf_bytes(data)
284
  elif low.endswith((".docx", ".doc")):
285
  text = read_docx_bytes(data)
286
- elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
287
- text = read_text_bytes(data)
288
  else:
289
  text = read_text_bytes(data)
290
  return {"file": os.path.basename(name), "text": text or ""}
291
 
292
 
293
  # =========================
294
- # Detailed Answer Composer
295
  # =========================
296
 
297
  def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
298
- collected = [s.get("excerpt", "") for s in snippets]
299
- combined = "\n\n".join(collected)
300
- scores, gaps = score_text_against_checks(combined) if combined.strip() else ({"overall": 0.0}, [])
301
-
302
- def _mk_gaps(glist, limit=8):
303
- out = []
304
- for g in glist[:limit]:
305
- out.append(f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_")
306
- return "\n".join(out) if out else "- No major issues detected in the sampled excerpts."
307
-
308
- refs = list_refs([
309
- "Azure VMware Solution (AVS)",
310
- "Azure Migrate",
311
- "Cloud Adoption Framework (CAF)",
312
- "Azure Well-Architected Framework (WAF)",
313
- "VMware HCX Docs",
314
- ])
315
-
316
- pillar_lines = []
317
- for k_, v_ in scores.items():
318
- if k_ == "overall":
319
- continue
320
- pillar_lines.append(f"- **{k_.capitalize()}**: {v_}")
321
- pillar_md = "\n".join(pillar_lines) if pillar_lines else "- (no signals)"
322
-
323
- md = (
324
  f"### Answer (detailed)\n"
325
  f"**Your question:** {query}\n\n"
326
- f"**TL;DR:** Here’s a concrete plan across landing zone, connectivity, migration method, security, DR, and cost. "
327
- f"Address the highest-risk gaps first.\n\n"
328
- f"#### Step-by-step plan\n"
329
- "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
330
- "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
331
- "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
332
- "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
333
- "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
334
- "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
335
- "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
336
- f"#### What your documents emphasize (auto-scored)\n"
337
- f"**Overall score:** {scores.get('overall', 0)} / 5.0\n\n"
338
- f"**Per-pillar signals:**\n{pillar_md}\n\n"
339
- f"#### Gaps & quick fixes\n{_mk_gaps(gaps, limit=8)}\n\n"
340
- f"#### Supporting excerpts\n"
341
  )
342
  for s in snippets:
343
- md += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
344
- md += f"**Trusted sources:** {refs}"
345
- return md
346
 
347
 
348
  # =========================
@@ -354,33 +392,27 @@ def answer_faq_or_approach_detailed(
354
  use_uploaded_docs: bool,
355
  index_obj: Any,
356
  _matrix_unused: Any,
357
- corpus: List[Dict[str, str]],
358
  ) -> str:
359
  q = (question or "").strip()
360
  if not q:
361
  return "Please enter a question."
362
 
363
- # 1) Seeded FAQs detailed plan (looser match to trigger more often)
 
 
 
 
 
364
  q_tokens = set(tokenize(q))
365
  for item in FAQ_SEEDS:
366
  seed_tokens = set(tokenize(item["q"]))
367
- overlap = len(seed_tokens & q_tokens)
368
- if overlap >= max(1, len(seed_tokens) // 2): # >=50% overlap
369
- refs = list_refs(item.get("refs", []))
370
- base = (
371
- f"### Answer (detailed)\n"
372
  f"{item['a']}\n\n"
373
- "#### Step-by-step plan\n"
374
- "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
375
- "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
376
- "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
377
- "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
378
- "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
379
- "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
380
- "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
381
- f"**Trusted sources:** {refs}"
382
  )
383
- return base
384
 
385
  # 2) Use uploaded docs (RAG) → detailed synthesized answer
386
  if use_uploaded_docs and index_obj is not None and corpus:
@@ -394,60 +426,43 @@ def answer_faq_or_approach_detailed(
394
  snippets.append({
395
  "file": item["file"],
396
  "relevance": float(sim),
397
- "excerpt": excerpt,
398
  })
399
  if snippets:
400
  return _compose_detailed_from_snippets(q, snippets)
401
 
402
- # 3) Fallback (no docs) → generic detailed plan with citations
403
- refs = list_refs([
404
- "Azure VMware Solution (AVS)",
405
- "Azure Migrate",
406
- "Cloud Adoption Framework (CAF)",
407
- "Azure Well-Architected Framework (WAF)",
408
- "VMware HCX Docs",
409
- ])
410
- generic = (
411
  "### Answer (detailed)\n"
412
- "**TL;DR:** Use AVS/HCX or Azure Migrate depending on downtime needs; build landing zone and connectivity first, "
413
- "then migrate in waves with rollback and DR drills.\n\n"
414
- "#### Step-by-step plan\n"
415
- "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
416
- "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
417
- "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
418
- "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
419
- "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
420
- "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
421
- "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
422
  f"**Trusted sources:** {refs}"
423
  )
424
- return generic
425
 
426
 
427
  # =========================
428
- # Build index from uploaded files
429
  # =========================
430
 
431
- def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
432
- """Returns: (index_obj, matrix_placeholder, corpus, status_message)"""
433
  if not files:
434
  return None, None, [], "No files uploaded yet."
435
-
436
  corpus: List[Dict[str, str]] = []
437
  for f in files:
438
  rec = parse_file(f)
439
  if rec["text"]:
440
  corpus.append(rec)
441
-
442
  if not corpus:
443
- return None, None, [], "Uploaded files could not be parsed (no text extracted)."
444
-
445
  tokenized = [tokenize(c["text"]) for c in corpus]
446
  idx = TinyTfidfIndex()
447
  idx.add_documents(tokenized)
448
-
449
- status = f"Indexed {len(corpus)} document(s). Vocabulary size ≈ {idx.voc_size}."
450
- return idx, None, corpus, status
451
 
452
 
453
  # =========================
@@ -457,38 +472,26 @@ def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, s
457
  with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
458
  gr.Markdown(
459
  "## VMware On-Prem → Azure Local Migration Assistant\n"
460
- "- Upload your **design/migration documents** (PDF, DOCX, TXT, MD)\n"
461
- "- Ask questions. Toggle **Use uploaded docs** for RAG-based answers\n"
462
- "- Answers are **detailed** by default, with structured steps and trusted references\n"
463
  )
464
 
465
  with gr.Row():
466
  with gr.Column(scale=2):
467
- file_in = gr.Files(
468
- label="Upload documents (PDF/DOCX/TXT/MD)",
469
- file_count="multiple",
470
- type="filepath" # we will open paths ourselves
471
- )
472
  index_status = gr.Markdown("No index yet.")
473
-
474
- # Hidden/State to hold in-memory data
475
  st_index = gr.State()
476
- st_matrix = gr.State() # placeholder for API compatibility
477
  st_corpus = gr.State()
478
-
479
  build_btn = gr.Button("Build Index", variant="primary")
 
480
  with gr.Column(scale=3):
481
- question = gr.Textbox(
482
- label="Ask a question",
483
- placeholder="e.g., How do I minimize downtime for our VMware migration?",
484
- lines=3
485
- )
486
  use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
487
  ask_btn = gr.Button("Ask", variant="primary")
488
  answer_box = gr.Markdown("")
489
 
490
- # Convert gr.Files (paths) into dicts our parser expects
491
- def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
492
  out = []
493
  for p in paths or []:
494
  try:
@@ -501,19 +504,18 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
501
 
502
  def _build(files_paths: List[str]):
503
  files = _collect_files(files_paths)
504
- idx, mat, corpus, status = build_index(files)
505
- return status, idx, mat, corpus
506
 
507
  build_btn.click(
508
  _build,
509
  inputs=[file_in],
510
- outputs=[index_status, st_index, st_matrix, st_corpus],
511
  )
512
 
513
  ask_btn.click(
514
  answer_faq_or_approach_detailed,
515
  inputs=[question, use_docs, st_index, st_matrix, st_corpus],
516
- outputs=[answer_box],
517
  )
518
 
519
  if __name__ == "__main__":
 
3
 
4
  """
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
+ - No external API calls. No scikit-learn.
7
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
8
+ - Ask questions; get RELIABLE, DETAILED answers:
9
+ • Concept KB (for definitions like “What is Azure Arc-enabled SDN?”)
10
+ • RAG on uploaded docs (excerpts + gaps/fixes)
11
+ • Seeded FAQs (migration flows)
12
  Run locally:
13
  pip install gradio PyPDF2 python-docx
14
  python app.py
 
18
  import io
19
  import re
20
  import math
21
+ from typing import List, Tuple, Dict, Any, Optional
22
  from collections import Counter, defaultdict
23
 
24
  import gradio as gr
25
 
26
+ # -------------------------
27
+ # Optional parsers (graceful fallback)
28
+ # -------------------------
29
  try:
30
+ import PyPDF2 # often present on Spaces
31
  except Exception:
32
  PyPDF2 = None
33
 
 
42
  # =========================
43
 
44
  TRUSTED_SOURCES: List[Tuple[str, str]] = [
45
+ ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
46
+ ("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
47
+ ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
48
  ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
49
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
50
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
 
89
  _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
90
 
91
  def tokenize(text: str) -> List[str]:
92
+ return [t.lower() for t in _WORD_RE.findall(text or "")]
 
 
93
 
94
  def list_refs(ref_names: List[str]) -> str:
95
  links = []
 
101
 
102
 
103
  # =========================
104
+ # Tiny TF-IDF Index (no sklearn)
105
  # =========================
106
 
107
  class TinyTfidfIndex:
 
114
 
115
  def add_documents(self, tokenized_docs: List[List[str]]):
116
  self.docs = tokenized_docs[:]
 
117
  self.df = Counter()
118
  for toks in self.docs:
119
  self.df.update(set(toks))
120
  N = max(1, len(self.docs))
121
  self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
122
  self.voc_size = len(self.idf)
 
123
  self.doc_norms = []
124
  for toks in self.docs:
125
  tf = Counter(toks)
 
140
  v[term] = (cnt / total) * idf
141
  return v
142
 
143
+ def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
144
  if not self.docs:
145
  return []
146
  qv = self._vec(tokenize(text))
 
160
 
161
 
162
  # =========================
163
+ # Rubric for RAG-tailoring
164
  # =========================
165
 
166
  CHECKS = [
167
+ {"id": "landing_zone", "desc": "Landing zone defined.", "fix": "Use CAF blueprints.", "keywords": ["landing", "hub", "spoke", "policy", "rbac"], "pillar": "governance"},
168
+ {"id": "connectivity", "desc": "Connectivity planned.", "fix": "Verify ER/VPN, DNS, MTU.", "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx"], "pillar": "networking"},
169
+ {"id": "migrate_tooling","desc": "Tooling chosen.", "fix": "Run Azure Migrate discovery.", "keywords": ["migrate", "discovery", "assessment", "hcx"], "pillar": "operations"},
170
+ {"id": "security", "desc": "Security configured.", "fix": "Enable Key Vault, Defender, Sentinel, MFA.", "keywords": ["vault", "defender", "sentinel", "mfa", "identity"], "pillar": "security"},
171
+ {"id": "dr_backup", "desc": "Backups/DR defined.", "fix": "Set RTO/RPO; test ASR.", "keywords": ["backup", "rto", "rpo", "dr", "asr"], "pillar": "reliability"},
172
+ {"id": "cost", "desc": "Cost optimization.", "fix": "Use reservations, rightsizing, tags.", "keywords": ["cost", "reservation", "savings", "tag"], "pillar": "cost"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  ]
174
 
175
  def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
 
197
 
198
 
199
  # =========================
200
+ # Built-in Concept KB (for definitional questions)
201
+ # =========================
202
+
203
+ class Concept:
204
+ def __init__(self, name: str, aliases: List[str], builder):
205
+ self.name = name
206
+ self.aliases = [tokenize(a) for a in aliases]
207
+ self.builder = builder # function(query:str)->str
208
+
209
+ def _kb_ans_azure_sdn(_: str) -> str:
210
+ refs = list_refs(["Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"])
211
+ return (
212
+ "### Azure SDN — What it is and why it matters\n"
213
+ "**Definition:** Azure SDN is Microsoft's software-defined networking stack that centralizes network control in software, "
214
+ "decoupling policy and management from physical hardware. It lets you programmatically create and secure virtual networks, "
215
+ "subnets, microsegmentation (ACL/NSG-like policies), load balancers and gateways across Azure and Azure Local (Azure Stack HCI) environments.\n\n"
216
+ "**Key capabilities**\n"
217
+ "- Central, policy-driven control plane for virtual networking resources.\n"
218
+ "- Automation & GitOps-friendly configuration for repeatable environments.\n"
219
+ "- Microsegmentation and traffic filtering for east–west security.\n"
220
+ "- Software load balancing and gateway services for app connectivity.\n"
221
+ "- Consistent constructs across cloud and on-prem (with Azure Local).\n\n"
222
+ "**How it works (high level)**\n"
223
+ "- A software control plane programs host virtual switches and network functions.\n"
224
+ "- Network intent (VNets, subnets, policies) is applied consistently across hosts.\n"
225
+ "- Integrates with Azure identity/management for RBAC and governance.\n\n"
226
+ "**Common use cases**\n"
227
+ "- Rapidly provisioning isolated app environments.\n"
228
+ "- Enforcing zero-trust style segmentation between tiers.\n"
229
+ "- Hybrid apps spanning Azure and Azure Local.\n\n"
230
+ f"**Trusted sources:** {refs}"
231
+ )
232
+
233
+ def _kb_ans_arc_enabled_sdn(_: str) -> str:
234
+ refs = list_refs(["Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"])
235
+ return (
236
+ "### Azure Arc-enabled SDN — Definition & details\n"
237
+ "**Definition:** Azure Arc-enabled SDN brings Azure's software-defined networking to on-premises Azure Local (Azure Stack HCI) clusters, "
238
+ "managed through Azure Arc. It decouples network control from hardware so you can centrally define, automate, and secure "
239
+ "virtual networks, subnets, and policies in your datacenter using Azure-consistent tools.\n\n"
240
+ "**Why it matters**\n"
241
+ "- Gives you Azure-like VNet constructs and policy management on-prem.\n"
242
+ "- Enables consistent security and segmentation across hybrid estates.\n"
243
+ "- Supports rapid, software-driven changes without touching physical fabric.\n\n"
244
+ "**Key capabilities**\n"
245
+ "- Create/modify on-prem VNets, subnets, and routing policies from Azure.\n"
246
+ "- Apply microsegmentation rules (policy/ACL-style) for east–west security.\n"
247
+ "- Software load balancing and gateway services for north–south/east–west flows.\n"
248
+ "- Integration with Azure RBAC, tagging, and governance for change control.\n\n"
249
+ "**Core components (conceptual)**\n"
250
+ "- **Arc resource bridge & agents** — connect your HCI cluster to Azure control.\n"
251
+ "- **SDN controller & host agents** — program the Hyper-V vSwitch and network functions.\n"
252
+ "- **Azure portal/CLI/GitOps** — define intent (VNets, subnets, policies) and deploy.\n\n"
253
+ "**Prerequisites (typical)**\n"
254
+ "- Azure Local (Azure Stack HCI) cluster connected to Azure Arc.\n"
255
+ "- Arc resource bridge onboarded; network requirements met.\n"
256
+ "- Appropriate RBAC roles to manage networking resources.\n\n"
257
+ "**Use cases**\n"
258
+ "- Host Azure-consistent app networks on-prem for data locality/regulatory needs.\n"
259
+ "- Hybrid deployments with identical network constructs across Azure and HCI.\n"
260
+ "- Rapid rollout of segmented networks for dev/test/prod without hardware changes.\n\n"
261
+ "**Notes & limitations (high level)**\n"
262
+ "- Physical underlay still matters (IP design, routing, bandwidth, HA).\n"
263
+ "- Feature parity with public Azure services may vary; validate per release.\n\n"
264
+ f"**Trusted sources:** {refs}"
265
+ )
266
+
267
+ KB_CONCEPTS: List[Concept] = [
268
+ Concept(
269
+ name="azure sdn",
270
+ aliases=[
271
+ "azure sdn",
272
+ "software defined networking azure",
273
+ "sdn in azure",
274
+ "azure local sdn",
275
+ "azure stack hci sdn",
276
+ ],
277
+ builder=_kb_ans_azure_sdn,
278
+ ),
279
+ Concept(
280
+ name="azure arc enabled sdn",
281
+ aliases=[
282
+ "azure arc enabled sdn",
283
+ "azure arc-enabled sdn",
284
+ "arc enabled sdn",
285
+ "arc-enabled sdn",
286
+ "arc sdn",
287
+ "azure local arc sdn",
288
+ "azure stack hci arc sdn",
289
+ ],
290
+ builder=_kb_ans_arc_enabled_sdn,
291
+ ),
292
+ ]
293
+
294
+ def lookup_concept(query: str) -> Optional[Concept]:
295
+ q_tokens = set(tokenize(query))
296
+ best: Optional[Concept] = None
297
+ best_score = 0.0
298
+ for c in KB_CONCEPTS:
299
+ for alias_tokens in c.aliases:
300
+ if not alias_tokens:
301
+ continue
302
+ overlap = len(q_tokens & set(alias_tokens))
303
+ score = overlap / float(len(set(alias_tokens)))
304
+ if score > best_score:
305
+ best_score = score
306
+ best = c
307
+ # threshold: intentional but tolerant
308
+ return best if best_score >= 0.5 else None
309
+
310
+
311
+ # =========================
312
+ # File Parsing
313
  # =========================
314
 
315
  def read_pdf_bytes(b: bytes) -> str:
 
317
  return ""
318
  try:
319
  reader = PyPDF2.PdfReader(io.BytesIO(b))
320
+ return "\n".join([page.extract_text() or "" for page in reader.pages])
 
 
 
 
 
 
321
  except Exception:
322
  return ""
323
 
 
340
  return ""
341
 
342
  def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
 
343
  name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
344
  data = file_obj.get("data")
345
  if data is None:
 
349
  data = fh.read()
350
  if data is None:
351
  return {"file": name, "text": ""}
 
352
  low = name.lower()
353
  if low.endswith(".pdf"):
354
  text = read_pdf_bytes(data)
355
  elif low.endswith((".docx", ".doc")):
356
  text = read_docx_bytes(data)
 
 
357
  else:
358
  text = read_text_bytes(data)
359
  return {"file": os.path.basename(name), "text": text or ""}
360
 
361
 
362
  # =========================
363
+ # Detailed Answer Composer (for RAG path)
364
  # =========================
365
 
366
  def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
367
+ combined = "\n\n".join([s.get("excerpt", "") for s in snippets])
368
+ scores, gaps = score_text_against_checks(combined)
369
+ def _mk_gaps(glist):
370
+ return "\n".join([f"- ({g['severity']}) {g['id']}: {g['fix']}" for g in glist]) or "- No major issues detected."
371
+ refs = list_refs([s[0] for s in TRUSTED_SOURCES])
372
+ details = (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  f"### Answer (detailed)\n"
374
  f"**Your question:** {query}\n\n"
375
+ f"**Summary:** Migration planning must cover landing zone, connectivity, tooling, security, DR, and cost.\n\n"
376
+ f"#### Scores\nOverall: {scores.get('overall', 0)}/5.0\n\n"
377
+ f"#### Gaps & Fixes\n{_mk_gaps(gaps)}\n\n"
378
+ f"#### Supporting Excerpts\n"
 
 
 
 
 
 
 
 
 
 
 
379
  )
380
  for s in snippets:
381
+ details += f"- {s['file']} (rel {s['relevance']:.2f}): {s['excerpt']}\n"
382
+ details += f"\n**Trusted sources:** {refs}"
383
+ return details
384
 
385
 
386
  # =========================
 
392
  use_uploaded_docs: bool,
393
  index_obj: Any,
394
  _matrix_unused: Any,
395
+ corpus: List[Dict[str, str]]
396
  ) -> str:
397
  q = (question or "").strip()
398
  if not q:
399
  return "Please enter a question."
400
 
401
+ # 0) Concept KB for definitional questions (e.g., "What is Azure Arc-enabled SDN?")
402
+ concept = lookup_concept(q)
403
+ if concept is not None:
404
+ return concept.builder(q)
405
+
406
+ # 1) Seeded FAQs → detailed plan when relevant (>=50% overlap with seed)
407
  q_tokens = set(tokenize(q))
408
  for item in FAQ_SEEDS:
409
  seed_tokens = set(tokenize(item["q"]))
410
+ if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
411
+ return (
412
+ "### Answer (detailed)\n"
 
 
413
  f"{item['a']}\n\n"
414
+ f"**Trusted sources:** {list_refs(item.get('refs', []))}"
 
 
 
 
 
 
 
 
415
  )
 
416
 
417
  # 2) Use uploaded docs (RAG) → detailed synthesized answer
418
  if use_uploaded_docs and index_obj is not None and corpus:
 
426
  snippets.append({
427
  "file": item["file"],
428
  "relevance": float(sim),
429
+ "excerpt": excerpt
430
  })
431
  if snippets:
432
  return _compose_detailed_from_snippets(q, snippets)
433
 
434
+ # 3) Fallback (no docs) → generic, but structured overview (not migration-only)
435
+ refs = list_refs(["Azure Arc (overview)", "Azure Stack HCI (Azure Local)", "Azure SDN concepts (HCI)"])
436
+ return (
 
 
 
 
 
 
437
  "### Answer (detailed)\n"
438
+ "I couldn't match a specific concept or supporting excerpts, so here's a structured overview you can refine:\n\n"
439
+ "**Definition:** Describe what the service/feature is, what problems it solves, and where it runs (Azure / Azure Local).\n\n"
440
+ "**Key capabilities:** automation, policy-driven control, security segmentation, connectivity services.\n\n"
441
+ "**How it works:** control plane programs host/network functions; policies applied consistently; integrates with RBAC/governance.\n\n"
442
+ "**Prerequisites:** identity/RBAC, connectivity to Azure (for Arc), supported host/cluster versions.\n\n"
443
+ "**Use cases:** hybrid deployments, zero-trust segmentation, rapid environment provisioning.\n\n"
 
 
 
 
444
  f"**Trusted sources:** {refs}"
445
  )
 
446
 
447
 
448
  # =========================
449
+ # Build Index
450
  # =========================
451
 
452
+ def build_index(files: List[Dict[str, Any]]):
 
453
  if not files:
454
  return None, None, [], "No files uploaded yet."
 
455
  corpus: List[Dict[str, str]] = []
456
  for f in files:
457
  rec = parse_file(f)
458
  if rec["text"]:
459
  corpus.append(rec)
 
460
  if not corpus:
461
+ return None, None, [], "No text extracted."
 
462
  tokenized = [tokenize(c["text"]) for c in corpus]
463
  idx = TinyTfidfIndex()
464
  idx.add_documents(tokenized)
465
+ return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}."
 
 
466
 
467
 
468
  # =========================
 
472
  with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
473
  gr.Markdown(
474
  "## VMware On-Prem → Azure Local Migration Assistant\n"
475
+ "Upload documents and ask questions. Detailed answers will be provided."
 
 
476
  )
477
 
478
  with gr.Row():
479
  with gr.Column(scale=2):
480
+ file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
 
 
 
 
481
  index_status = gr.Markdown("No index yet.")
 
 
482
  st_index = gr.State()
483
+ st_matrix = gr.State()
484
  st_corpus = gr.State()
 
485
  build_btn = gr.Button("Build Index", variant="primary")
486
+
487
  with gr.Column(scale=3):
488
+ question = gr.Textbox(label="Ask a question", placeholder="e.g., What is Azure Arc-enabled SDN, and why would I use it?")
 
 
 
 
489
  use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
490
  ask_btn = gr.Button("Ask", variant="primary")
491
  answer_box = gr.Markdown("")
492
 
493
+ # Convert gr.Files (paths) to expected dicts
494
+ def _collect_files(paths: List[str]):
495
  out = []
496
  for p in paths or []:
497
  try:
 
504
 
505
  def _build(files_paths: List[str]):
506
  files = _collect_files(files_paths)
507
+ return build_index(files)
 
508
 
509
  build_btn.click(
510
  _build,
511
  inputs=[file_in],
512
+ outputs=[index_status, st_index, st_matrix, st_corpus]
513
  )
514
 
515
  ask_btn.click(
516
  answer_faq_or_approach_detailed,
517
  inputs=[question, use_docs, st_index, st_matrix, st_corpus],
518
+ outputs=[answer_box]
519
  )
520
 
521
  if __name__ == "__main__":