ajayinsac commited on
Commit
f2aed85
·
verified ·
1 Parent(s): 32e167d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +396 -611
app.py CHANGED
@@ -1,22 +1,20 @@
1
  #!/usr/bin/env python3
 
 
2
  """
3
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
 
 
 
4
 
5
- Update: Removed scikit-learn dependency. Includes a minimal pure-Python TF-IDF
6
- and cosine similarity so it runs on Hugging Face Spaces without sklearn.
7
-
8
- Features
9
- - FAQ / approach Q&A with trusted-source citations (links)
10
- - Upload & index PDF/DOCX/TXT (session-local)
11
- - Lightweight RAG (pure-Python TF-IDF over chunks)
12
- - Design/Runbook auto-review with rubric (0–5) + gaps + fixes
13
- - All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
14
  """
15
 
16
  import os
17
  import io
18
  import re
19
- import json
20
  import math
21
  import time
22
  from typing import List, Tuple, Dict, Any
@@ -24,584 +22,342 @@ from collections import Counter, defaultdict
24
 
25
  import gradio as gr
26
 
27
- # -------- Optional, small footprint parsers --------
28
- # PDF
29
  try:
30
- from pypdf import PdfReader
31
  except Exception:
32
- PdfReader = None
33
 
34
- # DOCX
35
  try:
36
- import docx
37
  except Exception:
38
  docx = None
39
 
40
 
41
  # =========================
42
- # Trusted Sources (Allowlist)
43
  # =========================
44
- TRUSTED_SOURCES = [
45
- # Microsoft Learn / Docs
46
  ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
47
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
48
- ("Azure Stack HCI / Azure Local", "https://learn.microsoft.com/azure-stack/"),
49
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
50
- ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/well-architected/"),
51
- # VMware
52
- ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/"),
53
- ("VMware vSphere Docs", "https://docs.vmware.com/en/VMware-vSphere/index.html"),
54
- # Security & Compliance
55
- ("NIST SP 800-53", "https://csrc.nist.gov/publications/sp800-53"),
56
- ("FedRAMP Baselines", "https://www.fedramp.gov/"),
57
- ("IRS Publication 1075 (FTI)", "https://www.irs.gov/pub/irs-pdf/p1075.pdf"),
58
  ]
59
 
60
- # =========================
61
- # Ontology (Domains/Subdomains)
62
- # =========================
63
- ONTOLOGY = {
64
- "Assessment": ["Inventory", "Dependencies", "Performance", "Criticality", "Readiness"],
65
- "Architecture": ["Landing Zone", "Azure Local Footprint", "AVS", "Environments"],
66
- "Networking": ["ExpressRoute", "VPN", "IP Plan", "DNS", "Load Balancing", "Private Link", "HCX Network"],
67
- "Identity": ["Entra ID", "AD DS", "PIM", "MFA", "RBAC", "Break-Glass"],
68
- "Migration": ["HCX", "Azure Migrate", "Cutover", "Rollback", "Data Sync"],
69
- "Data": ["Storage", "Backup", "Snapshots", "Immutability", "Residency"],
70
- "Security": ["Defender", "Sentinel", "Policy", "Purview", "Key Vault"],
71
- "DR": ["ASR", "Failover", "RTO/RPO", "Runbooks", "Tests"],
72
- "Ops": ["Monitor", "Log Analytics", "Patching", "Change Mgmt", "ITIL"],
73
- "Cost": ["Right-Sizing", "Reservations", "Tagging", "Budgets"],
74
- "Program": ["RAID", "Comms", "Training", "RACI", "Gates"],
75
- "Troubleshooting": ["HCX Failures", "DNS Drift", "Identity Tokens", "Latency"],
76
- }
77
-
78
- # =========================
79
- # Heuristic Design Checks (keywords → rubric mapping)
80
- # =========================
81
- CHECKS = {
82
- "security": {
83
- "weight": 1.0,
84
- "keywords": [
85
- "Defender for Cloud", "Microsoft Defender", "Sentinel", "Key Vault", "encryption",
86
- "TLS", "KMS", "HSM", "Just-In-Time", "JIT", "PIM", "MFA", "Conditional Access",
87
- "Azure Policy", "Purview", "classification", "DLP", "RBAC", "least privilege"
88
- ],
89
- "controls": ["NIST-AC-2", "NIST-SC-13", "IRS1075 §9.3"]
90
- },
91
- "reliability": {
92
- "weight": 1.0,
93
- "keywords": [
94
- "Availability Zone", "zonal", "ASR", "Site Recovery", "backup", "failover",
95
- "failback", "DR drill", "runbook", "immutable", "soft delete", "RTO", "RPO"
96
- ],
97
- },
98
- "performance": {
99
- "weight": 1.0,
100
- "keywords": [
101
- "right-size", "IOPS", "latency", "throughput", "benchmark", "autoscale",
102
- "SKU", "Managed Disks", "Premium SSD", "Ultra", "Standard SSD"
103
- ],
104
- },
105
- "operations": {
106
- "weight": 1.0,
107
- "keywords": [
108
- "Azure Monitor", "Log Analytics", "alerts", "workbooks", "patch", "change management",
109
- "incident", "problem", "request", "ITIL", "configuration drift"
110
- ],
111
- },
112
- "cost": {
113
- "weight": 1.0,
114
- "keywords": [
115
- "reservation", "Reserved Instances", "Savings Plan", "spot",
116
- "tagging", "chargeback", "showback", "budget", "cost anomaly"
117
- ],
118
- },
119
- "networking": {
120
- "weight": 1.0,
121
- "keywords": [
122
- "ExpressRoute", "ER", "VPN", "BGP", "MTU", "NSG", "ASG", "UDR", "Private Link",
123
- "DNS", "DHCP", "load balancer", "hub and spoke", "landing zone network"
124
- ],
125
- },
126
- "identity": {
127
- "weight": 1.0,
128
- "keywords": [
129
- "Entra ID", "Azure AD", "Active Directory", "domain trust", "AADDS",
130
- "Conditional Access", "PIM", "break-glass", "least privilege"
131
- ],
132
- },
133
- "migration": {
134
- "weight": 1.0,
135
- "keywords": [
136
- "HCX", "vMotion", "RAV", "Azure Migrate", "replication", "Mobility Group",
137
- "cutover", "rollback", "pilot", "wave"
138
- ],
139
- },
140
- "architecture": {
141
- "weight": 1.0,
142
- "keywords": [
143
- "Landing Zone", "hub", "spoke", "policy", "RBAC", "naming",
144
- "AVS", "Azure Local", "Azure Stack HCI", "Local Zone"
145
- ],
146
- },
147
- }
148
-
149
- # =========================
150
- # FAQ seeds (concise, cite trusted links)
151
- # =========================
152
- FAQ_SEEDS = [
153
  {
154
- "q": "How do we migrate VMware workloads to Azure Local?",
155
  "a": (
156
- "Typical paths are **Azure VMware Solution (AVS)** with **HCX** (bulk/RAV/vMotion) or "
157
- "**Azure Migrate** for discovery, assessment, and server/db/web migration. "
158
- "Establish a governed **Landing Zone** (hub/spoke, Policy, RBAC), plan ExpressRoute/VPN, "
159
- "pilot a few VMs, then cut over in waves with rollback plans. "
160
- "See AVS, Azure Migrate, and CAF for prescriptive guidance."
161
  ),
162
- "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"]
163
  },
164
  {
165
- "q": "What downtime should we expect?",
166
  "a": (
167
- "Depends on method and app architecture. **HCX vMotion** can provide minimal downtime; "
168
- "**HCX RAV** and **bulk migration** usually require short cutover windows. "
169
- "Always pilot, measure replication lag, and agree on a timeboxed backout."
170
  ),
171
- "refs": ["VMware HCX Docs"]
172
  },
173
  {
174
- "q": "How do we meet IRS Pub 1075 and NIST controls?",
175
  "a": (
176
- "Map design controls to frameworks: enforce least privilege (RBAC/PIM/MFA), "
177
- "encrypt at rest/in transit (Key Vault/HSM, TLS), centralize telemetry (Sentinel), "
178
- "and document evidence (policies, runbooks, DR tests). Use CAF/WAF security pillars."
179
  ),
180
- "refs": ["IRS Publication 1075 (FTI)", "NIST SP 800-53", "Azure Well-Architected Framework (WAF)"]
181
- },
182
- {
183
- "q": "ExpressRoute or VPN?",
184
- "a": (
185
- "**ExpressRoute** is preferred for predictable performance and private connectivity; "
186
- "VPN is fine for initial testing or lower-throughput needs. Many designs use both "
187
- "for redundancy and phased cutover."
188
- ),
189
- "refs": ["Cloud Adoption Framework (CAF)"]
190
  },
191
  ]
192
 
 
193
  # =========================
194
- # Minimal Pure-Python TF-IDF
195
  # =========================
196
- STOPWORDS = set("""
197
- a an the and or but if then else for from to in on at by of with without into within over under not be is are was were will can should would could may might
198
- this that these those there here when where how what why who whom which as it its itself themselves ourselves yourself yourselves
199
- """.split())
200
 
201
- TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
202
 
203
  def tokenize(text: str) -> List[str]:
204
- return [w.lower() for w in TOKEN_RE.findall(text) if w and w.lower() not in STOPWORDS]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  class TinyTfidfIndex:
207
  def __init__(self):
208
  self.docs: List[List[str]] = []
209
- self.doc_vectors: List[Dict[str, float]] = []
210
- self.doc_norms: List[float] = []
211
  self.idf: Dict[str, float] = {}
212
- self.N = 0
213
- self.corpus_meta: List[Dict[str, str]] = []
214
-
215
- def fit(self, texts: List[str], meta: List[Dict[str, str]]):
216
- self.docs = [tokenize(t) for t in texts]
217
- self.N = len(self.docs)
218
- self.corpus_meta = meta
219
 
 
 
220
  # document frequency
221
- df = Counter()
222
- for doc in self.docs:
223
- df.update(set(doc))
224
- # idf
225
- self.idf = {}
226
- for term, dfi in df.items():
227
- # add-1 smoothing to avoid div by zero, +1 offset
228
- self.idf[term] = 1.0 + math.log((self.N + 1) / (dfi + 1))
229
-
230
- # build doc vectors
231
- self.doc_vectors = []
232
  self.doc_norms = []
233
- for doc in self.docs:
234
- tf = Counter(doc)
235
- vec = {}
236
  for term, cnt in tf.items():
237
- vec[term] = (cnt / max(1, len(doc))) * self.idf.get(term, 0.0)
238
- norm = math.sqrt(sum(v * v for v in vec.values())) or 1e-12
239
- self.doc_vectors.append(vec)
240
- self.doc_norms.append(norm)
241
-
242
- def query(self, text: str, k: int = 4) -> List[Tuple[int, float]]:
243
- qtokens = tokenize(text)
244
- if not qtokens or self.N == 0:
245
- return []
246
- tf = Counter(qtokens)
247
- qvec = {}
248
  for term, cnt in tf.items():
249
- qvec[term] = (cnt / max(1, len(qtokens))) * self.idf.get(term, 0.0)
250
- qnorm = math.sqrt(sum(v * v for v in qvec.values())) or 1e-12
251
-
252
- # cosine against each doc
253
- scores = []
254
- for i, dvec in enumerate(self.doc_vectors):
255
- dot = 0.0
256
- # iterate over smaller dict for speed
257
- if len(qvec) < len(dvec):
258
- for t, v in qvec.items():
259
- if t in dvec:
260
- dot += v * dvec[t]
261
- else:
262
- for t, v in dvec.items():
263
- if t in qvec:
264
- dot += v * qvec[t]
265
- sim = dot / (qnorm * self.doc_norms[i])
266
- scores.append((i, sim))
267
- scores.sort(key=lambda x: x[1], reverse=True)
268
- return scores[:k]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  # =========================
271
- # Utilities: text extraction & chunking
272
  # =========================
273
- def extract_text_from_pdf(fileobj: io.BytesIO) -> str:
274
- if PdfReader is None:
 
275
  return ""
276
  try:
277
- reader = PdfReader(fileobj)
278
- parts = []
279
  for page in reader.pages:
280
- txt = page.extract_text() or ""
281
- parts.append(txt)
282
- return "\n".join(parts)
 
 
283
  except Exception:
284
  return ""
285
 
286
- def extract_text_from_docx(fileobj: io.BytesIO) -> str:
287
- if docx is None:
288
- return ""
289
- try:
290
- document = docx.Document(fileobj)
291
- return "\n".join([p.text for p in document.paragraphs])
292
- except Exception:
293
  return ""
294
-
295
- def extract_text_from_txt(fileobj: io.BytesIO) -> str:
296
  try:
297
- return fileobj.read().decode("utf-8", errors="ignore")
 
 
298
  except Exception:
299
  return ""
300
 
301
- def read_file_to_text(file: gr.File) -> Tuple[str, str]:
302
- """
303
- Returns (text, filename)
304
- """
305
- if file is None:
306
- return "", ""
307
- name = os.path.basename(file.name) if file.name else "uploaded"
308
- with open(file.name, "rb") as f:
309
- raw = f.read()
310
- ext = (name.split(".")[-1] or "").lower()
311
- bio = io.BytesIO(raw)
312
- if ext in ["pdf"]:
313
- txt = extract_text_from_pdf(bio)
314
- elif ext in ["docx"]:
315
- txt = extract_text_from_docx(bio)
316
- elif ext in ["txt"]:
317
- txt = extract_text_from_txt(bio)
318
- else:
319
- txt = ""
320
- return txt, name
321
-
322
- def chunk_text(text: str, max_len: int = 900, overlap: int = 120) -> List[str]:
323
- """
324
- Simple sliding window chunker by characters; robust and fast.
325
- """
326
- text = re.sub(r"\s+", " ", text).strip()
327
- chunks = []
328
- i = 0
329
- n = len(text)
330
- while i < n:
331
- j = min(i + max_len, n)
332
- chunk = text[i:j]
333
- if chunk:
334
- chunks.append(chunk)
335
- i = j - overlap
336
- if i < 0:
337
- i = 0
338
- if i >= n:
339
- break
340
- return chunks
341
 
342
- # =========================
343
- # RAG Index (session-scoped)
344
- # =========================
345
- class RagState:
346
- def __init__(self):
347
- self.index = None # TinyTfidfIndex
348
- self.corpus = None # list of dicts with text/meta
349
 
350
- def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
351
  """
352
- Build a tiny TF-IDF index over all chunks from uploaded documents.
353
- Returns: (index_obj, None, chunks_with_meta) to keep signature compatible.
354
  """
355
- all_chunks = []
356
- meta = []
357
- if not files:
358
- return None, None, None
359
- for f in files:
360
- txt, fname = read_file_to_text(f)
361
- if not txt.strip():
362
- continue
363
- chunks = chunk_text(txt)
364
- for c in chunks:
365
- all_chunks.append(c)
366
- meta.append({"file": os.path.basename(f.name), "snippet": c[:120] + ("..." if len(c) > 120 else "")})
367
- if not all_chunks:
368
- return None, None, None
 
 
 
 
 
 
 
 
369
 
370
- idx = TinyTfidfIndex()
371
- idx.fit(all_chunks, meta)
372
- corpus = [{"text": t, **m} for t, m in zip(all_chunks, meta)]
373
- return idx, None, corpus
374
 
375
- def retrieve_answer(
376
- query: str,
377
- index_obj: Any,
378
- _matrix_unused: Any,
379
- corpus: List[Dict[str, str]],
380
- k: int = 4
381
- ) -> Tuple[str, List[Dict[str, str]]]:
382
- """
383
- Return synthesized answer + top-k supporting chunks with filenames.
384
- """
385
- if not query or index_obj is None or not corpus:
386
- return "", []
387
- top = index_obj.query(query, k=k)
388
- snippets = []
389
- for i, sim in top:
390
- item = corpus[i]
391
- snippets.append({
392
- "file": item["file"],
393
- "relevance": float(sim),
394
- "excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
395
- })
396
- answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
397
- for s in snippets:
398
- answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
399
- answer += "Tip: Ask a follow-up like “Summarize the cutover plan” or “List missing security controls.”"
400
- return answer, snippets
401
 
402
  # =========================
403
- # Design / Runbook Auto-Review
404
  # =========================
405
- def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
406
- text_low = text.lower()
407
-
408
- pillar_scores = {}
409
- gaps = []
410
-
411
- for pillar, cfg in CHECKS.items():
412
- hits = 0
413
- kws = cfg["keywords"]
414
- for kw in kws:
415
- if kw.lower() in text_low:
416
- hits += 1
417
- coverage = hits / max(1, len(kws))
418
- score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2) # baseline 1.5, up to 5.0
419
- pillar_scores[pillar] = score
420
-
421
- if pillar == "networking":
422
- if "expressroute".lower() not in text_low and "er " not in text_low:
423
- gaps.append({
424
- "id": "NET-ER-001",
425
- "severity": "High",
426
- "desc": "ExpressRoute (ER) not referenced; consider ER for predictable private connectivity.",
427
- "fix": "Design dual ER circuits with diverse POPs; fall back to VPN during pilot."
428
- })
429
- if "dns" not in text_low:
430
- gaps.append({
431
- "id": "NET-DNS-002",
432
- "severity": "Med",
433
- "desc": "DNS plan not mentioned; risk of name resolution drift post-cutover.",
434
- "fix": "Document forwarders/zones, conditional forwarding, and DNS cutover sequencing."
435
- })
436
- if "mtu" not in text_low and "hcx" in text_low:
437
- gaps.append({
438
- "id": "NET-MTU-003",
439
- "severity": "Med",
440
- "desc": "HCX present but MTU tuning not referenced.",
441
- "fix": "Validate path MTU for HCX tunnels; align NSX/physical network settings."
442
- })
443
-
444
- if pillar == "identity":
445
- if "pim" not in text_low:
446
- gaps.append({
447
- "id": "ID-PIM-004",
448
- "severity": "Med",
449
- "desc": "No mention of Privileged Identity Management (PIM).",
450
- "fix": "Enable PIM for admin roles; require approvals/justification; enforce MFA."
451
- })
452
- if "break-glass" not in text_low:
453
- gaps.append({
454
- "id": "ID-BG-005",
455
- "severity": "Low",
456
- "desc": "No break-glass account reference.",
457
- "fix": "Create monitored break-glass accounts with strong controls and regular review."
458
- })
459
-
460
- if pillar == "security":
461
- if "key vault" not in text_low and "hsm" not in text_low:
462
- gaps.append({
463
- "id": "SEC-KEY-006",
464
- "severity": "High",
465
- "desc": "Key management not described.",
466
- "fix": "Use Azure Key Vault (HSM-backed if needed); rotate secrets/keys; restrict access via RBAC."
467
- })
468
- if "sentinel" not in text_low:
469
- gaps.append({
470
- "id": "SEC-SIEM-007",
471
- "severity": "Med",
472
- "desc": "SIEM not referenced.",
473
- "fix": "Onboard to Microsoft Sentinel; define data connectors and incident processes."
474
- })
475
- if "policy" not in text_low:
476
- gaps.append({
477
- "id": "SEC-POL-008",
478
- "severity": "Med",
479
- "desc": "Azure Policy governance not mentioned.",
480
- "fix": "Attach ALZ policies/initiatives for guardrails (encryption, tags, allowed locations, SKUs)."
481
- })
482
-
483
- if pillar == "reliability":
484
- if ("asr" not in text_low) and ("site recovery" not in text_low):
485
- gaps.append({
486
- "id": "REL-ASR-009",
487
- "severity": "Med",
488
- "desc": "No DR replication tool referenced.",
489
- "fix": "Use Azure Site Recovery (ASR) or HCX DR for failover/failback; schedule DR drills."
490
- })
491
- if "backup" not in text_low and "recovery services vault" not in text_low:
492
- gaps.append({
493
- "id": "REL-BKP-010",
494
- "severity": "High",
495
- "desc": "Backup strategy not captured.",
496
- "fix": "Configure Azure Backup with immutable storage and soft delete; test restores."
497
- })
498
- if ("rto" not in text_low) or ("rpo" not in text_low):
499
- gaps.append({
500
- "id": "REL-RTORPO-011",
501
- "severity": "Med",
502
- "desc": "RTO/RPO targets not documented.",
503
- "fix": "Define business-aligned RTO/RPO and validate during pilot/cutover."
504
- })
505
-
506
- if pillar == "architecture":
507
- if ("landing zone" not in text_low) and ("landing-zone" not in text_low):
508
- gaps.append({
509
- "id": "ARC-ALZ-012",
510
- "severity": "High",
511
- "desc": "Azure Landing Zone baseline not referenced.",
512
- "fix": "Adopt ALZ (hub/spoke, Policy, RBAC, logging) before migration waves."
513
- })
514
-
515
- if pillar == "migration":
516
- if ("rollback" not in text_low) and ("backout" not in text_low):
517
- gaps.append({
518
- "id": "MIG-ROLL-013",
519
- "severity": "High",
520
- "desc": "Rollback/backout path not documented.",
521
- "fix": "Document clear backout steps and timebox for each wave; test in pilot."
522
- })
523
- if "pilot" not in text_low:
524
- gaps.append({
525
- "id": "MIG-PILOT-014",
526
- "severity": "Med",
527
- "desc": "No pilot mentioned.",
528
- "fix": "Execute a pilot with representative workloads; capture metrics and lessons."
529
- })
530
-
531
- if pillar == "cost":
532
- if "tag" not in text_low:
533
- gaps.append({
534
- "id": "COST-TAG-015",
535
- "severity": "Med",
536
- "desc": "Tagging strategy absent (owner, env, app).",
537
- "fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
538
- })
539
-
540
- if pillar_scores:
541
- overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
542
- else:
543
- overall = 0.0
544
 
545
- if overall < 3.5:
546
- gaps.insert(0, {
547
- "id": "SUMMARY",
548
- "severity": "Info",
549
- "desc": f"Overall score is {overall}. Focus first on High-severity gaps.",
550
- "fix": "Prioritize ER/DNS/Backup/ALZ/PIM/Key Vault where missing; re-run the check after updates."
551
- })
552
 
553
- return {"overall": overall, **pillar_scores}, gaps
 
 
 
 
554
 
555
- def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
556
- if not files:
557
- return "Please upload at least one PDF/DOCX/TXT.", {}, []
 
 
 
 
558
 
559
- text_full = []
560
- file_list = []
561
- for f in files:
562
- txt, fname = read_file_to_text(f)
563
- if txt.strip():
564
- text_full.append(txt)
565
- file_list.append(os.path.basename(f.name))
566
- if not text_full:
567
- return "Could not parse text from the provided files.", {}, []
568
-
569
- combined = "\n\n".join(text_full)
570
- scores, gaps = score_text_against_checks(combined)
571
-
572
- md = f"### Design/Runbook Review\n"
573
- md += f"**Files analyzed:** {', '.join(file_list)}\n\n"
574
- md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
575
- md += "**Per-Pillar Scores:**\n\n"
576
- for k, v in scores.items():
577
- if k == "overall":
578
  continue
579
- md += f"- **{k.capitalize()}**: {v}\n"
580
- md += "\n**Top Recommendations:**\n"
581
- for g in gaps[:6]:
582
- md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
583
-
584
- result_json = {
585
- "timestamp": int(time.time()),
586
- "files": file_list,
587
- "scores": scores,
588
- "gaps": gaps
589
- }
590
- table_rows = [[g["id"], g["severity"], g["desc"], g["fix"]] for g in gaps]
591
- return md, result_json, table_rows
 
 
 
 
 
 
 
 
 
 
 
 
 
592
 
593
- # =========================
594
- # Q&A Logic
595
- # =========================
596
- def list_refs(ref_names: List[str]) -> str:
597
- links = []
598
- for nm in ref_names:
599
- hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
600
- if hit:
601
- links.append(f"[{nm}]({hit[0][1]})")
602
- return " | ".join(links)
603
 
604
- def answer_faq_or_approach(
605
  question: str,
606
  use_uploaded_docs: bool,
607
  index_obj: Any,
@@ -612,32 +368,45 @@ def answer_faq_or_approach(
612
  if not q:
613
  return "Please enter a question."
614
 
615
- # First try seeded FAQs (very light semantic: keyword match)
616
  for item in FAQ_SEEDS:
617
- # simple heuristic: overlap of first few tokens
618
  seed_tokens = set(tokenize(item["q"])[:3])
619
  q_tokens = set(tokenize(q))
620
  if seed_tokens and seed_tokens.issubset(q_tokens):
621
- return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622
 
623
- # If requested, try RAG on uploaded docs
624
  if use_uploaded_docs and index_obj is not None and corpus:
625
- rag_answer, _snips = retrieve_answer(q, index_obj, None, corpus, k=4)
626
- if rag_answer.strip():
627
- refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
628
- return f"{rag_answer}\n\n**Trusted sources:** {refs}"
629
-
630
- # Fallback generic approach with citations
631
- generic = (
632
- "**Suggested approach:**\n"
633
- "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging).\n"
634
- "2) Establish **ExpressRoute/VPN** and DNS plans; validate MTU if using **HCX**.\n"
635
- "3) Run **Azure Migrate** discovery/assessment; classify (rehost/refactor/modernize).\n"
636
- "4) Pilot 2–3 VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
637
- "5) Define **RTO/RPO**, backup, and **ASR**/DR drills; document rollback.\n"
638
- "6) Onboard to **Defender/Sentinel**, enforce **Key Vault** and **PIM/MFA**.\n"
639
- "7) Optimize cost (right-size, reservations) and tag everything.\n"
640
- )
641
  refs = list_refs([
642
  "Azure VMware Solution (AVS)",
643
  "Azure Migrate",
@@ -645,98 +414,114 @@ def answer_faq_or_approach(
645
  "Azure Well-Architected Framework (WAF)",
646
  "VMware HCX Docs"
647
  ])
648
- return f"{generic}\n**Trusted sources:** {refs}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
 
650
  # =========================
651
- # Gradio UI
652
  # =========================
653
- with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
654
- gr.Markdown(
655
- "# VMware On-Prem → Azure Local Migration Assistant\n"
656
- "Ask questions, upload migration/design documents for review, and get recommendations.\n"
657
- "_Sources: Microsoft Learn/Docs, VMware Docs, NIST, IRS Pub 1075 (linked below)._"
658
- )
659
 
660
- # Session state for RAG
661
- st_index = gr.State(None) # TinyTfidfIndex
662
- st_matrix = gr.State(None) # kept for signature compatibility
663
- st_corpus = gr.State(None)
664
-
665
- with gr.Tabs():
666
- with gr.Tab("Ask Anything"):
667
- with gr.Row():
668
- question = gr.Textbox(
669
- label="Your question (FAQs, approach, troubleshooting)",
670
- placeholder="e.g., How do I plan a pilot with HCX RAV and ensure minimal downtime?"
671
- )
672
- use_docs = gr.Checkbox(label="Also search my uploaded documents (if any)", value=True)
673
- ask_btn = gr.Button("Answer")
674
- answer_box = gr.Markdown()
675
-
676
- with gr.Tab("Upload & Review Design"):
677
- gr.Markdown("Upload **PDF / DOCX / TXT** (multiple allowed). Then build the index and/or run a review.")
678
- files = gr.File(file_count="multiple", file_types=[".pdf", ".docx", ".txt"], label="Upload documents")
679
- with gr.Row():
680
- build_btn = gr.Button("Build/Refresh Search Index")
681
- review_btn = gr.Button("Run Design/Runbook Review")
682
- index_info = gr.Markdown()
683
- review_md = gr.Markdown()
684
- review_json = gr.JSON()
685
- gaps_table = gr.Dataframe(
686
- headers=["Gap ID", "Severity", "Description", "Fix"],
687
- datatype=["str", "str", "str", "str"],
688
- interactive=False,
689
- label="Gaps & Recommendations"
690
- )
691
 
692
- with gr.Tab("Trusted Sources & Ontology"):
693
- gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
694
- links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
695
- gr.Markdown(links_md)
696
-
697
- gr.Markdown("### Knowledge Taxonomy (Domains → Subdomains)")
698
- onto_str = ""
699
- for dom, subs in ONTOLOGY.items():
700
- onto_str += f"- **{dom}**: {', '.join(subs)}\n"
701
- gr.Markdown(onto_str)
702
-
703
- gr.Markdown(
704
- "### Notes\n"
705
- "- This app does **not** call external APIs. Use the links above for official guidance.\n"
706
- "- Design checks are heuristic; always validate against your Architecture Board and security teams."
707
- )
708
 
709
- # ====== Wiring ======
710
- def on_build_index(files_list):
711
- idx, _X, cor = build_index(files_list)
712
- if idx is None:
713
- return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
714
- None, None, None)
715
- msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
716
- return msg, idx, None, cor
717
 
718
- build_btn.click(
719
- on_build_index,
720
- inputs=[files],
721
- outputs=[index_info, st_index, st_matrix, st_corpus]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  )
723
 
724
- def on_review(files_list):
725
- md, js, table = review_uploaded_docs(files_list)
726
- return md, js, table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
728
- review_btn.click(
729
- on_review,
730
- inputs=[files],
731
- outputs=[review_md, review_json, gaps_table]
732
  )
733
 
734
  ask_btn.click(
735
- answer_faq_or_approach,
736
  inputs=[question, use_docs, st_index, st_matrix, st_corpus],
737
  outputs=[answer_box]
738
  )
739
 
740
- # Standard HF Spaces entrypoint
741
  if __name__ == "__main__":
742
- demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
 
 
 
1
  #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
  """
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
+ - Works on Hugging Face Spaces (no external API calls, no sklearn).
7
+ - Upload design/migration docs (PDF/DOCX/TXT/MD).
8
+ - Ask questions; get DETAILED, structured answers with excerpts + trusted refs.
9
 
10
+ Run locally:
11
+ pip install gradio PyPDF2 python-docx
12
+ python app.py
 
 
 
 
 
 
13
  """
14
 
15
  import os
16
  import io
17
  import re
 
18
  import math
19
  import time
20
  from typing import List, Tuple, Dict, Any
 
22
 
23
  import gradio as gr
24
 
25
+ # Optional parsers (gracefully degrade if not installed on Spaces)
 
26
  try:
27
+ import PyPDF2 # lightweight; often available on Spaces
28
  except Exception:
29
+ PyPDF2 = None
30
 
 
31
  try:
32
+ import docx # python-docx
33
  except Exception:
34
  docx = None
35
 
36
 
37
  # =========================
38
+ # Trusted sources & FAQ seeds
39
  # =========================
40
+
41
+ TRUSTED_SOURCES: List[Tuple[str, str]] = [
42
  ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
43
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
 
44
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
45
+ ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
46
+ ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html")
 
 
 
 
 
 
47
  ]
48
 
49
+ FAQ_SEEDS: List[Dict[str, Any]] = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  {
51
+ "q": "How do we migrate VMware workloads to Azure with minimal downtime?",
52
  "a": (
53
+ "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
54
+ "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
55
+ "pilot a few representative VMs, then migrate in waves with rollback and DR drills."
 
 
56
  ),
57
+ "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"]
58
  },
59
  {
60
+ "q": "What is a recommended migration sequence?",
61
  "a": (
62
+ "1) Establish a governed landing zone. 2) Set up connectivity and identity. "
63
+ "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
64
+ "6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
65
  ),
66
+ "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]
67
  },
68
  {
69
+ "q": "How do we plan DR and backups?",
70
  "a": (
71
+ "Define RTO/RPO per app. Use immutable backups and soft-delete. "
72
+ "Leverage ASR for DR where appropriate, run failover drills, and document rollback."
 
73
  ),
74
+ "refs": ["Azure Well-Architected Framework (WAF)"]
 
 
 
 
 
 
 
 
 
75
  },
76
  ]
77
 
78
+
79
  # =========================
80
+ # Utilities
81
  # =========================
 
 
 
 
82
 
83
+ _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+") # keep URLs/paths/ids mostly intact
84
 
85
  def tokenize(text: str) -> List[str]:
86
+ if not text:
87
+ return []
88
+ return [t.lower() for t in _WORD_RE.findall(text)]
89
+
90
+ def list_refs(ref_names: List[str]) -> str:
91
+ links = []
92
+ for nm in ref_names:
93
+ hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
94
+ if hit:
95
+ links.append(f"[{nm}]({hit[0][1]})")
96
+ return " | ".join(links) if links else ""
97
+
98
+
99
+ # =========================
100
+ # Tiny TF-IDF implementation (no sklearn)
101
+ # =========================
102
 
103
  class TinyTfidfIndex:
104
  def __init__(self):
105
  self.docs: List[List[str]] = []
106
+ self.df: Counter = Counter()
 
107
  self.idf: Dict[str, float] = {}
108
+ self.doc_norms: List[float] = []
109
+ self.voc_size = 0
 
 
 
 
 
110
 
111
+ def add_documents(self, tokenized_docs: List[List[str]]):
112
+ self.docs = tokenized_docs[:]
113
  # document frequency
114
+ self.df = Counter()
115
+ for toks in self.docs:
116
+ self.df.update(set(toks))
117
+ N = max(1, len(self.docs))
118
+ self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
119
+ self.voc_size = len(self.idf)
120
+ # precompute norms
 
 
 
 
121
  self.doc_norms = []
122
+ for toks in self.docs:
123
+ tf = Counter(toks)
124
+ norm_sq = 0.0
125
  for term, cnt in tf.items():
126
+ w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0)
127
+ norm_sq += w * w
128
+ self.doc_norms.append(math.sqrt(norm_sq))
129
+
130
+ def _vec(self, toks: List[str]) -> Dict[str, float]:
131
+ tf = Counter(toks)
132
+ total = max(1, len(toks))
133
+ v = {}
 
 
 
134
  for term, cnt in tf.items():
135
+ idf = self.idf.get(term)
136
+ if idf is None:
137
+ continue
138
+ v[term] = (cnt / total) * idf
139
+ return v
140
+
141
+ def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
142
+ if not self.docs:
143
+ return []
144
+ qv = self._vec(tokenize(text))
145
+ # cosine similarity
146
+ q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
147
+ sims: List[Tuple[int, float]] = []
148
+ for i, toks in enumerate(self.docs):
149
+ dv = Counter(toks) # use tf counter to loop terms
150
+ num = 0.0
151
+ for term in qv:
152
+ if term in dv:
153
+ # weight for doc term
154
+ w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
155
+ num += qv[term] * w_d
156
+ denom = (self.doc_norms[i] or 1e-9) * q_norm
157
+ sims.append((i, num / denom))
158
+ sims.sort(key=lambda x: x[1], reverse=True)
159
+ return sims[:k]
160
+
161
+
162
+ # =========================
163
+ # Simple scoring rubric to tailor the detailed output
164
+ # =========================
165
+
166
+ CHECKS = [
167
+ {
168
+ "id": "landing_zone",
169
+ "desc": "Landing zone defined (hub/spoke, Policy, RBAC, logging).",
170
+ "fix": "Use CAF blueprints; enforce Policy for guardrails and RBAC.",
171
+ "keywords": ["landing", "hub", "spoke", "policy", "rbac", "log", "monitor"],
172
+ "pillar": "governance",
173
+ },
174
+ {
175
+ "id": "connectivity",
176
+ "desc": "Connectivity planned (ExpressRoute/VPN), DNS, MTU validated for HCX.",
177
+ "fix": "Verify ER/VPN, DNS resolution, and HCX MTU/mobility settings.",
178
+ "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx", "connectivity"],
179
+ "pillar": "networking",
180
+ },
181
+ {
182
+ "id": "migrate_tooling",
183
+ "desc": "Discovery/assessment and tooling chosen (Azure Migrate or HCX).",
184
+ "fix": "Run Azure Migrate discovery; select HCX or Azure Migrate per downtime.",
185
+ "keywords": ["azure", "migrate", "discovery", "assessment", "hcx", "replication"],
186
+ "pillar": "operations",
187
+ },
188
+ {
189
+ "id": "security",
190
+ "desc": "Security/identity configured (Key Vault, Defender, Sentinel, PIM/MFA).",
191
+ "fix": "Centralize secrets in Key Vault; enable Defender/Sentinel; enforce PIM/MFA.",
192
+ "keywords": ["key", "vault", "defender", "sentinel", "pim", "mfa", "entra", "aad", "identity"],
193
+ "pillar": "security",
194
+ },
195
+ {
196
+ "id": "dr_backup",
197
+ "desc": "Backups, DR, RTO/RPO defined; ASR drills planned.",
198
+ "fix": "Set RTO/RPO; immutability & soft-delete; test ASR failover/failback.",
199
+ "keywords": ["backup", "rto", "rpo", "dr", "asr", "failover", "restore"],
200
+ "pillar": "reliability",
201
+ },
202
+ {
203
+ "id": "cost",
204
+ "desc": "Cost optimization plan (right-sizing, reservations, tagging).",
205
+ "fix": "Use reservations/Savings Plans, rightsizing, and enforce tags.",
206
+ "keywords": ["cost", "reservation", "savings", "right", "tag"],
207
+ "pillar": "cost",
208
+ },
209
+ ]
210
+
211
+ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
212
+ toks = set(tokenize(text))
213
+ scores = defaultdict(float)
214
+ hits = []
215
+ for chk in CHECKS:
216
+ matched = any(kw in toks for kw in chk["keywords"])
217
+ if matched:
218
+ scores["overall"] += 1.0
219
+ scores[chk["pillar"]] += 1.0
220
+ else:
221
+ hits.append({
222
+ "id": chk["id"],
223
+ "desc": chk["desc"],
224
+ "fix": chk["fix"],
225
+ "severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
226
+ })
227
+ # normalize roughly to 0-5 scale
228
+ max_possible = float(len(CHECKS))
229
+ scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
230
+ for k in list(scores.keys()):
231
+ if k != "overall":
232
+ scores[k] = round(scores[k], 2)
233
+ return scores, hits
234
+
235
 
236
  # =========================
237
+ # File parsing
238
  # =========================
239
+
240
+ def read_pdf_bytes(b: bytes) -> str:
241
+ if not PyPDF2:
242
  return ""
243
  try:
244
+ reader = PyPDF2.PdfReader(io.BytesIO(b))
245
+ out = []
246
  for page in reader.pages:
247
+ try:
248
+ out.append(page.extract_text() or "")
249
+ except Exception:
250
+ pass
251
+ return "\n".join(out)
252
  except Exception:
253
  return ""
254
 
255
+ def read_docx_bytes(b: bytes) -> str:
256
+ if not docx:
 
 
 
 
 
257
  return ""
 
 
258
  try:
259
+ f = io.BytesIO(b)
260
+ d = docx.Document(f)
261
+ return "\n".join(p.text for p in d.paragraphs)
262
  except Exception:
263
  return ""
264
 
265
+ def read_text_bytes(b: bytes) -> str:
266
+ # best-effort decoding
267
+ for enc in ("utf-8", "utf-16", "latin-1"):
268
+ try:
269
+ return b.decode(enc)
270
+ except Exception:
271
+ continue
272
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
 
 
 
 
 
 
 
274
 
275
+ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
276
  """
277
+ Returns {"file": <name>, "text": <extracted_text>}
 
278
  """
279
+ name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
280
+ data = file_obj.get("data")
281
+ if data is None:
282
+ # gradio sometimes provides a path instead
283
+ path = file_obj.get("path")
284
+ if path and os.path.exists(path):
285
+ with open(path, "rb") as fh:
286
+ data = fh.read()
287
+ if data is None:
288
+ return {"file": name, "text": ""}
289
+
290
+ low = name.lower()
291
+ text = ""
292
+ if low.endswith(".pdf"):
293
+ text = read_pdf_bytes(data)
294
+ elif low.endswith(".docx") or low.endswith(".doc"):
295
+ text = read_docx_bytes(data)
296
+ elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
297
+ text = read_text_bytes(data)
298
+ else:
299
+ # try plain text as fallback
300
+ text = read_text_bytes(data)
301
 
302
+ return {"file": os.path.basename(name), "text": text or ""}
 
 
 
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
  # =========================
306
+ # Detailed Q&A Composer
307
  # =========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
+ def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
310
+ collected = [s.get("excerpt", "") for s in snippets]
311
+ combined = "\n\n".join(collected)
312
+ scores, gaps = score_text_against_checks(combined) if combined.strip() else ({"overall": 0.0}, [])
 
 
 
313
 
314
+ def _mk_gaps(glist, limit=8):
315
+ out = []
316
+ for g in glist[:limit]:
317
+ out.append(f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_")
318
+ return "\n".join(out) if out else "- No major issues detected in the sampled excerpts."
319
 
320
+ refs = list_refs([
321
+ "Azure VMware Solution (AVS)",
322
+ "Azure Migrate",
323
+ "Cloud Adoption Framework (CAF)",
324
+ "Azure Well-Architected Framework (WAF)",
325
+ "VMware HCX Docs"
326
+ ])
327
 
328
+ pillar_lines = []
329
+ for k_, v_ in scores.items():
330
+ if k_ == "overall":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  continue
332
+ pillar_lines.append(f"- **{k_.capitalize()}**: {v_}")
333
+ pillar_md = "\n".join(pillar_lines) if pillar_lines else "- (no signals)"
334
+
335
+ md = (
336
+ f"### Answer (detailed)\n"
337
+ f"**Your question:** {query}\n\n"
338
+ f"**TL;DR:** Here’s a concrete plan across landing zone, connectivity, migration method, security, DR, and cost. "
339
+ f"Address the highest-risk gaps first.\n\n"
340
+ f"#### Step-by-step plan\n"
341
+ "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
342
+ "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
343
+ "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
344
+ "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
345
+ "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
346
+ "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
347
+ "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
348
+ f"#### What your documents emphasize (auto-scored)\n"
349
+ f"**Overall score:** {scores.get('overall', 0)} / 5.0\n\n"
350
+ f"**Per-pillar signals:**\n{pillar_md}\n\n"
351
+ f"#### Gaps & quick fixes\n{_mk_gaps(gaps, limit=8)}\n\n"
352
+ f"#### Supporting excerpts\n"
353
+ )
354
+ for s in snippets:
355
+ md += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
356
+ md += f"**Trusted sources:** {refs}"
357
+ return md
358
 
 
 
 
 
 
 
 
 
 
 
359
 
360
+ def answer_faq_or_approach_detailed(
361
  question: str,
362
  use_uploaded_docs: bool,
363
  index_obj: Any,
 
368
  if not q:
369
  return "Please enter a question."
370
 
371
+ # 1) Seeded FAQs detailed plan
372
  for item in FAQ_SEEDS:
 
373
  seed_tokens = set(tokenize(item["q"])[:3])
374
  q_tokens = set(tokenize(q))
375
  if seed_tokens and seed_tokens.issubset(q_tokens):
376
+ refs = list_refs(item.get("refs", []))
377
+ base = (
378
+ f"### Answer (detailed)\n"
379
+ f"{item['a']}\n\n"
380
+ "#### Step-by-step plan\n"
381
+ "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
382
+ "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
383
+ "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
384
+ "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
385
+ "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
386
+ "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
387
+ "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
388
+ f"**Trusted sources:** {refs}"
389
+ )
390
+ return base
391
 
392
+ # 2) Use uploaded docs (RAG) detailed synthesized answer
393
  if use_uploaded_docs and index_obj is not None and corpus:
394
+ top = index_obj.query(q, k=6)
395
+ snippets = []
396
+ for i, sim in top:
397
+ item = corpus[i]
398
+ excerpt = item["text"].strip()
399
+ if len(excerpt) > 700:
400
+ excerpt = excerpt[:700] + "..."
401
+ snippets.append({
402
+ "file": item["file"],
403
+ "relevance": float(sim),
404
+ "excerpt": excerpt
405
+ })
406
+ if snippets:
407
+ return _compose_detailed_from_snippets(q, snippets)
408
+
409
+ # 3) Fallback (no docs) → generic detailed plan with citations
410
  refs = list_refs([
411
  "Azure VMware Solution (AVS)",
412
  "Azure Migrate",
 
414
  "Azure Well-Architected Framework (WAF)",
415
  "VMware HCX Docs"
416
  ])
417
+ generic = (
418
+ "### Answer (detailed)\n"
419
+ "**TL;DR:** Use AVS/HCX or Azure Migrate depending on downtime needs; build landing zone and connectivity first, "
420
+ "then migrate in waves with rollback and DR drills.\n\n"
421
+ "#### Step-by-step plan\n"
422
+ "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
423
+ "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
424
+ "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
425
+ "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
426
+ "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
427
+ "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
428
+ "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
429
+ f"**Trusted sources:** {refs}"
430
+ )
431
+ return generic
432
+
433
 
434
  # =========================
435
+ # Build index from uploaded files
436
  # =========================
 
 
 
 
 
 
437
 
438
+ def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
439
+ """
440
+ Returns: (index_obj, matrix_placeholder, corpus, status_message)
441
+ """
442
+ if not files:
443
+ return None, None, [], "No files uploaded yet."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
+ corpus: List[Dict[str, str]] = []
446
+ for f in files:
447
+ rec = parse_file(f)
448
+ if rec["text"]:
449
+ corpus.append(rec)
 
 
 
 
 
 
 
 
 
 
 
450
 
451
+ if not corpus:
452
+ return None, None, [], "Uploaded files could not be parsed (no text extracted)."
 
 
 
 
 
 
453
 
454
+ tokenized = [tokenize(c["text"]) for c in corpus]
455
+ idx = TinyTfidfIndex()
456
+ idx.add_documents(tokenized)
457
+
458
+ status = f"Indexed {len(corpus)} document(s). Vocabulary size ≈ {idx.voc_size}."
459
+ return idx, None, corpus, status
460
+
461
+
462
+ # =========================
463
+ # Gradio UI
464
+ # =========================
465
+
466
+ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
467
+ gr.Markdown(
468
+ "## VMware On-Prem → Azure Local Migration Assistant\n"
469
+ "- Upload your **design/migration documents** (PDF, DOCX, TXT, MD)\n"
470
+ "- Ask questions. Toggle **Use uploaded docs** for RAG-based answers\n"
471
+ "- Answers are **detailed** by default, with structured steps and trusted references\n"
472
  )
473
 
474
+ with gr.Row():
475
+ with gr.Column(scale=2):
476
+ file_in = gr.Files(
477
+ label="Upload documents (PDF/DOCX/TXT/MD)",
478
+ file_count="multiple",
479
+ type="filepath" # we will open paths ourselves
480
+ )
481
+ index_status = gr.Markdown("No index yet.")
482
+
483
+ # Hidden/State to hold in-memory data
484
+ st_index = gr.State()
485
+ st_matrix = gr.State() # placeholder for API compatibility
486
+ st_corpus = gr.State()
487
+
488
+ build_btn = gr.Button("Build Index", variant="primary")
489
+ with gr.Column(scale=3):
490
+ question = gr.Textbox(label="Ask a question", placeholder="e.g., How do I minimize downtime for our VMware migration?")
491
+ use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
492
+ ask_btn = gr.Button("Ask", variant="primary")
493
+ answer_box = gr.Markdown("")
494
+
495
+ # Convert gr.Files (paths) into the dict format our parser expects
496
+ def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
497
+ out = []
498
+ for p in paths or []:
499
+ try:
500
+ with open(p, "rb") as fh:
501
+ data = fh.read()
502
+ out.append({"name": os.path.basename(p), "data": data, "path": p})
503
+ except Exception:
504
+ pass
505
+ return out
506
+
507
+ def _build(files_paths: List[str]):
508
+ files = _collect_files(files_paths)
509
+ idx, mat, corpus, status = build_index(files)
510
+ return status, idx, mat, corpus
511
 
512
+ build_btn.click(
513
+ _build,
514
+ inputs=[file_in],
515
+ outputs=[index_status, st_index, st_matrix, st_corpus]
516
  )
517
 
518
  ask_btn.click(
519
+ answer_faq_or_approach_detailed,
520
  inputs=[question, use_docs, st_index, st_matrix, st_corpus],
521
  outputs=[answer_box]
522
  )
523
 
 
524
  if __name__ == "__main__":
525
+ # On Spaces, share=True is ignored safely; locally it will open a public link if allowed.
526
+ IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
527
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)