ajayinsac commited on
Commit
0b055a7
·
verified ·
1 Parent(s): 22f921e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +674 -0
app.py ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ VMware On-Prem → Azure Local Migration Assistant (Gradio)
4
+
5
+ Features
6
+ - FAQ / approach Q&A with trusted-source citations (links)
7
+ - Upload & index PDF/DOCX/TXT (session-local)
8
+ - Lightweight RAG (TF-IDF over chunks)
9
+ - Design/Runbook auto-review with rubric (0–5) + gaps + fixes
10
+ - All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
11
+
12
+ Author: you
13
+ """
14
+
15
+ import os
16
+ import io
17
+ import re
18
+ import json
19
+ import time
20
+ from typing import List, Tuple, Dict, Any
21
+
22
+ import gradio as gr
23
+ from sklearn.feature_extraction.text import TfidfVectorizer
24
+ from sklearn.metrics.pairwise import cosine_similarity
25
+
26
+ # -------- Optional, small footprint parsers --------
27
+ # PDF
28
+ try:
29
+ from pypdf import PdfReader
30
+ except Exception:
31
+ PdfReader = None
32
+
33
+ # DOCX
34
+ try:
35
+ import docx
36
+ except Exception:
37
+ docx = None
38
+
39
+
40
+ # =========================
41
+ # Trusted Sources (Allowlist)
42
+ # =========================
43
+ TRUSTED_SOURCES = [
44
+ # Microsoft Learn / Docs
45
+ ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
46
+ ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
47
+ ("Azure Stack HCI / Azure Local", "https://learn.microsoft.com/azure-stack/"),
48
+ ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
49
+ ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/well-architected/"),
50
+ # VMware
51
+ ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/"),
52
+ ("VMware vSphere Docs", "https://docs.vmware.com/en/VMware-vSphere/index.html"),
53
+ # Security & Compliance
54
+ ("NIST SP 800-53", "https://csrc.nist.gov/publications/sp800-53"),
55
+ ("FedRAMP Baselines", "https://www.fedramp.gov/"),
56
+ ("IRS Publication 1075 (FTI)", "https://www.irs.gov/pub/irs-pdf/p1075.pdf"),
57
+ ]
58
+
59
+ # =========================
60
+ # Ontology (Domains/Subdomains)
61
+ # =========================
62
+ ONTOLOGY = {
63
+ "Assessment": ["Inventory", "Dependencies", "Performance", "Criticality", "Readiness"],
64
+ "Architecture": ["Landing Zone", "Azure Local Footprint", "AVS", "Environments"],
65
+ "Networking": ["ExpressRoute", "VPN", "IP Plan", "DNS", "Load Balancing", "Private Link", "HCX Network"],
66
+ "Identity": ["Entra ID", "AD DS", "PIM", "MFA", "RBAC", "Break-Glass"],
67
+ "Migration": ["HCX", "Azure Migrate", "Cutover", "Rollback", "Data Sync"],
68
+ "Data": ["Storage", "Backup", "Snapshots", "Immutability", "Residency"],
69
+ "Security": ["Defender", "Sentinel", "Policy", "Purview", "Key Vault"],
70
+ "DR": ["ASR", "Failover", "RTO/RPO", "Runbooks", "Tests"],
71
+ "Ops": ["Monitor", "Log Analytics", "Patching", "Change Mgmt", "ITIL"],
72
+ "Cost": ["Right-Sizing", "Reservations", "Tagging", "Budgets"],
73
+ "Program": ["RAID", "Comms", "Training", "RACI", "Gates"],
74
+ "Troubleshooting": ["HCX Failures", "DNS Drift", "Identity Tokens", "Latency"],
75
+ }
76
+
77
+ # =========================
78
+ # Heuristic Design Checks (keywords → rubric mapping)
79
+ # =========================
80
+ CHECKS = {
81
+ "security": {
82
+ "weight": 1.0,
83
+ "keywords": [
84
+ "Defender for Cloud", "Microsoft Defender", "Sentinel", "Key Vault", "encryption",
85
+ "TLS", "KMS", "HSM", "Just-In-Time", "JIT", "PIM", "MFA", "Conditional Access",
86
+ "Azure Policy", "Purview", "classification", "DLP", "RBAC", "least privilege"
87
+ ],
88
+ "controls": ["NIST-AC-2", "NIST-SC-13", "IRS1075 §9.3"]
89
+ },
90
+ "reliability": {
91
+ "weight": 1.0,
92
+ "keywords": [
93
+ "Availability Zone", "zonal", "ASR", "Site Recovery", "backup", "failover",
94
+ "failback", "DR drill", "runbook", "immutable", "soft delete", "RTO", "RPO"
95
+ ],
96
+ },
97
+ "performance": {
98
+ "weight": 1.0,
99
+ "keywords": [
100
+ "right-size", "IOPS", "latency", "throughput", "benchmark", "autoscale",
101
+ "SKU", "Managed Disks", "Premium SSD", "Ultra", "Standard SSD"
102
+ ],
103
+ },
104
+ "operations": {
105
+ "weight": 1.0,
106
+ "keywords": [
107
+ "Azure Monitor", "Log Analytics", "alerts", "workbooks", "patch", "change management",
108
+ "incident", "problem", "request", "ITIL", "configuration drift"
109
+ ],
110
+ },
111
+ "cost": {
112
+ "weight": 1.0,
113
+ "keywords": [
114
+ "reservation", "Reserved Instances", "Savings Plan", "spot",
115
+ "tagging", "chargeback", "showback", "budget", "cost anomaly"
116
+ ],
117
+ },
118
+ "networking": {
119
+ "weight": 1.0,
120
+ "keywords": [
121
+ "ExpressRoute", "ER", "VPN", "BGP", "MTU", "NSG", "ASG", "UDR", "Private Link",
122
+ "DNS", "DHCP", "load balancer", "hub and spoke", "landing zone network"
123
+ ],
124
+ },
125
+ "identity": {
126
+ "weight": 1.0,
127
+ "keywords": [
128
+ "Entra ID", "Azure AD", "Active Directory", "domain trust", "AADDS",
129
+ "Conditional Access", "PIM", "break-glass", "least privilege"
130
+ ],
131
+ },
132
+ "migration": {
133
+ "weight": 1.0,
134
+ "keywords": [
135
+ "HCX", "vMotion", "RAV", "Azure Migrate", "replication", "Mobility Group",
136
+ "cutover", "rollback", "pilot", "wave"
137
+ ],
138
+ },
139
+ "architecture": {
140
+ "weight": 1.0,
141
+ "keywords": [
142
+ "Landing Zone", "hub", "spoke", "policy", "RBAC", "naming",
143
+ "AVS", "Azure Local", "Azure Stack HCI", "Local Zone"
144
+ ],
145
+ },
146
+ }
147
+
148
+ # =========================
149
+ # FAQ seeds (concise, cite trusted links)
150
+ # =========================
151
+ FAQ_SEEDS = [
152
+ {
153
+ "q": "How do we migrate VMware workloads to Azure Local?",
154
+ "a": (
155
+ "Typical paths are **Azure VMware Solution (AVS)** with **HCX** (bulk/RAV/vMotion) or "
156
+ "**Azure Migrate** for discovery, assessment, and server/db/web migration. "
157
+ "Establish a governed **Landing Zone** (hub/spoke, Policy, RBAC), plan ExpressRoute/VPN, "
158
+ "pilot a few VMs, then cut over in waves with rollback plans. "
159
+ "See AVS, Azure Migrate, and CAF for prescriptive guidance."
160
+ ),
161
+ "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"]
162
+ },
163
+ {
164
+ "q": "What downtime should we expect?",
165
+ "a": (
166
+ "Depends on method and app architecture. **HCX vMotion** can provide minimal downtime; "
167
+ "**HCX RAV** and **bulk migration** usually require short cutover windows. "
168
+ "Always pilot, measure replication lag, and agree on a timeboxed backout."
169
+ ),
170
+ "refs": ["VMware HCX Docs"]
171
+ },
172
+ {
173
+ "q": "How do we meet IRS Pub 1075 and NIST controls?",
174
+ "a": (
175
+ "Map design controls to frameworks: enforce least privilege (RBAC/PIM/MFA), "
176
+ "encrypt at rest/in transit (Key Vault/HSM, TLS), centralize telemetry (Sentinel), "
177
+ "and document evidence (policies, runbooks, DR tests). Use CAF/WAF security pillars."
178
+ ),
179
+ "refs": ["IRS Publication 1075 (FTI)", "NIST SP 800-53", "Azure Well-Architected Framework (WAF)"]
180
+ },
181
+ {
182
+ "q": "ExpressRoute or VPN?",
183
+ "a": (
184
+ "**ExpressRoute** is preferred for predictable performance and private connectivity; "
185
+ "VPN is fine for initial testing or lower-throughput needs. Many designs use both "
186
+ "for redundancy and phased cutover."
187
+ ),
188
+ "refs": ["Cloud Adoption Framework (CAF)"]
189
+ },
190
+ ]
191
+
192
+ # =========================
193
+ # Utilities: text extraction & chunking
194
+ # =========================
195
+ def extract_text_from_pdf(fileobj: io.BytesIO) -> str:
196
+ if PdfReader is None:
197
+ return ""
198
+ try:
199
+ reader = PdfReader(fileobj)
200
+ parts = []
201
+ for page in reader.pages:
202
+ txt = page.extract_text() or ""
203
+ parts.append(txt)
204
+ return "\n".join(parts)
205
+ except Exception:
206
+ return ""
207
+
208
+ def extract_text_from_docx(fileobj: io.BytesIO) -> str:
209
+ if docx is None:
210
+ return ""
211
+ try:
212
+ document = docx.Document(fileobj)
213
+ return "\n".join([p.text for p in document.paragraphs])
214
+ except Exception:
215
+ return ""
216
+
217
+ def extract_text_from_txt(fileobj: io.BytesIO) -> str:
218
+ try:
219
+ return fileobj.read().decode("utf-8", errors="ignore")
220
+ except Exception:
221
+ return ""
222
+
223
+ def read_file_to_text(file: gr.File) -> Tuple[str, str]:
224
+ """
225
+ Returns (text, filename)
226
+ """
227
+ if file is None:
228
+ return "", ""
229
+ name = os.path.basename(file.name) if file.name else "uploaded"
230
+ with open(file.name, "rb") as f:
231
+ raw = f.read()
232
+ ext = (name.split(".")[-1] or "").lower()
233
+ bio = io.BytesIO(raw)
234
+ if ext in ["pdf"]:
235
+ txt = extract_text_from_pdf(bio)
236
+ elif ext in ["docx"]:
237
+ txt = extract_text_from_docx(bio)
238
+ elif ext in ["txt"]:
239
+ txt = extract_text_from_txt(bio)
240
+ else:
241
+ txt = ""
242
+ return txt, name
243
+
244
+ def chunk_text(text: str, max_len: int = 900, overlap: int = 120) -> List[str]:
245
+ """
246
+ Simple sliding window chunker by characters; robust and fast.
247
+ """
248
+ text = re.sub(r"\s+", " ", text).strip()
249
+ chunks = []
250
+ i = 0
251
+ n = len(text)
252
+ while i < n:
253
+ j = min(i + max_len, n)
254
+ chunk = text[i:j]
255
+ if chunk:
256
+ chunks.append(chunk)
257
+ i = j - overlap
258
+ if i < 0:
259
+ i = 0
260
+ if i >= n:
261
+ break
262
+ return chunks
263
+
264
+ # =========================
265
+ # RAG Index (session-scoped)
266
+ # =========================
267
+ def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
268
+ """
269
+ Build a TF-IDF vectorizer over all chunks from uploaded documents.
270
+ Returns: (vectorizer, matrix, chunks_with_meta)
271
+ """
272
+ all_chunks = []
273
+ meta = []
274
+ if not files:
275
+ return None, None, None
276
+ for f in files:
277
+ txt, fname = read_file_to_text(f)
278
+ if not txt.strip():
279
+ continue
280
+ chunks = chunk_text(txt)
281
+ for c in chunks:
282
+ all_chunks.append(c)
283
+ meta.append({"file": os.path.basename(f.name), "snippet": c[:120] + ("..." if len(c) > 120 else "")})
284
+ if not all_chunks:
285
+ return None, None, None
286
+
287
+ vectorizer = TfidfVectorizer(stop_words="english", max_features=25000)
288
+ X = vectorizer.fit_transform(all_chunks)
289
+ return vectorizer, X, [{"text": t, **m} for t, m in zip(all_chunks, meta)]
290
+
291
+ def retrieve_answer(
292
+ query: str,
293
+ vectorizer: Any,
294
+ matrix: Any,
295
+ corpus: List[Dict[str, str]],
296
+ k: int = 4
297
+ ) -> Tuple[str, List[Dict[str, str]]]:
298
+ """
299
+ Return synthesized answer + top-k supporting chunks with filenames.
300
+ """
301
+ if not query or vectorizer is None or matrix is None or not corpus:
302
+ return "", []
303
+ qv = vectorizer.transform([query])
304
+ sims = cosine_similarity(qv, matrix).ravel()
305
+ top_idx = sims.argsort()[::-1][:k]
306
+ snippets = []
307
+ for i in top_idx:
308
+ item = corpus[i]
309
+ snippets.append({
310
+ "file": item["file"],
311
+ "relevance": float(sims[i]),
312
+ "excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
313
+ })
314
+ # Simple synthesis: bullet list of the top excerpts + a short summary hint.
315
+ answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
316
+ for s in snippets:
317
+ answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
318
+ answer += "Tip: Ask a follow-up like “Summarize the cutover plan” or “List missing security controls.”"
319
+ return answer, snippets
320
+
321
+ # =========================
322
+ # Design / Runbook Auto-Review
323
+ # =========================
324
+ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
325
+ """
326
+ Returns per-pillar scores (0..5) and a list of gaps with fixes.
327
+ Very simple keyword coverage approach + gap heuristics.
328
+ """
329
+ text_low = text.lower()
330
+
331
+ pillar_scores = {}
332
+ gaps = []
333
+
334
+ for pillar, cfg in CHECKS.items():
335
+ hits = 0
336
+ kws = cfg["keywords"]
337
+ for kw in kws:
338
+ if kw.lower() in text_low:
339
+ hits += 1
340
+ coverage = hits / max(1, len(kws))
341
+ score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2) # baseline 1.5, up to 5.0
342
+ pillar_scores[pillar] = score
343
+
344
+ # naive gap examples:
345
+ if pillar == "networking":
346
+ if "expressroute".lower() not in text_low and "er " not in text_low:
347
+ gaps.append({
348
+ "id": "NET-ER-001",
349
+ "severity": "High",
350
+ "desc": "ExpressRoute (ER) not referenced; consider ER for predictable private connectivity.",
351
+ "fix": "Design dual ER circuits with diverse POPs; fall back to VPN during pilot."
352
+ })
353
+ if "dns" not in text_low:
354
+ gaps.append({
355
+ "id": "NET-DNS-002",
356
+ "severity": "Med",
357
+ "desc": "DNS plan not mentioned; risk of name resolution drift post-cutover.",
358
+ "fix": "Document forwarders/zones, conditional forwarding, and DNS cutover sequencing."
359
+ })
360
+ if "mtu" not in text_low and "hcx" in text_low:
361
+ gaps.append({
362
+ "id": "NET-MTU-003",
363
+ "severity": "Med",
364
+ "desc": "HCX present but MTU tuning not referenced.",
365
+ "fix": "Validate path MTU for HCX tunnels; align NSX/physical network settings."
366
+ })
367
+
368
+ if pillar == "identity":
369
+ if "pim" not in text_low:
370
+ gaps.append({
371
+ "id": "ID-PIM-004",
372
+ "severity": "Med",
373
+ "desc": "No mention of Privileged Identity Management (PIM).",
374
+ "fix": "Enable PIM for admin roles; require approvals/justification; enforce MFA."
375
+ })
376
+ if "break-glass" not in text_low:
377
+ gaps.append({
378
+ "id": "ID-BG-005",
379
+ "severity": "Low",
380
+ "desc": "No break-glass account reference.",
381
+ "fix": "Create monitored break-glass accounts with strong controls and regular review."
382
+ })
383
+
384
+ if pillar == "security":
385
+ if "key vault" not in text_low and "hsm" not in text_low:
386
+ gaps.append({
387
+ "id": "SEC-KEY-006",
388
+ "severity": "High",
389
+ "desc": "Key management not described.",
390
+ "fix": "Use Azure Key Vault (HSM-backed if needed); rotate secrets/keys; restrict access via RBAC."
391
+ })
392
+ if "sentinel" not in text_low:
393
+ gaps.append({
394
+ "id": "SEC-SIEM-007",
395
+ "severity": "Med",
396
+ "desc": "SIEM not referenced.",
397
+ "fix": "Onboard to Microsoft Sentinel; define data connectors and incident processes."
398
+ })
399
+ if "policy" not in text_low:
400
+ gaps.append({
401
+ "id": "SEC-POL-008",
402
+ "severity": "Med",
403
+ "desc": "Azure Policy governance not mentioned.",
404
+ "fix": "Attach ALZ policies/initiatives for guardrails (encryption, tags, allowed locations, SKUs)."
405
+ })
406
+
407
+ if pillar == "reliability":
408
+ if ("asr" not in text_low) and ("site recovery" not in text_low):
409
+ gaps.append({
410
+ "id": "REL-ASR-009",
411
+ "severity": "Med",
412
+ "desc": "No DR replication tool referenced.",
413
+ "fix": "Use Azure Site Recovery (ASR) or HCX DR for failover/failback; schedule DR drills."
414
+ })
415
+ if "backup" not in text_low and "recovery services vault" not in text_low:
416
+ gaps.append({
417
+ "id": "REL-BKP-010",
418
+ "severity": "High",
419
+ "desc": "Backup strategy not captured.",
420
+ "fix": "Configure Azure Backup with immutable storage and soft delete; test restores."
421
+ })
422
+ if ("rto" not in text_low) or ("rpo" not in text_low):
423
+ gaps.append({
424
+ "id": "REL-RTORPO-011",
425
+ "severity": "Med",
426
+ "desc": "RTO/RPO targets not documented.",
427
+ "fix": "Define business-aligned RTO/RPO and validate during pilot/cutover."
428
+ })
429
+
430
+ if pillar == "architecture":
431
+ if ("landing zone" not in text_low) and ("landing-zone" not in text_low):
432
+ gaps.append({
433
+ "id": "ARC-ALZ-012",
434
+ "severity": "High",
435
+ "desc": "Azure Landing Zone baseline not referenced.",
436
+ "fix": "Adopt ALZ (hub/spoke, Policy, RBAC, logging) before migration waves."
437
+ })
438
+
439
+ if pillar == "migration":
440
+ if ("rollback" not in text_low) and ("backout" not in text_low):
441
+ gaps.append({
442
+ "id": "MIG-ROLL-013",
443
+ "severity": "High",
444
+ "desc": "Rollback/backout path not documented.",
445
+ "fix": "Document clear backout steps and timebox for each wave; test in pilot."
446
+ })
447
+ if "pilot" not in text_low:
448
+ gaps.append({
449
+ "id": "MIG-PILOT-014",
450
+ "severity": "Med",
451
+ "desc": "No pilot mentioned.",
452
+ "fix": "Execute a pilot with representative workloads; capture metrics and lessons."
453
+ })
454
+
455
+ if pillar == "cost":
456
+ if "tag" not in text_low:
457
+ gaps.append({
458
+ "id": "COST-TAG-015",
459
+ "severity": "Med",
460
+ "desc": "Tagging strategy absent (owner, env, app).",
461
+ "fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
462
+ })
463
+
464
+ # Overall score = average of pillars
465
+ if pillar_scores:
466
+ overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
467
+ else:
468
+ overall = 0.0
469
+
470
+ # Insert an overall summary as the first "gap" entry if overall < 3.5
471
+ if overall < 3.5:
472
+ gaps.insert(0, {
473
+ "id": "SUMMARY",
474
+ "severity": "Info",
475
+ "desc": f"Overall score is {overall}. Focus first on High-severity gaps.",
476
+ "fix": "Prioritize ER/DNS/Backup/ALZ/PIM/Key Vault where missing; re-run the check after updates."
477
+ })
478
+
479
+ return {"overall": overall, **pillar_scores}, gaps
480
+
481
+ def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
482
+ """
483
+ Aggregate text from uploaded docs, run heuristic review, and return:
484
+ - markdown summary
485
+ - json result
486
+ - table rows for Gaps (id, severity, description, fix)
487
+ """
488
+ if not files:
489
+ return "Please upload at least one PDF/DOCX/TXT.", {}, []
490
+
491
+ text_full = []
492
+ file_list = []
493
+ for f in files:
494
+ txt, fname = read_file_to_text(f)
495
+ if txt.strip():
496
+ text_full.append(txt)
497
+ file_list.append(os.path.basename(f.name))
498
+ if not text_full:
499
+ return "Could not parse text from the provided files.", {}, []
500
+
501
+ combined = "\n\n".join(text_full)
502
+ scores, gaps = score_text_against_checks(combined)
503
+
504
+ md = f"### Design/Runbook Review\n"
505
+ md += f"**Files analyzed:** {', '.join(file_list)}\n\n"
506
+ md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
507
+ md += "**Per-Pillar Scores:**\n\n"
508
+ for k, v in scores.items():
509
+ if k == "overall":
510
+ continue
511
+ md += f"- **{k.capitalize()}**: {v}\n"
512
+ md += "\n**Top Recommendations:**\n"
513
+ for g in gaps[:6]:
514
+ md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
515
+
516
+ # JSON + table
517
+ result_json = {
518
+ "timestamp": int(time.time()),
519
+ "files": file_list,
520
+ "scores": scores,
521
+ "gaps": gaps
522
+ }
523
+ table_rows = [[g["id"], g["severity"], g["desc"], g["fix"]] for g in gaps]
524
+ return md, result_json, table_rows
525
+
526
+ # =========================
527
+ # Q&A Logic
528
+ # =========================
529
+ def list_refs(ref_names: List[str]) -> str:
530
+ links = []
531
+ for nm in ref_names:
532
+ hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
533
+ if hit:
534
+ links.append(f"[{nm}]({hit[0][1]})")
535
+ return " | ".join(links)
536
+
537
+ def answer_faq_or_approach(
538
+ question: str,
539
+ use_uploaded_docs: bool,
540
+ vectorizer: Any,
541
+ matrix: Any,
542
+ corpus: List[Dict[str, str]]
543
+ ) -> str:
544
+ q = (question or "").strip()
545
+ if not q:
546
+ return "Please enter a question."
547
+
548
+ # First try seeded FAQs (very light semantic: keyword match)
549
+ for item in FAQ_SEEDS:
550
+ if all(w.lower() in q.lower() for w in re.findall(r"\w+", item["q"])[:3]):
551
+ return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
552
+
553
+ # If requested, try RAG on uploaded docs
554
+ if use_uploaded_docs and vectorizer is not None and matrix is not None and corpus:
555
+ rag_answer, _snips = retrieve_answer(q, vectorizer, matrix, corpus, k=4)
556
+ if rag_answer.strip():
557
+ # Always append trusted sources list for user orientation
558
+ refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
559
+ return f"{rag_answer}\n\n**Trusted sources:** {refs}"
560
+
561
+ # Fallback generic approach with citations
562
+ generic = (
563
+ "**Suggested approach:**\n"
564
+ "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging).\n"
565
+ "2) Establish **ExpressRoute/VPN** and DNS plans; validate MTU if using **HCX**.\n"
566
+ "3) Run **Azure Migrate** discovery/assessment; classify (rehost/refactor/modernize).\n"
567
+ "4) Pilot 2–3 VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
568
+ "5) Define **RTO/RPO**, backup, and **ASR**/DR drills; document rollback.\n"
569
+ "6) Onboard to **Defender/Sentinel**, enforce **Key Vault** and **PIM/MFA**.\n"
570
+ "7) Optimize cost (right-size, reservations) and tag everything.\n"
571
+ )
572
+ refs = list_refs([
573
+ "Azure VMware Solution (AVS)",
574
+ "Azure Migrate",
575
+ "Cloud Adoption Framework (CAF)",
576
+ "Azure Well-Architected Framework (WAF)",
577
+ "VMware HCX Docs"
578
+ ])
579
+ return f"{generic}\n**Trusted sources:** {refs}"
580
+
581
+ # =========================
582
+ # Gradio UI
583
+ # =========================
584
+ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
585
+ gr.Markdown(
586
+ "# VMware On-Prem → Azure Local Migration Assistant\n"
587
+ "Ask questions, upload migration/design documents for review, and get recommendations.\n"
588
+ "_Sources: Microsoft Learn/Docs, VMware Docs, NIST, IRS Pub 1075 (linked below)._"
589
+ )
590
+
591
+ # Session state for RAG
592
+ st_vectorizer = gr.State(None)
593
+ st_matrix = gr.State(None)
594
+ st_corpus = gr.State(None)
595
+
596
+ with gr.Tabs():
597
+ with gr.Tab("Ask Anything"):
598
+ with gr.Row():
599
+ question = gr.Textbox(
600
+ label="Your question (FAQs, approach, troubleshooting)",
601
+ placeholder="e.g., How do I plan a pilot with HCX RAV and ensure minimal downtime?"
602
+ )
603
+ use_docs = gr.Checkbox(label="Also search my uploaded documents (if any)", value=True)
604
+ ask_btn = gr.Button("Answer")
605
+ answer_box = gr.Markdown()
606
+
607
+ with gr.Tab("Upload & Review Design"):
608
+ gr.Markdown("Upload **PDF / DOCX / TXT** (multiple allowed). Then build the index and/or run a review.")
609
+ files = gr.File(file_count="multiple", file_types=[".pdf", ".docx", ".txt"], label="Upload documents")
610
+ with gr.Row():
611
+ build_btn = gr.Button("Build/Refresh Search Index")
612
+ review_btn = gr.Button("Run Design/Runbook Review")
613
+ index_info = gr.Markdown()
614
+ review_md = gr.Markdown()
615
+ review_json = gr.JSON()
616
+ gaps_table = gr.Dataframe(
617
+ headers=["Gap ID", "Severity", "Description", "Fix"],
618
+ datatype=["str", "str", "str", "str"],
619
+ interactive=False,
620
+ label="Gaps & Recommendations"
621
+ )
622
+
623
+ with gr.Tab("Trusted Sources & Ontology"):
624
+ gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
625
+ # Render links
626
+ links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
627
+ gr.Markdown(links_md)
628
+
629
+ gr.Markdown("### Knowledge Taxonomy (Domains → Subdomains)")
630
+ onto_str = ""
631
+ for dom, subs in ONTOLOGY.items():
632
+ onto_str += f"- **{dom}**: {', '.join(subs)}\n"
633
+ gr.Markdown(onto_str)
634
+
635
+ gr.Markdown(
636
+ "### Notes\n"
637
+ "- This app does **not** call external APIs. Use the links above for deep-dives into official guidance.\n"
638
+ "- Design checks are heuristic; always validate against your Architecture Board and security teams."
639
+ )
640
+
641
+ # ====== Wiring ======
642
+ def on_build_index(files_list):
643
+ vec, X, cor = build_index(files_list)
644
+ if vec is None:
645
+ return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
646
+ None, None, None)
647
+ msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
648
+ return msg, vec, X, cor
649
+
650
+ build_btn.click(
651
+ on_build_index,
652
+ inputs=[files],
653
+ outputs=[index_info, st_vectorizer, st_matrix, st_corpus]
654
+ )
655
+
656
+ def on_review(files_list):
657
+ md, js, table = review_uploaded_docs(files_list)
658
+ return md, js, table
659
+
660
+ review_btn.click(
661
+ on_review,
662
+ inputs=[files],
663
+ outputs=[review_md, review_json, gaps_table]
664
+ )
665
+
666
+ ask_btn.click(
667
+ answer_faq_or_approach,
668
+ inputs=[question, use_docs, st_vectorizer, st_matrix, st_corpus],
669
+ outputs=[answer_box]
670
+ )
671
+
672
+ # Standard HF Spaces entrypoint
673
+ if __name__ == "__main__":
674
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))