Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,22 +1,20 @@
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
| 2 |
"""
|
| 3 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
Features
|
| 9 |
-
- FAQ / approach Q&A with trusted-source citations (links)
|
| 10 |
-
- Upload & index PDF/DOCX/TXT (session-local)
|
| 11 |
-
- Lightweight RAG (pure-Python TF-IDF over chunks)
|
| 12 |
-
- Design/Runbook auto-review with rubric (0–5) + gaps + fixes
|
| 13 |
-
- All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
| 17 |
import io
|
| 18 |
import re
|
| 19 |
-
import json
|
| 20 |
import math
|
| 21 |
import time
|
| 22 |
from typing import List, Tuple, Dict, Any
|
|
@@ -24,584 +22,342 @@ from collections import Counter, defaultdict
|
|
| 24 |
|
| 25 |
import gradio as gr
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
# PDF
|
| 29 |
try:
|
| 30 |
-
|
| 31 |
except Exception:
|
| 32 |
-
|
| 33 |
|
| 34 |
-
# DOCX
|
| 35 |
try:
|
| 36 |
-
import docx
|
| 37 |
except Exception:
|
| 38 |
docx = None
|
| 39 |
|
| 40 |
|
| 41 |
# =========================
|
| 42 |
-
# Trusted
|
| 43 |
# =========================
|
| 44 |
-
|
| 45 |
-
|
| 46 |
("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
|
| 47 |
("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
|
| 48 |
-
("Azure Stack HCI / Azure Local", "https://learn.microsoft.com/azure-stack/"),
|
| 49 |
("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
|
| 50 |
-
("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/
|
| 51 |
-
|
| 52 |
-
("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/"),
|
| 53 |
-
("VMware vSphere Docs", "https://docs.vmware.com/en/VMware-vSphere/index.html"),
|
| 54 |
-
# Security & Compliance
|
| 55 |
-
("NIST SP 800-53", "https://csrc.nist.gov/publications/sp800-53"),
|
| 56 |
-
("FedRAMP Baselines", "https://www.fedramp.gov/"),
|
| 57 |
-
("IRS Publication 1075 (FTI)", "https://www.irs.gov/pub/irs-pdf/p1075.pdf"),
|
| 58 |
]
|
| 59 |
|
| 60 |
-
|
| 61 |
-
# Ontology (Domains/Subdomains)
|
| 62 |
-
# =========================
|
| 63 |
-
ONTOLOGY = {
|
| 64 |
-
"Assessment": ["Inventory", "Dependencies", "Performance", "Criticality", "Readiness"],
|
| 65 |
-
"Architecture": ["Landing Zone", "Azure Local Footprint", "AVS", "Environments"],
|
| 66 |
-
"Networking": ["ExpressRoute", "VPN", "IP Plan", "DNS", "Load Balancing", "Private Link", "HCX Network"],
|
| 67 |
-
"Identity": ["Entra ID", "AD DS", "PIM", "MFA", "RBAC", "Break-Glass"],
|
| 68 |
-
"Migration": ["HCX", "Azure Migrate", "Cutover", "Rollback", "Data Sync"],
|
| 69 |
-
"Data": ["Storage", "Backup", "Snapshots", "Immutability", "Residency"],
|
| 70 |
-
"Security": ["Defender", "Sentinel", "Policy", "Purview", "Key Vault"],
|
| 71 |
-
"DR": ["ASR", "Failover", "RTO/RPO", "Runbooks", "Tests"],
|
| 72 |
-
"Ops": ["Monitor", "Log Analytics", "Patching", "Change Mgmt", "ITIL"],
|
| 73 |
-
"Cost": ["Right-Sizing", "Reservations", "Tagging", "Budgets"],
|
| 74 |
-
"Program": ["RAID", "Comms", "Training", "RACI", "Gates"],
|
| 75 |
-
"Troubleshooting": ["HCX Failures", "DNS Drift", "Identity Tokens", "Latency"],
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
# =========================
|
| 79 |
-
# Heuristic Design Checks (keywords → rubric mapping)
|
| 80 |
-
# =========================
|
| 81 |
-
CHECKS = {
|
| 82 |
-
"security": {
|
| 83 |
-
"weight": 1.0,
|
| 84 |
-
"keywords": [
|
| 85 |
-
"Defender for Cloud", "Microsoft Defender", "Sentinel", "Key Vault", "encryption",
|
| 86 |
-
"TLS", "KMS", "HSM", "Just-In-Time", "JIT", "PIM", "MFA", "Conditional Access",
|
| 87 |
-
"Azure Policy", "Purview", "classification", "DLP", "RBAC", "least privilege"
|
| 88 |
-
],
|
| 89 |
-
"controls": ["NIST-AC-2", "NIST-SC-13", "IRS1075 §9.3"]
|
| 90 |
-
},
|
| 91 |
-
"reliability": {
|
| 92 |
-
"weight": 1.0,
|
| 93 |
-
"keywords": [
|
| 94 |
-
"Availability Zone", "zonal", "ASR", "Site Recovery", "backup", "failover",
|
| 95 |
-
"failback", "DR drill", "runbook", "immutable", "soft delete", "RTO", "RPO"
|
| 96 |
-
],
|
| 97 |
-
},
|
| 98 |
-
"performance": {
|
| 99 |
-
"weight": 1.0,
|
| 100 |
-
"keywords": [
|
| 101 |
-
"right-size", "IOPS", "latency", "throughput", "benchmark", "autoscale",
|
| 102 |
-
"SKU", "Managed Disks", "Premium SSD", "Ultra", "Standard SSD"
|
| 103 |
-
],
|
| 104 |
-
},
|
| 105 |
-
"operations": {
|
| 106 |
-
"weight": 1.0,
|
| 107 |
-
"keywords": [
|
| 108 |
-
"Azure Monitor", "Log Analytics", "alerts", "workbooks", "patch", "change management",
|
| 109 |
-
"incident", "problem", "request", "ITIL", "configuration drift"
|
| 110 |
-
],
|
| 111 |
-
},
|
| 112 |
-
"cost": {
|
| 113 |
-
"weight": 1.0,
|
| 114 |
-
"keywords": [
|
| 115 |
-
"reservation", "Reserved Instances", "Savings Plan", "spot",
|
| 116 |
-
"tagging", "chargeback", "showback", "budget", "cost anomaly"
|
| 117 |
-
],
|
| 118 |
-
},
|
| 119 |
-
"networking": {
|
| 120 |
-
"weight": 1.0,
|
| 121 |
-
"keywords": [
|
| 122 |
-
"ExpressRoute", "ER", "VPN", "BGP", "MTU", "NSG", "ASG", "UDR", "Private Link",
|
| 123 |
-
"DNS", "DHCP", "load balancer", "hub and spoke", "landing zone network"
|
| 124 |
-
],
|
| 125 |
-
},
|
| 126 |
-
"identity": {
|
| 127 |
-
"weight": 1.0,
|
| 128 |
-
"keywords": [
|
| 129 |
-
"Entra ID", "Azure AD", "Active Directory", "domain trust", "AADDS",
|
| 130 |
-
"Conditional Access", "PIM", "break-glass", "least privilege"
|
| 131 |
-
],
|
| 132 |
-
},
|
| 133 |
-
"migration": {
|
| 134 |
-
"weight": 1.0,
|
| 135 |
-
"keywords": [
|
| 136 |
-
"HCX", "vMotion", "RAV", "Azure Migrate", "replication", "Mobility Group",
|
| 137 |
-
"cutover", "rollback", "pilot", "wave"
|
| 138 |
-
],
|
| 139 |
-
},
|
| 140 |
-
"architecture": {
|
| 141 |
-
"weight": 1.0,
|
| 142 |
-
"keywords": [
|
| 143 |
-
"Landing Zone", "hub", "spoke", "policy", "RBAC", "naming",
|
| 144 |
-
"AVS", "Azure Local", "Azure Stack HCI", "Local Zone"
|
| 145 |
-
],
|
| 146 |
-
},
|
| 147 |
-
}
|
| 148 |
-
|
| 149 |
-
# =========================
|
| 150 |
-
# FAQ seeds (concise, cite trusted links)
|
| 151 |
-
# =========================
|
| 152 |
-
FAQ_SEEDS = [
|
| 153 |
{
|
| 154 |
-
"q": "How do we migrate VMware workloads to Azure
|
| 155 |
"a": (
|
| 156 |
-
"
|
| 157 |
-
"
|
| 158 |
-
"
|
| 159 |
-
"pilot a few VMs, then cut over in waves with rollback plans. "
|
| 160 |
-
"See AVS, Azure Migrate, and CAF for prescriptive guidance."
|
| 161 |
),
|
| 162 |
-
"refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "
|
| 163 |
},
|
| 164 |
{
|
| 165 |
-
"q": "What
|
| 166 |
"a": (
|
| 167 |
-
"
|
| 168 |
-
"
|
| 169 |
-
"
|
| 170 |
),
|
| 171 |
-
"refs": ["
|
| 172 |
},
|
| 173 |
{
|
| 174 |
-
"q": "How do we
|
| 175 |
"a": (
|
| 176 |
-
"
|
| 177 |
-
"
|
| 178 |
-
"and document evidence (policies, runbooks, DR tests). Use CAF/WAF security pillars."
|
| 179 |
),
|
| 180 |
-
"refs": ["
|
| 181 |
-
},
|
| 182 |
-
{
|
| 183 |
-
"q": "ExpressRoute or VPN?",
|
| 184 |
-
"a": (
|
| 185 |
-
"**ExpressRoute** is preferred for predictable performance and private connectivity; "
|
| 186 |
-
"VPN is fine for initial testing or lower-throughput needs. Many designs use both "
|
| 187 |
-
"for redundancy and phased cutover."
|
| 188 |
-
),
|
| 189 |
-
"refs": ["Cloud Adoption Framework (CAF)"]
|
| 190 |
},
|
| 191 |
]
|
| 192 |
|
|
|
|
| 193 |
# =========================
|
| 194 |
-
#
|
| 195 |
# =========================
|
| 196 |
-
STOPWORDS = set("""
|
| 197 |
-
a an the and or but if then else for from to in on at by of with without into within over under not be is are was were will can should would could may might
|
| 198 |
-
this that these those there here when where how what why who whom which as it its itself themselves ourselves yourself yourselves
|
| 199 |
-
""".split())
|
| 200 |
|
| 201 |
-
|
| 202 |
|
| 203 |
def tokenize(text: str) -> List[str]:
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
class TinyTfidfIndex:
|
| 207 |
def __init__(self):
|
| 208 |
self.docs: List[List[str]] = []
|
| 209 |
-
self.
|
| 210 |
-
self.doc_norms: List[float] = []
|
| 211 |
self.idf: Dict[str, float] = {}
|
| 212 |
-
self.
|
| 213 |
-
self.
|
| 214 |
-
|
| 215 |
-
def fit(self, texts: List[str], meta: List[Dict[str, str]]):
|
| 216 |
-
self.docs = [tokenize(t) for t in texts]
|
| 217 |
-
self.N = len(self.docs)
|
| 218 |
-
self.corpus_meta = meta
|
| 219 |
|
|
|
|
|
|
|
| 220 |
# document frequency
|
| 221 |
-
df = Counter()
|
| 222 |
-
for
|
| 223 |
-
df.update(set(
|
| 224 |
-
|
| 225 |
-
self.idf = {}
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
self.idf[term] = 1.0 + math.log((self.N + 1) / (dfi + 1))
|
| 229 |
-
|
| 230 |
-
# build doc vectors
|
| 231 |
-
self.doc_vectors = []
|
| 232 |
self.doc_norms = []
|
| 233 |
-
for
|
| 234 |
-
tf = Counter(
|
| 235 |
-
|
| 236 |
for term, cnt in tf.items():
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
self.
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
return []
|
| 246 |
-
tf = Counter(qtokens)
|
| 247 |
-
qvec = {}
|
| 248 |
for term, cnt in tf.items():
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
# =========================
|
| 271 |
-
#
|
| 272 |
# =========================
|
| 273 |
-
|
| 274 |
-
|
|
|
|
| 275 |
return ""
|
| 276 |
try:
|
| 277 |
-
reader = PdfReader(
|
| 278 |
-
|
| 279 |
for page in reader.pages:
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
| 283 |
except Exception:
|
| 284 |
return ""
|
| 285 |
|
| 286 |
-
def
|
| 287 |
-
if docx
|
| 288 |
-
return ""
|
| 289 |
-
try:
|
| 290 |
-
document = docx.Document(fileobj)
|
| 291 |
-
return "\n".join([p.text for p in document.paragraphs])
|
| 292 |
-
except Exception:
|
| 293 |
return ""
|
| 294 |
-
|
| 295 |
-
def extract_text_from_txt(fileobj: io.BytesIO) -> str:
|
| 296 |
try:
|
| 297 |
-
|
|
|
|
|
|
|
| 298 |
except Exception:
|
| 299 |
return ""
|
| 300 |
|
| 301 |
-
def
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
raw = f.read()
|
| 310 |
-
ext = (name.split(".")[-1] or "").lower()
|
| 311 |
-
bio = io.BytesIO(raw)
|
| 312 |
-
if ext in ["pdf"]:
|
| 313 |
-
txt = extract_text_from_pdf(bio)
|
| 314 |
-
elif ext in ["docx"]:
|
| 315 |
-
txt = extract_text_from_docx(bio)
|
| 316 |
-
elif ext in ["txt"]:
|
| 317 |
-
txt = extract_text_from_txt(bio)
|
| 318 |
-
else:
|
| 319 |
-
txt = ""
|
| 320 |
-
return txt, name
|
| 321 |
-
|
| 322 |
-
def chunk_text(text: str, max_len: int = 900, overlap: int = 120) -> List[str]:
|
| 323 |
-
"""
|
| 324 |
-
Simple sliding window chunker by characters; robust and fast.
|
| 325 |
-
"""
|
| 326 |
-
text = re.sub(r"\s+", " ", text).strip()
|
| 327 |
-
chunks = []
|
| 328 |
-
i = 0
|
| 329 |
-
n = len(text)
|
| 330 |
-
while i < n:
|
| 331 |
-
j = min(i + max_len, n)
|
| 332 |
-
chunk = text[i:j]
|
| 333 |
-
if chunk:
|
| 334 |
-
chunks.append(chunk)
|
| 335 |
-
i = j - overlap
|
| 336 |
-
if i < 0:
|
| 337 |
-
i = 0
|
| 338 |
-
if i >= n:
|
| 339 |
-
break
|
| 340 |
-
return chunks
|
| 341 |
|
| 342 |
-
# =========================
|
| 343 |
-
# RAG Index (session-scoped)
|
| 344 |
-
# =========================
|
| 345 |
-
class RagState:
|
| 346 |
-
def __init__(self):
|
| 347 |
-
self.index = None # TinyTfidfIndex
|
| 348 |
-
self.corpus = None # list of dicts with text/meta
|
| 349 |
|
| 350 |
-
def
|
| 351 |
"""
|
| 352 |
-
|
| 353 |
-
Returns: (index_obj, None, chunks_with_meta) to keep signature compatible.
|
| 354 |
"""
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
if
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
-
|
| 371 |
-
idx.fit(all_chunks, meta)
|
| 372 |
-
corpus = [{"text": t, **m} for t, m in zip(all_chunks, meta)]
|
| 373 |
-
return idx, None, corpus
|
| 374 |
|
| 375 |
-
def retrieve_answer(
|
| 376 |
-
query: str,
|
| 377 |
-
index_obj: Any,
|
| 378 |
-
_matrix_unused: Any,
|
| 379 |
-
corpus: List[Dict[str, str]],
|
| 380 |
-
k: int = 4
|
| 381 |
-
) -> Tuple[str, List[Dict[str, str]]]:
|
| 382 |
-
"""
|
| 383 |
-
Return synthesized answer + top-k supporting chunks with filenames.
|
| 384 |
-
"""
|
| 385 |
-
if not query or index_obj is None or not corpus:
|
| 386 |
-
return "", []
|
| 387 |
-
top = index_obj.query(query, k=k)
|
| 388 |
-
snippets = []
|
| 389 |
-
for i, sim in top:
|
| 390 |
-
item = corpus[i]
|
| 391 |
-
snippets.append({
|
| 392 |
-
"file": item["file"],
|
| 393 |
-
"relevance": float(sim),
|
| 394 |
-
"excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
|
| 395 |
-
})
|
| 396 |
-
answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
|
| 397 |
-
for s in snippets:
|
| 398 |
-
answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
|
| 399 |
-
answer += "Tip: Ask a follow-up like “Summarize the cutover plan” or “List missing security controls.”"
|
| 400 |
-
return answer, snippets
|
| 401 |
|
| 402 |
# =========================
|
| 403 |
-
#
|
| 404 |
# =========================
|
| 405 |
-
def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
|
| 406 |
-
text_low = text.lower()
|
| 407 |
-
|
| 408 |
-
pillar_scores = {}
|
| 409 |
-
gaps = []
|
| 410 |
-
|
| 411 |
-
for pillar, cfg in CHECKS.items():
|
| 412 |
-
hits = 0
|
| 413 |
-
kws = cfg["keywords"]
|
| 414 |
-
for kw in kws:
|
| 415 |
-
if kw.lower() in text_low:
|
| 416 |
-
hits += 1
|
| 417 |
-
coverage = hits / max(1, len(kws))
|
| 418 |
-
score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2) # baseline 1.5, up to 5.0
|
| 419 |
-
pillar_scores[pillar] = score
|
| 420 |
-
|
| 421 |
-
if pillar == "networking":
|
| 422 |
-
if "expressroute".lower() not in text_low and "er " not in text_low:
|
| 423 |
-
gaps.append({
|
| 424 |
-
"id": "NET-ER-001",
|
| 425 |
-
"severity": "High",
|
| 426 |
-
"desc": "ExpressRoute (ER) not referenced; consider ER for predictable private connectivity.",
|
| 427 |
-
"fix": "Design dual ER circuits with diverse POPs; fall back to VPN during pilot."
|
| 428 |
-
})
|
| 429 |
-
if "dns" not in text_low:
|
| 430 |
-
gaps.append({
|
| 431 |
-
"id": "NET-DNS-002",
|
| 432 |
-
"severity": "Med",
|
| 433 |
-
"desc": "DNS plan not mentioned; risk of name resolution drift post-cutover.",
|
| 434 |
-
"fix": "Document forwarders/zones, conditional forwarding, and DNS cutover sequencing."
|
| 435 |
-
})
|
| 436 |
-
if "mtu" not in text_low and "hcx" in text_low:
|
| 437 |
-
gaps.append({
|
| 438 |
-
"id": "NET-MTU-003",
|
| 439 |
-
"severity": "Med",
|
| 440 |
-
"desc": "HCX present but MTU tuning not referenced.",
|
| 441 |
-
"fix": "Validate path MTU for HCX tunnels; align NSX/physical network settings."
|
| 442 |
-
})
|
| 443 |
-
|
| 444 |
-
if pillar == "identity":
|
| 445 |
-
if "pim" not in text_low:
|
| 446 |
-
gaps.append({
|
| 447 |
-
"id": "ID-PIM-004",
|
| 448 |
-
"severity": "Med",
|
| 449 |
-
"desc": "No mention of Privileged Identity Management (PIM).",
|
| 450 |
-
"fix": "Enable PIM for admin roles; require approvals/justification; enforce MFA."
|
| 451 |
-
})
|
| 452 |
-
if "break-glass" not in text_low:
|
| 453 |
-
gaps.append({
|
| 454 |
-
"id": "ID-BG-005",
|
| 455 |
-
"severity": "Low",
|
| 456 |
-
"desc": "No break-glass account reference.",
|
| 457 |
-
"fix": "Create monitored break-glass accounts with strong controls and regular review."
|
| 458 |
-
})
|
| 459 |
-
|
| 460 |
-
if pillar == "security":
|
| 461 |
-
if "key vault" not in text_low and "hsm" not in text_low:
|
| 462 |
-
gaps.append({
|
| 463 |
-
"id": "SEC-KEY-006",
|
| 464 |
-
"severity": "High",
|
| 465 |
-
"desc": "Key management not described.",
|
| 466 |
-
"fix": "Use Azure Key Vault (HSM-backed if needed); rotate secrets/keys; restrict access via RBAC."
|
| 467 |
-
})
|
| 468 |
-
if "sentinel" not in text_low:
|
| 469 |
-
gaps.append({
|
| 470 |
-
"id": "SEC-SIEM-007",
|
| 471 |
-
"severity": "Med",
|
| 472 |
-
"desc": "SIEM not referenced.",
|
| 473 |
-
"fix": "Onboard to Microsoft Sentinel; define data connectors and incident processes."
|
| 474 |
-
})
|
| 475 |
-
if "policy" not in text_low:
|
| 476 |
-
gaps.append({
|
| 477 |
-
"id": "SEC-POL-008",
|
| 478 |
-
"severity": "Med",
|
| 479 |
-
"desc": "Azure Policy governance not mentioned.",
|
| 480 |
-
"fix": "Attach ALZ policies/initiatives for guardrails (encryption, tags, allowed locations, SKUs)."
|
| 481 |
-
})
|
| 482 |
-
|
| 483 |
-
if pillar == "reliability":
|
| 484 |
-
if ("asr" not in text_low) and ("site recovery" not in text_low):
|
| 485 |
-
gaps.append({
|
| 486 |
-
"id": "REL-ASR-009",
|
| 487 |
-
"severity": "Med",
|
| 488 |
-
"desc": "No DR replication tool referenced.",
|
| 489 |
-
"fix": "Use Azure Site Recovery (ASR) or HCX DR for failover/failback; schedule DR drills."
|
| 490 |
-
})
|
| 491 |
-
if "backup" not in text_low and "recovery services vault" not in text_low:
|
| 492 |
-
gaps.append({
|
| 493 |
-
"id": "REL-BKP-010",
|
| 494 |
-
"severity": "High",
|
| 495 |
-
"desc": "Backup strategy not captured.",
|
| 496 |
-
"fix": "Configure Azure Backup with immutable storage and soft delete; test restores."
|
| 497 |
-
})
|
| 498 |
-
if ("rto" not in text_low) or ("rpo" not in text_low):
|
| 499 |
-
gaps.append({
|
| 500 |
-
"id": "REL-RTORPO-011",
|
| 501 |
-
"severity": "Med",
|
| 502 |
-
"desc": "RTO/RPO targets not documented.",
|
| 503 |
-
"fix": "Define business-aligned RTO/RPO and validate during pilot/cutover."
|
| 504 |
-
})
|
| 505 |
-
|
| 506 |
-
if pillar == "architecture":
|
| 507 |
-
if ("landing zone" not in text_low) and ("landing-zone" not in text_low):
|
| 508 |
-
gaps.append({
|
| 509 |
-
"id": "ARC-ALZ-012",
|
| 510 |
-
"severity": "High",
|
| 511 |
-
"desc": "Azure Landing Zone baseline not referenced.",
|
| 512 |
-
"fix": "Adopt ALZ (hub/spoke, Policy, RBAC, logging) before migration waves."
|
| 513 |
-
})
|
| 514 |
-
|
| 515 |
-
if pillar == "migration":
|
| 516 |
-
if ("rollback" not in text_low) and ("backout" not in text_low):
|
| 517 |
-
gaps.append({
|
| 518 |
-
"id": "MIG-ROLL-013",
|
| 519 |
-
"severity": "High",
|
| 520 |
-
"desc": "Rollback/backout path not documented.",
|
| 521 |
-
"fix": "Document clear backout steps and timebox for each wave; test in pilot."
|
| 522 |
-
})
|
| 523 |
-
if "pilot" not in text_low:
|
| 524 |
-
gaps.append({
|
| 525 |
-
"id": "MIG-PILOT-014",
|
| 526 |
-
"severity": "Med",
|
| 527 |
-
"desc": "No pilot mentioned.",
|
| 528 |
-
"fix": "Execute a pilot with representative workloads; capture metrics and lessons."
|
| 529 |
-
})
|
| 530 |
-
|
| 531 |
-
if pillar == "cost":
|
| 532 |
-
if "tag" not in text_low:
|
| 533 |
-
gaps.append({
|
| 534 |
-
"id": "COST-TAG-015",
|
| 535 |
-
"severity": "Med",
|
| 536 |
-
"desc": "Tagging strategy absent (owner, env, app).",
|
| 537 |
-
"fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
|
| 538 |
-
})
|
| 539 |
-
|
| 540 |
-
if pillar_scores:
|
| 541 |
-
overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
|
| 542 |
-
else:
|
| 543 |
-
overall = 0.0
|
| 544 |
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
"desc": f"Overall score is {overall}. Focus first on High-severity gaps.",
|
| 550 |
-
"fix": "Prioritize ER/DNS/Backup/ALZ/PIM/Key Vault where missing; re-run the check after updates."
|
| 551 |
-
})
|
| 552 |
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
txt, fname = read_file_to_text(f)
|
| 563 |
-
if txt.strip():
|
| 564 |
-
text_full.append(txt)
|
| 565 |
-
file_list.append(os.path.basename(f.name))
|
| 566 |
-
if not text_full:
|
| 567 |
-
return "Could not parse text from the provided files.", {}, []
|
| 568 |
-
|
| 569 |
-
combined = "\n\n".join(text_full)
|
| 570 |
-
scores, gaps = score_text_against_checks(combined)
|
| 571 |
-
|
| 572 |
-
md = f"### Design/Runbook Review\n"
|
| 573 |
-
md += f"**Files analyzed:** {', '.join(file_list)}\n\n"
|
| 574 |
-
md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
|
| 575 |
-
md += "**Per-Pillar Scores:**\n\n"
|
| 576 |
-
for k, v in scores.items():
|
| 577 |
-
if k == "overall":
|
| 578 |
continue
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
"
|
| 586 |
-
"
|
| 587 |
-
"
|
| 588 |
-
"
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
-
# =========================
|
| 594 |
-
# Q&A Logic
|
| 595 |
-
# =========================
|
| 596 |
-
def list_refs(ref_names: List[str]) -> str:
|
| 597 |
-
links = []
|
| 598 |
-
for nm in ref_names:
|
| 599 |
-
hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
|
| 600 |
-
if hit:
|
| 601 |
-
links.append(f"[{nm}]({hit[0][1]})")
|
| 602 |
-
return " | ".join(links)
|
| 603 |
|
| 604 |
-
def
|
| 605 |
question: str,
|
| 606 |
use_uploaded_docs: bool,
|
| 607 |
index_obj: Any,
|
|
@@ -612,32 +368,45 @@ def answer_faq_or_approach(
|
|
| 612 |
if not q:
|
| 613 |
return "Please enter a question."
|
| 614 |
|
| 615 |
-
#
|
| 616 |
for item in FAQ_SEEDS:
|
| 617 |
-
# simple heuristic: overlap of first few tokens
|
| 618 |
seed_tokens = set(tokenize(item["q"])[:3])
|
| 619 |
q_tokens = set(tokenize(q))
|
| 620 |
if seed_tokens and seed_tokens.issubset(q_tokens):
|
| 621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
|
| 623 |
-
#
|
| 624 |
if use_uploaded_docs and index_obj is not None and corpus:
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
)
|
| 641 |
refs = list_refs([
|
| 642 |
"Azure VMware Solution (AVS)",
|
| 643 |
"Azure Migrate",
|
|
@@ -645,98 +414,114 @@ def answer_faq_or_approach(
|
|
| 645 |
"Azure Well-Architected Framework (WAF)",
|
| 646 |
"VMware HCX Docs"
|
| 647 |
])
|
| 648 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
|
| 650 |
# =========================
|
| 651 |
-
#
|
| 652 |
# =========================
|
| 653 |
-
with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
|
| 654 |
-
gr.Markdown(
|
| 655 |
-
"# VMware On-Prem → Azure Local Migration Assistant\n"
|
| 656 |
-
"Ask questions, upload migration/design documents for review, and get recommendations.\n"
|
| 657 |
-
"_Sources: Microsoft Learn/Docs, VMware Docs, NIST, IRS Pub 1075 (linked below)._"
|
| 658 |
-
)
|
| 659 |
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
with gr.Tab("Ask Anything"):
|
| 667 |
-
with gr.Row():
|
| 668 |
-
question = gr.Textbox(
|
| 669 |
-
label="Your question (FAQs, approach, troubleshooting)",
|
| 670 |
-
placeholder="e.g., How do I plan a pilot with HCX RAV and ensure minimal downtime?"
|
| 671 |
-
)
|
| 672 |
-
use_docs = gr.Checkbox(label="Also search my uploaded documents (if any)", value=True)
|
| 673 |
-
ask_btn = gr.Button("Answer")
|
| 674 |
-
answer_box = gr.Markdown()
|
| 675 |
-
|
| 676 |
-
with gr.Tab("Upload & Review Design"):
|
| 677 |
-
gr.Markdown("Upload **PDF / DOCX / TXT** (multiple allowed). Then build the index and/or run a review.")
|
| 678 |
-
files = gr.File(file_count="multiple", file_types=[".pdf", ".docx", ".txt"], label="Upload documents")
|
| 679 |
-
with gr.Row():
|
| 680 |
-
build_btn = gr.Button("Build/Refresh Search Index")
|
| 681 |
-
review_btn = gr.Button("Run Design/Runbook Review")
|
| 682 |
-
index_info = gr.Markdown()
|
| 683 |
-
review_md = gr.Markdown()
|
| 684 |
-
review_json = gr.JSON()
|
| 685 |
-
gaps_table = gr.Dataframe(
|
| 686 |
-
headers=["Gap ID", "Severity", "Description", "Fix"],
|
| 687 |
-
datatype=["str", "str", "str", "str"],
|
| 688 |
-
interactive=False,
|
| 689 |
-
label="Gaps & Recommendations"
|
| 690 |
-
)
|
| 691 |
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
gr.Markdown("### Knowledge Taxonomy (Domains → Subdomains)")
|
| 698 |
-
onto_str = ""
|
| 699 |
-
for dom, subs in ONTOLOGY.items():
|
| 700 |
-
onto_str += f"- **{dom}**: {', '.join(subs)}\n"
|
| 701 |
-
gr.Markdown(onto_str)
|
| 702 |
-
|
| 703 |
-
gr.Markdown(
|
| 704 |
-
"### Notes\n"
|
| 705 |
-
"- This app does **not** call external APIs. Use the links above for official guidance.\n"
|
| 706 |
-
"- Design checks are heuristic; always validate against your Architecture Board and security teams."
|
| 707 |
-
)
|
| 708 |
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
idx, _X, cor = build_index(files_list)
|
| 712 |
-
if idx is None:
|
| 713 |
-
return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
|
| 714 |
-
None, None, None)
|
| 715 |
-
msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
|
| 716 |
-
return msg, idx, None, cor
|
| 717 |
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
)
|
| 723 |
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
inputs=[
|
| 731 |
-
outputs=[
|
| 732 |
)
|
| 733 |
|
| 734 |
ask_btn.click(
|
| 735 |
-
|
| 736 |
inputs=[question, use_docs, st_index, st_matrix, st_corpus],
|
| 737 |
outputs=[answer_box]
|
| 738 |
)
|
| 739 |
|
| 740 |
-
# Standard HF Spaces entrypoint
|
| 741 |
if __name__ == "__main__":
|
| 742 |
-
|
|
|
|
|
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
"""
|
| 5 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
| 6 |
+
- Works on Hugging Face Spaces (no external API calls, no sklearn).
|
| 7 |
+
- Upload design/migration docs (PDF/DOCX/TXT/MD).
|
| 8 |
+
- Ask questions; get DETAILED, structured answers with excerpts + trusted refs.
|
| 9 |
|
| 10 |
+
Run locally:
|
| 11 |
+
pip install gradio PyPDF2 python-docx
|
| 12 |
+
python app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
import os
|
| 16 |
import io
|
| 17 |
import re
|
|
|
|
| 18 |
import math
|
| 19 |
import time
|
| 20 |
from typing import List, Tuple, Dict, Any
|
|
|
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
|
| 25 |
+
# Optional parsers (gracefully degrade if not installed on Spaces)
|
|
|
|
| 26 |
try:
|
| 27 |
+
import PyPDF2 # lightweight; often available on Spaces
|
| 28 |
except Exception:
|
| 29 |
+
PyPDF2 = None
|
| 30 |
|
|
|
|
| 31 |
try:
|
| 32 |
+
import docx # python-docx
|
| 33 |
except Exception:
|
| 34 |
docx = None
|
| 35 |
|
| 36 |
|
| 37 |
# =========================
|
| 38 |
+
# Trusted sources & FAQ seeds
|
| 39 |
# =========================
|
| 40 |
+
|
| 41 |
+
TRUSTED_SOURCES: List[Tuple[str, str]] = [
|
| 42 |
("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
|
| 43 |
("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
|
|
|
|
| 44 |
("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
|
| 45 |
+
("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
|
| 46 |
+
("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
]
|
| 48 |
|
| 49 |
+
FAQ_SEEDS: List[Dict[str, Any]] = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
{
|
| 51 |
+
"q": "How do we migrate VMware workloads to Azure with minimal downtime?",
|
| 52 |
"a": (
|
| 53 |
+
"For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
|
| 54 |
+
"Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
|
| 55 |
+
"pilot a few representative VMs, then migrate in waves with rollback and DR drills."
|
|
|
|
|
|
|
| 56 |
),
|
| 57 |
+
"refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"]
|
| 58 |
},
|
| 59 |
{
|
| 60 |
+
"q": "What is a recommended migration sequence?",
|
| 61 |
"a": (
|
| 62 |
+
"1) Establish a governed landing zone. 2) Set up connectivity and identity. "
|
| 63 |
+
"3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
|
| 64 |
+
"6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
|
| 65 |
),
|
| 66 |
+
"refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]
|
| 67 |
},
|
| 68 |
{
|
| 69 |
+
"q": "How do we plan DR and backups?",
|
| 70 |
"a": (
|
| 71 |
+
"Define RTO/RPO per app. Use immutable backups and soft-delete. "
|
| 72 |
+
"Leverage ASR for DR where appropriate, run failover drills, and document rollback."
|
|
|
|
| 73 |
),
|
| 74 |
+
"refs": ["Azure Well-Architected Framework (WAF)"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
},
|
| 76 |
]
|
| 77 |
|
| 78 |
+
|
| 79 |
# =========================
|
| 80 |
+
# Utilities
|
| 81 |
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+") # keep URLs/paths/ids mostly intact
|
| 84 |
|
| 85 |
def tokenize(text: str) -> List[str]:
|
| 86 |
+
if not text:
|
| 87 |
+
return []
|
| 88 |
+
return [t.lower() for t in _WORD_RE.findall(text)]
|
| 89 |
+
|
| 90 |
+
def list_refs(ref_names: List[str]) -> str:
|
| 91 |
+
links = []
|
| 92 |
+
for nm in ref_names:
|
| 93 |
+
hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
|
| 94 |
+
if hit:
|
| 95 |
+
links.append(f"[{nm}]({hit[0][1]})")
|
| 96 |
+
return " | ".join(links) if links else ""
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# =========================
|
| 100 |
+
# Tiny TF-IDF implementation (no sklearn)
|
| 101 |
+
# =========================
|
| 102 |
|
| 103 |
class TinyTfidfIndex:
|
| 104 |
def __init__(self):
|
| 105 |
self.docs: List[List[str]] = []
|
| 106 |
+
self.df: Counter = Counter()
|
|
|
|
| 107 |
self.idf: Dict[str, float] = {}
|
| 108 |
+
self.doc_norms: List[float] = []
|
| 109 |
+
self.voc_size = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
def add_documents(self, tokenized_docs: List[List[str]]):
|
| 112 |
+
self.docs = tokenized_docs[:]
|
| 113 |
# document frequency
|
| 114 |
+
self.df = Counter()
|
| 115 |
+
for toks in self.docs:
|
| 116 |
+
self.df.update(set(toks))
|
| 117 |
+
N = max(1, len(self.docs))
|
| 118 |
+
self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
|
| 119 |
+
self.voc_size = len(self.idf)
|
| 120 |
+
# precompute norms
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
self.doc_norms = []
|
| 122 |
+
for toks in self.docs:
|
| 123 |
+
tf = Counter(toks)
|
| 124 |
+
norm_sq = 0.0
|
| 125 |
for term, cnt in tf.items():
|
| 126 |
+
w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0)
|
| 127 |
+
norm_sq += w * w
|
| 128 |
+
self.doc_norms.append(math.sqrt(norm_sq))
|
| 129 |
+
|
| 130 |
+
def _vec(self, toks: List[str]) -> Dict[str, float]:
|
| 131 |
+
tf = Counter(toks)
|
| 132 |
+
total = max(1, len(toks))
|
| 133 |
+
v = {}
|
|
|
|
|
|
|
|
|
|
| 134 |
for term, cnt in tf.items():
|
| 135 |
+
idf = self.idf.get(term)
|
| 136 |
+
if idf is None:
|
| 137 |
+
continue
|
| 138 |
+
v[term] = (cnt / total) * idf
|
| 139 |
+
return v
|
| 140 |
+
|
| 141 |
+
def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
|
| 142 |
+
if not self.docs:
|
| 143 |
+
return []
|
| 144 |
+
qv = self._vec(tokenize(text))
|
| 145 |
+
# cosine similarity
|
| 146 |
+
q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
|
| 147 |
+
sims: List[Tuple[int, float]] = []
|
| 148 |
+
for i, toks in enumerate(self.docs):
|
| 149 |
+
dv = Counter(toks) # use tf counter to loop terms
|
| 150 |
+
num = 0.0
|
| 151 |
+
for term in qv:
|
| 152 |
+
if term in dv:
|
| 153 |
+
# weight for doc term
|
| 154 |
+
w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
|
| 155 |
+
num += qv[term] * w_d
|
| 156 |
+
denom = (self.doc_norms[i] or 1e-9) * q_norm
|
| 157 |
+
sims.append((i, num / denom))
|
| 158 |
+
sims.sort(key=lambda x: x[1], reverse=True)
|
| 159 |
+
return sims[:k]
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# =========================
|
| 163 |
+
# Simple scoring rubric to tailor the detailed output
|
| 164 |
+
# =========================
|
| 165 |
+
|
| 166 |
+
CHECKS = [
|
| 167 |
+
{
|
| 168 |
+
"id": "landing_zone",
|
| 169 |
+
"desc": "Landing zone defined (hub/spoke, Policy, RBAC, logging).",
|
| 170 |
+
"fix": "Use CAF blueprints; enforce Policy for guardrails and RBAC.",
|
| 171 |
+
"keywords": ["landing", "hub", "spoke", "policy", "rbac", "log", "monitor"],
|
| 172 |
+
"pillar": "governance",
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"id": "connectivity",
|
| 176 |
+
"desc": "Connectivity planned (ExpressRoute/VPN), DNS, MTU validated for HCX.",
|
| 177 |
+
"fix": "Verify ER/VPN, DNS resolution, and HCX MTU/mobility settings.",
|
| 178 |
+
"keywords": ["expressroute", "vpn", "dns", "mtu", "hcx", "connectivity"],
|
| 179 |
+
"pillar": "networking",
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"id": "migrate_tooling",
|
| 183 |
+
"desc": "Discovery/assessment and tooling chosen (Azure Migrate or HCX).",
|
| 184 |
+
"fix": "Run Azure Migrate discovery; select HCX or Azure Migrate per downtime.",
|
| 185 |
+
"keywords": ["azure", "migrate", "discovery", "assessment", "hcx", "replication"],
|
| 186 |
+
"pillar": "operations",
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"id": "security",
|
| 190 |
+
"desc": "Security/identity configured (Key Vault, Defender, Sentinel, PIM/MFA).",
|
| 191 |
+
"fix": "Centralize secrets in Key Vault; enable Defender/Sentinel; enforce PIM/MFA.",
|
| 192 |
+
"keywords": ["key", "vault", "defender", "sentinel", "pim", "mfa", "entra", "aad", "identity"],
|
| 193 |
+
"pillar": "security",
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"id": "dr_backup",
|
| 197 |
+
"desc": "Backups, DR, RTO/RPO defined; ASR drills planned.",
|
| 198 |
+
"fix": "Set RTO/RPO; immutability & soft-delete; test ASR failover/failback.",
|
| 199 |
+
"keywords": ["backup", "rto", "rpo", "dr", "asr", "failover", "restore"],
|
| 200 |
+
"pillar": "reliability",
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"id": "cost",
|
| 204 |
+
"desc": "Cost optimization plan (right-sizing, reservations, tagging).",
|
| 205 |
+
"fix": "Use reservations/Savings Plans, rightsizing, and enforce tags.",
|
| 206 |
+
"keywords": ["cost", "reservation", "savings", "right", "tag"],
|
| 207 |
+
"pillar": "cost",
|
| 208 |
+
},
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
|
| 212 |
+
toks = set(tokenize(text))
|
| 213 |
+
scores = defaultdict(float)
|
| 214 |
+
hits = []
|
| 215 |
+
for chk in CHECKS:
|
| 216 |
+
matched = any(kw in toks for kw in chk["keywords"])
|
| 217 |
+
if matched:
|
| 218 |
+
scores["overall"] += 1.0
|
| 219 |
+
scores[chk["pillar"]] += 1.0
|
| 220 |
+
else:
|
| 221 |
+
hits.append({
|
| 222 |
+
"id": chk["id"],
|
| 223 |
+
"desc": chk["desc"],
|
| 224 |
+
"fix": chk["fix"],
|
| 225 |
+
"severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
|
| 226 |
+
})
|
| 227 |
+
# normalize roughly to 0-5 scale
|
| 228 |
+
max_possible = float(len(CHECKS))
|
| 229 |
+
scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
|
| 230 |
+
for k in list(scores.keys()):
|
| 231 |
+
if k != "overall":
|
| 232 |
+
scores[k] = round(scores[k], 2)
|
| 233 |
+
return scores, hits
|
| 234 |
+
|
| 235 |
|
| 236 |
# =========================
|
| 237 |
+
# File parsing
|
| 238 |
# =========================
|
| 239 |
+
|
| 240 |
+
def read_pdf_bytes(b: bytes) -> str:
|
| 241 |
+
if not PyPDF2:
|
| 242 |
return ""
|
| 243 |
try:
|
| 244 |
+
reader = PyPDF2.PdfReader(io.BytesIO(b))
|
| 245 |
+
out = []
|
| 246 |
for page in reader.pages:
|
| 247 |
+
try:
|
| 248 |
+
out.append(page.extract_text() or "")
|
| 249 |
+
except Exception:
|
| 250 |
+
pass
|
| 251 |
+
return "\n".join(out)
|
| 252 |
except Exception:
|
| 253 |
return ""
|
| 254 |
|
| 255 |
+
def read_docx_bytes(b: bytes) -> str:
|
| 256 |
+
if not docx:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
return ""
|
|
|
|
|
|
|
| 258 |
try:
|
| 259 |
+
f = io.BytesIO(b)
|
| 260 |
+
d = docx.Document(f)
|
| 261 |
+
return "\n".join(p.text for p in d.paragraphs)
|
| 262 |
except Exception:
|
| 263 |
return ""
|
| 264 |
|
| 265 |
+
def read_text_bytes(b: bytes) -> str:
|
| 266 |
+
# best-effort decoding
|
| 267 |
+
for enc in ("utf-8", "utf-16", "latin-1"):
|
| 268 |
+
try:
|
| 269 |
+
return b.decode(enc)
|
| 270 |
+
except Exception:
|
| 271 |
+
continue
|
| 272 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
+
def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
|
| 276 |
"""
|
| 277 |
+
Returns {"file": <name>, "text": <extracted_text>}
|
|
|
|
| 278 |
"""
|
| 279 |
+
name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
|
| 280 |
+
data = file_obj.get("data")
|
| 281 |
+
if data is None:
|
| 282 |
+
# gradio sometimes provides a path instead
|
| 283 |
+
path = file_obj.get("path")
|
| 284 |
+
if path and os.path.exists(path):
|
| 285 |
+
with open(path, "rb") as fh:
|
| 286 |
+
data = fh.read()
|
| 287 |
+
if data is None:
|
| 288 |
+
return {"file": name, "text": ""}
|
| 289 |
+
|
| 290 |
+
low = name.lower()
|
| 291 |
+
text = ""
|
| 292 |
+
if low.endswith(".pdf"):
|
| 293 |
+
text = read_pdf_bytes(data)
|
| 294 |
+
elif low.endswith(".docx") or low.endswith(".doc"):
|
| 295 |
+
text = read_docx_bytes(data)
|
| 296 |
+
elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
|
| 297 |
+
text = read_text_bytes(data)
|
| 298 |
+
else:
|
| 299 |
+
# try plain text as fallback
|
| 300 |
+
text = read_text_bytes(data)
|
| 301 |
|
| 302 |
+
return {"file": os.path.basename(name), "text": text or ""}
|
|
|
|
|
|
|
|
|
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
# =========================
|
| 306 |
+
# Detailed Q&A Composer
|
| 307 |
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
+
def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
|
| 310 |
+
collected = [s.get("excerpt", "") for s in snippets]
|
| 311 |
+
combined = "\n\n".join(collected)
|
| 312 |
+
scores, gaps = score_text_against_checks(combined) if combined.strip() else ({"overall": 0.0}, [])
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
+
def _mk_gaps(glist, limit=8):
|
| 315 |
+
out = []
|
| 316 |
+
for g in glist[:limit]:
|
| 317 |
+
out.append(f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_")
|
| 318 |
+
return "\n".join(out) if out else "- No major issues detected in the sampled excerpts."
|
| 319 |
|
| 320 |
+
refs = list_refs([
|
| 321 |
+
"Azure VMware Solution (AVS)",
|
| 322 |
+
"Azure Migrate",
|
| 323 |
+
"Cloud Adoption Framework (CAF)",
|
| 324 |
+
"Azure Well-Architected Framework (WAF)",
|
| 325 |
+
"VMware HCX Docs"
|
| 326 |
+
])
|
| 327 |
|
| 328 |
+
pillar_lines = []
|
| 329 |
+
for k_, v_ in scores.items():
|
| 330 |
+
if k_ == "overall":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
continue
|
| 332 |
+
pillar_lines.append(f"- **{k_.capitalize()}**: {v_}")
|
| 333 |
+
pillar_md = "\n".join(pillar_lines) if pillar_lines else "- (no signals)"
|
| 334 |
+
|
| 335 |
+
md = (
|
| 336 |
+
f"### Answer (detailed)\n"
|
| 337 |
+
f"**Your question:** {query}\n\n"
|
| 338 |
+
f"**TL;DR:** Here’s a concrete plan across landing zone, connectivity, migration method, security, DR, and cost. "
|
| 339 |
+
f"Address the highest-risk gaps first.\n\n"
|
| 340 |
+
f"#### Step-by-step plan\n"
|
| 341 |
+
"1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
|
| 342 |
+
"2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
|
| 343 |
+
"3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
|
| 344 |
+
"4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
|
| 345 |
+
"5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
|
| 346 |
+
"6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
|
| 347 |
+
"7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
|
| 348 |
+
f"#### What your documents emphasize (auto-scored)\n"
|
| 349 |
+
f"**Overall score:** {scores.get('overall', 0)} / 5.0\n\n"
|
| 350 |
+
f"**Per-pillar signals:**\n{pillar_md}\n\n"
|
| 351 |
+
f"#### Gaps & quick fixes\n{_mk_gaps(gaps, limit=8)}\n\n"
|
| 352 |
+
f"#### Supporting excerpts\n"
|
| 353 |
+
)
|
| 354 |
+
for s in snippets:
|
| 355 |
+
md += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
|
| 356 |
+
md += f"**Trusted sources:** {refs}"
|
| 357 |
+
return md
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
+
def answer_faq_or_approach_detailed(
|
| 361 |
question: str,
|
| 362 |
use_uploaded_docs: bool,
|
| 363 |
index_obj: Any,
|
|
|
|
| 368 |
if not q:
|
| 369 |
return "Please enter a question."
|
| 370 |
|
| 371 |
+
# 1) Seeded FAQs → detailed plan
|
| 372 |
for item in FAQ_SEEDS:
|
|
|
|
| 373 |
seed_tokens = set(tokenize(item["q"])[:3])
|
| 374 |
q_tokens = set(tokenize(q))
|
| 375 |
if seed_tokens and seed_tokens.issubset(q_tokens):
|
| 376 |
+
refs = list_refs(item.get("refs", []))
|
| 377 |
+
base = (
|
| 378 |
+
f"### Answer (detailed)\n"
|
| 379 |
+
f"{item['a']}\n\n"
|
| 380 |
+
"#### Step-by-step plan\n"
|
| 381 |
+
"1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
|
| 382 |
+
"2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
|
| 383 |
+
"3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
|
| 384 |
+
"4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
|
| 385 |
+
"5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
|
| 386 |
+
"6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
|
| 387 |
+
"7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
|
| 388 |
+
f"**Trusted sources:** {refs}"
|
| 389 |
+
)
|
| 390 |
+
return base
|
| 391 |
|
| 392 |
+
# 2) Use uploaded docs (RAG) → detailed synthesized answer
|
| 393 |
if use_uploaded_docs and index_obj is not None and corpus:
|
| 394 |
+
top = index_obj.query(q, k=6)
|
| 395 |
+
snippets = []
|
| 396 |
+
for i, sim in top:
|
| 397 |
+
item = corpus[i]
|
| 398 |
+
excerpt = item["text"].strip()
|
| 399 |
+
if len(excerpt) > 700:
|
| 400 |
+
excerpt = excerpt[:700] + "..."
|
| 401 |
+
snippets.append({
|
| 402 |
+
"file": item["file"],
|
| 403 |
+
"relevance": float(sim),
|
| 404 |
+
"excerpt": excerpt
|
| 405 |
+
})
|
| 406 |
+
if snippets:
|
| 407 |
+
return _compose_detailed_from_snippets(q, snippets)
|
| 408 |
+
|
| 409 |
+
# 3) Fallback (no docs) → generic detailed plan with citations
|
| 410 |
refs = list_refs([
|
| 411 |
"Azure VMware Solution (AVS)",
|
| 412 |
"Azure Migrate",
|
|
|
|
| 414 |
"Azure Well-Architected Framework (WAF)",
|
| 415 |
"VMware HCX Docs"
|
| 416 |
])
|
| 417 |
+
generic = (
|
| 418 |
+
"### Answer (detailed)\n"
|
| 419 |
+
"**TL;DR:** Use AVS/HCX or Azure Migrate depending on downtime needs; build landing zone and connectivity first, "
|
| 420 |
+
"then migrate in waves with rollback and DR drills.\n\n"
|
| 421 |
+
"#### Step-by-step plan\n"
|
| 422 |
+
"1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
|
| 423 |
+
"2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
|
| 424 |
+
"3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
|
| 425 |
+
"4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
|
| 426 |
+
"5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
|
| 427 |
+
"6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
|
| 428 |
+
"7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
|
| 429 |
+
f"**Trusted sources:** {refs}"
|
| 430 |
+
)
|
| 431 |
+
return generic
|
| 432 |
+
|
| 433 |
|
| 434 |
# =========================
|
| 435 |
+
# Build index from uploaded files
|
| 436 |
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
|
| 438 |
+
def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
|
| 439 |
+
"""
|
| 440 |
+
Returns: (index_obj, matrix_placeholder, corpus, status_message)
|
| 441 |
+
"""
|
| 442 |
+
if not files:
|
| 443 |
+
return None, None, [], "No files uploaded yet."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
+
corpus: List[Dict[str, str]] = []
|
| 446 |
+
for f in files:
|
| 447 |
+
rec = parse_file(f)
|
| 448 |
+
if rec["text"]:
|
| 449 |
+
corpus.append(rec)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
|
| 451 |
+
if not corpus:
|
| 452 |
+
return None, None, [], "Uploaded files could not be parsed (no text extracted)."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
|
| 454 |
+
tokenized = [tokenize(c["text"]) for c in corpus]
|
| 455 |
+
idx = TinyTfidfIndex()
|
| 456 |
+
idx.add_documents(tokenized)
|
| 457 |
+
|
| 458 |
+
status = f"Indexed {len(corpus)} document(s). Vocabulary size ≈ {idx.voc_size}."
|
| 459 |
+
return idx, None, corpus, status
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
# =========================
|
| 463 |
+
# Gradio UI
|
| 464 |
+
# =========================
|
| 465 |
+
|
| 466 |
+
with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
|
| 467 |
+
gr.Markdown(
|
| 468 |
+
"## VMware On-Prem → Azure Local Migration Assistant\n"
|
| 469 |
+
"- Upload your **design/migration documents** (PDF, DOCX, TXT, MD)\n"
|
| 470 |
+
"- Ask questions. Toggle **Use uploaded docs** for RAG-based answers\n"
|
| 471 |
+
"- Answers are **detailed** by default, with structured steps and trusted references\n"
|
| 472 |
)
|
| 473 |
|
| 474 |
+
with gr.Row():
|
| 475 |
+
with gr.Column(scale=2):
|
| 476 |
+
file_in = gr.Files(
|
| 477 |
+
label="Upload documents (PDF/DOCX/TXT/MD)",
|
| 478 |
+
file_count="multiple",
|
| 479 |
+
type="filepath" # we will open paths ourselves
|
| 480 |
+
)
|
| 481 |
+
index_status = gr.Markdown("No index yet.")
|
| 482 |
+
|
| 483 |
+
# Hidden/State to hold in-memory data
|
| 484 |
+
st_index = gr.State()
|
| 485 |
+
st_matrix = gr.State() # placeholder for API compatibility
|
| 486 |
+
st_corpus = gr.State()
|
| 487 |
+
|
| 488 |
+
build_btn = gr.Button("Build Index", variant="primary")
|
| 489 |
+
with gr.Column(scale=3):
|
| 490 |
+
question = gr.Textbox(label="Ask a question", placeholder="e.g., How do I minimize downtime for our VMware migration?")
|
| 491 |
+
use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
|
| 492 |
+
ask_btn = gr.Button("Ask", variant="primary")
|
| 493 |
+
answer_box = gr.Markdown("")
|
| 494 |
+
|
| 495 |
+
# Convert gr.Files (paths) into the dict format our parser expects
|
| 496 |
+
def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
|
| 497 |
+
out = []
|
| 498 |
+
for p in paths or []:
|
| 499 |
+
try:
|
| 500 |
+
with open(p, "rb") as fh:
|
| 501 |
+
data = fh.read()
|
| 502 |
+
out.append({"name": os.path.basename(p), "data": data, "path": p})
|
| 503 |
+
except Exception:
|
| 504 |
+
pass
|
| 505 |
+
return out
|
| 506 |
+
|
| 507 |
+
def _build(files_paths: List[str]):
|
| 508 |
+
files = _collect_files(files_paths)
|
| 509 |
+
idx, mat, corpus, status = build_index(files)
|
| 510 |
+
return status, idx, mat, corpus
|
| 511 |
|
| 512 |
+
build_btn.click(
|
| 513 |
+
_build,
|
| 514 |
+
inputs=[file_in],
|
| 515 |
+
outputs=[index_status, st_index, st_matrix, st_corpus]
|
| 516 |
)
|
| 517 |
|
| 518 |
ask_btn.click(
|
| 519 |
+
answer_faq_or_approach_detailed,
|
| 520 |
inputs=[question, use_docs, st_index, st_matrix, st_corpus],
|
| 521 |
outputs=[answer_box]
|
| 522 |
)
|
| 523 |
|
|
|
|
| 524 |
if __name__ == "__main__":
|
| 525 |
+
# On Spaces, share=True is ignored safely; locally it will open a public link if allowed.
|
| 526 |
+
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
|
| 527 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)
|