Spaces:

ajayinsac
/

VMware2AzureLocal

Sleeping

App Files Files Community

VMware2AzureLocal / app.py

ajayinsac

Update app.py

5038afa verified 4 months ago

raw

history blame contribute delete

21.4 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	"""
	VMware On-Prem → Azure Local Migration Assistant (Gradio)
	- Upload design/migration docs (PDF/DOCX/TXT/MD).
	- Ask questions; get reliable, detailed, and relevant answers.
	- Intent-aware (definitions \| how-to \| plans \| comparisons) with topic-aware details.
	- No external APIs. No scikit-learn.

	Run locally:
	pip install gradio PyPDF2 python-docx
	python app.py
	"""

	import os
	import io
	import re
	import math
	from typing import List, Tuple, Dict, Any
	from collections import Counter

	import gradio as gr

	# -------------------------
	# Optional parsers (graceful fallback)
	# -------------------------
	try:
	import PyPDF2
	except Exception:
	PyPDF2 = None

	try:
	import docx # python-docx
	except Exception:
	docx = None


	# =========================
	# Trusted sources & FAQ seeds
	# =========================

	TRUSTED_SOURCES: List[Tuple[str, str]] = [
	# Core guidance
	("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
	("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
	# Networking / SDN (used when question is about SDN)
	("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
	("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
	("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
	("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
	# Migration
	("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
	("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
	("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html"),
	# DR
	("Azure Site Recovery (ASR)", "https://learn.microsoft.com/azure/site-recovery/"),
	# Security
	("Microsoft Defender for Cloud", "https://learn.microsoft.com/azure/defender-for-cloud/"),
	# Cost
	("Azure Cost Management", "https://learn.microsoft.com/azure/cost-management-billing/"),
	]

	FAQ_SEEDS: List[Dict[str, Any]] = [
	{
	"q": "migrate vmware workloads minimal downtime",
	"a": (
	"For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
	"Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
	"pilot a few representative VMs, then migrate in waves with rollback and DR drills."
	),
	"refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"],
	},
	{
	"q": "recommended migration sequence",
	"a": (
	"1) Establish a governed landing zone. 2) Set up connectivity and identity. "
	"3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
	"6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
	),
	"refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"],
	},
	{
	"q": "dr and backups planning",
	"a": (
	"Define RTO/RPO per app. Use immutable backups and soft-delete. "
	"Leverage ASR for DR where appropriate, run failover drills, and document rollback."
	),
	"refs": ["Azure Site Recovery (ASR)"],
	},
	]


	# =========================
	# Utilities
	# =========================

	_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")

	def tokenize(text: str) -> List[str]:
	return [t.lower() for t in _WORD_RE.findall(text or "")]

	def list_refs(ref_names: List[str]) -> str:
	links = []
	for nm in ref_names:
	hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
	if hit:
	links.append(f"[{nm}]({hit[0][1]})")
	return " \| ".join(links) if links else ""


	# =========================
	# Intent & topic detection
	# =========================

	_DEF_RE = re.compile(r"^\s*(what\s+is\|what's\|define\|explain\|tell\s+me\s+about)\b", re.I)
	_HOW_RE = re.compile(r"^\s*(how\s+do\|how\s+to\|how\s+does\|how\s+can)\b", re.I)
	_CMP_RE = re.compile(r"\b(vs\.?\|versus\|compare\|difference\|differ)\b", re.I)
	_PLAN_RE = re.compile(r"\b(plan\|approach\|steps\|roadmap\|sequence\|strategy)\b", re.I)

	def detect_intent(q: str) -> str:
	if _DEF_RE.search(q): return "define"
	if _CMP_RE.search(q): return "compare"
	if _PLAN_RE.search(q): return "plan"
	if _HOW_RE.search(q): return "how"
	return "general"

	def detect_topic(q: str) -> str:
	toks = set(tokenize(q))
	if {"sdn", "software-defined", "softwaredefined"} & toks: return "sdn"
	if {"migrate", "migration", "hcx", "avs", "vmotion", "cutover"} & toks: return "migration"
	if {"dr", "disaster", "asr", "rto", "rpo", "failover"} & toks: return "dr"
	if {"defender", "sentinel", "pim", "mfa", "vault", "identity", "entra"} & toks: return "security"
	if {"cost", "reservation", "savings", "rightsizing", "tagging"} & toks: return "cost"
	return "general"

	def topic_refs(topic: str) -> List[str]:
	if topic == "sdn":
	return ["Azure Virtual Network", "Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"]
	if topic == "migration":
	return ["Azure Migrate", "Azure VMware Solution (AVS)", "VMware HCX Docs", "Cloud Adoption Framework (CAF)"]
	if topic == "dr":
	return ["Azure Site Recovery (ASR)", "Azure Well-Architected Framework (WAF)"]
	if topic == "security":
	return ["Microsoft Defender for Cloud", "Azure Well-Architected Framework (WAF)"]
	if topic == "cost":
	return ["Azure Cost Management", "Azure Well-Architected Framework (WAF)"]
	return ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]


	# =========================
	# Tiny TF-IDF Index
	# =========================

	class TinyTfidfIndex:
	def __init__(self):
	self.docs: List[List[str]] = []
	self.df: Counter = Counter()
	self.idf: Dict[str, float] = {}
	self.doc_norms: List[float] = []
	self.voc_size = 0

	def add_documents(self, tokenized_docs: List[List[str]]):
	self.docs = tokenized_docs[:]
	self.df = Counter()
	for toks in self.docs:
	self.df.update(set(toks))
	N = max(1, len(self.docs))
	self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
	self.voc_size = len(self.idf)
	self.doc_norms = []
	for toks in self.docs:
	tf = Counter(toks)
	norm_sq = 0.0
	for term, cnt in tf.items():
	w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0)
	norm_sq += w * w
	self.doc_norms.append(math.sqrt(norm_sq))

	def _vec(self, toks: List[str]) -> Dict[str, float]:
	tf = Counter(toks)
	total = max(1, len(toks))
	v = {}
	for term, cnt in tf.items():
	idf = self.idf.get(term)
	if idf is None:
	continue
	v[term] = (cnt / total) * idf
	return v

	def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
	if not self.docs:
	return []
	qv = self._vec(tokenize(text))
	q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
	sims: List[Tuple[int, float]] = []
	for i, toks in enumerate(self.docs):
	dv = Counter(toks)
	num = 0.0
	for term in qv:
	if term in dv:
	w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
	num += qv[term] * w_d
	denom = (self.doc_norms[i] or 1e-9) * q_norm
	sims.append((i, num / denom))
	sims.sort(key=lambda x: x[1], reverse=True)
	return sims[:k]


	# =========================
	# File Parsing
	# =========================

	def read_pdf_bytes(b: bytes) -> str:
	if not PyPDF2:
	return ""
	try:
	reader = PyPDF2.PdfReader(io.BytesIO(b))
	return "\n".join([page.extract_text() or "" for page in reader.pages])
	except Exception:
	return ""

	def read_docx_bytes(b: bytes) -> str:
	if not docx:
	return ""
	try:
	f = io.BytesIO(b)
	d = docx.Document(f)
	return "\n".join(p.text for p in d.paragraphs)
	except Exception:
	return ""

	def read_text_bytes(b: bytes) -> str:
	for enc in ("utf-8", "utf-16", "latin-1"):
	try:
	return b.decode(enc)
	except Exception:
	continue
	return ""

	def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
	name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
	data = file_obj.get("data")
	if data is None:
	path = file_obj.get("path")
	if path and os.path.exists(path):
	with open(path, "rb") as fh:
	data = fh.read()
	if data is None:
	return {"file": name, "text": ""}
	low = name.lower()
	if low.endswith(".pdf"):
	text = read_pdf_bytes(data)
	elif low.endswith((".docx", ".doc")):
	text = read_docx_bytes(data)
	else:
	text = read_text_bytes(data)
	return {"file": os.path.basename(name), "text": text or ""}


	# =========================
	# Strong definition composer (for “what is …”)
	# =========================

	_DEF_RE_LEAD = re.compile(r"^\s*(what\s+is\|what's\|define\|explain\|tell\s+me\s+about)\s+", re.I)

	def _extract_subject_from_question(q: str) -> str:
	s = _DEF_RE_LEAD.sub("", q).strip()
	s = re.sub(r"[?.!]+$", "", s).strip()
	s = re.sub(r"^(an?\|the)\s+", "", s, flags=re.I)
	return s if s else "the topic"

	def _definition_for_subject(subject: str, topic: str) -> Tuple[str, List[str], List[str], List[str], List[str], List[str]]:
	"""
	Returns: (definition, capabilities[], how[], best_practices[], use_cases[], refs_list)
	Provides a specific definition for SDN; otherwise a generic but detailed scaffold using the subject.
	"""
	# SDN-specific, as per your example (paraphrased, not reused verbatim for all topics)
	if topic == "sdn" or "sdn" in subject.lower():
	definition = (
	f"{subject} is Microsoft's implementation of software-defined networking: "
	"a model that shifts network control into software so you can centrally design, automate, "
	"and protect virtual networks across Azure and Azure Local (Azure Stack HCI). "
	"By separating the control plane from underlying hardware, it enables programmability and "
	"policy-driven management of components such as virtual networks, subnets, firewalls/ACLs, "
	"load balancers, and gateways—well-suited for dynamic cloud and hybrid environments."
	)
	capabilities = [
	"Programmatic creation of VNets, subnets, routing, and address spaces.",
	"Micro-segmentation and policy enforcement for east–west traffic.",
	"Software load balancing and gateway services for app connectivity.",
	"Consistency across Azure and Azure Local (Azure Stack HCI) via Azure Arc.",
	]
	how = [
	"A centralized control plane applies intent (network topology and policies) to host virtual switches.",
	"Agents/controllers translate intent into concrete configuration on each host.",
	"Telemetry and logs feed monitoring, governance, and troubleshooting workflows.",
	]
	best = [
	"Use Infrastructure-as-Code (Bicep/Terraform) and GitOps to standardize changes.",
	"Apply least-privilege and RBAC; review segmentation policies regularly.",
	"Integrate with logging/monitoring; alert on drift and policy violations.",
	]
	uses = [
	"Rapidly provisioning isolated app environments and tiers.",
	"Zero-trust segmentation between workloads and environments.",
	"Hybrid designs spanning Azure and Azure Local with consistent constructs.",
	]
	refs_list = topic_refs("sdn")
	return definition, capabilities, how, best, uses, refs_list

	# Generic detailed definition for other subjects
	sub = subject.strip()
	definition = (
	f"{sub} is a service/technology that centralizes control through software and policy so teams can "
	f"create, operate, and secure resources consistently across environments."
	)
	capabilities = [
	"Automation and policy-driven configuration to reduce manual effort and errors.",
	"Governance integration (RBAC, tagging, policy) for consistency and compliance.",
	"Observability hooks (logs/metrics) for reliability and performance tuning.",
	]
	how = [
	"A control plane captures intent (configuration/policies) and applies it to managed resources.",
	"Providers/agents on the platform translate intent into changes at runtime.",
	"Feedback loops via telemetry inform continuous improvement.",
	]
	best = [
	"Adopt Infrastructure-as-Code and peer reviews for change control.",
	"Define tagging, RBAC roles, and policy baselines early.",
	"Pilot in a non-prod environment before broad rollout.",
	]
	uses = [
	"Faster, repeatable environment provisioning.",
	"Improved security posture through standardized controls.",
	"Hybrid scenarios requiring consistent management across sites.",
	]
	refs_list = topic_refs(detect_topic(sub))
	return definition, capabilities, how, best, uses, refs_list

	def _compose_definition_markdown(query: str, subject: str, topic: str) -> str:
	definition, capabilities, how, best, uses, refs_list = _definition_for_subject(subject, topic)
	refs = list_refs(refs_list)
	md = [f"### {subject} — Detailed definition",
	f"Your question: {query}", "",
	f"Definition: {definition}", "",
	"Key capabilities:"]
	md += [f"- {c}" for c in capabilities]
	md += ["", "How it works:"]
	md += [f"- {h}" for h in how]
	md += ["", "Best practices:"]
	md += [f"- {b}" for b in best]
	md += ["", "Common use cases:"]
	md += [f"- {u}" for u in uses]
	md += ["", f"Trusted sources: {refs}"]
	return "\n".join(md)


	# =========================
	# RAG: build a detailed answer from uploaded docs
	# =========================

	def _extract_points(text: str, max_points: int = 6) -> List[str]:
	parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
	pts = []
	for p in parts:
	p = p.strip()
	if 40 <= len(p) <= 280 and p not in pts:
	pts.append(p)
	if len(pts) >= max_points:
	break
	return pts

	def _compose_rag_answer(query: str, snippets: List[str], topic: str) -> str:
	combined = " ".join(snippets)
	points = _extract_points(combined, max_points=6)
	refs = list_refs(topic_refs(topic))
	md = ["### Answer (detailed)", f"Your question: {query}", ""]
	if points:
	md += ["Executive summary:"] + [f"- {p}" for p in points]
	else:
	md += ["Executive summary:", "- Here are key considerations synthesized from your documents."]
	# Add a short topic-aware checklist
	checklist = {
	"sdn": [
	"Define VNets/subnets and segmentation policy.",
	"Automate with IaC (Bicep/Terraform) and GitOps.",
	"Harden east–west traffic with micro-segmentation.",
	"Plan ingress/egress with LBs and gateways."
	],
	"migration": [
	"Establish landing zone (Policy, RBAC, logging).",
	"Connect networks (ER/VPN), validate DNS/MTU.",
	"Discover/assess with Azure Migrate; pilot a few VMs.",
	"Choose HCX or Azure Migrate for cutover; migrate in waves."
	],
	"dr": [
	"Define RTO/RPO; choose replication targets.",
	"Run planned/unplanned failover drills.",
	"Ensure immutable backups and soft-delete."
	],
	"security": [
	"Enable RBAC/PIM/MFA and Key Vault.",
	"Turn on Defender for Cloud; set policies and alerts.",
	"Collect logs; restrict lateral movement."
	],
	"cost": [
	"Right-size; use Reservations/Savings Plans.",
	"Tag resources; set budgets/alerts.",
	"Automate non-prod shutdowns."
	],
	"general": [
	"Clarify objectives and constraints.",
	"Pilot changes; define rollback and verification."
	]
	}.get(topic, ["Clarify objectives and constraints.", "Pilot changes; define rollback and verification."])
	md += ["", "Recommended steps:"] + [f"- {s}" for s in checklist]
	md += ["", f"Trusted sources: {refs}"]
	return "\n".join(md)


	# =========================
	# Main Answer Function
	# =========================

	def answer_faq_or_approach_detailed(question: str, use_uploaded_docs: bool, index_obj: Any, _matrix_unused: Any, corpus: List[Dict[str,str]]) -> str:
	q = (question or "").strip()
	if not q:
	return "Please enter a question."

	intent = detect_intent(q)
	topic = detect_topic(q)

	# A) Definitions: build a strong, subject-specific definition (e.g., "What is Azure SDN?")
	if intent == "define":
	subject = _extract_subject_from_question(q)
	return _compose_definition_markdown(q, subject, topic)

	# B) Migration FAQs (only if the question is migration-like to avoid hijacking)
	q_tokens = set(tokenize(q))
	if {"migrate", "migration", "hcx", "avs"} & q_tokens:
	for item in FAQ_SEEDS:
	seed_tokens = set(tokenize(item["q"]))
	if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
	return (
	"### Answer (detailed)\n"
	f"{item['a']}\n\n"
	f"Trusted sources: {list_refs(item.get('refs', []))}"
	)

	# C) RAG over uploaded docs → detailed synthesized answer
	if use_uploaded_docs and index_obj is not None and corpus:
	top = index_obj.query(q, k=6)
	snippets = []
	for i, sim in top:
	item = corpus[i]
	excerpt = (item["text"] or "").strip()
	if len(excerpt) > 700:
	excerpt = excerpt[:700] + "..."
	if excerpt:
	snippets.append(excerpt)
	if snippets:
	return _compose_rag_answer(q, snippets, topic)

	# D) Topic-aware fallback (short but relevant)
	subject = _extract_subject_from_question(q) if intent in {"how", "plan", "compare"} else q
	return _compose_definition_markdown(q, subject, topic)


	# =========================
	# Index Builder
	# =========================

	def build_index(files: List[Dict[str, Any]]):
	if not files:
	return None, None, [], "No files uploaded yet."
	corpus = [parse_file(f) for f in files if parse_file(f)["text"]]
	if not corpus:
	return None, None, [], "No text extracted."
	tokenized = [tokenize(c["text"]) for c in corpus]
	idx = TinyTfidfIndex()
	idx.add_documents(tokenized)
	return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}."


	# =========================
	# Gradio UI
	# =========================

	with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
	gr.Markdown(
	"## VMware On-Prem → Azure Local Migration Assistant\n"
	"- Upload documents (PDF/DOCX/TXT/MD)\n"
	"- Click Build Index\n"
	"- Ask a question. Answers are detailed and topic-relevant\n"
	)
	with gr.Row():
	with gr.Column(scale=2):
	file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
	index_status = gr.Markdown("No index yet.")
	st_index = gr.State(); st_matrix = gr.State(); st_corpus = gr.State()
	build_btn = gr.Button("Build Index", variant="primary")
	with gr.Column(scale=3):
	question = gr.Textbox(
	label="Ask a question",
	placeholder="e.g., What is Azure SDN? • How do I minimize downtime for our AVS migration?"
	)
	use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
	ask_btn = gr.Button("Ask", variant="primary")
	answer_box = gr.Markdown("")

	def _collect_files(paths: List[str]):
	out = []
	for p in paths or []:
	try:
	with open(p, "rb") as fh:
	data = fh.read()
	out.append({"name": os.path.basename(p), "data": data, "path": p})
	except Exception:
	pass
	return out

	def _build(files_paths: List[str]):
	files = _collect_files(files_paths)
	return build_index(files)

	build_btn.click(_build, inputs=[file_in], outputs=[index_status, st_index, st_matrix, st_corpus])

	ask_btn.click(
	answer_faq_or_approach_detailed,
	inputs=[question, use_docs, st_index, st_matrix, st_corpus],
	outputs=[answer_box]
	)

	if __name__ == "__main__":
	IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)