ajayinsac's picture
Update app.py
5038afa verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
VMware On-Prem → Azure Local Migration Assistant (Gradio)
- Upload design/migration docs (PDF/DOCX/TXT/MD).
- Ask questions; get reliable, detailed, and relevant answers.
- Intent-aware (definitions | how-to | plans | comparisons) with topic-aware details.
- No external APIs. No scikit-learn.
Run locally:
pip install gradio PyPDF2 python-docx
python app.py
"""
import os
import io
import re
import math
from typing import List, Tuple, Dict, Any
from collections import Counter
import gradio as gr
# -------------------------
# Optional parsers (graceful fallback)
# -------------------------
try:
import PyPDF2
except Exception:
PyPDF2 = None
try:
import docx # python-docx
except Exception:
docx = None
# =========================
# Trusted sources & FAQ seeds
# =========================
TRUSTED_SOURCES: List[Tuple[str, str]] = [
# Core guidance
("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
# Networking / SDN (used when question is about SDN)
("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
# Migration
("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html"),
# DR
("Azure Site Recovery (ASR)", "https://learn.microsoft.com/azure/site-recovery/"),
# Security
("Microsoft Defender for Cloud", "https://learn.microsoft.com/azure/defender-for-cloud/"),
# Cost
("Azure Cost Management", "https://learn.microsoft.com/azure/cost-management-billing/"),
]
FAQ_SEEDS: List[Dict[str, Any]] = [
{
"q": "migrate vmware workloads minimal downtime",
"a": (
"For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
"Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
"pilot a few representative VMs, then migrate in waves with rollback and DR drills."
),
"refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"],
},
{
"q": "recommended migration sequence",
"a": (
"1) Establish a governed landing zone. 2) Set up connectivity and identity. "
"3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
"6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
),
"refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"],
},
{
"q": "dr and backups planning",
"a": (
"Define RTO/RPO per app. Use immutable backups and soft-delete. "
"Leverage ASR for DR where appropriate, run failover drills, and document rollback."
),
"refs": ["Azure Site Recovery (ASR)"],
},
]
# =========================
# Utilities
# =========================
_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
def tokenize(text: str) -> List[str]:
return [t.lower() for t in _WORD_RE.findall(text or "")]
def list_refs(ref_names: List[str]) -> str:
links = []
for nm in ref_names:
hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
if hit:
links.append(f"[{nm}]({hit[0][1]})")
return " | ".join(links) if links else ""
# =========================
# Intent & topic detection
# =========================
_DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I)
_HOW_RE = re.compile(r"^\s*(how\s+do|how\s+to|how\s+does|how\s+can)\b", re.I)
_CMP_RE = re.compile(r"\b(vs\.?|versus|compare|difference|differ)\b", re.I)
_PLAN_RE = re.compile(r"\b(plan|approach|steps|roadmap|sequence|strategy)\b", re.I)
def detect_intent(q: str) -> str:
if _DEF_RE.search(q): return "define"
if _CMP_RE.search(q): return "compare"
if _PLAN_RE.search(q): return "plan"
if _HOW_RE.search(q): return "how"
return "general"
def detect_topic(q: str) -> str:
toks = set(tokenize(q))
if {"sdn", "software-defined", "softwaredefined"} & toks: return "sdn"
if {"migrate", "migration", "hcx", "avs", "vmotion", "cutover"} & toks: return "migration"
if {"dr", "disaster", "asr", "rto", "rpo", "failover"} & toks: return "dr"
if {"defender", "sentinel", "pim", "mfa", "vault", "identity", "entra"} & toks: return "security"
if {"cost", "reservation", "savings", "rightsizing", "tagging"} & toks: return "cost"
return "general"
def topic_refs(topic: str) -> List[str]:
if topic == "sdn":
return ["Azure Virtual Network", "Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"]
if topic == "migration":
return ["Azure Migrate", "Azure VMware Solution (AVS)", "VMware HCX Docs", "Cloud Adoption Framework (CAF)"]
if topic == "dr":
return ["Azure Site Recovery (ASR)", "Azure Well-Architected Framework (WAF)"]
if topic == "security":
return ["Microsoft Defender for Cloud", "Azure Well-Architected Framework (WAF)"]
if topic == "cost":
return ["Azure Cost Management", "Azure Well-Architected Framework (WAF)"]
return ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]
# =========================
# Tiny TF-IDF Index
# =========================
class TinyTfidfIndex:
def __init__(self):
self.docs: List[List[str]] = []
self.df: Counter = Counter()
self.idf: Dict[str, float] = {}
self.doc_norms: List[float] = []
self.voc_size = 0
def add_documents(self, tokenized_docs: List[List[str]]):
self.docs = tokenized_docs[:]
self.df = Counter()
for toks in self.docs:
self.df.update(set(toks))
N = max(1, len(self.docs))
self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
self.voc_size = len(self.idf)
self.doc_norms = []
for toks in self.docs:
tf = Counter(toks)
norm_sq = 0.0
for term, cnt in tf.items():
w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0)
norm_sq += w * w
self.doc_norms.append(math.sqrt(norm_sq))
def _vec(self, toks: List[str]) -> Dict[str, float]:
tf = Counter(toks)
total = max(1, len(toks))
v = {}
for term, cnt in tf.items():
idf = self.idf.get(term)
if idf is None:
continue
v[term] = (cnt / total) * idf
return v
def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
if not self.docs:
return []
qv = self._vec(tokenize(text))
q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
sims: List[Tuple[int, float]] = []
for i, toks in enumerate(self.docs):
dv = Counter(toks)
num = 0.0
for term in qv:
if term in dv:
w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
num += qv[term] * w_d
denom = (self.doc_norms[i] or 1e-9) * q_norm
sims.append((i, num / denom))
sims.sort(key=lambda x: x[1], reverse=True)
return sims[:k]
# =========================
# File Parsing
# =========================
def read_pdf_bytes(b: bytes) -> str:
if not PyPDF2:
return ""
try:
reader = PyPDF2.PdfReader(io.BytesIO(b))
return "\n".join([page.extract_text() or "" for page in reader.pages])
except Exception:
return ""
def read_docx_bytes(b: bytes) -> str:
if not docx:
return ""
try:
f = io.BytesIO(b)
d = docx.Document(f)
return "\n".join(p.text for p in d.paragraphs)
except Exception:
return ""
def read_text_bytes(b: bytes) -> str:
for enc in ("utf-8", "utf-16", "latin-1"):
try:
return b.decode(enc)
except Exception:
continue
return ""
def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
data = file_obj.get("data")
if data is None:
path = file_obj.get("path")
if path and os.path.exists(path):
with open(path, "rb") as fh:
data = fh.read()
if data is None:
return {"file": name, "text": ""}
low = name.lower()
if low.endswith(".pdf"):
text = read_pdf_bytes(data)
elif low.endswith((".docx", ".doc")):
text = read_docx_bytes(data)
else:
text = read_text_bytes(data)
return {"file": os.path.basename(name), "text": text or ""}
# =========================
# Strong definition composer (for “what is …”)
# =========================
_DEF_RE_LEAD = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", re.I)
def _extract_subject_from_question(q: str) -> str:
s = _DEF_RE_LEAD.sub("", q).strip()
s = re.sub(r"[?.!]+$", "", s).strip()
s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I)
return s if s else "the topic"
def _definition_for_subject(subject: str, topic: str) -> Tuple[str, List[str], List[str], List[str], List[str], List[str]]:
"""
Returns: (definition, capabilities[], how[], best_practices[], use_cases[], refs_list)
Provides a specific definition for SDN; otherwise a generic but detailed scaffold using the subject.
"""
# SDN-specific, as per your example (paraphrased, not reused verbatim for all topics)
if topic == "sdn" or "sdn" in subject.lower():
definition = (
f"{subject} is Microsoft's implementation of software-defined networking: "
"a model that shifts network control into software so you can centrally design, automate, "
"and protect virtual networks across Azure and Azure Local (Azure Stack HCI). "
"By separating the control plane from underlying hardware, it enables programmability and "
"policy-driven management of components such as virtual networks, subnets, firewalls/ACLs, "
"load balancers, and gateways—well-suited for dynamic cloud and hybrid environments."
)
capabilities = [
"Programmatic creation of VNets, subnets, routing, and address spaces.",
"Micro-segmentation and policy enforcement for east–west traffic.",
"Software load balancing and gateway services for app connectivity.",
"Consistency across Azure and Azure Local (Azure Stack HCI) via Azure Arc.",
]
how = [
"A centralized control plane applies intent (network topology and policies) to host virtual switches.",
"Agents/controllers translate intent into concrete configuration on each host.",
"Telemetry and logs feed monitoring, governance, and troubleshooting workflows.",
]
best = [
"Use Infrastructure-as-Code (Bicep/Terraform) and GitOps to standardize changes.",
"Apply least-privilege and RBAC; review segmentation policies regularly.",
"Integrate with logging/monitoring; alert on drift and policy violations.",
]
uses = [
"Rapidly provisioning isolated app environments and tiers.",
"Zero-trust segmentation between workloads and environments.",
"Hybrid designs spanning Azure and Azure Local with consistent constructs.",
]
refs_list = topic_refs("sdn")
return definition, capabilities, how, best, uses, refs_list
# Generic detailed definition for other subjects
sub = subject.strip()
definition = (
f"{sub} is a service/technology that centralizes control through software and policy so teams can "
f"create, operate, and secure resources consistently across environments."
)
capabilities = [
"Automation and policy-driven configuration to reduce manual effort and errors.",
"Governance integration (RBAC, tagging, policy) for consistency and compliance.",
"Observability hooks (logs/metrics) for reliability and performance tuning.",
]
how = [
"A control plane captures intent (configuration/policies) and applies it to managed resources.",
"Providers/agents on the platform translate intent into changes at runtime.",
"Feedback loops via telemetry inform continuous improvement.",
]
best = [
"Adopt Infrastructure-as-Code and peer reviews for change control.",
"Define tagging, RBAC roles, and policy baselines early.",
"Pilot in a non-prod environment before broad rollout.",
]
uses = [
"Faster, repeatable environment provisioning.",
"Improved security posture through standardized controls.",
"Hybrid scenarios requiring consistent management across sites.",
]
refs_list = topic_refs(detect_topic(sub))
return definition, capabilities, how, best, uses, refs_list
def _compose_definition_markdown(query: str, subject: str, topic: str) -> str:
definition, capabilities, how, best, uses, refs_list = _definition_for_subject(subject, topic)
refs = list_refs(refs_list)
md = [f"### {subject} — Detailed definition",
f"**Your question:** {query}", "",
f"**Definition:** {definition}", "",
"**Key capabilities:**"]
md += [f"- {c}" for c in capabilities]
md += ["", "**How it works:**"]
md += [f"- {h}" for h in how]
md += ["", "**Best practices:**"]
md += [f"- {b}" for b in best]
md += ["", "**Common use cases:**"]
md += [f"- {u}" for u in uses]
md += ["", f"**Trusted sources:** {refs}"]
return "\n".join(md)
# =========================
# RAG: build a detailed answer from uploaded docs
# =========================
def _extract_points(text: str, max_points: int = 6) -> List[str]:
parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
pts = []
for p in parts:
p = p.strip()
if 40 <= len(p) <= 280 and p not in pts:
pts.append(p)
if len(pts) >= max_points:
break
return pts
def _compose_rag_answer(query: str, snippets: List[str], topic: str) -> str:
combined = " ".join(snippets)
points = _extract_points(combined, max_points=6)
refs = list_refs(topic_refs(topic))
md = ["### Answer (detailed)", f"**Your question:** {query}", ""]
if points:
md += ["**Executive summary:**"] + [f"- {p}" for p in points]
else:
md += ["**Executive summary:**", "- Here are key considerations synthesized from your documents."]
# Add a short topic-aware checklist
checklist = {
"sdn": [
"Define VNets/subnets and segmentation policy.",
"Automate with IaC (Bicep/Terraform) and GitOps.",
"Harden east–west traffic with micro-segmentation.",
"Plan ingress/egress with LBs and gateways."
],
"migration": [
"Establish landing zone (Policy, RBAC, logging).",
"Connect networks (ER/VPN), validate DNS/MTU.",
"Discover/assess with Azure Migrate; pilot a few VMs.",
"Choose HCX or Azure Migrate for cutover; migrate in waves."
],
"dr": [
"Define RTO/RPO; choose replication targets.",
"Run planned/unplanned failover drills.",
"Ensure immutable backups and soft-delete."
],
"security": [
"Enable RBAC/PIM/MFA and Key Vault.",
"Turn on Defender for Cloud; set policies and alerts.",
"Collect logs; restrict lateral movement."
],
"cost": [
"Right-size; use Reservations/Savings Plans.",
"Tag resources; set budgets/alerts.",
"Automate non-prod shutdowns."
],
"general": [
"Clarify objectives and constraints.",
"Pilot changes; define rollback and verification."
]
}.get(topic, ["Clarify objectives and constraints.", "Pilot changes; define rollback and verification."])
md += ["", "**Recommended steps:**"] + [f"- {s}" for s in checklist]
md += ["", f"**Trusted sources:** {refs}"]
return "\n".join(md)
# =========================
# Main Answer Function
# =========================
def answer_faq_or_approach_detailed(question: str, use_uploaded_docs: bool, index_obj: Any, _matrix_unused: Any, corpus: List[Dict[str,str]]) -> str:
q = (question or "").strip()
if not q:
return "Please enter a question."
intent = detect_intent(q)
topic = detect_topic(q)
# A) Definitions: build a strong, subject-specific definition (e.g., "What is Azure SDN?")
if intent == "define":
subject = _extract_subject_from_question(q)
return _compose_definition_markdown(q, subject, topic)
# B) Migration FAQs (only if the question is migration-like to avoid hijacking)
q_tokens = set(tokenize(q))
if {"migrate", "migration", "hcx", "avs"} & q_tokens:
for item in FAQ_SEEDS:
seed_tokens = set(tokenize(item["q"]))
if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
return (
"### Answer (detailed)\n"
f"{item['a']}\n\n"
f"**Trusted sources:** {list_refs(item.get('refs', []))}"
)
# C) RAG over uploaded docs → detailed synthesized answer
if use_uploaded_docs and index_obj is not None and corpus:
top = index_obj.query(q, k=6)
snippets = []
for i, sim in top:
item = corpus[i]
excerpt = (item["text"] or "").strip()
if len(excerpt) > 700:
excerpt = excerpt[:700] + "..."
if excerpt:
snippets.append(excerpt)
if snippets:
return _compose_rag_answer(q, snippets, topic)
# D) Topic-aware fallback (short but relevant)
subject = _extract_subject_from_question(q) if intent in {"how", "plan", "compare"} else q
return _compose_definition_markdown(q, subject, topic)
# =========================
# Index Builder
# =========================
def build_index(files: List[Dict[str, Any]]):
if not files:
return None, None, [], "No files uploaded yet."
corpus = [parse_file(f) for f in files if parse_file(f)["text"]]
if not corpus:
return None, None, [], "No text extracted."
tokenized = [tokenize(c["text"]) for c in corpus]
idx = TinyTfidfIndex()
idx.add_documents(tokenized)
return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}."
# =========================
# Gradio UI
# =========================
with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
gr.Markdown(
"## VMware On-Prem → Azure Local Migration Assistant\n"
"- Upload documents (PDF/DOCX/TXT/MD)\n"
"- Click **Build Index**\n"
"- Ask a question. Answers are **detailed** and **topic-relevant**\n"
)
with gr.Row():
with gr.Column(scale=2):
file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
index_status = gr.Markdown("No index yet.")
st_index = gr.State(); st_matrix = gr.State(); st_corpus = gr.State()
build_btn = gr.Button("Build Index", variant="primary")
with gr.Column(scale=3):
question = gr.Textbox(
label="Ask a question",
placeholder="e.g., What is Azure SDN? • How do I minimize downtime for our AVS migration?"
)
use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
ask_btn = gr.Button("Ask", variant="primary")
answer_box = gr.Markdown("")
def _collect_files(paths: List[str]):
out = []
for p in paths or []:
try:
with open(p, "rb") as fh:
data = fh.read()
out.append({"name": os.path.basename(p), "data": data, "path": p})
except Exception:
pass
return out
def _build(files_paths: List[str]):
files = _collect_files(files_paths)
return build_index(files)
build_btn.click(_build, inputs=[file_in], outputs=[index_status, st_index, st_matrix, st_corpus])
ask_btn.click(
answer_faq_or_approach_detailed,
inputs=[question, use_docs, st_index, st_matrix, st_corpus],
outputs=[answer_box]
)
if __name__ == "__main__":
IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)