Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| VMware On-Prem → Azure Local Migration Assistant (Gradio) | |
| - Upload design/migration docs (PDF/DOCX/TXT/MD). | |
| - Ask questions; get reliable, detailed, and relevant answers. | |
| - Intent-aware (definitions | how-to | plans | comparisons) with topic-aware details. | |
| - No external APIs. No scikit-learn. | |
| Run locally: | |
| pip install gradio PyPDF2 python-docx | |
| python app.py | |
| """ | |
| import os | |
| import io | |
| import re | |
| import math | |
| from typing import List, Tuple, Dict, Any | |
| from collections import Counter | |
| import gradio as gr | |
| # ------------------------- | |
| # Optional parsers (graceful fallback) | |
| # ------------------------- | |
| try: | |
| import PyPDF2 | |
| except Exception: | |
| PyPDF2 = None | |
| try: | |
| import docx # python-docx | |
| except Exception: | |
| docx = None | |
| # ========================= | |
| # Trusted sources & FAQ seeds | |
| # ========================= | |
| TRUSTED_SOURCES: List[Tuple[str, str]] = [ | |
| # Core guidance | |
| ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"), | |
| ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"), | |
| # Networking / SDN (used when question is about SDN) | |
| ("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"), | |
| ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"), | |
| ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"), | |
| ("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"), | |
| # Migration | |
| ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"), | |
| ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"), | |
| ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html"), | |
| # DR | |
| ("Azure Site Recovery (ASR)", "https://learn.microsoft.com/azure/site-recovery/"), | |
| # Security | |
| ("Microsoft Defender for Cloud", "https://learn.microsoft.com/azure/defender-for-cloud/"), | |
| # Cost | |
| ("Azure Cost Management", "https://learn.microsoft.com/azure/cost-management-billing/"), | |
| ] | |
| FAQ_SEEDS: List[Dict[str, Any]] = [ | |
| { | |
| "q": "migrate vmware workloads minimal downtime", | |
| "a": ( | |
| "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. " | |
| "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), " | |
| "pilot a few representative VMs, then migrate in waves with rollback and DR drills." | |
| ), | |
| "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"], | |
| }, | |
| { | |
| "q": "recommended migration sequence", | |
| "a": ( | |
| "1) Establish a governed landing zone. 2) Set up connectivity and identity. " | |
| "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. " | |
| "6) Enforce security/monitoring. 7) Optimize cost and tag consistently." | |
| ), | |
| "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"], | |
| }, | |
| { | |
| "q": "dr and backups planning", | |
| "a": ( | |
| "Define RTO/RPO per app. Use immutable backups and soft-delete. " | |
| "Leverage ASR for DR where appropriate, run failover drills, and document rollback." | |
| ), | |
| "refs": ["Azure Site Recovery (ASR)"], | |
| }, | |
| ] | |
| # ========================= | |
| # Utilities | |
| # ========================= | |
| _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+") | |
| def tokenize(text: str) -> List[str]: | |
| return [t.lower() for t in _WORD_RE.findall(text or "")] | |
| def list_refs(ref_names: List[str]) -> str: | |
| links = [] | |
| for nm in ref_names: | |
| hit = [x for x in TRUSTED_SOURCES if x[0] == nm] | |
| if hit: | |
| links.append(f"[{nm}]({hit[0][1]})") | |
| return " | ".join(links) if links else "" | |
| # ========================= | |
| # Intent & topic detection | |
| # ========================= | |
| _DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I) | |
| _HOW_RE = re.compile(r"^\s*(how\s+do|how\s+to|how\s+does|how\s+can)\b", re.I) | |
| _CMP_RE = re.compile(r"\b(vs\.?|versus|compare|difference|differ)\b", re.I) | |
| _PLAN_RE = re.compile(r"\b(plan|approach|steps|roadmap|sequence|strategy)\b", re.I) | |
| def detect_intent(q: str) -> str: | |
| if _DEF_RE.search(q): return "define" | |
| if _CMP_RE.search(q): return "compare" | |
| if _PLAN_RE.search(q): return "plan" | |
| if _HOW_RE.search(q): return "how" | |
| return "general" | |
| def detect_topic(q: str) -> str: | |
| toks = set(tokenize(q)) | |
| if {"sdn", "software-defined", "softwaredefined"} & toks: return "sdn" | |
| if {"migrate", "migration", "hcx", "avs", "vmotion", "cutover"} & toks: return "migration" | |
| if {"dr", "disaster", "asr", "rto", "rpo", "failover"} & toks: return "dr" | |
| if {"defender", "sentinel", "pim", "mfa", "vault", "identity", "entra"} & toks: return "security" | |
| if {"cost", "reservation", "savings", "rightsizing", "tagging"} & toks: return "cost" | |
| return "general" | |
| def topic_refs(topic: str) -> List[str]: | |
| if topic == "sdn": | |
| return ["Azure Virtual Network", "Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"] | |
| if topic == "migration": | |
| return ["Azure Migrate", "Azure VMware Solution (AVS)", "VMware HCX Docs", "Cloud Adoption Framework (CAF)"] | |
| if topic == "dr": | |
| return ["Azure Site Recovery (ASR)", "Azure Well-Architected Framework (WAF)"] | |
| if topic == "security": | |
| return ["Microsoft Defender for Cloud", "Azure Well-Architected Framework (WAF)"] | |
| if topic == "cost": | |
| return ["Azure Cost Management", "Azure Well-Architected Framework (WAF)"] | |
| return ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"] | |
| # ========================= | |
| # Tiny TF-IDF Index | |
| # ========================= | |
| class TinyTfidfIndex: | |
| def __init__(self): | |
| self.docs: List[List[str]] = [] | |
| self.df: Counter = Counter() | |
| self.idf: Dict[str, float] = {} | |
| self.doc_norms: List[float] = [] | |
| self.voc_size = 0 | |
| def add_documents(self, tokenized_docs: List[List[str]]): | |
| self.docs = tokenized_docs[:] | |
| self.df = Counter() | |
| for toks in self.docs: | |
| self.df.update(set(toks)) | |
| N = max(1, len(self.docs)) | |
| self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()} | |
| self.voc_size = len(self.idf) | |
| self.doc_norms = [] | |
| for toks in self.docs: | |
| tf = Counter(toks) | |
| norm_sq = 0.0 | |
| for term, cnt in tf.items(): | |
| w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0) | |
| norm_sq += w * w | |
| self.doc_norms.append(math.sqrt(norm_sq)) | |
| def _vec(self, toks: List[str]) -> Dict[str, float]: | |
| tf = Counter(toks) | |
| total = max(1, len(toks)) | |
| v = {} | |
| for term, cnt in tf.items(): | |
| idf = self.idf.get(term) | |
| if idf is None: | |
| continue | |
| v[term] = (cnt / total) * idf | |
| return v | |
| def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]: | |
| if not self.docs: | |
| return [] | |
| qv = self._vec(tokenize(text)) | |
| q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9 | |
| sims: List[Tuple[int, float]] = [] | |
| for i, toks in enumerate(self.docs): | |
| dv = Counter(toks) | |
| num = 0.0 | |
| for term in qv: | |
| if term in dv: | |
| w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0) | |
| num += qv[term] * w_d | |
| denom = (self.doc_norms[i] or 1e-9) * q_norm | |
| sims.append((i, num / denom)) | |
| sims.sort(key=lambda x: x[1], reverse=True) | |
| return sims[:k] | |
| # ========================= | |
| # File Parsing | |
| # ========================= | |
| def read_pdf_bytes(b: bytes) -> str: | |
| if not PyPDF2: | |
| return "" | |
| try: | |
| reader = PyPDF2.PdfReader(io.BytesIO(b)) | |
| return "\n".join([page.extract_text() or "" for page in reader.pages]) | |
| except Exception: | |
| return "" | |
| def read_docx_bytes(b: bytes) -> str: | |
| if not docx: | |
| return "" | |
| try: | |
| f = io.BytesIO(b) | |
| d = docx.Document(f) | |
| return "\n".join(p.text for p in d.paragraphs) | |
| except Exception: | |
| return "" | |
| def read_text_bytes(b: bytes) -> str: | |
| for enc in ("utf-8", "utf-16", "latin-1"): | |
| try: | |
| return b.decode(enc) | |
| except Exception: | |
| continue | |
| return "" | |
| def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]: | |
| name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded" | |
| data = file_obj.get("data") | |
| if data is None: | |
| path = file_obj.get("path") | |
| if path and os.path.exists(path): | |
| with open(path, "rb") as fh: | |
| data = fh.read() | |
| if data is None: | |
| return {"file": name, "text": ""} | |
| low = name.lower() | |
| if low.endswith(".pdf"): | |
| text = read_pdf_bytes(data) | |
| elif low.endswith((".docx", ".doc")): | |
| text = read_docx_bytes(data) | |
| else: | |
| text = read_text_bytes(data) | |
| return {"file": os.path.basename(name), "text": text or ""} | |
| # ========================= | |
| # Strong definition composer (for “what is …”) | |
| # ========================= | |
| _DEF_RE_LEAD = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", re.I) | |
| def _extract_subject_from_question(q: str) -> str: | |
| s = _DEF_RE_LEAD.sub("", q).strip() | |
| s = re.sub(r"[?.!]+$", "", s).strip() | |
| s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I) | |
| return s if s else "the topic" | |
| def _definition_for_subject(subject: str, topic: str) -> Tuple[str, List[str], List[str], List[str], List[str], List[str]]: | |
| """ | |
| Returns: (definition, capabilities[], how[], best_practices[], use_cases[], refs_list) | |
| Provides a specific definition for SDN; otherwise a generic but detailed scaffold using the subject. | |
| """ | |
| # SDN-specific, as per your example (paraphrased, not reused verbatim for all topics) | |
| if topic == "sdn" or "sdn" in subject.lower(): | |
| definition = ( | |
| f"{subject} is Microsoft's implementation of software-defined networking: " | |
| "a model that shifts network control into software so you can centrally design, automate, " | |
| "and protect virtual networks across Azure and Azure Local (Azure Stack HCI). " | |
| "By separating the control plane from underlying hardware, it enables programmability and " | |
| "policy-driven management of components such as virtual networks, subnets, firewalls/ACLs, " | |
| "load balancers, and gateways—well-suited for dynamic cloud and hybrid environments." | |
| ) | |
| capabilities = [ | |
| "Programmatic creation of VNets, subnets, routing, and address spaces.", | |
| "Micro-segmentation and policy enforcement for east–west traffic.", | |
| "Software load balancing and gateway services for app connectivity.", | |
| "Consistency across Azure and Azure Local (Azure Stack HCI) via Azure Arc.", | |
| ] | |
| how = [ | |
| "A centralized control plane applies intent (network topology and policies) to host virtual switches.", | |
| "Agents/controllers translate intent into concrete configuration on each host.", | |
| "Telemetry and logs feed monitoring, governance, and troubleshooting workflows.", | |
| ] | |
| best = [ | |
| "Use Infrastructure-as-Code (Bicep/Terraform) and GitOps to standardize changes.", | |
| "Apply least-privilege and RBAC; review segmentation policies regularly.", | |
| "Integrate with logging/monitoring; alert on drift and policy violations.", | |
| ] | |
| uses = [ | |
| "Rapidly provisioning isolated app environments and tiers.", | |
| "Zero-trust segmentation between workloads and environments.", | |
| "Hybrid designs spanning Azure and Azure Local with consistent constructs.", | |
| ] | |
| refs_list = topic_refs("sdn") | |
| return definition, capabilities, how, best, uses, refs_list | |
| # Generic detailed definition for other subjects | |
| sub = subject.strip() | |
| definition = ( | |
| f"{sub} is a service/technology that centralizes control through software and policy so teams can " | |
| f"create, operate, and secure resources consistently across environments." | |
| ) | |
| capabilities = [ | |
| "Automation and policy-driven configuration to reduce manual effort and errors.", | |
| "Governance integration (RBAC, tagging, policy) for consistency and compliance.", | |
| "Observability hooks (logs/metrics) for reliability and performance tuning.", | |
| ] | |
| how = [ | |
| "A control plane captures intent (configuration/policies) and applies it to managed resources.", | |
| "Providers/agents on the platform translate intent into changes at runtime.", | |
| "Feedback loops via telemetry inform continuous improvement.", | |
| ] | |
| best = [ | |
| "Adopt Infrastructure-as-Code and peer reviews for change control.", | |
| "Define tagging, RBAC roles, and policy baselines early.", | |
| "Pilot in a non-prod environment before broad rollout.", | |
| ] | |
| uses = [ | |
| "Faster, repeatable environment provisioning.", | |
| "Improved security posture through standardized controls.", | |
| "Hybrid scenarios requiring consistent management across sites.", | |
| ] | |
| refs_list = topic_refs(detect_topic(sub)) | |
| return definition, capabilities, how, best, uses, refs_list | |
| def _compose_definition_markdown(query: str, subject: str, topic: str) -> str: | |
| definition, capabilities, how, best, uses, refs_list = _definition_for_subject(subject, topic) | |
| refs = list_refs(refs_list) | |
| md = [f"### {subject} — Detailed definition", | |
| f"**Your question:** {query}", "", | |
| f"**Definition:** {definition}", "", | |
| "**Key capabilities:**"] | |
| md += [f"- {c}" for c in capabilities] | |
| md += ["", "**How it works:**"] | |
| md += [f"- {h}" for h in how] | |
| md += ["", "**Best practices:**"] | |
| md += [f"- {b}" for b in best] | |
| md += ["", "**Common use cases:**"] | |
| md += [f"- {u}" for u in uses] | |
| md += ["", f"**Trusted sources:** {refs}"] | |
| return "\n".join(md) | |
| # ========================= | |
| # RAG: build a detailed answer from uploaded docs | |
| # ========================= | |
| def _extract_points(text: str, max_points: int = 6) -> List[str]: | |
| parts = re.split(r"(?<=[.!?])\s+", (text or "").strip()) | |
| pts = [] | |
| for p in parts: | |
| p = p.strip() | |
| if 40 <= len(p) <= 280 and p not in pts: | |
| pts.append(p) | |
| if len(pts) >= max_points: | |
| break | |
| return pts | |
| def _compose_rag_answer(query: str, snippets: List[str], topic: str) -> str: | |
| combined = " ".join(snippets) | |
| points = _extract_points(combined, max_points=6) | |
| refs = list_refs(topic_refs(topic)) | |
| md = ["### Answer (detailed)", f"**Your question:** {query}", ""] | |
| if points: | |
| md += ["**Executive summary:**"] + [f"- {p}" for p in points] | |
| else: | |
| md += ["**Executive summary:**", "- Here are key considerations synthesized from your documents."] | |
| # Add a short topic-aware checklist | |
| checklist = { | |
| "sdn": [ | |
| "Define VNets/subnets and segmentation policy.", | |
| "Automate with IaC (Bicep/Terraform) and GitOps.", | |
| "Harden east–west traffic with micro-segmentation.", | |
| "Plan ingress/egress with LBs and gateways." | |
| ], | |
| "migration": [ | |
| "Establish landing zone (Policy, RBAC, logging).", | |
| "Connect networks (ER/VPN), validate DNS/MTU.", | |
| "Discover/assess with Azure Migrate; pilot a few VMs.", | |
| "Choose HCX or Azure Migrate for cutover; migrate in waves." | |
| ], | |
| "dr": [ | |
| "Define RTO/RPO; choose replication targets.", | |
| "Run planned/unplanned failover drills.", | |
| "Ensure immutable backups and soft-delete." | |
| ], | |
| "security": [ | |
| "Enable RBAC/PIM/MFA and Key Vault.", | |
| "Turn on Defender for Cloud; set policies and alerts.", | |
| "Collect logs; restrict lateral movement." | |
| ], | |
| "cost": [ | |
| "Right-size; use Reservations/Savings Plans.", | |
| "Tag resources; set budgets/alerts.", | |
| "Automate non-prod shutdowns." | |
| ], | |
| "general": [ | |
| "Clarify objectives and constraints.", | |
| "Pilot changes; define rollback and verification." | |
| ] | |
| }.get(topic, ["Clarify objectives and constraints.", "Pilot changes; define rollback and verification."]) | |
| md += ["", "**Recommended steps:**"] + [f"- {s}" for s in checklist] | |
| md += ["", f"**Trusted sources:** {refs}"] | |
| return "\n".join(md) | |
| # ========================= | |
| # Main Answer Function | |
| # ========================= | |
| def answer_faq_or_approach_detailed(question: str, use_uploaded_docs: bool, index_obj: Any, _matrix_unused: Any, corpus: List[Dict[str,str]]) -> str: | |
| q = (question or "").strip() | |
| if not q: | |
| return "Please enter a question." | |
| intent = detect_intent(q) | |
| topic = detect_topic(q) | |
| # A) Definitions: build a strong, subject-specific definition (e.g., "What is Azure SDN?") | |
| if intent == "define": | |
| subject = _extract_subject_from_question(q) | |
| return _compose_definition_markdown(q, subject, topic) | |
| # B) Migration FAQs (only if the question is migration-like to avoid hijacking) | |
| q_tokens = set(tokenize(q)) | |
| if {"migrate", "migration", "hcx", "avs"} & q_tokens: | |
| for item in FAQ_SEEDS: | |
| seed_tokens = set(tokenize(item["q"])) | |
| if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5: | |
| return ( | |
| "### Answer (detailed)\n" | |
| f"{item['a']}\n\n" | |
| f"**Trusted sources:** {list_refs(item.get('refs', []))}" | |
| ) | |
| # C) RAG over uploaded docs → detailed synthesized answer | |
| if use_uploaded_docs and index_obj is not None and corpus: | |
| top = index_obj.query(q, k=6) | |
| snippets = [] | |
| for i, sim in top: | |
| item = corpus[i] | |
| excerpt = (item["text"] or "").strip() | |
| if len(excerpt) > 700: | |
| excerpt = excerpt[:700] + "..." | |
| if excerpt: | |
| snippets.append(excerpt) | |
| if snippets: | |
| return _compose_rag_answer(q, snippets, topic) | |
| # D) Topic-aware fallback (short but relevant) | |
| subject = _extract_subject_from_question(q) if intent in {"how", "plan", "compare"} else q | |
| return _compose_definition_markdown(q, subject, topic) | |
| # ========================= | |
| # Index Builder | |
| # ========================= | |
| def build_index(files: List[Dict[str, Any]]): | |
| if not files: | |
| return None, None, [], "No files uploaded yet." | |
| corpus = [parse_file(f) for f in files if parse_file(f)["text"]] | |
| if not corpus: | |
| return None, None, [], "No text extracted." | |
| tokenized = [tokenize(c["text"]) for c in corpus] | |
| idx = TinyTfidfIndex() | |
| idx.add_documents(tokenized) | |
| return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}." | |
| # ========================= | |
| # Gradio UI | |
| # ========================= | |
| with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo: | |
| gr.Markdown( | |
| "## VMware On-Prem → Azure Local Migration Assistant\n" | |
| "- Upload documents (PDF/DOCX/TXT/MD)\n" | |
| "- Click **Build Index**\n" | |
| "- Ask a question. Answers are **detailed** and **topic-relevant**\n" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath") | |
| index_status = gr.Markdown("No index yet.") | |
| st_index = gr.State(); st_matrix = gr.State(); st_corpus = gr.State() | |
| build_btn = gr.Button("Build Index", variant="primary") | |
| with gr.Column(scale=3): | |
| question = gr.Textbox( | |
| label="Ask a question", | |
| placeholder="e.g., What is Azure SDN? • How do I minimize downtime for our AVS migration?" | |
| ) | |
| use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True) | |
| ask_btn = gr.Button("Ask", variant="primary") | |
| answer_box = gr.Markdown("") | |
| def _collect_files(paths: List[str]): | |
| out = [] | |
| for p in paths or []: | |
| try: | |
| with open(p, "rb") as fh: | |
| data = fh.read() | |
| out.append({"name": os.path.basename(p), "data": data, "path": p}) | |
| except Exception: | |
| pass | |
| return out | |
| def _build(files_paths: List[str]): | |
| files = _collect_files(files_paths) | |
| return build_index(files) | |
| build_btn.click(_build, inputs=[file_in], outputs=[index_status, st_index, st_matrix, st_corpus]) | |
| ask_btn.click( | |
| answer_faq_or_approach_detailed, | |
| inputs=[question, use_docs, st_index, st_matrix, st_corpus], | |
| outputs=[answer_box] | |
| ) | |
| if __name__ == "__main__": | |
| IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID")) | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES) | |