Spaces:

ajayinsac
/

VMware2AzureLocal

Sleeping

App Files Files Community

ajayinsac commited on Sep 11, 2025

Commit

bdd12dc

verified ·

1 Parent(s): c76f040

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -184

app.py CHANGED Viewed

@@ -3,10 +3,12 @@
 """
 VMware On-Prem → Azure Local Migration Assistant (Gradio)
-- Works on Hugging Face Spaces (no external API calls, no sklearn).
 - Upload design/migration docs (PDF/DOCX/TXT/MD).
-- Ask questions; get reliable, detailed answers with excerpts + trusted refs.
 Run locally:
   pip install gradio PyPDF2 python-docx
   python app.py
@@ -16,14 +18,16 @@ import os
 import io
 import re
 import math
-from typing import List, Tuple, Dict, Any
 from collections import Counter, defaultdict
 import gradio as gr
-# Optional parsers (gracefully degrade if not installed on Spaces)
 try:
-    import PyPDF2  # lightweight; often available on Spaces
 except Exception:
     PyPDF2 = None
@@ -38,6 +42,9 @@ except Exception:
 # =========================
 TRUSTED_SOURCES: List[Tuple[str, str]] = [
     ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
     ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
     ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
@@ -82,9 +89,7 @@ FAQ_SEEDS: List[Dict[str, Any]] = [
 _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
 def tokenize(text: str) -> List[str]:
-    if not text:
-        return []
-    return [t.lower() for t in _WORD_RE.findall(text)]
 def list_refs(ref_names: List[str]) -> str:
     links = []
@@ -96,7 +101,7 @@ def list_refs(ref_names: List[str]) -> str:
 # =========================
-# Tiny TF-IDF implementation (no sklearn)
 # =========================
 class TinyTfidfIndex:
@@ -109,14 +114,12 @@ class TinyTfidfIndex:
     def add_documents(self, tokenized_docs: List[List[str]]):
         self.docs = tokenized_docs[:]
-        # document frequency
         self.df = Counter()
         for toks in self.docs:
             self.df.update(set(toks))
         N = max(1, len(self.docs))
         self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
         self.voc_size = len(self.idf)
-        # precompute norms
         self.doc_norms = []
         for toks in self.docs:
             tf = Counter(toks)
@@ -137,7 +140,7 @@ class TinyTfidfIndex:
             v[term] = (cnt / total) * idf
         return v
-    def query(self, text: str, k: int = 6) -> List[Tuple[int, float]]:
         if not self.docs:
             return []
         qv = self._vec(tokenize(text))
@@ -157,52 +160,16 @@ class TinyTfidfIndex:
 # =========================
-# Scoring rubric to tailor the detailed output
 # =========================
 CHECKS = [
-    {
-        "id": "landing_zone",
-        "desc": "Landing zone defined (hub/spoke, Policy, RBAC, logging).",
-        "fix": "Use CAF blueprints; enforce Policy for guardrails and RBAC.",
-        "keywords": ["landing", "hub", "spoke", "policy", "rbac", "log", "monitor"],
-        "pillar": "governance",
-    },
-    {
-        "id": "connectivity",
-        "desc": "Connectivity planned (ExpressRoute/VPN), DNS, MTU validated for HCX.",
-        "fix": "Verify ER/VPN, DNS resolution, and HCX MTU/mobility settings.",
-        "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx", "connectivity"],
-        "pillar": "networking",
-    },
-    {
-        "id": "migrate_tooling",
-        "desc": "Discovery/assessment and tooling chosen (Azure Migrate or HCX).",
-        "fix": "Run Azure Migrate discovery; select HCX or Azure Migrate per downtime.",
-        "keywords": ["azure", "migrate", "discovery", "assessment", "hcx", "replication"],
-        "pillar": "operations",
-    },
-    {
-        "id": "security",
-        "desc": "Security/identity configured (Key Vault, Defender, Sentinel, PIM/MFA).",
-        "fix": "Centralize secrets in Key Vault; enable Defender/Sentinel; enforce PIM/MFA.",
-        "keywords": ["key", "vault", "defender", "sentinel", "pim", "mfa", "entra", "aad", "identity"],
-        "pillar": "security",
-    },
-    {
-        "id": "dr_backup",
-        "desc": "Backups, DR, RTO/RPO defined; ASR drills planned.",
-        "fix": "Set RTO/RPO; immutability & soft-delete; test ASR failover/failback.",
-        "keywords": ["backup", "rto", "rpo", "dr", "asr", "failover", "restore"],
-        "pillar": "reliability",
-    },
-    {
-        "id": "cost",
-        "desc": "Cost optimization plan (right-sizing, reservations, tagging).",
-        "fix": "Use reservations/Savings Plans, rightsizing, and enforce tags.",
-        "keywords": ["cost", "reservation", "savings", "right", "tag"],
-        "pillar": "cost",
-    },
 ]
 def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
@@ -230,7 +197,119 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
 # =========================
-# File parsing
 # =========================
 def read_pdf_bytes(b: bytes) -> str:
@@ -238,13 +317,7 @@ def read_pdf_bytes(b: bytes) -> str:
         return ""
     try:
         reader = PyPDF2.PdfReader(io.BytesIO(b))
-        out = []
-        for page in reader.pages:
-            try:
-                out.append(page.extract_text() or "")
-            except Exception:
-                pass
-        return "\n".join(out)
     except Exception:
         return ""
@@ -267,7 +340,6 @@ def read_text_bytes(b: bytes) -> str:
     return ""
 def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
-    """Returns {"file": <name>, "text": <extracted_text>}"""
     name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
     data = file_obj.get("data")
     if data is None:
@@ -277,72 +349,38 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
                 data = fh.read()
     if data is None:
         return {"file": name, "text": ""}
     low = name.lower()
     if low.endswith(".pdf"):
         text = read_pdf_bytes(data)
     elif low.endswith((".docx", ".doc")):
         text = read_docx_bytes(data)
-    elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
-        text = read_text_bytes(data)
     else:
         text = read_text_bytes(data)
     return {"file": os.path.basename(name), "text": text or ""}
 # =========================
-# Detailed Answer Composer
 # =========================
 def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
-    collected = [s.get("excerpt", "") for s in snippets]
-    combined = "\n\n".join(collected)
-    scores, gaps = score_text_against_checks(combined) if combined.strip() else ({"overall": 0.0}, [])
-    def _mk_gaps(glist, limit=8):
-        out = []
-        for g in glist[:limit]:
-            out.append(f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_")
-        return "\n".join(out) if out else "- No major issues detected in the sampled excerpts."
-    refs = list_refs([
-        "Azure VMware Solution (AVS)",
-        "Azure Migrate",
-        "Cloud Adoption Framework (CAF)",
-        "Azure Well-Architected Framework (WAF)",
-        "VMware HCX Docs",
-    ])
-    pillar_lines = []
-    for k_, v_ in scores.items():
-        if k_ == "overall":
-            continue
-        pillar_lines.append(f"- **{k_.capitalize()}**: {v_}")
-    pillar_md = "\n".join(pillar_lines) if pillar_lines else "- (no signals)"
-    md = (
         f"### Answer (detailed)\n"
         f"**Your question:** {query}\n\n"
-        f"**TL;DR:** Here’s a concrete plan across landing zone, connectivity, migration method, security, DR, and cost. "
-        f"Address the highest-risk gaps first.\n\n"
-        f"#### Step-by-step plan\n"
-        "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
-        "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
-        "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
-        "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
-        "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
-        "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
-        "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
-        f"#### What your documents emphasize (auto-scored)\n"
-        f"**Overall score:** {scores.get('overall', 0)} / 5.0\n\n"
-        f"**Per-pillar signals:**\n{pillar_md}\n\n"
-        f"#### Gaps & quick fixes\n{_mk_gaps(gaps, limit=8)}\n\n"
-        f"#### Supporting excerpts\n"
     )
     for s in snippets:
-        md += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
-    md += f"**Trusted sources:** {refs}"
-    return md
 # =========================
@@ -354,33 +392,27 @@ def answer_faq_or_approach_detailed(
     use_uploaded_docs: bool,
     index_obj: Any,
     _matrix_unused: Any,
-    corpus: List[Dict[str, str]],
 ) -> str:
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
-    # 1) Seeded FAQs → detailed plan (looser match to trigger more often)
     q_tokens = set(tokenize(q))
     for item in FAQ_SEEDS:
         seed_tokens = set(tokenize(item["q"]))
-        overlap = len(seed_tokens & q_tokens)
-        if overlap >= max(1, len(seed_tokens) // 2):  # >=50% overlap
-            refs = list_refs(item.get("refs", []))
-            base = (
-                f"### Answer (detailed)\n"
                 f"{item['a']}\n\n"
-                "#### Step-by-step plan\n"
-                "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
-                "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
-                "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
-                "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
-                "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
-                "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
-                "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
-                f"**Trusted sources:** {refs}"
             )
-            return base
     # 2) Use uploaded docs (RAG) → detailed synthesized answer
     if use_uploaded_docs and index_obj is not None and corpus:
@@ -394,60 +426,43 @@ def answer_faq_or_approach_detailed(
             snippets.append({
                 "file": item["file"],
                 "relevance": float(sim),
-                "excerpt": excerpt,
             })
         if snippets:
             return _compose_detailed_from_snippets(q, snippets)
-    # 3) Fallback (no docs) → generic detailed plan with citations
-    refs = list_refs([
-        "Azure VMware Solution (AVS)",
-        "Azure Migrate",
-        "Cloud Adoption Framework (CAF)",
-        "Azure Well-Architected Framework (WAF)",
-        "VMware HCX Docs",
-    ])
-    generic = (
         "### Answer (detailed)\n"
-        "**TL;DR:** Use AVS/HCX or Azure Migrate depending on downtime needs; build landing zone and connectivity first, "
-        "then migrate in waves with rollback and DR drills.\n\n"
-        "#### Step-by-step plan\n"
-        "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
-        "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
-        "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
-        "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
-        "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
-        "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
-        "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
         f"**Trusted sources:** {refs}"
     )
-    return generic
 # =========================
-# Build index from uploaded files
 # =========================
-def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
-    """Returns: (index_obj, matrix_placeholder, corpus, status_message)"""
     if not files:
         return None, None, [], "No files uploaded yet."
     corpus: List[Dict[str, str]] = []
     for f in files:
         rec = parse_file(f)
         if rec["text"]:
             corpus.append(rec)
     if not corpus:
-        return None, None, [], "Uploaded files could not be parsed (no text extracted)."
     tokenized = [tokenize(c["text"]) for c in corpus]
     idx = TinyTfidfIndex()
     idx.add_documents(tokenized)
-    status = f"Indexed {len(corpus)} document(s). Vocabulary size ≈ {idx.voc_size}."
-    return idx, None, corpus, status
 # =========================
@@ -457,38 +472,26 @@ def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, s
 with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
     gr.Markdown(
         "## VMware On-Prem → Azure Local Migration Assistant\n"
-        "- Upload your **design/migration documents** (PDF, DOCX, TXT, MD)\n"
-        "- Ask questions. Toggle **Use uploaded docs** for RAG-based answers\n"
-        "- Answers are **detailed** by default, with structured steps and trusted references\n"
     )
     with gr.Row():
         with gr.Column(scale=2):
-            file_in = gr.Files(
-                label="Upload documents (PDF/DOCX/TXT/MD)",
-                file_count="multiple",
-                type="filepath"  # we will open paths ourselves
-            )
             index_status = gr.Markdown("No index yet.")
-            # Hidden/State to hold in-memory data
             st_index = gr.State()
-            st_matrix = gr.State()   # placeholder for API compatibility
             st_corpus = gr.State()
             build_btn = gr.Button("Build Index", variant="primary")
         with gr.Column(scale=3):
-            question = gr.Textbox(
-                label="Ask a question",
-                placeholder="e.g., How do I minimize downtime for our VMware migration?",
-                lines=3
-            )
             use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
             ask_btn = gr.Button("Ask", variant="primary")
             answer_box = gr.Markdown("")
-    # Convert gr.Files (paths) into dicts our parser expects
-    def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
         out = []
         for p in paths or []:
             try:
@@ -501,19 +504,18 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
     def _build(files_paths: List[str]):
         files = _collect_files(files_paths)
-        idx, mat, corpus, status = build_index(files)
-        return status, idx, mat, corpus
     build_btn.click(
         _build,
         inputs=[file_in],
-        outputs=[index_status, st_index, st_matrix, st_corpus],
     )
     ask_btn.click(
         answer_faq_or_approach_detailed,
         inputs=[question, use_docs, st_index, st_matrix, st_corpus],
-        outputs=[answer_box],
     )
 if __name__ == "__main__":

 """
 VMware On-Prem → Azure Local Migration Assistant (Gradio)
+- No external API calls. No scikit-learn.
 - Upload design/migration docs (PDF/DOCX/TXT/MD).
+- Ask questions; get RELIABLE, DETAILED answers:
+  • Concept KB (for definitions like “What is Azure Arc-enabled SDN?”)
+  • RAG on uploaded docs (excerpts + gaps/fixes)
+  • Seeded FAQs (migration flows)
 Run locally:
   pip install gradio PyPDF2 python-docx
   python app.py
 import io
 import re
 import math
+from typing import List, Tuple, Dict, Any, Optional
 from collections import Counter, defaultdict
 import gradio as gr
+# -------------------------
+# Optional parsers (graceful fallback)
+# -------------------------
 try:
+    import PyPDF2  # often present on Spaces
 except Exception:
     PyPDF2 = None
 # =========================
 TRUSTED_SOURCES: List[Tuple[str, str]] = [
+    ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
+    ("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
+    ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
     ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
     ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
     ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
 _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
 def tokenize(text: str) -> List[str]:
+    return [t.lower() for t in _WORD_RE.findall(text or "")]
 def list_refs(ref_names: List[str]) -> str:
     links = []
 # =========================
+# Tiny TF-IDF Index (no sklearn)
 # =========================
 class TinyTfidfIndex:
     def add_documents(self, tokenized_docs: List[List[str]]):
         self.docs = tokenized_docs[:]
         self.df = Counter()
         for toks in self.docs:
             self.df.update(set(toks))
         N = max(1, len(self.docs))
         self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
         self.voc_size = len(self.idf)
         self.doc_norms = []
         for toks in self.docs:
             tf = Counter(toks)
             v[term] = (cnt / total) * idf
         return v
+    def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
         if not self.docs:
             return []
         qv = self._vec(tokenize(text))
 # =========================
+# Rubric for RAG-tailoring
 # =========================
 CHECKS = [
+    {"id": "landing_zone",  "desc": "Landing zone defined.",              "fix": "Use CAF blueprints.",                                     "keywords": ["landing", "hub", "spoke", "policy", "rbac"], "pillar": "governance"},
+    {"id": "connectivity",  "desc": "Connectivity planned.",              "fix": "Verify ER/VPN, DNS, MTU.",                                "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx"],   "pillar": "networking"},
+    {"id": "migrate_tooling","desc": "Tooling chosen.",                   "fix": "Run Azure Migrate discovery.",                            "keywords": ["migrate", "discovery", "assessment", "hcx"],  "pillar": "operations"},
+    {"id": "security",      "desc": "Security configured.",               "fix": "Enable Key Vault, Defender, Sentinel, MFA.",             "keywords": ["vault", "defender", "sentinel", "mfa", "identity"], "pillar": "security"},
+    {"id": "dr_backup",     "desc": "Backups/DR defined.",                "fix": "Set RTO/RPO; test ASR.",                                 "keywords": ["backup", "rto", "rpo", "dr", "asr"],          "pillar": "reliability"},
+    {"id": "cost",          "desc": "Cost optimization.",                 "fix": "Use reservations, rightsizing, tags.",                   "keywords": ["cost", "reservation", "savings", "tag"],      "pillar": "cost"},
 ]
 def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
 # =========================
+# Built-in Concept KB (for definitional questions)
+# =========================
+class Concept:
+    def __init__(self, name: str, aliases: List[str], builder):
+        self.name = name
+        self.aliases = [tokenize(a) for a in aliases]
+        self.builder = builder  # function(query:str)->str
+def _kb_ans_azure_sdn(_: str) -> str:
+    refs = list_refs(["Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"])
+    return (
+        "### Azure SDN — What it is and why it matters\n"
+        "**Definition:** Azure SDN is Microsoft's software-defined networking stack that centralizes network control in software, "
+        "decoupling policy and management from physical hardware. It lets you programmatically create and secure virtual networks, "
+        "subnets, microsegmentation (ACL/NSG-like policies), load balancers and gateways across Azure and Azure Local (Azure Stack HCI) environments.\n\n"
+        "**Key capabilities**\n"
+        "- Central, policy-driven control plane for virtual networking resources.\n"
+        "- Automation & GitOps-friendly configuration for repeatable environments.\n"
+        "- Microsegmentation and traffic filtering for east–west security.\n"
+        "- Software load balancing and gateway services for app connectivity.\n"
+        "- Consistent constructs across cloud and on-prem (with Azure Local).\n\n"
+        "**How it works (high level)**\n"
+        "- A software control plane programs host virtual switches and network functions.\n"
+        "- Network intent (VNets, subnets, policies) is applied consistently across hosts.\n"
+        "- Integrates with Azure identity/management for RBAC and governance.\n\n"
+        "**Common use cases**\n"
+        "- Rapidly provisioning isolated app environments.\n"
+        "- Enforcing zero-trust style segmentation between tiers.\n"
+        "- Hybrid apps spanning Azure and Azure Local.\n\n"
+        f"**Trusted sources:** {refs}"
+    )
+def _kb_ans_arc_enabled_sdn(_: str) -> str:
+    refs = list_refs(["Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"])
+    return (
+        "### Azure Arc-enabled SDN — Definition & details\n"
+        "**Definition:** Azure Arc-enabled SDN brings Azure's software-defined networking to on-premises Azure Local (Azure Stack HCI) clusters, "
+        "managed through Azure Arc. It decouples network control from hardware so you can centrally define, automate, and secure "
+        "virtual networks, subnets, and policies in your datacenter using Azure-consistent tools.\n\n"
+        "**Why it matters**\n"
+        "- Gives you Azure-like VNet constructs and policy management on-prem.\n"
+        "- Enables consistent security and segmentation across hybrid estates.\n"
+        "- Supports rapid, software-driven changes without touching physical fabric.\n\n"
+        "**Key capabilities**\n"
+        "- Create/modify on-prem VNets, subnets, and routing policies from Azure.\n"
+        "- Apply microsegmentation rules (policy/ACL-style) for east–west security.\n"
+        "- Software load balancing and gateway services for north–south/east–west flows.\n"
+        "- Integration with Azure RBAC, tagging, and governance for change control.\n\n"
+        "**Core components (conceptual)**\n"
+        "- **Arc resource bridge & agents** — connect your HCI cluster to Azure control.\n"
+        "- **SDN controller & host agents** — program the Hyper-V vSwitch and network functions.\n"
+        "- **Azure portal/CLI/GitOps** — define intent (VNets, subnets, policies) and deploy.\n\n"
+        "**Prerequisites (typical)**\n"
+        "- Azure Local (Azure Stack HCI) cluster connected to Azure Arc.\n"
+        "- Arc resource bridge onboarded; network requirements met.\n"
+        "- Appropriate RBAC roles to manage networking resources.\n\n"
+        "**Use cases**\n"
+        "- Host Azure-consistent app networks on-prem for data locality/regulatory needs.\n"
+        "- Hybrid deployments with identical network constructs across Azure and HCI.\n"
+        "- Rapid rollout of segmented networks for dev/test/prod without hardware changes.\n\n"
+        "**Notes & limitations (high level)**\n"
+        "- Physical underlay still matters (IP design, routing, bandwidth, HA).\n"
+        "- Feature parity with public Azure services may vary; validate per release.\n\n"
+        f"**Trusted sources:** {refs}"
+    )
+KB_CONCEPTS: List[Concept] = [
+    Concept(
+        name="azure sdn",
+        aliases=[
+            "azure sdn",
+            "software defined networking azure",
+            "sdn in azure",
+            "azure local sdn",
+            "azure stack hci sdn",
+        ],
+        builder=_kb_ans_azure_sdn,
+    ),
+    Concept(
+        name="azure arc enabled sdn",
+        aliases=[
+            "azure arc enabled sdn",
+            "azure arc-enabled sdn",
+            "arc enabled sdn",
+            "arc-enabled sdn",
+            "arc sdn",
+            "azure local arc sdn",
+            "azure stack hci arc sdn",
+        ],
+        builder=_kb_ans_arc_enabled_sdn,
+    ),
+]
+def lookup_concept(query: str) -> Optional[Concept]:
+    q_tokens = set(tokenize(query))
+    best: Optional[Concept] = None
+    best_score = 0.0
+    for c in KB_CONCEPTS:
+        for alias_tokens in c.aliases:
+            if not alias_tokens:
+                continue
+            overlap = len(q_tokens & set(alias_tokens))
+            score = overlap / float(len(set(alias_tokens)))
+            if score > best_score:
+                best_score = score
+                best = c
+    # threshold: intentional but tolerant
+    return best if best_score >= 0.5 else None
+# =========================
+# File Parsing
 # =========================
 def read_pdf_bytes(b: bytes) -> str:
         return ""
     try:
         reader = PyPDF2.PdfReader(io.BytesIO(b))
+        return "\n".join([page.extract_text() or "" for page in reader.pages])
     except Exception:
         return ""
     return ""
 def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
     name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
     data = file_obj.get("data")
     if data is None:
                 data = fh.read()
     if data is None:
         return {"file": name, "text": ""}
     low = name.lower()
     if low.endswith(".pdf"):
         text = read_pdf_bytes(data)
     elif low.endswith((".docx", ".doc")):
         text = read_docx_bytes(data)
     else:
         text = read_text_bytes(data)
     return {"file": os.path.basename(name), "text": text or ""}
 # =========================
+# Detailed Answer Composer (for RAG path)
 # =========================
 def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
+    combined = "\n\n".join([s.get("excerpt", "") for s in snippets])
+    scores, gaps = score_text_against_checks(combined)
+    def _mk_gaps(glist):
+        return "\n".join([f"- ({g['severity']}) {g['id']}: {g['fix']}" for g in glist]) or "- No major issues detected."
+    refs = list_refs([s[0] for s in TRUSTED_SOURCES])
+    details = (
         f"### Answer (detailed)\n"
         f"**Your question:** {query}\n\n"
+        f"**Summary:** Migration planning must cover landing zone, connectivity, tooling, security, DR, and cost.\n\n"
+        f"#### Scores\nOverall: {scores.get('overall', 0)}/5.0\n\n"
+        f"#### Gaps & Fixes\n{_mk_gaps(gaps)}\n\n"
+        f"#### Supporting Excerpts\n"
     )
     for s in snippets:
+        details += f"- {s['file']} (rel {s['relevance']:.2f}): {s['excerpt']}\n"
+    details += f"\n**Trusted sources:** {refs}"
+    return details
 # =========================
     use_uploaded_docs: bool,
     index_obj: Any,
     _matrix_unused: Any,
+    corpus: List[Dict[str, str]]
 ) -> str:
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
+    # 0) Concept KB for definitional questions (e.g., "What is Azure Arc-enabled SDN?")
+    concept = lookup_concept(q)
+    if concept is not None:
+        return concept.builder(q)
+    # 1) Seeded FAQs → detailed plan when relevant (>=50% overlap with seed)
     q_tokens = set(tokenize(q))
     for item in FAQ_SEEDS:
         seed_tokens = set(tokenize(item["q"]))
+        if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
+            return (
+                "### Answer (detailed)\n"
                 f"{item['a']}\n\n"
+                f"**Trusted sources:** {list_refs(item.get('refs', []))}"
             )
     # 2) Use uploaded docs (RAG) → detailed synthesized answer
     if use_uploaded_docs and index_obj is not None and corpus:
             snippets.append({
                 "file": item["file"],
                 "relevance": float(sim),
+                "excerpt": excerpt
             })
         if snippets:
             return _compose_detailed_from_snippets(q, snippets)
+    # 3) Fallback (no docs) → generic, but structured overview (not migration-only)
+    refs = list_refs(["Azure Arc (overview)", "Azure Stack HCI (Azure Local)", "Azure SDN concepts (HCI)"])
+    return (
         "### Answer (detailed)\n"
+        "I couldn't match a specific concept or supporting excerpts, so here's a structured overview you can refine:\n\n"
+        "**Definition:** Describe what the service/feature is, what problems it solves, and where it runs (Azure / Azure Local).\n\n"
+        "**Key capabilities:** automation, policy-driven control, security segmentation, connectivity services.\n\n"
+        "**How it works:** control plane programs host/network functions; policies applied consistently; integrates with RBAC/governance.\n\n"
+        "**Prerequisites:** identity/RBAC, connectivity to Azure (for Arc), supported host/cluster versions.\n\n"
+        "**Use cases:** hybrid deployments, zero-trust segmentation, rapid environment provisioning.\n\n"
         f"**Trusted sources:** {refs}"
     )
 # =========================
+# Build Index
 # =========================
+def build_index(files: List[Dict[str, Any]]):
     if not files:
         return None, None, [], "No files uploaded yet."
     corpus: List[Dict[str, str]] = []
     for f in files:
         rec = parse_file(f)
         if rec["text"]:
             corpus.append(rec)
     if not corpus:
+        return None, None, [], "No text extracted."
     tokenized = [tokenize(c["text"]) for c in corpus]
     idx = TinyTfidfIndex()
     idx.add_documents(tokenized)
+    return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}."
 # =========================
 with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
     gr.Markdown(
         "## VMware On-Prem → Azure Local Migration Assistant\n"
+        "Upload documents and ask questions. Detailed answers will be provided."
     )
     with gr.Row():
         with gr.Column(scale=2):
+            file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
             index_status = gr.Markdown("No index yet.")
             st_index = gr.State()
+            st_matrix = gr.State()
             st_corpus = gr.State()
             build_btn = gr.Button("Build Index", variant="primary")
         with gr.Column(scale=3):
+            question = gr.Textbox(label="Ask a question", placeholder="e.g., What is Azure Arc-enabled SDN, and why would I use it?")
             use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
             ask_btn = gr.Button("Ask", variant="primary")
             answer_box = gr.Markdown("")
+    # Convert gr.Files (paths) to expected dicts
+    def _collect_files(paths: List[str]):
         out = []
         for p in paths or []:
             try:
     def _build(files_paths: List[str]):
         files = _collect_files(files_paths)
+        return build_index(files)
     build_btn.click(
         _build,
         inputs=[file_in],
+        outputs=[index_status, st_index, st_matrix, st_corpus]
     )
     ask_btn.click(
         answer_faq_or_approach_detailed,
         inputs=[question, use_docs, st_index, st_matrix, st_corpus],
+        outputs=[answer_box]
     )
 if __name__ == "__main__":