Spaces:

ajayinsac
/

VMware2AzureLocal

Sleeping

App Files Files Community

ajayinsac commited on Sep 10, 2025

Commit

f2aed85

verified ·

1 Parent(s): 32e167d

Update app.py

Browse files

Files changed (1) hide show

app.py +396 -611

app.py CHANGED Viewed

@@ -1,22 +1,20 @@
 #!/usr/bin/env python3
 """
 VMware On-Prem → Azure Local Migration Assistant (Gradio)
-Update: Removed scikit-learn dependency. Includes a minimal pure-Python TF-IDF
-and cosine similarity so it runs on Hugging Face Spaces without sklearn.
-Features
-- FAQ / approach Q&A with trusted-source citations (links)
-- Upload & index PDF/DOCX/TXT (session-local)
-- Lightweight RAG (pure-Python TF-IDF over chunks)
-- Design/Runbook auto-review with rubric (0–5) + gaps + fixes
-- All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
 """
 import os
 import io
 import re
-import json
 import math
 import time
 from typing import List, Tuple, Dict, Any
@@ -24,584 +22,342 @@ from collections import Counter, defaultdict
 import gradio as gr
-# -------- Optional, small footprint parsers --------
-# PDF
 try:
-    from pypdf import PdfReader
 except Exception:
-    PdfReader = None
-# DOCX
 try:
-    import docx
 except Exception:
     docx = None
 # =========================
-# Trusted Sources (Allowlist)
 # =========================
-TRUSTED_SOURCES = [
-    # Microsoft Learn / Docs
     ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
     ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
-    ("Azure Stack HCI / Azure Local", "https://learn.microsoft.com/azure-stack/"),
     ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
-    ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/well-architected/"),
-    # VMware
-    ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/"),
-    ("VMware vSphere Docs", "https://docs.vmware.com/en/VMware-vSphere/index.html"),
-    # Security & Compliance
-    ("NIST SP 800-53", "https://csrc.nist.gov/publications/sp800-53"),
-    ("FedRAMP Baselines", "https://www.fedramp.gov/"),
-    ("IRS Publication 1075 (FTI)", "https://www.irs.gov/pub/irs-pdf/p1075.pdf"),
 ]
-# =========================
-# Ontology (Domains/Subdomains)
-# =========================
-ONTOLOGY = {
-    "Assessment": ["Inventory", "Dependencies", "Performance", "Criticality", "Readiness"],
-    "Architecture": ["Landing Zone", "Azure Local Footprint", "AVS", "Environments"],
-    "Networking": ["ExpressRoute", "VPN", "IP Plan", "DNS", "Load Balancing", "Private Link", "HCX Network"],
-    "Identity": ["Entra ID", "AD DS", "PIM", "MFA", "RBAC", "Break-Glass"],
-    "Migration": ["HCX", "Azure Migrate", "Cutover", "Rollback", "Data Sync"],
-    "Data": ["Storage", "Backup", "Snapshots", "Immutability", "Residency"],
-    "Security": ["Defender", "Sentinel", "Policy", "Purview", "Key Vault"],
-    "DR": ["ASR", "Failover", "RTO/RPO", "Runbooks", "Tests"],
-    "Ops": ["Monitor", "Log Analytics", "Patching", "Change Mgmt", "ITIL"],
-    "Cost": ["Right-Sizing", "Reservations", "Tagging", "Budgets"],
-    "Program": ["RAID", "Comms", "Training", "RACI", "Gates"],
-    "Troubleshooting": ["HCX Failures", "DNS Drift", "Identity Tokens", "Latency"],
-}
-# =========================
-# Heuristic Design Checks (keywords → rubric mapping)
-# =========================
-CHECKS = {
-    "security": {
-        "weight": 1.0,
-        "keywords": [
-            "Defender for Cloud", "Microsoft Defender", "Sentinel", "Key Vault", "encryption",
-            "TLS", "KMS", "HSM", "Just-In-Time", "JIT", "PIM", "MFA", "Conditional Access",
-            "Azure Policy", "Purview", "classification", "DLP", "RBAC", "least privilege"
-        ],
-        "controls": ["NIST-AC-2", "NIST-SC-13", "IRS1075 §9.3"]
-    },
-    "reliability": {
-        "weight": 1.0,
-        "keywords": [
-            "Availability Zone", "zonal", "ASR", "Site Recovery", "backup", "failover",
-            "failback", "DR drill", "runbook", "immutable", "soft delete", "RTO", "RPO"
-        ],
-    },
-    "performance": {
-        "weight": 1.0,
-        "keywords": [
-            "right-size", "IOPS", "latency", "throughput", "benchmark", "autoscale",
-            "SKU", "Managed Disks", "Premium SSD", "Ultra", "Standard SSD"
-        ],
-    },
-    "operations": {
-        "weight": 1.0,
-        "keywords": [
-            "Azure Monitor", "Log Analytics", "alerts", "workbooks", "patch", "change management",
-            "incident", "problem", "request", "ITIL", "configuration drift"
-        ],
-    },
-    "cost": {
-        "weight": 1.0,
-        "keywords": [
-            "reservation", "Reserved Instances", "Savings Plan", "spot",
-            "tagging", "chargeback", "showback", "budget", "cost anomaly"
-        ],
-    },
-    "networking": {
-        "weight": 1.0,
-        "keywords": [
-            "ExpressRoute", "ER", "VPN", "BGP", "MTU", "NSG", "ASG", "UDR", "Private Link",
-            "DNS", "DHCP", "load balancer", "hub and spoke", "landing zone network"
-        ],
-    },
-    "identity": {
-        "weight": 1.0,
-        "keywords": [
-            "Entra ID", "Azure AD", "Active Directory", "domain trust", "AADDS",
-            "Conditional Access", "PIM", "break-glass", "least privilege"
-        ],
-    },
-    "migration": {
-        "weight": 1.0,
-        "keywords": [
-            "HCX", "vMotion", "RAV", "Azure Migrate", "replication", "Mobility Group",
-            "cutover", "rollback", "pilot", "wave"
-        ],
-    },
-    "architecture": {
-        "weight": 1.0,
-        "keywords": [
-            "Landing Zone", "hub", "spoke", "policy", "RBAC", "naming",
-            "AVS", "Azure Local", "Azure Stack HCI", "Local Zone"
-        ],
-    },
-}
-# =========================
-# FAQ seeds (concise, cite trusted links)
-# =========================
-FAQ_SEEDS = [
     {
-        "q": "How do we migrate VMware workloads to Azure Local?",
         "a": (
-            "Typical paths are **Azure VMware Solution (AVS)** with **HCX** (bulk/RAV/vMotion) or "
-            "**Azure Migrate** for discovery, assessment, and server/db/web migration. "
-            "Establish a governed **Landing Zone** (hub/spoke, Policy, RBAC), plan ExpressRoute/VPN, "
-            "pilot a few VMs, then cut over in waves with rollback plans. "
-            "See AVS, Azure Migrate, and CAF for prescriptive guidance."
         ),
-        "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"]
     },
     {
-        "q": "What downtime should we expect?",
         "a": (
-            "Depends on method and app architecture. **HCX vMotion** can provide minimal downtime; "
-            "**HCX RAV** and **bulk migration** usually require short cutover windows. "
-            "Always pilot, measure replication lag, and agree on a timeboxed backout."
         ),
-        "refs": ["VMware HCX Docs"]
     },
     {
-        "q": "How do we meet IRS Pub 1075 and NIST controls?",
         "a": (
-            "Map design controls to frameworks: enforce least privilege (RBAC/PIM/MFA), "
-            "encrypt at rest/in transit (Key Vault/HSM, TLS), centralize telemetry (Sentinel), "
-            "and document evidence (policies, runbooks, DR tests). Use CAF/WAF security pillars."
         ),
-        "refs": ["IRS Publication 1075 (FTI)", "NIST SP 800-53", "Azure Well-Architected Framework (WAF)"]
-    },
-    {
-        "q": "ExpressRoute or VPN?",
-        "a": (
-            "**ExpressRoute** is preferred for predictable performance and private connectivity; "
-            "VPN is fine for initial testing or lower-throughput needs. Many designs use both "
-            "for redundancy and phased cutover."
-        ),
-        "refs": ["Cloud Adoption Framework (CAF)"]
     },
 ]
 # =========================
-# Minimal Pure-Python TF-IDF
 # =========================
-STOPWORDS = set("""
-a an the and or but if then else for from to in on at by of with without into within over under not be is are was were will can should would could may might
-this that these those there here when where how what why who whom which as it its itself themselves ourselves yourself yourselves
-""".split())
-TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
 def tokenize(text: str) -> List[str]:
-    return [w.lower() for w in TOKEN_RE.findall(text) if w and w.lower() not in STOPWORDS]
 class TinyTfidfIndex:
     def __init__(self):
         self.docs: List[List[str]] = []
-        self.doc_vectors: List[Dict[str, float]] = []
-        self.doc_norms: List[float] = []
         self.idf: Dict[str, float] = {}
-        self.N = 0
-        self.corpus_meta: List[Dict[str, str]] = []
-    def fit(self, texts: List[str], meta: List[Dict[str, str]]):
-        self.docs = [tokenize(t) for t in texts]
-        self.N = len(self.docs)
-        self.corpus_meta = meta
         # document frequency
-        df = Counter()
-        for doc in self.docs:
-            df.update(set(doc))
-        # idf
-        self.idf = {}
-        for term, dfi in df.items():
-            # add-1 smoothing to avoid div by zero, +1 offset
-            self.idf[term] = 1.0 + math.log((self.N + 1) / (dfi + 1))
-        # build doc vectors
-        self.doc_vectors = []
         self.doc_norms = []
-        for doc in self.docs:
-            tf = Counter(doc)
-            vec = {}
             for term, cnt in tf.items():
-                vec[term] = (cnt / max(1, len(doc))) * self.idf.get(term, 0.0)
-            norm = math.sqrt(sum(v * v for v in vec.values())) or 1e-12
-            self.doc_vectors.append(vec)
-            self.doc_norms.append(norm)
-    def query(self, text: str, k: int = 4) -> List[Tuple[int, float]]:
-        qtokens = tokenize(text)
-        if not qtokens or self.N == 0:
-            return []
-        tf = Counter(qtokens)
-        qvec = {}
         for term, cnt in tf.items():
-            qvec[term] = (cnt / max(1, len(qtokens))) * self.idf.get(term, 0.0)
-        qnorm = math.sqrt(sum(v * v for v in qvec.values())) or 1e-12
-        # cosine against each doc
-        scores = []
-        for i, dvec in enumerate(self.doc_vectors):
-            dot = 0.0
-            # iterate over smaller dict for speed
-            if len(qvec) < len(dvec):
-                for t, v in qvec.items():
-                    if t in dvec:
-                        dot += v * dvec[t]
-            else:
-                for t, v in dvec.items():
-                    if t in qvec:
-                        dot += v * qvec[t]
-            sim = dot / (qnorm * self.doc_norms[i])
-            scores.append((i, sim))
-        scores.sort(key=lambda x: x[1], reverse=True)
-        return scores[:k]
 # =========================
-# Utilities: text extraction & chunking
 # =========================
-def extract_text_from_pdf(fileobj: io.BytesIO) -> str:
-    if PdfReader is None:
         return ""
     try:
-        reader = PdfReader(fileobj)
-        parts = []
         for page in reader.pages:
-            txt = page.extract_text() or ""
-            parts.append(txt)
-        return "\n".join(parts)
     except Exception:
         return ""
-def extract_text_from_docx(fileobj: io.BytesIO) -> str:
-    if docx is None:
-        return ""
-    try:
-        document = docx.Document(fileobj)
-        return "\n".join([p.text for p in document.paragraphs])
-    except Exception:
         return ""
-def extract_text_from_txt(fileobj: io.BytesIO) -> str:
     try:
-        return fileobj.read().decode("utf-8", errors="ignore")
     except Exception:
         return ""
-def read_file_to_text(file: gr.File) -> Tuple[str, str]:
-    """
-    Returns (text, filename)
-    """
-    if file is None:
-        return "", ""
-    name = os.path.basename(file.name) if file.name else "uploaded"
-    with open(file.name, "rb") as f:
-        raw = f.read()
-    ext = (name.split(".")[-1] or "").lower()
-    bio = io.BytesIO(raw)
-    if ext in ["pdf"]:
-        txt = extract_text_from_pdf(bio)
-    elif ext in ["docx"]:
-        txt = extract_text_from_docx(bio)
-    elif ext in ["txt"]:
-        txt = extract_text_from_txt(bio)
-    else:
-        txt = ""
-    return txt, name
-def chunk_text(text: str, max_len: int = 900, overlap: int = 120) -> List[str]:
-    """
-    Simple sliding window chunker by characters; robust and fast.
-    """
-    text = re.sub(r"\s+", " ", text).strip()
-    chunks = []
-    i = 0
-    n = len(text)
-    while i < n:
-        j = min(i + max_len, n)
-        chunk = text[i:j]
-        if chunk:
-            chunks.append(chunk)
-        i = j - overlap
-        if i < 0:
-            i = 0
-        if i >= n:
-            break
-    return chunks
-# =========================
-# RAG Index (session-scoped)
-# =========================
-class RagState:
-    def __init__(self):
-        self.index = None  # TinyTfidfIndex
-        self.corpus = None  # list of dicts with text/meta
-def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
     """
-    Build a tiny TF-IDF index over all chunks from uploaded documents.
-    Returns: (index_obj, None, chunks_with_meta) to keep signature compatible.
     """
-    all_chunks = []
-    meta = []
-    if not files:
-        return None, None, None
-    for f in files:
-        txt, fname = read_file_to_text(f)
-        if not txt.strip():
-            continue
-        chunks = chunk_text(txt)
-        for c in chunks:
-            all_chunks.append(c)
-            meta.append({"file": os.path.basename(f.name), "snippet": c[:120] + ("..." if len(c) > 120 else "")})
-    if not all_chunks:
-        return None, None, None
-    idx = TinyTfidfIndex()
-    idx.fit(all_chunks, meta)
-    corpus = [{"text": t, **m} for t, m in zip(all_chunks, meta)]
-    return idx, None, corpus
-def retrieve_answer(
-    query: str,
-    index_obj: Any,
-    _matrix_unused: Any,
-    corpus: List[Dict[str, str]],
-    k: int = 4
-) -> Tuple[str, List[Dict[str, str]]]:
-    """
-    Return synthesized answer + top-k supporting chunks with filenames.
-    """
-    if not query or index_obj is None or not corpus:
-        return "", []
-    top = index_obj.query(query, k=k)
-    snippets = []
-    for i, sim in top:
-        item = corpus[i]
-        snippets.append({
-            "file": item["file"],
-            "relevance": float(sim),
-            "excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
-        })
-    answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
-    for s in snippets:
-        answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
-    answer += "Tip: Ask a follow-up like “Summarize the cutover plan” or “List missing security controls.”"
-    return answer, snippets
 # =========================
-# Design / Runbook Auto-Review
 # =========================
-def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
-    text_low = text.lower()
-    pillar_scores = {}
-    gaps = []
-    for pillar, cfg in CHECKS.items():
-        hits = 0
-        kws = cfg["keywords"]
-        for kw in kws:
-            if kw.lower() in text_low:
-                hits += 1
-        coverage = hits / max(1, len(kws))
-        score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2)  # baseline 1.5, up to 5.0
-        pillar_scores[pillar] = score
-        if pillar == "networking":
-            if "expressroute".lower() not in text_low and "er " not in text_low:
-                gaps.append({
-                    "id": "NET-ER-001",
-                    "severity": "High",
-                    "desc": "ExpressRoute (ER) not referenced; consider ER for predictable private connectivity.",
-                    "fix": "Design dual ER circuits with diverse POPs; fall back to VPN during pilot."
-                })
-            if "dns" not in text_low:
-                gaps.append({
-                    "id": "NET-DNS-002",
-                    "severity": "Med",
-                    "desc": "DNS plan not mentioned; risk of name resolution drift post-cutover.",
-                    "fix": "Document forwarders/zones, conditional forwarding, and DNS cutover sequencing."
-                })
-            if "mtu" not in text_low and "hcx" in text_low:
-                gaps.append({
-                    "id": "NET-MTU-003",
-                    "severity": "Med",
-                    "desc": "HCX present but MTU tuning not referenced.",
-                    "fix": "Validate path MTU for HCX tunnels; align NSX/physical network settings."
-                })
-        if pillar == "identity":
-            if "pim" not in text_low:
-                gaps.append({
-                    "id": "ID-PIM-004",
-                    "severity": "Med",
-                    "desc": "No mention of Privileged Identity Management (PIM).",
-                    "fix": "Enable PIM for admin roles; require approvals/justification; enforce MFA."
-                })
-            if "break-glass" not in text_low:
-                gaps.append({
-                    "id": "ID-BG-005",
-                    "severity": "Low",
-                    "desc": "No break-glass account reference.",
-                    "fix": "Create monitored break-glass accounts with strong controls and regular review."
-                })
-        if pillar == "security":
-            if "key vault" not in text_low and "hsm" not in text_low:
-                gaps.append({
-                    "id": "SEC-KEY-006",
-                    "severity": "High",
-                    "desc": "Key management not described.",
-                    "fix": "Use Azure Key Vault (HSM-backed if needed); rotate secrets/keys; restrict access via RBAC."
-                })
-            if "sentinel" not in text_low:
-                gaps.append({
-                    "id": "SEC-SIEM-007",
-                    "severity": "Med",
-                    "desc": "SIEM not referenced.",
-                    "fix": "Onboard to Microsoft Sentinel; define data connectors and incident processes."
-                })
-            if "policy" not in text_low:
-                gaps.append({
-                    "id": "SEC-POL-008",
-                    "severity": "Med",
-                    "desc": "Azure Policy governance not mentioned.",
-                    "fix": "Attach ALZ policies/initiatives for guardrails (encryption, tags, allowed locations, SKUs)."
-                })
-        if pillar == "reliability":
-            if ("asr" not in text_low) and ("site recovery" not in text_low):
-                gaps.append({
-                    "id": "REL-ASR-009",
-                    "severity": "Med",
-                    "desc": "No DR replication tool referenced.",
-                    "fix": "Use Azure Site Recovery (ASR) or HCX DR for failover/failback; schedule DR drills."
-                })
-            if "backup" not in text_low and "recovery services vault" not in text_low:
-                gaps.append({
-                    "id": "REL-BKP-010",
-                    "severity": "High",
-                    "desc": "Backup strategy not captured.",
-                    "fix": "Configure Azure Backup with immutable storage and soft delete; test restores."
-                })
-            if ("rto" not in text_low) or ("rpo" not in text_low):
-                gaps.append({
-                    "id": "REL-RTORPO-011",
-                    "severity": "Med",
-                    "desc": "RTO/RPO targets not documented.",
-                    "fix": "Define business-aligned RTO/RPO and validate during pilot/cutover."
-                })
-        if pillar == "architecture":
-            if ("landing zone" not in text_low) and ("landing-zone" not in text_low):
-                gaps.append({
-                    "id": "ARC-ALZ-012",
-                    "severity": "High",
-                    "desc": "Azure Landing Zone baseline not referenced.",
-                    "fix": "Adopt ALZ (hub/spoke, Policy, RBAC, logging) before migration waves."
-                })
-        if pillar == "migration":
-            if ("rollback" not in text_low) and ("backout" not in text_low):
-                gaps.append({
-                    "id": "MIG-ROLL-013",
-                    "severity": "High",
-                    "desc": "Rollback/backout path not documented.",
-                    "fix": "Document clear backout steps and timebox for each wave; test in pilot."
-                })
-            if "pilot" not in text_low:
-                gaps.append({
-                    "id": "MIG-PILOT-014",
-                    "severity": "Med",
-                    "desc": "No pilot mentioned.",
-                    "fix": "Execute a pilot with representative workloads; capture metrics and lessons."
-                })
-        if pillar == "cost":
-            if "tag" not in text_low:
-                gaps.append({
-                    "id": "COST-TAG-015",
-                    "severity": "Med",
-                    "desc": "Tagging strategy absent (owner, env, app).",
-                    "fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
-                })
-    if pillar_scores:
-        overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
-    else:
-        overall = 0.0
-    if overall < 3.5:
-        gaps.insert(0, {
-            "id": "SUMMARY",
-            "severity": "Info",
-            "desc": f"Overall score is {overall}. Focus first on High-severity gaps.",
-            "fix": "Prioritize ER/DNS/Backup/ALZ/PIM/Key Vault where missing; re-run the check after updates."
-        })
-    return {"overall": overall, **pillar_scores}, gaps
-def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
-    if not files:
-        return "Please upload at least one PDF/DOCX/TXT.", {}, []
-    text_full = []
-    file_list = []
-    for f in files:
-        txt, fname = read_file_to_text(f)
-        if txt.strip():
-            text_full.append(txt)
-            file_list.append(os.path.basename(f.name))
-    if not text_full:
-        return "Could not parse text from the provided files.", {}, []
-    combined = "\n\n".join(text_full)
-    scores, gaps = score_text_against_checks(combined)
-    md = f"### Design/Runbook Review\n"
-    md += f"**Files analyzed:** {', '.join(file_list)}\n\n"
-    md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
-    md += "**Per-Pillar Scores:**\n\n"
-    for k, v in scores.items():
-        if k == "overall":
             continue
-        md += f"- **{k.capitalize()}**: {v}\n"
-    md += "\n**Top Recommendations:**\n"
-    for g in gaps[:6]:
-        md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
-    result_json = {
-        "timestamp": int(time.time()),
-        "files": file_list,
-        "scores": scores,
-        "gaps": gaps
-    }
-    table_rows = [[g["id"], g["severity"], g["desc"], g["fix"]] for g in gaps]
-    return md, result_json, table_rows
-# =========================
-# Q&A Logic
-# =========================
-def list_refs(ref_names: List[str]) -> str:
-    links = []
-    for nm in ref_names:
-        hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
-        if hit:
-            links.append(f"[{nm}]({hit[0][1]})")
-    return " | ".join(links)
-def answer_faq_or_approach(
     question: str,
     use_uploaded_docs: bool,
     index_obj: Any,
@@ -612,32 +368,45 @@ def answer_faq_or_approach(
     if not q:
         return "Please enter a question."
-    # First try seeded FAQs (very light semantic: keyword match)
     for item in FAQ_SEEDS:
-        # simple heuristic: overlap of first few tokens
         seed_tokens = set(tokenize(item["q"])[:3])
         q_tokens = set(tokenize(q))
         if seed_tokens and seed_tokens.issubset(q_tokens):
-            return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
-    # If requested, try RAG on uploaded docs
     if use_uploaded_docs and index_obj is not None and corpus:
-        rag_answer, _snips = retrieve_answer(q, index_obj, None, corpus, k=4)
-        if rag_answer.strip():
-            refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
-            return f"{rag_answer}\n\n**Trusted sources:** {refs}"
-    # Fallback generic approach with citations
-    generic = (
-        "**Suggested approach:**\n"
-        "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging).\n"
-        "2) Establish **ExpressRoute/VPN** and DNS plans; validate MTU if using **HCX**.\n"
-        "3) Run **Azure Migrate** discovery/assessment; classify (rehost/refactor/modernize).\n"
-        "4) Pilot 2–3 VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
-        "5) Define **RTO/RPO**, backup, and **ASR**/DR drills; document rollback.\n"
-        "6) Onboard to **Defender/Sentinel**, enforce **Key Vault** and **PIM/MFA**.\n"
-        "7) Optimize cost (right-size, reservations) and tag everything.\n"
-    )
     refs = list_refs([
         "Azure VMware Solution (AVS)",
         "Azure Migrate",
@@ -645,98 +414,114 @@ def answer_faq_or_approach(
         "Azure Well-Architected Framework (WAF)",
         "VMware HCX Docs"
     ])
-    return f"{generic}\n**Trusted sources:** {refs}"
 # =========================
-# Gradio UI
 # =========================
-with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
-    gr.Markdown(
-        "# VMware On-Prem → Azure Local Migration Assistant\n"
-        "Ask questions, upload migration/design documents for review, and get recommendations.\n"
-        "_Sources: Microsoft Learn/Docs, VMware Docs, NIST, IRS Pub 1075 (linked below)._"
-    )
-    # Session state for RAG
-    st_index = gr.State(None)   # TinyTfidfIndex
-    st_matrix = gr.State(None)  # kept for signature compatibility
-    st_corpus = gr.State(None)
-    with gr.Tabs():
-        with gr.Tab("Ask Anything"):
-            with gr.Row():
-                question = gr.Textbox(
-                    label="Your question (FAQs, approach, troubleshooting)",
-                    placeholder="e.g., How do I plan a pilot with HCX RAV and ensure minimal downtime?"
-                )
-            use_docs = gr.Checkbox(label="Also search my uploaded documents (if any)", value=True)
-            ask_btn = gr.Button("Answer")
-            answer_box = gr.Markdown()
-        with gr.Tab("Upload & Review Design"):
-            gr.Markdown("Upload **PDF / DOCX / TXT** (multiple allowed). Then build the index and/or run a review.")
-            files = gr.File(file_count="multiple", file_types=[".pdf", ".docx", ".txt"], label="Upload documents")
-            with gr.Row():
-                build_btn = gr.Button("Build/Refresh Search Index")
-                review_btn = gr.Button("Run Design/Runbook Review")
-            index_info = gr.Markdown()
-            review_md = gr.Markdown()
-            review_json = gr.JSON()
-            gaps_table = gr.Dataframe(
-                headers=["Gap ID", "Severity", "Description", "Fix"],
-                datatype=["str", "str", "str", "str"],
-                interactive=False,
-                label="Gaps & Recommendations"
-            )
-        with gr.Tab("Trusted Sources & Ontology"):
-            gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
-            links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
-            gr.Markdown(links_md)
-            gr.Markdown("### Knowledge Taxonomy (Domains → Subdomains)")
-            onto_str = ""
-            for dom, subs in ONTOLOGY.items():
-                onto_str += f"- **{dom}**: {', '.join(subs)}\n"
-            gr.Markdown(onto_str)
-            gr.Markdown(
-                "### Notes\n"
-                "- This app does **not** call external APIs. Use the links above for official guidance.\n"
-                "- Design checks are heuristic; always validate against your Architecture Board and security teams."
-            )
-    # ====== Wiring ======
-    def on_build_index(files_list):
-        idx, _X, cor = build_index(files_list)
-        if idx is None:
-            return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
-                    None, None, None)
-        msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
-        return msg, idx, None, cor
-    build_btn.click(
-        on_build_index,
-        inputs=[files],
-        outputs=[index_info, st_index, st_matrix, st_corpus]
     )
-    def on_review(files_list):
-        md, js, table = review_uploaded_docs(files_list)
-        return md, js, table
-    review_btn.click(
-        on_review,
-        inputs=[files],
-        outputs=[review_md, review_json, gaps_table]
     )
     ask_btn.click(
-        answer_faq_or_approach,
         inputs=[question, use_docs, st_index, st_matrix, st_corpus],
         outputs=[answer_box]
     )
-# Standard HF Spaces entrypoint
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))

 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 """
 VMware On-Prem → Azure Local Migration Assistant (Gradio)
+- Works on Hugging Face Spaces (no external API calls, no sklearn).
+- Upload design/migration docs (PDF/DOCX/TXT/MD).
+- Ask questions; get DETAILED, structured answers with excerpts + trusted refs.
+Run locally:
+  pip install gradio PyPDF2 python-docx
+  python app.py
 """
 import os
 import io
 import re
 import math
 import time
 from typing import List, Tuple, Dict, Any
 import gradio as gr
+# Optional parsers (gracefully degrade if not installed on Spaces)
 try:
+    import PyPDF2  # lightweight; often available on Spaces
 except Exception:
+    PyPDF2 = None
 try:
+    import docx  # python-docx
 except Exception:
     docx = None
 # =========================
+# Trusted sources & FAQ seeds
 # =========================
+TRUSTED_SOURCES: List[Tuple[str, str]] = [
     ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
     ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
     ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
+    ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
+    ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html")
 ]
+FAQ_SEEDS: List[Dict[str, Any]] = [
     {
+        "q": "How do we migrate VMware workloads to Azure with minimal downtime?",
         "a": (
+            "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
+            "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
+            "pilot a few representative VMs, then migrate in waves with rollback and DR drills."
         ),
+        "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"]
     },
     {
+        "q": "What is a recommended migration sequence?",
         "a": (
+            "1) Establish a governed landing zone. 2) Set up connectivity and identity. "
+            "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
+            "6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
         ),
+        "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]
     },
     {
+        "q": "How do we plan DR and backups?",
         "a": (
+            "Define RTO/RPO per app. Use immutable backups and soft-delete. "
+            "Leverage ASR for DR where appropriate, run failover drills, and document rollback."
         ),
+        "refs": ["Azure Well-Architected Framework (WAF)"]
     },
 ]
 # =========================
+# Utilities
 # =========================
+_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")  # keep URLs/paths/ids mostly intact
 def tokenize(text: str) -> List[str]:
+    if not text:
+        return []
+    return [t.lower() for t in _WORD_RE.findall(text)]
+def list_refs(ref_names: List[str]) -> str:
+    links = []
+    for nm in ref_names:
+        hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
+        if hit:
+            links.append(f"[{nm}]({hit[0][1]})")
+    return " | ".join(links) if links else ""
+# =========================
+# Tiny TF-IDF implementation (no sklearn)
+# =========================
 class TinyTfidfIndex:
     def __init__(self):
         self.docs: List[List[str]] = []
+        self.df: Counter = Counter()
         self.idf: Dict[str, float] = {}
+        self.doc_norms: List[float] = []
+        self.voc_size = 0
+    def add_documents(self, tokenized_docs: List[List[str]]):
+        self.docs = tokenized_docs[:]
         # document frequency
+        self.df = Counter()
+        for toks in self.docs:
+            self.df.update(set(toks))
+        N = max(1, len(self.docs))
+        self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
+        self.voc_size = len(self.idf)
+        # precompute norms
         self.doc_norms = []
+        for toks in self.docs:
+            tf = Counter(toks)
+            norm_sq = 0.0
             for term, cnt in tf.items():
+                w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0)
+                norm_sq += w * w
+            self.doc_norms.append(math.sqrt(norm_sq))
+    def _vec(self, toks: List[str]) -> Dict[str, float]:
+        tf = Counter(toks)
+        total = max(1, len(toks))
+        v = {}
         for term, cnt in tf.items():
+            idf = self.idf.get(term)
+            if idf is None:
+                continue
+            v[term] = (cnt / total) * idf
+        return v
+    def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
+        if not self.docs:
+            return []
+        qv = self._vec(tokenize(text))
+        # cosine similarity
+        q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
+        sims: List[Tuple[int, float]] = []
+        for i, toks in enumerate(self.docs):
+            dv = Counter(toks)  # use tf counter to loop terms
+            num = 0.0
+            for term in qv:
+                if term in dv:
+                    # weight for doc term
+                    w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
+                    num += qv[term] * w_d
+            denom = (self.doc_norms[i] or 1e-9) * q_norm
+            sims.append((i, num / denom))
+        sims.sort(key=lambda x: x[1], reverse=True)
+        return sims[:k]
+# =========================
+# Simple scoring rubric to tailor the detailed output
+# =========================
+CHECKS = [
+    {
+        "id": "landing_zone",
+        "desc": "Landing zone defined (hub/spoke, Policy, RBAC, logging).",
+        "fix": "Use CAF blueprints; enforce Policy for guardrails and RBAC.",
+        "keywords": ["landing", "hub", "spoke", "policy", "rbac", "log", "monitor"],
+        "pillar": "governance",
+    },
+    {
+        "id": "connectivity",
+        "desc": "Connectivity planned (ExpressRoute/VPN), DNS, MTU validated for HCX.",
+        "fix": "Verify ER/VPN, DNS resolution, and HCX MTU/mobility settings.",
+        "keywords": ["expressroute", "vpn", "dns", "mtu", "hcx", "connectivity"],
+        "pillar": "networking",
+    },
+    {
+        "id": "migrate_tooling",
+        "desc": "Discovery/assessment and tooling chosen (Azure Migrate or HCX).",
+        "fix": "Run Azure Migrate discovery; select HCX or Azure Migrate per downtime.",
+        "keywords": ["azure", "migrate", "discovery", "assessment", "hcx", "replication"],
+        "pillar": "operations",
+    },
+    {
+        "id": "security",
+        "desc": "Security/identity configured (Key Vault, Defender, Sentinel, PIM/MFA).",
+        "fix": "Centralize secrets in Key Vault; enable Defender/Sentinel; enforce PIM/MFA.",
+        "keywords": ["key", "vault", "defender", "sentinel", "pim", "mfa", "entra", "aad", "identity"],
+        "pillar": "security",
+    },
+    {
+        "id": "dr_backup",
+        "desc": "Backups, DR, RTO/RPO defined; ASR drills planned.",
+        "fix": "Set RTO/RPO; immutability & soft-delete; test ASR failover/failback.",
+        "keywords": ["backup", "rto", "rpo", "dr", "asr", "failover", "restore"],
+        "pillar": "reliability",
+    },
+    {
+        "id": "cost",
+        "desc": "Cost optimization plan (right-sizing, reservations, tagging).",
+        "fix": "Use reservations/Savings Plans, rightsizing, and enforce tags.",
+        "keywords": ["cost", "reservation", "savings", "right", "tag"],
+        "pillar": "cost",
+    },
+]
+def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
+    toks = set(tokenize(text))
+    scores = defaultdict(float)
+    hits = []
+    for chk in CHECKS:
+        matched = any(kw in toks for kw in chk["keywords"])
+        if matched:
+            scores["overall"] += 1.0
+            scores[chk["pillar"]] += 1.0
+        else:
+            hits.append({
+                "id": chk["id"],
+                "desc": chk["desc"],
+                "fix": chk["fix"],
+                "severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
+            })
+    # normalize roughly to 0-5 scale
+    max_possible = float(len(CHECKS))
+    scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
+    for k in list(scores.keys()):
+        if k != "overall":
+            scores[k] = round(scores[k], 2)
+    return scores, hits
 # =========================
+# File parsing
 # =========================
+def read_pdf_bytes(b: bytes) -> str:
+    if not PyPDF2:
         return ""
     try:
+        reader = PyPDF2.PdfReader(io.BytesIO(b))
+        out = []
         for page in reader.pages:
+            try:
+                out.append(page.extract_text() or "")
+            except Exception:
+                pass
+        return "\n".join(out)
     except Exception:
         return ""
+def read_docx_bytes(b: bytes) -> str:
+    if not docx:
         return ""
     try:
+        f = io.BytesIO(b)
+        d = docx.Document(f)
+        return "\n".join(p.text for p in d.paragraphs)
     except Exception:
         return ""
+def read_text_bytes(b: bytes) -> str:
+    # best-effort decoding
+    for enc in ("utf-8", "utf-16", "latin-1"):
+        try:
+            return b.decode(enc)
+        except Exception:
+            continue
+    return ""
+def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
     """
+    Returns {"file": <name>, "text": <extracted_text>}
     """
+    name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
+    data = file_obj.get("data")
+    if data is None:
+        # gradio sometimes provides a path instead
+        path = file_obj.get("path")
+        if path and os.path.exists(path):
+            with open(path, "rb") as fh:
+                data = fh.read()
+    if data is None:
+        return {"file": name, "text": ""}
+    low = name.lower()
+    text = ""
+    if low.endswith(".pdf"):
+        text = read_pdf_bytes(data)
+    elif low.endswith(".docx") or low.endswith(".doc"):
+        text = read_docx_bytes(data)
+    elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
+        text = read_text_bytes(data)
+    else:
+        # try plain text as fallback
+        text = read_text_bytes(data)
+    return {"file": os.path.basename(name), "text": text or ""}
 # =========================
+# Detailed Q&A Composer
 # =========================
+def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
+    collected = [s.get("excerpt", "") for s in snippets]
+    combined = "\n\n".join(collected)
+    scores, gaps = score_text_against_checks(combined) if combined.strip() else ({"overall": 0.0}, [])
+    def _mk_gaps(glist, limit=8):
+        out = []
+        for g in glist[:limit]:
+            out.append(f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_")
+        return "\n".join(out) if out else "- No major issues detected in the sampled excerpts."
+    refs = list_refs([
+        "Azure VMware Solution (AVS)",
+        "Azure Migrate",
+        "Cloud Adoption Framework (CAF)",
+        "Azure Well-Architected Framework (WAF)",
+        "VMware HCX Docs"
+    ])
+    pillar_lines = []
+    for k_, v_ in scores.items():
+        if k_ == "overall":
             continue
+        pillar_lines.append(f"- **{k_.capitalize()}**: {v_}")
+    pillar_md = "\n".join(pillar_lines) if pillar_lines else "- (no signals)"
+    md = (
+        f"### Answer (detailed)\n"
+        f"**Your question:** {query}\n\n"
+        f"**TL;DR:** Here’s a concrete plan across landing zone, connectivity, migration method, security, DR, and cost. "
+        f"Address the highest-risk gaps first.\n\n"
+        f"#### Step-by-step plan\n"
+        "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
+        "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
+        "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
+        "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
+        "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
+        "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
+        "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
+        f"#### What your documents emphasize (auto-scored)\n"
+        f"**Overall score:** {scores.get('overall', 0)} / 5.0\n\n"
+        f"**Per-pillar signals:**\n{pillar_md}\n\n"
+        f"#### Gaps & quick fixes\n{_mk_gaps(gaps, limit=8)}\n\n"
+        f"#### Supporting excerpts\n"
+    )
+    for s in snippets:
+        md += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
+    md += f"**Trusted sources:** {refs}"
+    return md
+def answer_faq_or_approach_detailed(
     question: str,
     use_uploaded_docs: bool,
     index_obj: Any,
     if not q:
         return "Please enter a question."
+    # 1) Seeded FAQs → detailed plan
     for item in FAQ_SEEDS:
         seed_tokens = set(tokenize(item["q"])[:3])
         q_tokens = set(tokenize(q))
         if seed_tokens and seed_tokens.issubset(q_tokens):
+            refs = list_refs(item.get("refs", []))
+            base = (
+                f"### Answer (detailed)\n"
+                f"{item['a']}\n\n"
+                "#### Step-by-step plan\n"
+                "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
+                "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
+                "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
+                "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
+                "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
+                "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
+                "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
+                f"**Trusted sources:** {refs}"
+            )
+            return base
+    # 2) Use uploaded docs (RAG) → detailed synthesized answer
     if use_uploaded_docs and index_obj is not None and corpus:
+        top = index_obj.query(q, k=6)
+        snippets = []
+        for i, sim in top:
+            item = corpus[i]
+            excerpt = item["text"].strip()
+            if len(excerpt) > 700:
+                excerpt = excerpt[:700] + "..."
+            snippets.append({
+                "file": item["file"],
+                "relevance": float(sim),
+                "excerpt": excerpt
+            })
+        if snippets:
+            return _compose_detailed_from_snippets(q, snippets)
+    # 3) Fallback (no docs) → generic detailed plan with citations
     refs = list_refs([
         "Azure VMware Solution (AVS)",
         "Azure Migrate",
         "Azure Well-Architected Framework (WAF)",
         "VMware HCX Docs"
     ])
+    generic = (
+        "### Answer (detailed)\n"
+        "**TL;DR:** Use AVS/HCX or Azure Migrate depending on downtime needs; build landing zone and connectivity first, "
+        "then migrate in waves with rollback and DR drills.\n\n"
+        "#### Step-by-step plan\n"
+        "1) Confirm **Landing Zone** (hub/spoke, Policy, RBAC, logging/monitoring).\n"
+        "2) Establish **ExpressRoute/VPN** and DNS; validate MTU if using **HCX**.\n"
+        "3) Run **Azure Migrate** discovery/assessment; classify rehost/refactor/modernize.\n"
+        "4) Pilot 2–3 representative VMs; choose **HCX (bulk/RAV/vMotion)** or **Azure Migrate** for cutover.\n"
+        "5) Define **RTO/RPO**, backups (immutable/soft-delete), and **ASR** drills; document rollback.\n"
+        "6) Enforce **Key Vault**, **Defender/Sentinel**, **PIM/MFA**, and **Azure Policy** guardrails.\n"
+        "7) Right-size, use reservations/Savings Plans; tag for showback/chargeback.\n\n"
+        f"**Trusted sources:** {refs}"
+    )
+    return generic
 # =========================
+# Build index from uploaded files
 # =========================
+def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
+    """
+    Returns: (index_obj, matrix_placeholder, corpus, status_message)
+    """
+    if not files:
+        return None, None, [], "No files uploaded yet."
+    corpus: List[Dict[str, str]] = []
+    for f in files:
+        rec = parse_file(f)
+        if rec["text"]:
+            corpus.append(rec)
+    if not corpus:
+        return None, None, [], "Uploaded files could not be parsed (no text extracted)."
+    tokenized = [tokenize(c["text"]) for c in corpus]
+    idx = TinyTfidfIndex()
+    idx.add_documents(tokenized)
+    status = f"Indexed {len(corpus)} document(s). Vocabulary size ≈ {idx.voc_size}."
+    return idx, None, corpus, status
+# =========================
+# Gradio UI
+# =========================
+with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
+    gr.Markdown(
+        "## VMware On-Prem → Azure Local Migration Assistant\n"
+        "- Upload your **design/migration documents** (PDF, DOCX, TXT, MD)\n"
+        "- Ask questions. Toggle **Use uploaded docs** for RAG-based answers\n"
+        "- Answers are **detailed** by default, with structured steps and trusted references\n"
     )
+    with gr.Row():
+        with gr.Column(scale=2):
+            file_in = gr.Files(
+                label="Upload documents (PDF/DOCX/TXT/MD)",
+                file_count="multiple",
+                type="filepath"  # we will open paths ourselves
+            )
+            index_status = gr.Markdown("No index yet.")
+            # Hidden/State to hold in-memory data
+            st_index = gr.State()
+            st_matrix = gr.State()   # placeholder for API compatibility
+            st_corpus = gr.State()
+            build_btn = gr.Button("Build Index", variant="primary")
+        with gr.Column(scale=3):
+            question = gr.Textbox(label="Ask a question", placeholder="e.g., How do I minimize downtime for our VMware migration?")
+            use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
+            ask_btn = gr.Button("Ask", variant="primary")
+            answer_box = gr.Markdown("")
+    # Convert gr.Files (paths) into the dict format our parser expects
+    def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
+        out = []
+        for p in paths or []:
+            try:
+                with open(p, "rb") as fh:
+                    data = fh.read()
+                out.append({"name": os.path.basename(p), "data": data, "path": p})
+            except Exception:
+                pass
+        return out
+    def _build(files_paths: List[str]):
+        files = _collect_files(files_paths)
+        idx, mat, corpus, status = build_index(files)
+        return status, idx, mat, corpus
+    build_btn.click(
+        _build,
+        inputs=[file_in],
+        outputs=[index_status, st_index, st_matrix, st_corpus]
     )
     ask_btn.click(
+        answer_faq_or_approach_detailed,
         inputs=[question, use_docs, st_index, st_matrix, st_corpus],
         outputs=[answer_box]
     )
 if __name__ == "__main__":
+    # On Spaces, share=True is ignored safely; locally it will open a public link if allowed.
+    IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
+    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)