Spaces:

ajayinsac
/

VMware2AzureLocal

Sleeping

App Files Files Community

ajayinsac commited on Sep 11, 2025

Commit

c76f040

verified ·

1 Parent(s): f2aed85

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -47

app.py CHANGED Viewed

@@ -5,7 +5,7 @@
 VMware On-Prem → Azure Local Migration Assistant (Gradio)
 - Works on Hugging Face Spaces (no external API calls, no sklearn).
 - Upload design/migration docs (PDF/DOCX/TXT/MD).
-- Ask questions; get DETAILED, structured answers with excerpts + trusted refs.
 Run locally:
   pip install gradio PyPDF2 python-docx
@@ -16,7 +16,6 @@ import os
 import io
 import re
 import math
-import time
 from typing import List, Tuple, Dict, Any
 from collections import Counter, defaultdict
@@ -43,35 +42,35 @@ TRUSTED_SOURCES: List[Tuple[str, str]] = [
     ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
     ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
     ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
-    ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html")
 ]
 FAQ_SEEDS: List[Dict[str, Any]] = [
     {
-        "q": "How do we migrate VMware workloads to Azure with minimal downtime?",
         "a": (
             "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
             "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
             "pilot a few representative VMs, then migrate in waves with rollback and DR drills."
         ),
-        "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"]
     },
     {
-        "q": "What is a recommended migration sequence?",
         "a": (
             "1) Establish a governed landing zone. 2) Set up connectivity and identity. "
             "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
             "6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
         ),
-        "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]
     },
     {
-        "q": "How do we plan DR and backups?",
         "a": (
             "Define RTO/RPO per app. Use immutable backups and soft-delete. "
             "Leverage ASR for DR where appropriate, run failover drills, and document rollback."
         ),
-        "refs": ["Azure Well-Architected Framework (WAF)"]
     },
 ]
@@ -80,7 +79,7 @@ FAQ_SEEDS: List[Dict[str, Any]] = [
 # Utilities
 # =========================
-_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")  # keep URLs/paths/ids mostly intact
 def tokenize(text: str) -> List[str]:
     if not text:
@@ -138,19 +137,17 @@ class TinyTfidfIndex:
             v[term] = (cnt / total) * idf
         return v
-    def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
         if not self.docs:
             return []
         qv = self._vec(tokenize(text))
-        # cosine similarity
         q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
         sims: List[Tuple[int, float]] = []
         for i, toks in enumerate(self.docs):
-            dv = Counter(toks)  # use tf counter to loop terms
             num = 0.0
             for term in qv:
                 if term in dv:
-                    # weight for doc term
                     w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
                     num += qv[term] * w_d
             denom = (self.doc_norms[i] or 1e-9) * q_norm
@@ -160,7 +157,7 @@ class TinyTfidfIndex:
 # =========================
-# Simple scoring rubric to tailor the detailed output
 # =========================
 CHECKS = [
@@ -211,26 +208,25 @@ CHECKS = [
 def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
     toks = set(tokenize(text))
     scores = defaultdict(float)
-    hits = []
     for chk in CHECKS:
         matched = any(kw in toks for kw in chk["keywords"])
         if matched:
             scores["overall"] += 1.0
             scores[chk["pillar"]] += 1.0
         else:
-            hits.append({
                 "id": chk["id"],
                 "desc": chk["desc"],
                 "fix": chk["fix"],
                 "severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
             })
-    # normalize roughly to 0-5 scale
     max_possible = float(len(CHECKS))
     scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
     for k in list(scores.keys()):
         if k != "overall":
             scores[k] = round(scores[k], 2)
-    return scores, hits
 # =========================
@@ -263,7 +259,6 @@ def read_docx_bytes(b: bytes) -> str:
         return ""
 def read_text_bytes(b: bytes) -> str:
-    # best-effort decoding
     for enc in ("utf-8", "utf-16", "latin-1"):
         try:
             return b.decode(enc)
@@ -271,15 +266,11 @@ def read_text_bytes(b: bytes) -> str:
             continue
     return ""
 def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
-    """
-    Returns {"file": <name>, "text": <extracted_text>}
-    """
     name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
     data = file_obj.get("data")
     if data is None:
-        # gradio sometimes provides a path instead
         path = file_obj.get("path")
         if path and os.path.exists(path):
             with open(path, "rb") as fh:
@@ -288,22 +279,19 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
         return {"file": name, "text": ""}
     low = name.lower()
-    text = ""
     if low.endswith(".pdf"):
         text = read_pdf_bytes(data)
-    elif low.endswith(".docx") or low.endswith(".doc"):
         text = read_docx_bytes(data)
     elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
         text = read_text_bytes(data)
     else:
-        # try plain text as fallback
         text = read_text_bytes(data)
     return {"file": os.path.basename(name), "text": text or ""}
 # =========================
-# Detailed Q&A Composer
 # =========================
 def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
@@ -322,7 +310,7 @@ def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]])
         "Azure Migrate",
         "Cloud Adoption Framework (CAF)",
         "Azure Well-Architected Framework (WAF)",
-        "VMware HCX Docs"
     ])
     pillar_lines = []
@@ -357,22 +345,27 @@ def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]])
     return md
 def answer_faq_or_approach_detailed(
     question: str,
     use_uploaded_docs: bool,
     index_obj: Any,
     _matrix_unused: Any,
-    corpus: List[Dict[str, str]]
 ) -> str:
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
-    # 1) Seeded FAQs → detailed plan
     for item in FAQ_SEEDS:
-        seed_tokens = set(tokenize(item["q"])[:3])
-        q_tokens = set(tokenize(q))
-        if seed_tokens and seed_tokens.issubset(q_tokens):
             refs = list_refs(item.get("refs", []))
             base = (
                 f"### Answer (detailed)\n"
@@ -395,13 +388,13 @@ def answer_faq_or_approach_detailed(
         snippets = []
         for i, sim in top:
             item = corpus[i]
-            excerpt = item["text"].strip()
             if len(excerpt) > 700:
                 excerpt = excerpt[:700] + "..."
             snippets.append({
                 "file": item["file"],
                 "relevance": float(sim),
-                "excerpt": excerpt
             })
         if snippets:
             return _compose_detailed_from_snippets(q, snippets)
@@ -412,7 +405,7 @@ def answer_faq_or_approach_detailed(
         "Azure Migrate",
         "Cloud Adoption Framework (CAF)",
         "Azure Well-Architected Framework (WAF)",
-        "VMware HCX Docs"
     ])
     generic = (
         "### Answer (detailed)\n"
@@ -436,9 +429,7 @@ def answer_faq_or_approach_detailed(
 # =========================
 def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
-    """
-    Returns: (index_obj, matrix_placeholder, corpus, status_message)
-    """
     if not files:
         return None, None, [], "No files uploaded yet."
@@ -487,12 +478,16 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
             build_btn = gr.Button("Build Index", variant="primary")
         with gr.Column(scale=3):
-            question = gr.Textbox(label="Ask a question", placeholder="e.g., How do I minimize downtime for our VMware migration?")
             use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
             ask_btn = gr.Button("Ask", variant="primary")
             answer_box = gr.Markdown("")
-    # Convert gr.Files (paths) into the dict format our parser expects
     def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
         out = []
         for p in paths or []:
@@ -512,16 +507,15 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
     build_btn.click(
         _build,
         inputs=[file_in],
-        outputs=[index_status, st_index, st_matrix, st_corpus]
     )
     ask_btn.click(
         answer_faq_or_approach_detailed,
         inputs=[question, use_docs, st_index, st_matrix, st_corpus],
-        outputs=[answer_box]
     )
 if __name__ == "__main__":
-    # On Spaces, share=True is ignored safely; locally it will open a public link if allowed.
     IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
     demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)

 VMware On-Prem → Azure Local Migration Assistant (Gradio)
 - Works on Hugging Face Spaces (no external API calls, no sklearn).
 - Upload design/migration docs (PDF/DOCX/TXT/MD).
+- Ask questions; get reliable, detailed answers with excerpts + trusted refs.
 Run locally:
   pip install gradio PyPDF2 python-docx
 import io
 import re
 import math
 from typing import List, Tuple, Dict, Any
 from collections import Counter, defaultdict
     ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
     ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
     ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
+    ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html"),
 ]
 FAQ_SEEDS: List[Dict[str, Any]] = [
     {
+        "q": "migrate vmware workloads minimal downtime",
         "a": (
             "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
             "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
             "pilot a few representative VMs, then migrate in waves with rollback and DR drills."
         ),
+        "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"],
     },
     {
+        "q": "recommended migration sequence",
         "a": (
             "1) Establish a governed landing zone. 2) Set up connectivity and identity. "
             "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
             "6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
         ),
+        "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"],
     },
     {
+        "q": "dr and backups planning",
         "a": (
             "Define RTO/RPO per app. Use immutable backups and soft-delete. "
             "Leverage ASR for DR where appropriate, run failover drills, and document rollback."
         ),
+        "refs": ["Azure Well-Architected Framework (WAF)"],
     },
 ]
 # Utilities
 # =========================
+_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
 def tokenize(text: str) -> List[str]:
     if not text:
             v[term] = (cnt / total) * idf
         return v
+    def query(self, text: str, k: int = 6) -> List[Tuple[int, float]]:
         if not self.docs:
             return []
         qv = self._vec(tokenize(text))
         q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
         sims: List[Tuple[int, float]] = []
         for i, toks in enumerate(self.docs):
+            dv = Counter(toks)
             num = 0.0
             for term in qv:
                 if term in dv:
                     w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
                     num += qv[term] * w_d
             denom = (self.doc_norms[i] or 1e-9) * q_norm
 # =========================
+# Scoring rubric to tailor the detailed output
 # =========================
 CHECKS = [
 def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
     toks = set(tokenize(text))
     scores = defaultdict(float)
+    gaps = []
     for chk in CHECKS:
         matched = any(kw in toks for kw in chk["keywords"])
         if matched:
             scores["overall"] += 1.0
             scores[chk["pillar"]] += 1.0
         else:
+            gaps.append({
                 "id": chk["id"],
                 "desc": chk["desc"],
                 "fix": chk["fix"],
                 "severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
             })
     max_possible = float(len(CHECKS))
     scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
     for k in list(scores.keys()):
         if k != "overall":
             scores[k] = round(scores[k], 2)
+    return scores, gaps
 # =========================
         return ""
 def read_text_bytes(b: bytes) -> str:
     for enc in ("utf-8", "utf-16", "latin-1"):
         try:
             return b.decode(enc)
             continue
     return ""
 def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
+    """Returns {"file": <name>, "text": <extracted_text>}"""
     name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
     data = file_obj.get("data")
     if data is None:
         path = file_obj.get("path")
         if path and os.path.exists(path):
             with open(path, "rb") as fh:
         return {"file": name, "text": ""}
     low = name.lower()
     if low.endswith(".pdf"):
         text = read_pdf_bytes(data)
+    elif low.endswith((".docx", ".doc")):
         text = read_docx_bytes(data)
     elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
         text = read_text_bytes(data)
     else:
         text = read_text_bytes(data)
     return {"file": os.path.basename(name), "text": text or ""}
 # =========================
+# Detailed Answer Composer
 # =========================
 def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
         "Azure Migrate",
         "Cloud Adoption Framework (CAF)",
         "Azure Well-Architected Framework (WAF)",
+        "VMware HCX Docs",
     ])
     pillar_lines = []
     return md
+# =========================
+# Main Answer Function
+# =========================
 def answer_faq_or_approach_detailed(
     question: str,
     use_uploaded_docs: bool,
     index_obj: Any,
     _matrix_unused: Any,
+    corpus: List[Dict[str, str]],
 ) -> str:
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
+    # 1) Seeded FAQs → detailed plan (looser match to trigger more often)
+    q_tokens = set(tokenize(q))
     for item in FAQ_SEEDS:
+        seed_tokens = set(tokenize(item["q"]))
+        overlap = len(seed_tokens & q_tokens)
+        if overlap >= max(1, len(seed_tokens) // 2):  # >=50% overlap
             refs = list_refs(item.get("refs", []))
             base = (
                 f"### Answer (detailed)\n"
         snippets = []
         for i, sim in top:
             item = corpus[i]
+            excerpt = (item["text"] or "").strip()
             if len(excerpt) > 700:
                 excerpt = excerpt[:700] + "..."
             snippets.append({
                 "file": item["file"],
                 "relevance": float(sim),
+                "excerpt": excerpt,
             })
         if snippets:
             return _compose_detailed_from_snippets(q, snippets)
         "Azure Migrate",
         "Cloud Adoption Framework (CAF)",
         "Azure Well-Architected Framework (WAF)",
+        "VMware HCX Docs",
     ])
     generic = (
         "### Answer (detailed)\n"
 # =========================
 def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
+    """Returns: (index_obj, matrix_placeholder, corpus, status_message)"""
     if not files:
         return None, None, [], "No files uploaded yet."
             build_btn = gr.Button("Build Index", variant="primary")
         with gr.Column(scale=3):
+            question = gr.Textbox(
+                label="Ask a question",
+                placeholder="e.g., How do I minimize downtime for our VMware migration?",
+                lines=3
+            )
             use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
             ask_btn = gr.Button("Ask", variant="primary")
             answer_box = gr.Markdown("")
+    # Convert gr.Files (paths) into dicts our parser expects
     def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
         out = []
         for p in paths or []:
     build_btn.click(
         _build,
         inputs=[file_in],
+        outputs=[index_status, st_index, st_matrix, st_corpus],
     )
     ask_btn.click(
         answer_faq_or_approach_detailed,
         inputs=[question, use_docs, st_index, st_matrix, st_corpus],
+        outputs=[answer_box],
     )
 if __name__ == "__main__":
     IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
     demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)