ajayinsac commited on
Commit
c76f040
·
verified ·
1 Parent(s): f2aed85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -47
app.py CHANGED
@@ -5,7 +5,7 @@
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
  - Works on Hugging Face Spaces (no external API calls, no sklearn).
7
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
8
- - Ask questions; get DETAILED, structured answers with excerpts + trusted refs.
9
 
10
  Run locally:
11
  pip install gradio PyPDF2 python-docx
@@ -16,7 +16,6 @@ import os
16
  import io
17
  import re
18
  import math
19
- import time
20
  from typing import List, Tuple, Dict, Any
21
  from collections import Counter, defaultdict
22
 
@@ -43,35 +42,35 @@ TRUSTED_SOURCES: List[Tuple[str, str]] = [
43
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
44
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
45
  ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
46
- ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html")
47
  ]
48
 
49
  FAQ_SEEDS: List[Dict[str, Any]] = [
50
  {
51
- "q": "How do we migrate VMware workloads to Azure with minimal downtime?",
52
  "a": (
53
  "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
54
  "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
55
  "pilot a few representative VMs, then migrate in waves with rollback and DR drills."
56
  ),
57
- "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"]
58
  },
59
  {
60
- "q": "What is a recommended migration sequence?",
61
  "a": (
62
  "1) Establish a governed landing zone. 2) Set up connectivity and identity. "
63
  "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
64
  "6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
65
  ),
66
- "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]
67
  },
68
  {
69
- "q": "How do we plan DR and backups?",
70
  "a": (
71
  "Define RTO/RPO per app. Use immutable backups and soft-delete. "
72
  "Leverage ASR for DR where appropriate, run failover drills, and document rollback."
73
  ),
74
- "refs": ["Azure Well-Architected Framework (WAF)"]
75
  },
76
  ]
77
 
@@ -80,7 +79,7 @@ FAQ_SEEDS: List[Dict[str, Any]] = [
80
  # Utilities
81
  # =========================
82
 
83
- _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+") # keep URLs/paths/ids mostly intact
84
 
85
  def tokenize(text: str) -> List[str]:
86
  if not text:
@@ -138,19 +137,17 @@ class TinyTfidfIndex:
138
  v[term] = (cnt / total) * idf
139
  return v
140
 
141
- def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
142
  if not self.docs:
143
  return []
144
  qv = self._vec(tokenize(text))
145
- # cosine similarity
146
  q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
147
  sims: List[Tuple[int, float]] = []
148
  for i, toks in enumerate(self.docs):
149
- dv = Counter(toks) # use tf counter to loop terms
150
  num = 0.0
151
  for term in qv:
152
  if term in dv:
153
- # weight for doc term
154
  w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
155
  num += qv[term] * w_d
156
  denom = (self.doc_norms[i] or 1e-9) * q_norm
@@ -160,7 +157,7 @@ class TinyTfidfIndex:
160
 
161
 
162
  # =========================
163
- # Simple scoring rubric to tailor the detailed output
164
  # =========================
165
 
166
  CHECKS = [
@@ -211,26 +208,25 @@ CHECKS = [
211
  def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
212
  toks = set(tokenize(text))
213
  scores = defaultdict(float)
214
- hits = []
215
  for chk in CHECKS:
216
  matched = any(kw in toks for kw in chk["keywords"])
217
  if matched:
218
  scores["overall"] += 1.0
219
  scores[chk["pillar"]] += 1.0
220
  else:
221
- hits.append({
222
  "id": chk["id"],
223
  "desc": chk["desc"],
224
  "fix": chk["fix"],
225
  "severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
226
  })
227
- # normalize roughly to 0-5 scale
228
  max_possible = float(len(CHECKS))
229
  scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
230
  for k in list(scores.keys()):
231
  if k != "overall":
232
  scores[k] = round(scores[k], 2)
233
- return scores, hits
234
 
235
 
236
  # =========================
@@ -263,7 +259,6 @@ def read_docx_bytes(b: bytes) -> str:
263
  return ""
264
 
265
  def read_text_bytes(b: bytes) -> str:
266
- # best-effort decoding
267
  for enc in ("utf-8", "utf-16", "latin-1"):
268
  try:
269
  return b.decode(enc)
@@ -271,15 +266,11 @@ def read_text_bytes(b: bytes) -> str:
271
  continue
272
  return ""
273
 
274
-
275
  def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
276
- """
277
- Returns {"file": <name>, "text": <extracted_text>}
278
- """
279
  name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
280
  data = file_obj.get("data")
281
  if data is None:
282
- # gradio sometimes provides a path instead
283
  path = file_obj.get("path")
284
  if path and os.path.exists(path):
285
  with open(path, "rb") as fh:
@@ -288,22 +279,19 @@ def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
288
  return {"file": name, "text": ""}
289
 
290
  low = name.lower()
291
- text = ""
292
  if low.endswith(".pdf"):
293
  text = read_pdf_bytes(data)
294
- elif low.endswith(".docx") or low.endswith(".doc"):
295
  text = read_docx_bytes(data)
296
  elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
297
  text = read_text_bytes(data)
298
  else:
299
- # try plain text as fallback
300
  text = read_text_bytes(data)
301
-
302
  return {"file": os.path.basename(name), "text": text or ""}
303
 
304
 
305
  # =========================
306
- # Detailed Q&A Composer
307
  # =========================
308
 
309
  def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
@@ -322,7 +310,7 @@ def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]])
322
  "Azure Migrate",
323
  "Cloud Adoption Framework (CAF)",
324
  "Azure Well-Architected Framework (WAF)",
325
- "VMware HCX Docs"
326
  ])
327
 
328
  pillar_lines = []
@@ -357,22 +345,27 @@ def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]])
357
  return md
358
 
359
 
 
 
 
 
360
  def answer_faq_or_approach_detailed(
361
  question: str,
362
  use_uploaded_docs: bool,
363
  index_obj: Any,
364
  _matrix_unused: Any,
365
- corpus: List[Dict[str, str]]
366
  ) -> str:
367
  q = (question or "").strip()
368
  if not q:
369
  return "Please enter a question."
370
 
371
- # 1) Seeded FAQs → detailed plan
 
372
  for item in FAQ_SEEDS:
373
- seed_tokens = set(tokenize(item["q"])[:3])
374
- q_tokens = set(tokenize(q))
375
- if seed_tokens and seed_tokens.issubset(q_tokens):
376
  refs = list_refs(item.get("refs", []))
377
  base = (
378
  f"### Answer (detailed)\n"
@@ -395,13 +388,13 @@ def answer_faq_or_approach_detailed(
395
  snippets = []
396
  for i, sim in top:
397
  item = corpus[i]
398
- excerpt = item["text"].strip()
399
  if len(excerpt) > 700:
400
  excerpt = excerpt[:700] + "..."
401
  snippets.append({
402
  "file": item["file"],
403
  "relevance": float(sim),
404
- "excerpt": excerpt
405
  })
406
  if snippets:
407
  return _compose_detailed_from_snippets(q, snippets)
@@ -412,7 +405,7 @@ def answer_faq_or_approach_detailed(
412
  "Azure Migrate",
413
  "Cloud Adoption Framework (CAF)",
414
  "Azure Well-Architected Framework (WAF)",
415
- "VMware HCX Docs"
416
  ])
417
  generic = (
418
  "### Answer (detailed)\n"
@@ -436,9 +429,7 @@ def answer_faq_or_approach_detailed(
436
  # =========================
437
 
438
  def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
439
- """
440
- Returns: (index_obj, matrix_placeholder, corpus, status_message)
441
- """
442
  if not files:
443
  return None, None, [], "No files uploaded yet."
444
 
@@ -487,12 +478,16 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
487
 
488
  build_btn = gr.Button("Build Index", variant="primary")
489
  with gr.Column(scale=3):
490
- question = gr.Textbox(label="Ask a question", placeholder="e.g., How do I minimize downtime for our VMware migration?")
 
 
 
 
491
  use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
492
  ask_btn = gr.Button("Ask", variant="primary")
493
  answer_box = gr.Markdown("")
494
 
495
- # Convert gr.Files (paths) into the dict format our parser expects
496
  def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
497
  out = []
498
  for p in paths or []:
@@ -512,16 +507,15 @@ with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) a
512
  build_btn.click(
513
  _build,
514
  inputs=[file_in],
515
- outputs=[index_status, st_index, st_matrix, st_corpus]
516
  )
517
 
518
  ask_btn.click(
519
  answer_faq_or_approach_detailed,
520
  inputs=[question, use_docs, st_index, st_matrix, st_corpus],
521
- outputs=[answer_box]
522
  )
523
 
524
  if __name__ == "__main__":
525
- # On Spaces, share=True is ignored safely; locally it will open a public link if allowed.
526
  IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
527
  demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)
 
5
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
6
  - Works on Hugging Face Spaces (no external API calls, no sklearn).
7
  - Upload design/migration docs (PDF/DOCX/TXT/MD).
8
+ - Ask questions; get reliable, detailed answers with excerpts + trusted refs.
9
 
10
  Run locally:
11
  pip install gradio PyPDF2 python-docx
 
16
  import io
17
  import re
18
  import math
 
19
  from typing import List, Tuple, Dict, Any
20
  from collections import Counter, defaultdict
21
 
 
42
  ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
43
  ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
44
  ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
45
+ ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html"),
46
  ]
47
 
48
  FAQ_SEEDS: List[Dict[str, Any]] = [
49
  {
50
+ "q": "migrate vmware workloads minimal downtime",
51
  "a": (
52
  "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
53
  "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
54
  "pilot a few representative VMs, then migrate in waves with rollback and DR drills."
55
  ),
56
+ "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"],
57
  },
58
  {
59
+ "q": "recommended migration sequence",
60
  "a": (
61
  "1) Establish a governed landing zone. 2) Set up connectivity and identity. "
62
  "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
63
  "6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
64
  ),
65
+ "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"],
66
  },
67
  {
68
+ "q": "dr and backups planning",
69
  "a": (
70
  "Define RTO/RPO per app. Use immutable backups and soft-delete. "
71
  "Leverage ASR for DR where appropriate, run failover drills, and document rollback."
72
  ),
73
+ "refs": ["Azure Well-Architected Framework (WAF)"],
74
  },
75
  ]
76
 
 
79
  # Utilities
80
  # =========================
81
 
82
+ _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")
83
 
84
  def tokenize(text: str) -> List[str]:
85
  if not text:
 
137
  v[term] = (cnt / total) * idf
138
  return v
139
 
140
+ def query(self, text: str, k: int = 6) -> List[Tuple[int, float]]:
141
  if not self.docs:
142
  return []
143
  qv = self._vec(tokenize(text))
 
144
  q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
145
  sims: List[Tuple[int, float]] = []
146
  for i, toks in enumerate(self.docs):
147
+ dv = Counter(toks)
148
  num = 0.0
149
  for term in qv:
150
  if term in dv:
 
151
  w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
152
  num += qv[term] * w_d
153
  denom = (self.doc_norms[i] or 1e-9) * q_norm
 
157
 
158
 
159
  # =========================
160
+ # Scoring rubric to tailor the detailed output
161
  # =========================
162
 
163
  CHECKS = [
 
208
  def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
209
  toks = set(tokenize(text))
210
  scores = defaultdict(float)
211
+ gaps = []
212
  for chk in CHECKS:
213
  matched = any(kw in toks for kw in chk["keywords"])
214
  if matched:
215
  scores["overall"] += 1.0
216
  scores[chk["pillar"]] += 1.0
217
  else:
218
+ gaps.append({
219
  "id": chk["id"],
220
  "desc": chk["desc"],
221
  "fix": chk["fix"],
222
  "severity": "high" if chk["pillar"] in ("security", "reliability") else "medium",
223
  })
 
224
  max_possible = float(len(CHECKS))
225
  scores["overall"] = round(5.0 * (scores["overall"] / max_possible), 2)
226
  for k in list(scores.keys()):
227
  if k != "overall":
228
  scores[k] = round(scores[k], 2)
229
+ return scores, gaps
230
 
231
 
232
  # =========================
 
259
  return ""
260
 
261
  def read_text_bytes(b: bytes) -> str:
 
262
  for enc in ("utf-8", "utf-16", "latin-1"):
263
  try:
264
  return b.decode(enc)
 
266
  continue
267
  return ""
268
 
 
269
  def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
270
+ """Returns {"file": <name>, "text": <extracted_text>}"""
 
 
271
  name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
272
  data = file_obj.get("data")
273
  if data is None:
 
274
  path = file_obj.get("path")
275
  if path and os.path.exists(path):
276
  with open(path, "rb") as fh:
 
279
  return {"file": name, "text": ""}
280
 
281
  low = name.lower()
 
282
  if low.endswith(".pdf"):
283
  text = read_pdf_bytes(data)
284
+ elif low.endswith((".docx", ".doc")):
285
  text = read_docx_bytes(data)
286
  elif low.endswith((".md", ".txt", ".log", ".cfg", ".ini")):
287
  text = read_text_bytes(data)
288
  else:
 
289
  text = read_text_bytes(data)
 
290
  return {"file": os.path.basename(name), "text": text or ""}
291
 
292
 
293
  # =========================
294
+ # Detailed Answer Composer
295
  # =========================
296
 
297
  def _compose_detailed_from_snippets(query: str, snippets: List[Dict[str, str]]) -> str:
 
310
  "Azure Migrate",
311
  "Cloud Adoption Framework (CAF)",
312
  "Azure Well-Architected Framework (WAF)",
313
+ "VMware HCX Docs",
314
  ])
315
 
316
  pillar_lines = []
 
345
  return md
346
 
347
 
348
+ # =========================
349
+ # Main Answer Function
350
+ # =========================
351
+
352
  def answer_faq_or_approach_detailed(
353
  question: str,
354
  use_uploaded_docs: bool,
355
  index_obj: Any,
356
  _matrix_unused: Any,
357
+ corpus: List[Dict[str, str]],
358
  ) -> str:
359
  q = (question or "").strip()
360
  if not q:
361
  return "Please enter a question."
362
 
363
+ # 1) Seeded FAQs → detailed plan (looser match to trigger more often)
364
+ q_tokens = set(tokenize(q))
365
  for item in FAQ_SEEDS:
366
+ seed_tokens = set(tokenize(item["q"]))
367
+ overlap = len(seed_tokens & q_tokens)
368
+ if overlap >= max(1, len(seed_tokens) // 2): # >=50% overlap
369
  refs = list_refs(item.get("refs", []))
370
  base = (
371
  f"### Answer (detailed)\n"
 
388
  snippets = []
389
  for i, sim in top:
390
  item = corpus[i]
391
+ excerpt = (item["text"] or "").strip()
392
  if len(excerpt) > 700:
393
  excerpt = excerpt[:700] + "..."
394
  snippets.append({
395
  "file": item["file"],
396
  "relevance": float(sim),
397
+ "excerpt": excerpt,
398
  })
399
  if snippets:
400
  return _compose_detailed_from_snippets(q, snippets)
 
405
  "Azure Migrate",
406
  "Cloud Adoption Framework (CAF)",
407
  "Azure Well-Architected Framework (WAF)",
408
+ "VMware HCX Docs",
409
  ])
410
  generic = (
411
  "### Answer (detailed)\n"
 
429
  # =========================
430
 
431
  def build_index(files: List[Dict[str, Any]]) -> Tuple[Any, Any, List[Dict[str, str]], str]:
432
+ """Returns: (index_obj, matrix_placeholder, corpus, status_message)"""
 
 
433
  if not files:
434
  return None, None, [], "No files uploaded yet."
435
 
 
478
 
479
  build_btn = gr.Button("Build Index", variant="primary")
480
  with gr.Column(scale=3):
481
+ question = gr.Textbox(
482
+ label="Ask a question",
483
+ placeholder="e.g., How do I minimize downtime for our VMware migration?",
484
+ lines=3
485
+ )
486
  use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
487
  ask_btn = gr.Button("Ask", variant="primary")
488
  answer_box = gr.Markdown("")
489
 
490
+ # Convert gr.Files (paths) into dicts our parser expects
491
  def _collect_files(paths: List[str]) -> List[Dict[str, Any]]:
492
  out = []
493
  for p in paths or []:
 
507
  build_btn.click(
508
  _build,
509
  inputs=[file_in],
510
+ outputs=[index_status, st_index, st_matrix, st_corpus],
511
  )
512
 
513
  ask_btn.click(
514
  answer_faq_or_approach_detailed,
515
  inputs=[question, use_docs, st_index, st_matrix, st_corpus],
516
+ outputs=[answer_box],
517
  )
518
 
519
  if __name__ == "__main__":
 
520
  IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
521
  demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)