ajayinsac commited on
Commit
32e167d
·
verified ·
1 Parent(s): 0b055a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -49
app.py CHANGED
@@ -2,26 +2,27 @@
2
  """
3
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
4
 
 
 
 
5
  Features
6
  - FAQ / approach Q&A with trusted-source citations (links)
7
  - Upload & index PDF/DOCX/TXT (session-local)
8
- - Lightweight RAG (TF-IDF over chunks)
9
  - Design/Runbook auto-review with rubric (0–5) + gaps + fixes
10
  - All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
11
-
12
- Author: you
13
  """
14
 
15
  import os
16
  import io
17
  import re
18
  import json
 
19
  import time
20
  from typing import List, Tuple, Dict, Any
 
21
 
22
  import gradio as gr
23
- from sklearn.feature_extraction.text import TfidfVectorizer
24
- from sklearn.metrics.pairwise import cosine_similarity
25
 
26
  # -------- Optional, small footprint parsers --------
27
  # PDF
@@ -189,6 +190,83 @@ FAQ_SEEDS = [
189
  },
190
  ]
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  # =========================
193
  # Utilities: text extraction & chunking
194
  # =========================
@@ -264,10 +342,15 @@ def chunk_text(text: str, max_len: int = 900, overlap: int = 120) -> List[str]:
264
  # =========================
265
  # RAG Index (session-scoped)
266
  # =========================
 
 
 
 
 
267
  def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
268
  """
269
- Build a TF-IDF vectorizer over all chunks from uploaded documents.
270
- Returns: (vectorizer, matrix, chunks_with_meta)
271
  """
272
  all_chunks = []
273
  meta = []
@@ -284,34 +367,32 @@ def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
284
  if not all_chunks:
285
  return None, None, None
286
 
287
- vectorizer = TfidfVectorizer(stop_words="english", max_features=25000)
288
- X = vectorizer.fit_transform(all_chunks)
289
- return vectorizer, X, [{"text": t, **m} for t, m in zip(all_chunks, meta)]
 
290
 
291
  def retrieve_answer(
292
  query: str,
293
- vectorizer: Any,
294
- matrix: Any,
295
  corpus: List[Dict[str, str]],
296
  k: int = 4
297
  ) -> Tuple[str, List[Dict[str, str]]]:
298
  """
299
  Return synthesized answer + top-k supporting chunks with filenames.
300
  """
301
- if not query or vectorizer is None or matrix is None or not corpus:
302
  return "", []
303
- qv = vectorizer.transform([query])
304
- sims = cosine_similarity(qv, matrix).ravel()
305
- top_idx = sims.argsort()[::-1][:k]
306
  snippets = []
307
- for i in top_idx:
308
  item = corpus[i]
309
  snippets.append({
310
  "file": item["file"],
311
- "relevance": float(sims[i]),
312
  "excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
313
  })
314
- # Simple synthesis: bullet list of the top excerpts + a short summary hint.
315
  answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
316
  for s in snippets:
317
  answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
@@ -322,10 +403,6 @@ def retrieve_answer(
322
  # Design / Runbook Auto-Review
323
  # =========================
324
  def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
325
- """
326
- Returns per-pillar scores (0..5) and a list of gaps with fixes.
327
- Very simple keyword coverage approach + gap heuristics.
328
- """
329
  text_low = text.lower()
330
 
331
  pillar_scores = {}
@@ -341,7 +418,6 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
341
  score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2) # baseline 1.5, up to 5.0
342
  pillar_scores[pillar] = score
343
 
344
- # naive gap examples:
345
  if pillar == "networking":
346
  if "expressroute".lower() not in text_low and "er " not in text_low:
347
  gaps.append({
@@ -461,13 +537,11 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
461
  "fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
462
  })
463
 
464
- # Overall score = average of pillars
465
  if pillar_scores:
466
  overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
467
  else:
468
  overall = 0.0
469
 
470
- # Insert an overall summary as the first "gap" entry if overall < 3.5
471
  if overall < 3.5:
472
  gaps.insert(0, {
473
  "id": "SUMMARY",
@@ -479,12 +553,6 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
479
  return {"overall": overall, **pillar_scores}, gaps
480
 
481
  def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
482
- """
483
- Aggregate text from uploaded docs, run heuristic review, and return:
484
- - markdown summary
485
- - json result
486
- - table rows for Gaps (id, severity, description, fix)
487
- """
488
  if not files:
489
  return "Please upload at least one PDF/DOCX/TXT.", {}, []
490
 
@@ -506,14 +574,13 @@ def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], Lis
506
  md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
507
  md += "**Per-Pillar Scores:**\n\n"
508
  for k, v in scores.items():
509
- if k == "overall":
510
  continue
511
  md += f"- **{k.capitalize()}**: {v}\n"
512
  md += "\n**Top Recommendations:**\n"
513
  for g in gaps[:6]:
514
  md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
515
 
516
- # JSON + table
517
  result_json = {
518
  "timestamp": int(time.time()),
519
  "files": file_list,
@@ -537,8 +604,8 @@ def list_refs(ref_names: List[str]) -> str:
537
  def answer_faq_or_approach(
538
  question: str,
539
  use_uploaded_docs: bool,
540
- vectorizer: Any,
541
- matrix: Any,
542
  corpus: List[Dict[str, str]]
543
  ) -> str:
544
  q = (question or "").strip()
@@ -547,14 +614,16 @@ def answer_faq_or_approach(
547
 
548
  # First try seeded FAQs (very light semantic: keyword match)
549
  for item in FAQ_SEEDS:
550
- if all(w.lower() in q.lower() for w in re.findall(r"\w+", item["q"])[:3]):
 
 
 
551
  return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
552
 
553
  # If requested, try RAG on uploaded docs
554
- if use_uploaded_docs and vectorizer is not None and matrix is not None and corpus:
555
- rag_answer, _snips = retrieve_answer(q, vectorizer, matrix, corpus, k=4)
556
  if rag_answer.strip():
557
- # Always append trusted sources list for user orientation
558
  refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
559
  return f"{rag_answer}\n\n**Trusted sources:** {refs}"
560
 
@@ -589,8 +658,8 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
589
  )
590
 
591
  # Session state for RAG
592
- st_vectorizer = gr.State(None)
593
- st_matrix = gr.State(None)
594
  st_corpus = gr.State(None)
595
 
596
  with gr.Tabs():
@@ -622,7 +691,6 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
622
 
623
  with gr.Tab("Trusted Sources & Ontology"):
624
  gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
625
- # Render links
626
  links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
627
  gr.Markdown(links_md)
628
 
@@ -634,23 +702,23 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
634
 
635
  gr.Markdown(
636
  "### Notes\n"
637
- "- This app does **not** call external APIs. Use the links above for deep-dives into official guidance.\n"
638
  "- Design checks are heuristic; always validate against your Architecture Board and security teams."
639
  )
640
 
641
  # ====== Wiring ======
642
  def on_build_index(files_list):
643
- vec, X, cor = build_index(files_list)
644
- if vec is None:
645
  return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
646
  None, None, None)
647
  msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
648
- return msg, vec, X, cor
649
 
650
  build_btn.click(
651
  on_build_index,
652
  inputs=[files],
653
- outputs=[index_info, st_vectorizer, st_matrix, st_corpus]
654
  )
655
 
656
  def on_review(files_list):
@@ -665,7 +733,7 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
665
 
666
  ask_btn.click(
667
  answer_faq_or_approach,
668
- inputs=[question, use_docs, st_vectorizer, st_matrix, st_corpus],
669
  outputs=[answer_box]
670
  )
671
 
 
2
  """
3
  VMware On-Prem → Azure Local Migration Assistant (Gradio)
4
 
5
+ Update: Removed scikit-learn dependency. Includes a minimal pure-Python TF-IDF
6
+ and cosine similarity so it runs on Hugging Face Spaces without sklearn.
7
+
8
  Features
9
  - FAQ / approach Q&A with trusted-source citations (links)
10
  - Upload & index PDF/DOCX/TXT (session-local)
11
+ - Lightweight RAG (pure-Python TF-IDF over chunks)
12
  - Design/Runbook auto-review with rubric (0–5) + gaps + fixes
13
  - All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
 
 
14
  """
15
 
16
  import os
17
  import io
18
  import re
19
  import json
20
+ import math
21
  import time
22
  from typing import List, Tuple, Dict, Any
23
+ from collections import Counter, defaultdict
24
 
25
  import gradio as gr
 
 
26
 
27
  # -------- Optional, small footprint parsers --------
28
  # PDF
 
190
  },
191
  ]
192
 
193
+ # =========================
194
+ # Minimal Pure-Python TF-IDF
195
+ # =========================
196
+ STOPWORDS = set("""
197
+ a an the and or but if then else for from to in on at by of with without into within over under not be is are was were will can should would could may might
198
+ this that these those there here when where how what why who whom which as it its itself themselves ourselves yourself yourselves
199
+ """.split())
200
+
201
+ TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
202
+
203
+ def tokenize(text: str) -> List[str]:
204
+ return [w.lower() for w in TOKEN_RE.findall(text) if w and w.lower() not in STOPWORDS]
205
+
206
+ class TinyTfidfIndex:
207
+ def __init__(self):
208
+ self.docs: List[List[str]] = []
209
+ self.doc_vectors: List[Dict[str, float]] = []
210
+ self.doc_norms: List[float] = []
211
+ self.idf: Dict[str, float] = {}
212
+ self.N = 0
213
+ self.corpus_meta: List[Dict[str, str]] = []
214
+
215
+ def fit(self, texts: List[str], meta: List[Dict[str, str]]):
216
+ self.docs = [tokenize(t) for t in texts]
217
+ self.N = len(self.docs)
218
+ self.corpus_meta = meta
219
+
220
+ # document frequency
221
+ df = Counter()
222
+ for doc in self.docs:
223
+ df.update(set(doc))
224
+ # idf
225
+ self.idf = {}
226
+ for term, dfi in df.items():
227
+ # add-1 smoothing to avoid div by zero, +1 offset
228
+ self.idf[term] = 1.0 + math.log((self.N + 1) / (dfi + 1))
229
+
230
+ # build doc vectors
231
+ self.doc_vectors = []
232
+ self.doc_norms = []
233
+ for doc in self.docs:
234
+ tf = Counter(doc)
235
+ vec = {}
236
+ for term, cnt in tf.items():
237
+ vec[term] = (cnt / max(1, len(doc))) * self.idf.get(term, 0.0)
238
+ norm = math.sqrt(sum(v * v for v in vec.values())) or 1e-12
239
+ self.doc_vectors.append(vec)
240
+ self.doc_norms.append(norm)
241
+
242
+ def query(self, text: str, k: int = 4) -> List[Tuple[int, float]]:
243
+ qtokens = tokenize(text)
244
+ if not qtokens or self.N == 0:
245
+ return []
246
+ tf = Counter(qtokens)
247
+ qvec = {}
248
+ for term, cnt in tf.items():
249
+ qvec[term] = (cnt / max(1, len(qtokens))) * self.idf.get(term, 0.0)
250
+ qnorm = math.sqrt(sum(v * v for v in qvec.values())) or 1e-12
251
+
252
+ # cosine against each doc
253
+ scores = []
254
+ for i, dvec in enumerate(self.doc_vectors):
255
+ dot = 0.0
256
+ # iterate over smaller dict for speed
257
+ if len(qvec) < len(dvec):
258
+ for t, v in qvec.items():
259
+ if t in dvec:
260
+ dot += v * dvec[t]
261
+ else:
262
+ for t, v in dvec.items():
263
+ if t in qvec:
264
+ dot += v * qvec[t]
265
+ sim = dot / (qnorm * self.doc_norms[i])
266
+ scores.append((i, sim))
267
+ scores.sort(key=lambda x: x[1], reverse=True)
268
+ return scores[:k]
269
+
270
  # =========================
271
  # Utilities: text extraction & chunking
272
  # =========================
 
342
  # =========================
343
  # RAG Index (session-scoped)
344
  # =========================
345
+ class RagState:
346
+ def __init__(self):
347
+ self.index = None # TinyTfidfIndex
348
+ self.corpus = None # list of dicts with text/meta
349
+
350
  def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
351
  """
352
+ Build a tiny TF-IDF index over all chunks from uploaded documents.
353
+ Returns: (index_obj, None, chunks_with_meta) to keep signature compatible.
354
  """
355
  all_chunks = []
356
  meta = []
 
367
  if not all_chunks:
368
  return None, None, None
369
 
370
+ idx = TinyTfidfIndex()
371
+ idx.fit(all_chunks, meta)
372
+ corpus = [{"text": t, **m} for t, m in zip(all_chunks, meta)]
373
+ return idx, None, corpus
374
 
375
  def retrieve_answer(
376
  query: str,
377
+ index_obj: Any,
378
+ _matrix_unused: Any,
379
  corpus: List[Dict[str, str]],
380
  k: int = 4
381
  ) -> Tuple[str, List[Dict[str, str]]]:
382
  """
383
  Return synthesized answer + top-k supporting chunks with filenames.
384
  """
385
+ if not query or index_obj is None or not corpus:
386
  return "", []
387
+ top = index_obj.query(query, k=k)
 
 
388
  snippets = []
389
+ for i, sim in top:
390
  item = corpus[i]
391
  snippets.append({
392
  "file": item["file"],
393
+ "relevance": float(sim),
394
  "excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
395
  })
 
396
  answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
397
  for s in snippets:
398
  answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
 
403
  # Design / Runbook Auto-Review
404
  # =========================
405
  def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
 
 
 
 
406
  text_low = text.lower()
407
 
408
  pillar_scores = {}
 
418
  score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2) # baseline 1.5, up to 5.0
419
  pillar_scores[pillar] = score
420
 
 
421
  if pillar == "networking":
422
  if "expressroute".lower() not in text_low and "er " not in text_low:
423
  gaps.append({
 
537
  "fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
538
  })
539
 
 
540
  if pillar_scores:
541
  overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
542
  else:
543
  overall = 0.0
544
 
 
545
  if overall < 3.5:
546
  gaps.insert(0, {
547
  "id": "SUMMARY",
 
553
  return {"overall": overall, **pillar_scores}, gaps
554
 
555
  def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
 
 
 
 
 
 
556
  if not files:
557
  return "Please upload at least one PDF/DOCX/TXT.", {}, []
558
 
 
574
  md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
575
  md += "**Per-Pillar Scores:**\n\n"
576
  for k, v in scores.items():
577
+ if k == "overall":
578
  continue
579
  md += f"- **{k.capitalize()}**: {v}\n"
580
  md += "\n**Top Recommendations:**\n"
581
  for g in gaps[:6]:
582
  md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
583
 
 
584
  result_json = {
585
  "timestamp": int(time.time()),
586
  "files": file_list,
 
604
  def answer_faq_or_approach(
605
  question: str,
606
  use_uploaded_docs: bool,
607
+ index_obj: Any,
608
+ _matrix_unused: Any,
609
  corpus: List[Dict[str, str]]
610
  ) -> str:
611
  q = (question or "").strip()
 
614
 
615
  # First try seeded FAQs (very light semantic: keyword match)
616
  for item in FAQ_SEEDS:
617
+ # simple heuristic: overlap of first few tokens
618
+ seed_tokens = set(tokenize(item["q"])[:3])
619
+ q_tokens = set(tokenize(q))
620
+ if seed_tokens and seed_tokens.issubset(q_tokens):
621
  return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
622
 
623
  # If requested, try RAG on uploaded docs
624
+ if use_uploaded_docs and index_obj is not None and corpus:
625
+ rag_answer, _snips = retrieve_answer(q, index_obj, None, corpus, k=4)
626
  if rag_answer.strip():
 
627
  refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
628
  return f"{rag_answer}\n\n**Trusted sources:** {refs}"
629
 
 
658
  )
659
 
660
  # Session state for RAG
661
+ st_index = gr.State(None) # TinyTfidfIndex
662
+ st_matrix = gr.State(None) # kept for signature compatibility
663
  st_corpus = gr.State(None)
664
 
665
  with gr.Tabs():
 
691
 
692
  with gr.Tab("Trusted Sources & Ontology"):
693
  gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
 
694
  links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
695
  gr.Markdown(links_md)
696
 
 
702
 
703
  gr.Markdown(
704
  "### Notes\n"
705
+ "- This app does **not** call external APIs. Use the links above for official guidance.\n"
706
  "- Design checks are heuristic; always validate against your Architecture Board and security teams."
707
  )
708
 
709
  # ====== Wiring ======
710
  def on_build_index(files_list):
711
+ idx, _X, cor = build_index(files_list)
712
+ if idx is None:
713
  return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
714
  None, None, None)
715
  msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
716
+ return msg, idx, None, cor
717
 
718
  build_btn.click(
719
  on_build_index,
720
  inputs=[files],
721
+ outputs=[index_info, st_index, st_matrix, st_corpus]
722
  )
723
 
724
  def on_review(files_list):
 
733
 
734
  ask_btn.click(
735
  answer_faq_or_approach,
736
+ inputs=[question, use_docs, st_index, st_matrix, st_corpus],
737
  outputs=[answer_box]
738
  )
739