Spaces:

ajayinsac
/

VMware2AzureLocal

Sleeping

App Files Files Community

ajayinsac commited on Sep 10, 2025

Commit

32e167d

verified ·

1 Parent(s): 0b055a7

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -49

app.py CHANGED Viewed

@@ -2,26 +2,27 @@
 """
 VMware On-Prem → Azure Local Migration Assistant (Gradio)
 Features
 - FAQ / approach Q&A with trusted-source citations (links)
 - Upload & index PDF/DOCX/TXT (session-local)
-- Lightweight RAG (TF-IDF over chunks)
 - Design/Runbook auto-review with rubric (0–5) + gaps + fixes
 - All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
-Author: you
 """
 import os
 import io
 import re
 import json
 import time
 from typing import List, Tuple, Dict, Any
 import gradio as gr
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
 # -------- Optional, small footprint parsers --------
 # PDF
@@ -189,6 +190,83 @@ FAQ_SEEDS = [
     },
 ]
 # =========================
 # Utilities: text extraction & chunking
 # =========================
@@ -264,10 +342,15 @@ def chunk_text(text: str, max_len: int = 900, overlap: int = 120) -> List[str]:
 # =========================
 # RAG Index (session-scoped)
 # =========================
 def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
     """
-    Build a TF-IDF vectorizer over all chunks from uploaded documents.
-    Returns: (vectorizer, matrix, chunks_with_meta)
     """
     all_chunks = []
     meta = []
@@ -284,34 +367,32 @@ def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
     if not all_chunks:
         return None, None, None
-    vectorizer = TfidfVectorizer(stop_words="english", max_features=25000)
-    X = vectorizer.fit_transform(all_chunks)
-    return vectorizer, X, [{"text": t, **m} for t, m in zip(all_chunks, meta)]
 def retrieve_answer(
     query: str,
-    vectorizer: Any,
-    matrix: Any,
     corpus: List[Dict[str, str]],
     k: int = 4
 ) -> Tuple[str, List[Dict[str, str]]]:
     """
     Return synthesized answer + top-k supporting chunks with filenames.
     """
-    if not query or vectorizer is None or matrix is None or not corpus:
         return "", []
-    qv = vectorizer.transform([query])
-    sims = cosine_similarity(qv, matrix).ravel()
-    top_idx = sims.argsort()[::-1][:k]
     snippets = []
-    for i in top_idx:
         item = corpus[i]
         snippets.append({
             "file": item["file"],
-            "relevance": float(sims[i]),
             "excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
         })
-    # Simple synthesis: bullet list of the top excerpts + a short summary hint.
     answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
     for s in snippets:
         answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
@@ -322,10 +403,6 @@ def retrieve_answer(
 # Design / Runbook Auto-Review
 # =========================
 def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
-    """
-    Returns per-pillar scores (0..5) and a list of gaps with fixes.
-    Very simple keyword coverage approach + gap heuristics.
-    """
     text_low = text.lower()
     pillar_scores = {}
@@ -341,7 +418,6 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
         score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2)  # baseline 1.5, up to 5.0
         pillar_scores[pillar] = score
-        # naive gap examples:
         if pillar == "networking":
             if "expressroute".lower() not in text_low and "er " not in text_low:
                 gaps.append({
@@ -461,13 +537,11 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
                     "fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
                 })
-    # Overall score = average of pillars
     if pillar_scores:
         overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
     else:
         overall = 0.0
-    # Insert an overall summary as the first "gap" entry if overall < 3.5
     if overall < 3.5:
         gaps.insert(0, {
             "id": "SUMMARY",
@@ -479,12 +553,6 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
     return {"overall": overall, **pillar_scores}, gaps
 def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
-    """
-    Aggregate text from uploaded docs, run heuristic review, and return:
-    - markdown summary
-    - json result
-    - table rows for Gaps (id, severity, description, fix)
-    """
     if not files:
         return "Please upload at least one PDF/DOCX/TXT.", {}, []
@@ -506,14 +574,13 @@ def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], Lis
     md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
     md += "**Per-Pillar Scores:**\n\n"
     for k, v in scores.items():
-        if k == "overall":
             continue
         md += f"- **{k.capitalize()}**: {v}\n"
     md += "\n**Top Recommendations:**\n"
     for g in gaps[:6]:
         md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
-    # JSON + table
     result_json = {
         "timestamp": int(time.time()),
         "files": file_list,
@@ -537,8 +604,8 @@ def list_refs(ref_names: List[str]) -> str:
 def answer_faq_or_approach(
     question: str,
     use_uploaded_docs: bool,
-    vectorizer: Any,
-    matrix: Any,
     corpus: List[Dict[str, str]]
 ) -> str:
     q = (question or "").strip()
@@ -547,14 +614,16 @@ def answer_faq_or_approach(
     # First try seeded FAQs (very light semantic: keyword match)
     for item in FAQ_SEEDS:
-        if all(w.lower() in q.lower() for w in re.findall(r"\w+", item["q"])[:3]):
             return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
     # If requested, try RAG on uploaded docs
-    if use_uploaded_docs and vectorizer is not None and matrix is not None and corpus:
-        rag_answer, _snips = retrieve_answer(q, vectorizer, matrix, corpus, k=4)
         if rag_answer.strip():
-            # Always append trusted sources list for user orientation
             refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
             return f"{rag_answer}\n\n**Trusted sources:** {refs}"
@@ -589,8 +658,8 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
     )
     # Session state for RAG
-    st_vectorizer = gr.State(None)
-    st_matrix = gr.State(None)
     st_corpus = gr.State(None)
     with gr.Tabs():
@@ -622,7 +691,6 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
         with gr.Tab("Trusted Sources & Ontology"):
             gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
-            # Render links
             links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
             gr.Markdown(links_md)
@@ -634,23 +702,23 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
             gr.Markdown(
                 "### Notes\n"
-                "- This app does **not** call external APIs. Use the links above for deep-dives into official guidance.\n"
                 "- Design checks are heuristic; always validate against your Architecture Board and security teams."
             )
     # ====== Wiring ======
     def on_build_index(files_list):
-        vec, X, cor = build_index(files_list)
-        if vec is None:
             return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
                     None, None, None)
         msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
-        return msg, vec, X, cor
     build_btn.click(
         on_build_index,
         inputs=[files],
-        outputs=[index_info, st_vectorizer, st_matrix, st_corpus]
     )
     def on_review(files_list):
@@ -665,7 +733,7 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
     ask_btn.click(
         answer_faq_or_approach,
-        inputs=[question, use_docs, st_vectorizer, st_matrix, st_corpus],
         outputs=[answer_box]
     )

 """
 VMware On-Prem → Azure Local Migration Assistant (Gradio)
+Update: Removed scikit-learn dependency. Includes a minimal pure-Python TF-IDF
+and cosine similarity so it runs on Hugging Face Spaces without sklearn.
 Features
 - FAQ / approach Q&A with trusted-source citations (links)
 - Upload & index PDF/DOCX/TXT (session-local)
+- Lightweight RAG (pure-Python TF-IDF over chunks)
 - Design/Runbook auto-review with rubric (0–5) + gaps + fixes
 - All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
 """
 import os
 import io
 import re
 import json
+import math
 import time
 from typing import List, Tuple, Dict, Any
+from collections import Counter, defaultdict
 import gradio as gr
 # -------- Optional, small footprint parsers --------
 # PDF
     },
 ]
+# =========================
+# Minimal Pure-Python TF-IDF
+# =========================
+STOPWORDS = set("""
+a an the and or but if then else for from to in on at by of with without into within over under not be is are was were will can should would could may might
+this that these those there here when where how what why who whom which as it its itself themselves ourselves yourself yourselves
+""".split())
+TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
+def tokenize(text: str) -> List[str]:
+    return [w.lower() for w in TOKEN_RE.findall(text) if w and w.lower() not in STOPWORDS]
+class TinyTfidfIndex:
+    def __init__(self):
+        self.docs: List[List[str]] = []
+        self.doc_vectors: List[Dict[str, float]] = []
+        self.doc_norms: List[float] = []
+        self.idf: Dict[str, float] = {}
+        self.N = 0
+        self.corpus_meta: List[Dict[str, str]] = []
+    def fit(self, texts: List[str], meta: List[Dict[str, str]]):
+        self.docs = [tokenize(t) for t in texts]
+        self.N = len(self.docs)
+        self.corpus_meta = meta
+        # document frequency
+        df = Counter()
+        for doc in self.docs:
+            df.update(set(doc))
+        # idf
+        self.idf = {}
+        for term, dfi in df.items():
+            # add-1 smoothing to avoid div by zero, +1 offset
+            self.idf[term] = 1.0 + math.log((self.N + 1) / (dfi + 1))
+        # build doc vectors
+        self.doc_vectors = []
+        self.doc_norms = []
+        for doc in self.docs:
+            tf = Counter(doc)
+            vec = {}
+            for term, cnt in tf.items():
+                vec[term] = (cnt / max(1, len(doc))) * self.idf.get(term, 0.0)
+            norm = math.sqrt(sum(v * v for v in vec.values())) or 1e-12
+            self.doc_vectors.append(vec)
+            self.doc_norms.append(norm)
+    def query(self, text: str, k: int = 4) -> List[Tuple[int, float]]:
+        qtokens = tokenize(text)
+        if not qtokens or self.N == 0:
+            return []
+        tf = Counter(qtokens)
+        qvec = {}
+        for term, cnt in tf.items():
+            qvec[term] = (cnt / max(1, len(qtokens))) * self.idf.get(term, 0.0)
+        qnorm = math.sqrt(sum(v * v for v in qvec.values())) or 1e-12
+        # cosine against each doc
+        scores = []
+        for i, dvec in enumerate(self.doc_vectors):
+            dot = 0.0
+            # iterate over smaller dict for speed
+            if len(qvec) < len(dvec):
+                for t, v in qvec.items():
+                    if t in dvec:
+                        dot += v * dvec[t]
+            else:
+                for t, v in dvec.items():
+                    if t in qvec:
+                        dot += v * qvec[t]
+            sim = dot / (qnorm * self.doc_norms[i])
+            scores.append((i, sim))
+        scores.sort(key=lambda x: x[1], reverse=True)
+        return scores[:k]
 # =========================
 # Utilities: text extraction & chunking
 # =========================
 # =========================
 # RAG Index (session-scoped)
 # =========================
+class RagState:
+    def __init__(self):
+        self.index = None  # TinyTfidfIndex
+        self.corpus = None  # list of dicts with text/meta
 def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
     """
+    Build a tiny TF-IDF index over all chunks from uploaded documents.
+    Returns: (index_obj, None, chunks_with_meta) to keep signature compatible.
     """
     all_chunks = []
     meta = []
     if not all_chunks:
         return None, None, None
+    idx = TinyTfidfIndex()
+    idx.fit(all_chunks, meta)
+    corpus = [{"text": t, **m} for t, m in zip(all_chunks, meta)]
+    return idx, None, corpus
 def retrieve_answer(
     query: str,
+    index_obj: Any,
+    _matrix_unused: Any,
     corpus: List[Dict[str, str]],
     k: int = 4
 ) -> Tuple[str, List[Dict[str, str]]]:
     """
     Return synthesized answer + top-k supporting chunks with filenames.
     """
+    if not query or index_obj is None or not corpus:
         return "", []
+    top = index_obj.query(query, k=k)
     snippets = []
+    for i, sim in top:
         item = corpus[i]
         snippets.append({
             "file": item["file"],
+            "relevance": float(sim),
             "excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
         })
     answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
     for s in snippets:
         answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
 # Design / Runbook Auto-Review
 # =========================
 def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
     text_low = text.lower()
     pillar_scores = {}
         score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2)  # baseline 1.5, up to 5.0
         pillar_scores[pillar] = score
         if pillar == "networking":
             if "expressroute".lower() not in text_low and "er " not in text_low:
                 gaps.append({
                     "fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
                 })
     if pillar_scores:
         overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
     else:
         overall = 0.0
     if overall < 3.5:
         gaps.insert(0, {
             "id": "SUMMARY",
     return {"overall": overall, **pillar_scores}, gaps
 def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
     if not files:
         return "Please upload at least one PDF/DOCX/TXT.", {}, []
     md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
     md += "**Per-Pillar Scores:**\n\n"
     for k, v in scores.items():
+        if k == "overall":
             continue
         md += f"- **{k.capitalize()}**: {v}\n"
     md += "\n**Top Recommendations:**\n"
     for g in gaps[:6]:
         md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
     result_json = {
         "timestamp": int(time.time()),
         "files": file_list,
 def answer_faq_or_approach(
     question: str,
     use_uploaded_docs: bool,
+    index_obj: Any,
+    _matrix_unused: Any,
     corpus: List[Dict[str, str]]
 ) -> str:
     q = (question or "").strip()
     # First try seeded FAQs (very light semantic: keyword match)
     for item in FAQ_SEEDS:
+        # simple heuristic: overlap of first few tokens
+        seed_tokens = set(tokenize(item["q"])[:3])
+        q_tokens = set(tokenize(q))
+        if seed_tokens and seed_tokens.issubset(q_tokens):
             return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
     # If requested, try RAG on uploaded docs
+    if use_uploaded_docs and index_obj is not None and corpus:
+        rag_answer, _snips = retrieve_answer(q, index_obj, None, corpus, k=4)
         if rag_answer.strip():
             refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
             return f"{rag_answer}\n\n**Trusted sources:** {refs}"
     )
     # Session state for RAG
+    st_index = gr.State(None)   # TinyTfidfIndex
+    st_matrix = gr.State(None)  # kept for signature compatibility
     st_corpus = gr.State(None)
     with gr.Tabs():
         with gr.Tab("Trusted Sources & Ontology"):
             gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
             links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
             gr.Markdown(links_md)
             gr.Markdown(
                 "### Notes\n"
+                "- This app does **not** call external APIs. Use the links above for official guidance.\n"
                 "- Design checks are heuristic; always validate against your Architecture Board and security teams."
             )
     # ====== Wiring ======
     def on_build_index(files_list):
+        idx, _X, cor = build_index(files_list)
+        if idx is None:
             return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
                     None, None, None)
         msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
+        return msg, idx, None, cor
     build_btn.click(
         on_build_index,
         inputs=[files],
+        outputs=[index_info, st_index, st_matrix, st_corpus]
     )
     def on_review(files_list):
     ask_btn.click(
         answer_faq_or_approach,
+        inputs=[question, use_docs, st_index, st_matrix, st_corpus],
         outputs=[answer_box]
     )