Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,26 +2,27 @@
|
|
| 2 |
"""
|
| 3 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
Features
|
| 6 |
- FAQ / approach Q&A with trusted-source citations (links)
|
| 7 |
- Upload & index PDF/DOCX/TXT (session-local)
|
| 8 |
-
- Lightweight RAG (TF-IDF over chunks)
|
| 9 |
- Design/Runbook auto-review with rubric (0–5) + gaps + fixes
|
| 10 |
- All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
|
| 11 |
-
|
| 12 |
-
Author: you
|
| 13 |
"""
|
| 14 |
|
| 15 |
import os
|
| 16 |
import io
|
| 17 |
import re
|
| 18 |
import json
|
|
|
|
| 19 |
import time
|
| 20 |
from typing import List, Tuple, Dict, Any
|
|
|
|
| 21 |
|
| 22 |
import gradio as gr
|
| 23 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 24 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 25 |
|
| 26 |
# -------- Optional, small footprint parsers --------
|
| 27 |
# PDF
|
|
@@ -189,6 +190,83 @@ FAQ_SEEDS = [
|
|
| 189 |
},
|
| 190 |
]
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
# =========================
|
| 193 |
# Utilities: text extraction & chunking
|
| 194 |
# =========================
|
|
@@ -264,10 +342,15 @@ def chunk_text(text: str, max_len: int = 900, overlap: int = 120) -> List[str]:
|
|
| 264 |
# =========================
|
| 265 |
# RAG Index (session-scoped)
|
| 266 |
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
|
| 268 |
"""
|
| 269 |
-
Build a TF-IDF
|
| 270 |
-
Returns: (
|
| 271 |
"""
|
| 272 |
all_chunks = []
|
| 273 |
meta = []
|
|
@@ -284,34 +367,32 @@ def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
|
|
| 284 |
if not all_chunks:
|
| 285 |
return None, None, None
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
|
|
|
| 290 |
|
| 291 |
def retrieve_answer(
|
| 292 |
query: str,
|
| 293 |
-
|
| 294 |
-
|
| 295 |
corpus: List[Dict[str, str]],
|
| 296 |
k: int = 4
|
| 297 |
) -> Tuple[str, List[Dict[str, str]]]:
|
| 298 |
"""
|
| 299 |
Return synthesized answer + top-k supporting chunks with filenames.
|
| 300 |
"""
|
| 301 |
-
if not query or
|
| 302 |
return "", []
|
| 303 |
-
|
| 304 |
-
sims = cosine_similarity(qv, matrix).ravel()
|
| 305 |
-
top_idx = sims.argsort()[::-1][:k]
|
| 306 |
snippets = []
|
| 307 |
-
for i in
|
| 308 |
item = corpus[i]
|
| 309 |
snippets.append({
|
| 310 |
"file": item["file"],
|
| 311 |
-
"relevance": float(
|
| 312 |
"excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
|
| 313 |
})
|
| 314 |
-
# Simple synthesis: bullet list of the top excerpts + a short summary hint.
|
| 315 |
answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
|
| 316 |
for s in snippets:
|
| 317 |
answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
|
|
@@ -322,10 +403,6 @@ def retrieve_answer(
|
|
| 322 |
# Design / Runbook Auto-Review
|
| 323 |
# =========================
|
| 324 |
def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
|
| 325 |
-
"""
|
| 326 |
-
Returns per-pillar scores (0..5) and a list of gaps with fixes.
|
| 327 |
-
Very simple keyword coverage approach + gap heuristics.
|
| 328 |
-
"""
|
| 329 |
text_low = text.lower()
|
| 330 |
|
| 331 |
pillar_scores = {}
|
|
@@ -341,7 +418,6 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
|
|
| 341 |
score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2) # baseline 1.5, up to 5.0
|
| 342 |
pillar_scores[pillar] = score
|
| 343 |
|
| 344 |
-
# naive gap examples:
|
| 345 |
if pillar == "networking":
|
| 346 |
if "expressroute".lower() not in text_low and "er " not in text_low:
|
| 347 |
gaps.append({
|
|
@@ -461,13 +537,11 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
|
|
| 461 |
"fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
|
| 462 |
})
|
| 463 |
|
| 464 |
-
# Overall score = average of pillars
|
| 465 |
if pillar_scores:
|
| 466 |
overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
|
| 467 |
else:
|
| 468 |
overall = 0.0
|
| 469 |
|
| 470 |
-
# Insert an overall summary as the first "gap" entry if overall < 3.5
|
| 471 |
if overall < 3.5:
|
| 472 |
gaps.insert(0, {
|
| 473 |
"id": "SUMMARY",
|
|
@@ -479,12 +553,6 @@ def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[st
|
|
| 479 |
return {"overall": overall, **pillar_scores}, gaps
|
| 480 |
|
| 481 |
def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
|
| 482 |
-
"""
|
| 483 |
-
Aggregate text from uploaded docs, run heuristic review, and return:
|
| 484 |
-
- markdown summary
|
| 485 |
-
- json result
|
| 486 |
-
- table rows for Gaps (id, severity, description, fix)
|
| 487 |
-
"""
|
| 488 |
if not files:
|
| 489 |
return "Please upload at least one PDF/DOCX/TXT.", {}, []
|
| 490 |
|
|
@@ -506,14 +574,13 @@ def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], Lis
|
|
| 506 |
md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
|
| 507 |
md += "**Per-Pillar Scores:**\n\n"
|
| 508 |
for k, v in scores.items():
|
| 509 |
-
if k == "overall":
|
| 510 |
continue
|
| 511 |
md += f"- **{k.capitalize()}**: {v}\n"
|
| 512 |
md += "\n**Top Recommendations:**\n"
|
| 513 |
for g in gaps[:6]:
|
| 514 |
md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
|
| 515 |
|
| 516 |
-
# JSON + table
|
| 517 |
result_json = {
|
| 518 |
"timestamp": int(time.time()),
|
| 519 |
"files": file_list,
|
|
@@ -537,8 +604,8 @@ def list_refs(ref_names: List[str]) -> str:
|
|
| 537 |
def answer_faq_or_approach(
|
| 538 |
question: str,
|
| 539 |
use_uploaded_docs: bool,
|
| 540 |
-
|
| 541 |
-
|
| 542 |
corpus: List[Dict[str, str]]
|
| 543 |
) -> str:
|
| 544 |
q = (question or "").strip()
|
|
@@ -547,14 +614,16 @@ def answer_faq_or_approach(
|
|
| 547 |
|
| 548 |
# First try seeded FAQs (very light semantic: keyword match)
|
| 549 |
for item in FAQ_SEEDS:
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
| 551 |
return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
|
| 552 |
|
| 553 |
# If requested, try RAG on uploaded docs
|
| 554 |
-
if use_uploaded_docs and
|
| 555 |
-
rag_answer, _snips = retrieve_answer(q,
|
| 556 |
if rag_answer.strip():
|
| 557 |
-
# Always append trusted sources list for user orientation
|
| 558 |
refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
|
| 559 |
return f"{rag_answer}\n\n**Trusted sources:** {refs}"
|
| 560 |
|
|
@@ -589,8 +658,8 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
|
|
| 589 |
)
|
| 590 |
|
| 591 |
# Session state for RAG
|
| 592 |
-
|
| 593 |
-
st_matrix = gr.State(None)
|
| 594 |
st_corpus = gr.State(None)
|
| 595 |
|
| 596 |
with gr.Tabs():
|
|
@@ -622,7 +691,6 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
|
|
| 622 |
|
| 623 |
with gr.Tab("Trusted Sources & Ontology"):
|
| 624 |
gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
|
| 625 |
-
# Render links
|
| 626 |
links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
|
| 627 |
gr.Markdown(links_md)
|
| 628 |
|
|
@@ -634,23 +702,23 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
|
|
| 634 |
|
| 635 |
gr.Markdown(
|
| 636 |
"### Notes\n"
|
| 637 |
-
"- This app does **not** call external APIs. Use the links above for
|
| 638 |
"- Design checks are heuristic; always validate against your Architecture Board and security teams."
|
| 639 |
)
|
| 640 |
|
| 641 |
# ====== Wiring ======
|
| 642 |
def on_build_index(files_list):
|
| 643 |
-
|
| 644 |
-
if
|
| 645 |
return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
|
| 646 |
None, None, None)
|
| 647 |
msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
|
| 648 |
-
return msg,
|
| 649 |
|
| 650 |
build_btn.click(
|
| 651 |
on_build_index,
|
| 652 |
inputs=[files],
|
| 653 |
-
outputs=[index_info,
|
| 654 |
)
|
| 655 |
|
| 656 |
def on_review(files_list):
|
|
@@ -665,7 +733,7 @@ with gr.Blocks(title="VMware → Azure Local Migration Assistant") as demo:
|
|
| 665 |
|
| 666 |
ask_btn.click(
|
| 667 |
answer_faq_or_approach,
|
| 668 |
-
inputs=[question, use_docs,
|
| 669 |
outputs=[answer_box]
|
| 670 |
)
|
| 671 |
|
|
|
|
| 2 |
"""
|
| 3 |
VMware On-Prem → Azure Local Migration Assistant (Gradio)
|
| 4 |
|
| 5 |
+
Update: Removed scikit-learn dependency. Includes a minimal pure-Python TF-IDF
|
| 6 |
+
and cosine similarity so it runs on Hugging Face Spaces without sklearn.
|
| 7 |
+
|
| 8 |
Features
|
| 9 |
- FAQ / approach Q&A with trusted-source citations (links)
|
| 10 |
- Upload & index PDF/DOCX/TXT (session-local)
|
| 11 |
+
- Lightweight RAG (pure-Python TF-IDF over chunks)
|
| 12 |
- Design/Runbook auto-review with rubric (0–5) + gaps + fixes
|
| 13 |
- All Hugging Face Spaces friendly (no share=True, no GPU deps, no external APIs)
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
import os
|
| 17 |
import io
|
| 18 |
import re
|
| 19 |
import json
|
| 20 |
+
import math
|
| 21 |
import time
|
| 22 |
from typing import List, Tuple, Dict, Any
|
| 23 |
+
from collections import Counter, defaultdict
|
| 24 |
|
| 25 |
import gradio as gr
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# -------- Optional, small footprint parsers --------
|
| 28 |
# PDF
|
|
|
|
| 190 |
},
|
| 191 |
]
|
| 192 |
|
| 193 |
+
# =========================
|
| 194 |
+
# Minimal Pure-Python TF-IDF
|
| 195 |
+
# =========================
|
| 196 |
+
STOPWORDS = set("""
|
| 197 |
+
a an the and or but if then else for from to in on at by of with without into within over under not be is are was were will can should would could may might
|
| 198 |
+
this that these those there here when where how what why who whom which as it its itself themselves ourselves yourself yourselves
|
| 199 |
+
""".split())
|
| 200 |
+
|
| 201 |
+
TOKEN_RE = re.compile(r"[A-Za-z0-9_]+")
|
| 202 |
+
|
| 203 |
+
def tokenize(text: str) -> List[str]:
|
| 204 |
+
return [w.lower() for w in TOKEN_RE.findall(text) if w and w.lower() not in STOPWORDS]
|
| 205 |
+
|
| 206 |
+
class TinyTfidfIndex:
|
| 207 |
+
def __init__(self):
|
| 208 |
+
self.docs: List[List[str]] = []
|
| 209 |
+
self.doc_vectors: List[Dict[str, float]] = []
|
| 210 |
+
self.doc_norms: List[float] = []
|
| 211 |
+
self.idf: Dict[str, float] = {}
|
| 212 |
+
self.N = 0
|
| 213 |
+
self.corpus_meta: List[Dict[str, str]] = []
|
| 214 |
+
|
| 215 |
+
def fit(self, texts: List[str], meta: List[Dict[str, str]]):
|
| 216 |
+
self.docs = [tokenize(t) for t in texts]
|
| 217 |
+
self.N = len(self.docs)
|
| 218 |
+
self.corpus_meta = meta
|
| 219 |
+
|
| 220 |
+
# document frequency
|
| 221 |
+
df = Counter()
|
| 222 |
+
for doc in self.docs:
|
| 223 |
+
df.update(set(doc))
|
| 224 |
+
# idf
|
| 225 |
+
self.idf = {}
|
| 226 |
+
for term, dfi in df.items():
|
| 227 |
+
# add-1 smoothing to avoid div by zero, +1 offset
|
| 228 |
+
self.idf[term] = 1.0 + math.log((self.N + 1) / (dfi + 1))
|
| 229 |
+
|
| 230 |
+
# build doc vectors
|
| 231 |
+
self.doc_vectors = []
|
| 232 |
+
self.doc_norms = []
|
| 233 |
+
for doc in self.docs:
|
| 234 |
+
tf = Counter(doc)
|
| 235 |
+
vec = {}
|
| 236 |
+
for term, cnt in tf.items():
|
| 237 |
+
vec[term] = (cnt / max(1, len(doc))) * self.idf.get(term, 0.0)
|
| 238 |
+
norm = math.sqrt(sum(v * v for v in vec.values())) or 1e-12
|
| 239 |
+
self.doc_vectors.append(vec)
|
| 240 |
+
self.doc_norms.append(norm)
|
| 241 |
+
|
| 242 |
+
def query(self, text: str, k: int = 4) -> List[Tuple[int, float]]:
|
| 243 |
+
qtokens = tokenize(text)
|
| 244 |
+
if not qtokens or self.N == 0:
|
| 245 |
+
return []
|
| 246 |
+
tf = Counter(qtokens)
|
| 247 |
+
qvec = {}
|
| 248 |
+
for term, cnt in tf.items():
|
| 249 |
+
qvec[term] = (cnt / max(1, len(qtokens))) * self.idf.get(term, 0.0)
|
| 250 |
+
qnorm = math.sqrt(sum(v * v for v in qvec.values())) or 1e-12
|
| 251 |
+
|
| 252 |
+
# cosine against each doc
|
| 253 |
+
scores = []
|
| 254 |
+
for i, dvec in enumerate(self.doc_vectors):
|
| 255 |
+
dot = 0.0
|
| 256 |
+
# iterate over smaller dict for speed
|
| 257 |
+
if len(qvec) < len(dvec):
|
| 258 |
+
for t, v in qvec.items():
|
| 259 |
+
if t in dvec:
|
| 260 |
+
dot += v * dvec[t]
|
| 261 |
+
else:
|
| 262 |
+
for t, v in dvec.items():
|
| 263 |
+
if t in qvec:
|
| 264 |
+
dot += v * qvec[t]
|
| 265 |
+
sim = dot / (qnorm * self.doc_norms[i])
|
| 266 |
+
scores.append((i, sim))
|
| 267 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
| 268 |
+
return scores[:k]
|
| 269 |
+
|
| 270 |
# =========================
|
| 271 |
# Utilities: text extraction & chunking
|
| 272 |
# =========================
|
|
|
|
| 342 |
# =========================
|
| 343 |
# RAG Index (session-scoped)
|
| 344 |
# =========================
|
| 345 |
+
class RagState:
|
| 346 |
+
def __init__(self):
|
| 347 |
+
self.index = None # TinyTfidfIndex
|
| 348 |
+
self.corpus = None # list of dicts with text/meta
|
| 349 |
+
|
| 350 |
def build_index(files: List[gr.File]) -> Tuple[Any, Any, Any]:
|
| 351 |
"""
|
| 352 |
+
Build a tiny TF-IDF index over all chunks from uploaded documents.
|
| 353 |
+
Returns: (index_obj, None, chunks_with_meta) to keep signature compatible.
|
| 354 |
"""
|
| 355 |
all_chunks = []
|
| 356 |
meta = []
|
|
|
|
| 367 |
if not all_chunks:
|
| 368 |
return None, None, None
|
| 369 |
|
| 370 |
+
idx = TinyTfidfIndex()
|
| 371 |
+
idx.fit(all_chunks, meta)
|
| 372 |
+
corpus = [{"text": t, **m} for t, m in zip(all_chunks, meta)]
|
| 373 |
+
return idx, None, corpus
|
| 374 |
|
| 375 |
def retrieve_answer(
|
| 376 |
query: str,
|
| 377 |
+
index_obj: Any,
|
| 378 |
+
_matrix_unused: Any,
|
| 379 |
corpus: List[Dict[str, str]],
|
| 380 |
k: int = 4
|
| 381 |
) -> Tuple[str, List[Dict[str, str]]]:
|
| 382 |
"""
|
| 383 |
Return synthesized answer + top-k supporting chunks with filenames.
|
| 384 |
"""
|
| 385 |
+
if not query or index_obj is None or not corpus:
|
| 386 |
return "", []
|
| 387 |
+
top = index_obj.query(query, k=k)
|
|
|
|
|
|
|
| 388 |
snippets = []
|
| 389 |
+
for i, sim in top:
|
| 390 |
item = corpus[i]
|
| 391 |
snippets.append({
|
| 392 |
"file": item["file"],
|
| 393 |
+
"relevance": float(sim),
|
| 394 |
"excerpt": item["text"][:500] + ("..." if len(item["text"]) > 500 else "")
|
| 395 |
})
|
|
|
|
| 396 |
answer = "Here are the most relevant excerpts from your uploaded documents:\n\n"
|
| 397 |
for s in snippets:
|
| 398 |
answer += f"- **{s['file']}** (relevance {s['relevance']:.2f}): {s['excerpt']}\n\n"
|
|
|
|
| 403 |
# Design / Runbook Auto-Review
|
| 404 |
# =========================
|
| 405 |
def score_text_against_checks(text: str) -> Tuple[Dict[str, float], List[Dict[str, str]]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
text_low = text.lower()
|
| 407 |
|
| 408 |
pillar_scores = {}
|
|
|
|
| 418 |
score = round(min(5.0, 5.0 * (0.3 + 0.7 * coverage)), 2) # baseline 1.5, up to 5.0
|
| 419 |
pillar_scores[pillar] = score
|
| 420 |
|
|
|
|
| 421 |
if pillar == "networking":
|
| 422 |
if "expressroute".lower() not in text_low and "er " not in text_low:
|
| 423 |
gaps.append({
|
|
|
|
| 537 |
"fix": "Enforce tags via Policy; enable showback/chargeback and budgets."
|
| 538 |
})
|
| 539 |
|
|
|
|
| 540 |
if pillar_scores:
|
| 541 |
overall = round(sum(pillar_scores.values()) / len(pillar_scores), 2)
|
| 542 |
else:
|
| 543 |
overall = 0.0
|
| 544 |
|
|
|
|
| 545 |
if overall < 3.5:
|
| 546 |
gaps.insert(0, {
|
| 547 |
"id": "SUMMARY",
|
|
|
|
| 553 |
return {"overall": overall, **pillar_scores}, gaps
|
| 554 |
|
| 555 |
def review_uploaded_docs(files: List[gr.File]) -> Tuple[str, Dict[str, Any], List[List[str]]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
if not files:
|
| 557 |
return "Please upload at least one PDF/DOCX/TXT.", {}, []
|
| 558 |
|
|
|
|
| 574 |
md += f"**Overall Score:** {scores['overall']} / 5.0\n\n"
|
| 575 |
md += "**Per-Pillar Scores:**\n\n"
|
| 576 |
for k, v in scores.items():
|
| 577 |
+
if k == "overall":
|
| 578 |
continue
|
| 579 |
md += f"- **{k.capitalize()}**: {v}\n"
|
| 580 |
md += "\n**Top Recommendations:**\n"
|
| 581 |
for g in gaps[:6]:
|
| 582 |
md += f"- ({g['severity']}) **{g['id']}** — {g['desc']} → _{g['fix']}_\n"
|
| 583 |
|
|
|
|
| 584 |
result_json = {
|
| 585 |
"timestamp": int(time.time()),
|
| 586 |
"files": file_list,
|
|
|
|
| 604 |
def answer_faq_or_approach(
|
| 605 |
question: str,
|
| 606 |
use_uploaded_docs: bool,
|
| 607 |
+
index_obj: Any,
|
| 608 |
+
_matrix_unused: Any,
|
| 609 |
corpus: List[Dict[str, str]]
|
| 610 |
) -> str:
|
| 611 |
q = (question or "").strip()
|
|
|
|
| 614 |
|
| 615 |
# First try seeded FAQs (very light semantic: keyword match)
|
| 616 |
for item in FAQ_SEEDS:
|
| 617 |
+
# simple heuristic: overlap of first few tokens
|
| 618 |
+
seed_tokens = set(tokenize(item["q"])[:3])
|
| 619 |
+
q_tokens = set(tokenize(q))
|
| 620 |
+
if seed_tokens and seed_tokens.issubset(q_tokens):
|
| 621 |
return f"{item['a']}\n\n**Trusted sources:** {list_refs(item['refs'])}"
|
| 622 |
|
| 623 |
# If requested, try RAG on uploaded docs
|
| 624 |
+
if use_uploaded_docs and index_obj is not None and corpus:
|
| 625 |
+
rag_answer, _snips = retrieve_answer(q, index_obj, None, corpus, k=4)
|
| 626 |
if rag_answer.strip():
|
|
|
|
| 627 |
refs = list_refs(["Azure VMware Solution (AVS)", "Azure Migrate", "Cloud Adoption Framework (CAF)"])
|
| 628 |
return f"{rag_answer}\n\n**Trusted sources:** {refs}"
|
| 629 |
|
|
|
|
| 658 |
)
|
| 659 |
|
| 660 |
# Session state for RAG
|
| 661 |
+
st_index = gr.State(None) # TinyTfidfIndex
|
| 662 |
+
st_matrix = gr.State(None) # kept for signature compatibility
|
| 663 |
st_corpus = gr.State(None)
|
| 664 |
|
| 665 |
with gr.Tabs():
|
|
|
|
| 691 |
|
| 692 |
with gr.Tab("Trusted Sources & Ontology"):
|
| 693 |
gr.Markdown("### Trusted / Authoritative Sources (Allow-list)")
|
|
|
|
| 694 |
links_md = "\n".join([f"- [{nm}]({url})" for nm, url in TRUSTED_SOURCES])
|
| 695 |
gr.Markdown(links_md)
|
| 696 |
|
|
|
|
| 702 |
|
| 703 |
gr.Markdown(
|
| 704 |
"### Notes\n"
|
| 705 |
+
"- This app does **not** call external APIs. Use the links above for official guidance.\n"
|
| 706 |
"- Design checks are heuristic; always validate against your Architecture Board and security teams."
|
| 707 |
)
|
| 708 |
|
| 709 |
# ====== Wiring ======
|
| 710 |
def on_build_index(files_list):
|
| 711 |
+
idx, _X, cor = build_index(files_list)
|
| 712 |
+
if idx is None:
|
| 713 |
return (gr.update(value="No text could be extracted. Make sure files are PDF/DOCX/TXT."),
|
| 714 |
None, None, None)
|
| 715 |
msg = f"Indexed {len(cor)} chunks from {len(files_list)} file(s). You can now toggle 'Also search my uploaded documents' in the Ask Anything tab."
|
| 716 |
+
return msg, idx, None, cor
|
| 717 |
|
| 718 |
build_btn.click(
|
| 719 |
on_build_index,
|
| 720 |
inputs=[files],
|
| 721 |
+
outputs=[index_info, st_index, st_matrix, st_corpus]
|
| 722 |
)
|
| 723 |
|
| 724 |
def on_review(files_list):
|
|
|
|
| 733 |
|
| 734 |
ask_btn.click(
|
| 735 |
answer_faq_or_approach,
|
| 736 |
+
inputs=[question, use_docs, st_index, st_matrix, st_corpus],
|
| 737 |
outputs=[answer_box]
|
| 738 |
)
|
| 739 |
|