AI_Agent_Final_V4

Sleeping

App Files Files Community

SarahXia0405 commited on Jan 11

Commit

e60e9dd

verified ·

1 Parent(s): 0a5c6d4

Update api/rag_engine.py

Browse files

Files changed (1) hide show

api/rag_engine.py +203 -1

api/rag_engine.py CHANGED Viewed

@@ -188,6 +188,38 @@ def _parse_pptx_to_text(path: str) -> List[Tuple[str, str]]:
             out.append((f"slide{idx}", _clean_text("\n".join(lines))))
     return out
 # ----------------------------
 # Public API
@@ -211,9 +243,11 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
             sections = _parse_docx_to_text(path)
         elif ext == ".pptx":
             sections = _parse_pptx_to_text(path)
-        elif ext in [".txt", ".md"]:
             with open(path, "r", encoding="utf-8", errors="ignore") as f:
                 sections = [("text", _clean_text(f.read()))]
         else:
             print(f"[rag_engine] unsupported file type: {ext}")
             return []
@@ -361,3 +395,171 @@ def retrieve_relevant_chunks(
     context = "\n\n---\n\n".join(truncated_texts)
     return context, used

             out.append((f"slide{idx}", _clean_text("\n".join(lines))))
     return out
+import json
+def _parse_ipynb_to_text(path: str) -> List[Tuple[str, str]]:
+    try:
+        with open(path, "r", encoding="utf-8", errors="ignore") as f:
+            nb = json.load(f)
+    except Exception:
+        return []
+    cells = nb.get("cells", []) or []
+    parts: List[str] = []
+    for c in cells:
+        ctype = c.get("cell_type", "")
+        src = c.get("source", [])
+        if isinstance(src, list):
+            src = "".join(src)
+        else:
+            src = str(src or "")
+        src = src.strip()
+        if not src:
+            continue
+        if ctype == "markdown":
+            parts.append(src)
+        elif ctype == "code":
+            # 保留代码（对 Lab 很重要）
+            parts.append("```python\n" + src + "\n```")
+        else:
+            parts.append(src)
+    full = _clean_text("\n\n".join(parts))
+    return [("ipynb", full)] if full else []
 # ----------------------------
 # Public API
             sections = _parse_docx_to_text(path)
         elif ext == ".pptx":
             sections = _parse_pptx_to_text(path)
+        elif ext in [".txt", ".md", ".py"]:
             with open(path, "r", encoding="utf-8", errors="ignore") as f:
                 sections = [("text", _clean_text(f.read()))]
+        elif ext == ".ipynb":
+            sections = _parse_ipynb_to_text(path)
         else:
             print(f"[rag_engine] unsupported file type: {ext}")
             return []
     context = "\n\n---\n\n".join(truncated_texts)
     return context, used
+# ============================
+# Course-scoped Vector Index (Simple: chunks.json + embeddings.npy)
+# ============================
+import json
+from typing import Any
+import numpy as np
+from api.config import client, EMBEDDING_MODEL  # 你 config.py 里有 client
+def _course_root(course_id: str) -> str:
+    return os.path.join("data", "courses", course_id)
+def _course_raw_dir(course_id: str) -> str:
+    return os.path.join(_course_root(course_id), "raw")
+def _course_index_dir(course_id: str) -> str:
+    return os.path.join(_course_root(course_id), "index")
+def _course_chunks_path(course_id: str) -> str:
+    return os.path.join(_course_index_dir(course_id), "chunks.json")
+def _course_emb_path(course_id: str) -> str:
+    return os.path.join(_course_index_dir(course_id), "embeddings.npy")
+def ensure_course_dirs(course_id: str) -> None:
+    os.makedirs(_course_raw_dir(course_id), exist_ok=True)
+    os.makedirs(_course_index_dir(course_id), exist_ok=True)
+def _embed_texts(texts: List[str]) -> np.ndarray:
+    # batched embeddings
+    resp = client.embeddings.create(model=EMBEDDING_MODEL, input=texts)
+    vecs = [d.embedding for d in resp.data]
+    return np.array(vecs, dtype=np.float32)
+def load_course_index(course_id: str) -> Tuple[List[Dict[str, Any]], Optional[np.ndarray]]:
+    ensure_course_dirs(course_id)
+    cp = _course_chunks_path(course_id)
+    ep = _course_emb_path(course_id)
+    if not os.path.exists(cp) or not os.path.exists(ep):
+        return [], None
+    try:
+        with open(cp, "r", encoding="utf-8") as f:
+            chunks = json.load(f)
+        embs = np.load(ep)
+        if len(chunks) != embs.shape[0]:
+            return [], None
+        return chunks, embs
+    except Exception:
+        return [], None
+def save_course_index(course_id: str, chunks: List[Dict[str, Any]], embs: np.ndarray) -> None:
+    ensure_course_dirs(course_id)
+    with open(_course_chunks_path(course_id), "w", encoding="utf-8") as f:
+        json.dump(chunks, f, ensure_ascii=False, indent=2)
+    np.save(_course_emb_path(course_id), embs)
+def add_file_to_course_index(course_id: str, file_path: str, doc_type: str) -> Dict[str, Any]:
+    """
+    Parse -> chunk -> embed -> append -> save
+    """
+    ensure_course_dirs(course_id)
+    new_chunks = build_rag_chunks_from_file(file_path, doc_type) or []
+    texts = [c.get("text", "") for c in new_chunks if c.get("text")]
+    if not texts:
+        return {"added_chunks": 0, "total_chunks": 0}
+    new_embs = _embed_texts(texts)
+    chunks, embs = load_course_index(course_id)
+    if embs is None:
+        chunks = []
+        embs = np.zeros((0, new_embs.shape[1]), dtype=np.float32)
+    chunks.extend(new_chunks)
+    embs = np.vstack([embs, new_embs])
+    save_course_index(course_id, chunks, embs)
+    return {"added_chunks": len(new_chunks), "total_chunks": len(chunks)}
+def _cosine_topk(query_vec: np.ndarray, mat: np.ndarray, k: int) -> List[int]:
+    q = query_vec / (np.linalg.norm(query_vec) + 1e-8)
+    m = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-8)
+    sims = m @ q
+    k = max(1, min(int(k), sims.shape[0]))
+    idx = np.argpartition(-sims, kth=k-1)[:k]
+    idx = idx[np.argsort(-sims[idx])]
+    return idx.tolist()
+def retrieve_relevant_chunks_vector(
+    query: str,
+    course_id: str,
+    k: int = RAG_TOPK_LIMIT,
+    chunk_token_limit: int = RAG_CHUNK_TOKEN_LIMIT,
+    max_context_tokens: int = RAG_CONTEXT_TOKEN_LIMIT,
+    model_for_tokenizer: str = "",
+    allowed_source_files: Optional[List[str]] = None,
+    allowed_doc_types: Optional[List[str]] = None,
+) -> Tuple[str, List[Dict]]:
+    """
+    Vector retrieval scoped to course_id, with the same scoping semantics you already use.
+    """
+    query = _clean_text(query)
+    if not query:
+        return "", []
+    chunks, embs = load_course_index(course_id)
+    if not chunks or embs is None or embs.shape[0] == 0:
+        return "", []
+    # scope BEFORE similarity
+    keep = list(range(len(chunks)))
+    if allowed_source_files:
+        allow_files = {_basename(str(x)).strip() for x in allowed_source_files if str(x).strip()}
+        if allow_files:
+            keep = [i for i in keep if _basename(str(chunks[i].get("source_file", ""))).strip() in allow_files]
+    if allowed_doc_types:
+        allow_dt = {str(x).strip() for x in allowed_doc_types if str(x).strip()}
+        if allow_dt:
+            keep = [i for i in keep if str(chunks[i].get("doc_type", "")).strip() in allow_dt]
+    if not keep:
+        return "", []
+    cand_embs = embs[keep]
+    qv = _embed_texts([query])[0]
+    top_local = _cosine_topk(qv, cand_embs, k=min(k, RAG_TOPK_LIMIT))
+    top_global = [keep[i] for i in top_local]
+    used = [chunks[i] for i in top_global]
+    # truncate like your current logic (token caps)
+    used_out: List[Dict] = []
+    texts_out: List[str] = []
+    total_tokens = 0
+    for c in used:
+        raw = c.get("text") or ""
+        if not raw:
+            continue
+        t = _truncate_to_tokens(raw, max_tokens=chunk_token_limit, model=model_for_tokenizer)
+        t_tokens = _count_text_tokens(t, model=model_for_tokenizer)
+        if total_tokens + t_tokens > max_context_tokens:
+            remaining = max_context_tokens - total_tokens
+            if remaining <= 0:
+                break
+            t = _truncate_to_tokens(t, max_tokens=remaining, model=model_for_tokenizer)
+            t_tokens = _count_text_tokens(t, model=model_for_tokenizer)
+        t = _clean_text(t)
+        if not t:
+            continue
+        texts_out.append(t)
+        used_out.append(c)
+        total_tokens += t_tokens
+        if total_tokens >= max_context_tokens:
+            break
+    context = "\n\n---\n\n".join(texts_out)
+    return context, used_out