Spaces:

mickey1976
/

cove-api

Sleeping

App Files Files Community

mickey1976 commited on Sep 7, 2025

Commit

549c270

0 Parent(s):

Deploy: Minimal FastAPI backend for CoVE Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +1 -0
data/processed/beauty/index/.ipynb_checkpoints/defaults-checkpoint.json +9 -0
data/processed/beauty/index/.ipynb_checkpoints/defaults_cove-checkpoint.json +0 -0
data/processed/beauty/index/defaults.json +16 -0
data/processed/beauty/index/defaults_cove.json +22 -0
requirements.txt +13 -0
space.yaml +5 -0
src/__init__.py +2 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/agents/.ipynb_checkpoints/agent_types-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/base-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/chat_agent-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/data_agent-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/index_agent-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/model_agent-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/orchestrator-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/recommend_agent-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/report_agent-checkpoint.py +0 -0
src/agents/.ipynb_checkpoints/run_agent-checkpoint.py +0 -0
src/agents/__init__.py +1 -0
src/agents/__pycache__/__init__.cpython-311.pyc +0 -0
src/agents/__pycache__/chat_agent.cpython-311.pyc +0 -0
src/agents/__pycache__/orchestrator.cpython-311.pyc +0 -0
src/agents/__pycache__/report_agent.cpython-311.pyc +0 -0
src/agents/agent_types.py +16 -0
src/agents/base.py +16 -0
src/agents/chat_agent.py +311 -0
src/agents/data_agent.py +46 -0
src/agents/index_agent.py +34 -0
src/agents/model_agent.py +8 -0
src/agents/orchestrator.py +44 -0
src/agents/recommend_agent.py +37 -0
src/agents/report_agent.py +319 -0
src/agents/run_agent.py +28 -0
src/cove/.ipynb_checkpoints/__init__-checkpoint.py +0 -0
src/cove/.ipynb_checkpoints/fuse_index-checkpoint.py +0 -0
src/cove/.ipynb_checkpoints/io-checkpoint.py +0 -0
src/cove/__init__.py +0 -0
src/cove/fuse_index.py +106 -0
src/cove/io.py +29 -0
src/data/.ipynb_checkpoints/init-checkpoint.py +0 -0
src/data/.ipynb_checkpoints/loader-checkpoint.py +0 -0
src/data/.ipynb_checkpoints/registry-checkpoint.py +0 -0
src/data/__init__.py +2 -0
src/data/__pycache__/__init__.cpython-311.pyc +0 -0
src/data/__pycache__/loader.cpython-311.pyc +0 -0
src/data/__pycache__/registry.cpython-311.pyc +0 -0
src/data/loader.py +15 -0
src/data/registry.py +73 -0
src/models/.ipynb_checkpoints/fusion-checkpoint.py +0 -0

app.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from api.app_api import app

data/processed/beauty/index/.ipynb_checkpoints/defaults-checkpoint.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "weighted": {
+    "w_text": 1.0,
+    "w_image": 0.0,
+    "w_meta": 0.2,
+    "k": 10,
+    "faiss_name": "weighted_wt1.0_wi0.0_wm0.2"
+  }
+}

data/processed/beauty/index/.ipynb_checkpoints/defaults_cove-checkpoint.json ADDED Viewed

File without changes

data/processed/beauty/index/defaults.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "concat": {
+    "w_text": 1.0,
+    "w_image": 0.2,
+    "w_meta": 0.2,
+    "k": 10,
+    "faiss_name": "beauty_concat"
+  },
+  "weighted": {
+    "w_text": 1.0,
+    "w_image": 0.2,
+    "w_meta": 0.2,
+    "k": 10,
+    "faiss_name": "beauty_weighted"
+  }
+}

data/processed/beauty/index/defaults_cove.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "cove_faiss_only": {
+    "k": 10,
+    "faiss_name": "beauty_cove_faiss_only"
+  },
+  "cove_faiss_concat": {
+    "w_text": 0.2,
+    "w_image": 0.2,
+    "w_meta": 0.2,
+    "w_cove": 0.4,
+    "k": 10,
+    "faiss_name": "beauty_cove_faiss_concat"
+  },
+  "cove_faiss_weighted": {
+    "w_text": 0.2,
+    "w_image": 0.2,
+    "w_meta": 0.2,
+    "w_cove": 0.4,
+    "k": 10,
+    "faiss_name": "beauty_cove_faiss_weighted"
+  }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi==0.110.0
+uvicorn==0.27.0.post1
+pydantic==1.10.14
+numpy==1.24.4
+pandas==2.2.1
+faiss-cpu==1.7.4
+scikit-learn==1.4.0
+tqdm==4.66.2
+sentence-transformers==2.6.1
+transformers==4.39.3
+torch==2.1.2
+protobuf==4.25.3
+pyarrow==15.0.2

space.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# space.yaml
+title: "CoVE API"
+sdk: "docker"
+app_file: "api/app_api.py"
+python_version: "3.11"

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Minimal package init to avoid import-time side effects
2	+ __all__ = []

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (173 Bytes). View file

src/agents/.ipynb_checkpoints/agent_types-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/base-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/chat_agent-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/data_agent-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/index_agent-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/model_agent-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/orchestrator-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/recommend_agent-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/report_agent-checkpoint.py ADDED Viewed

File without changes

src/agents/.ipynb_checkpoints/run_agent-checkpoint.py ADDED Viewed

File without changes

src/agents/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # auto-created to mark package

src/agents/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (171 Bytes). View file

src/agents/__pycache__/chat_agent.cpython-311.pyc ADDED Viewed

Binary file (17.1 kB). View file

src/agents/__pycache__/orchestrator.cpython-311.pyc ADDED Viewed

Binary file (17.3 kB). View file

src/agents/__pycache__/report_agent.cpython-311.pyc ADDED Viewed

Binary file (18.5 kB). View file

src/agents/agent_types.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class Task:
+    intent: str              # "prepare" | "index" | "eval" | "recommend" | "report"
+    dataset: str = "beauty"
+    user: Optional[str] = None
+    k: int = 10
+    fusion: str = "concat"
+    w_text: float = 1.0
+    w_image: float = 1.0
+    w_meta: float = 0.0
+    use_faiss: bool = True
+    faiss_name: Optional[str] = None
+    exclude_seen: bool = True

src/agents/base.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from abc import ABC, abstractmethod
+from typing import Dict, Any
+from .types import Task, StepResult
+class BaseAgent(ABC):
+    name: str = "base"
+    @abstractmethod
+    def run(self, task: Task) -> StepResult:
+        ...
+    def ok(self, detail: str = "", **artifacts) -> StepResult:
+        return StepResult(name=self.name, status="succeeded", detail=detail, artifacts=artifacts)
+    def fail(self, detail: str = "", **artifacts) -> StepResult:
+        return StepResult(name=self.name, status="failed", detail=detail, artifacts=artifacts)

src/agents/chat_agent.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# src/agents/chat_agent.py
+from __future__ import annotations
+import ast
+import math
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import pandas as pd
+from src.utils.paths import get_processed_path
+# ----------------------------- simple config -----------------------------
+@dataclass
+class ChatAgentConfig:
+    # words to ignore when pulling a keyword from the prompt
+    stopwords: frozenset = frozenset(
+        {
+            "under", "below", "less", "than", "beneath",
+            "recommend", "something", "for", "me", "i", "need", "want",
+            "a", "an", "the", "please", "pls", "ok", "okay",
+            "price", "priced", "cost", "costing", "buy", "find", "search",
+            "show", "give", "with", "and", "or", "of", "to", "in", "on",
+        }
+    )
+    # price pattern: $12, 12, 12.5
+    price_re: re.Pattern = re.compile(r"\$?\s*([0-9]+(?:\.[0-9]+)?)", re.IGNORECASE)
+# ----------------------------- helpers -----------------------------------
+def _safe_float(x) -> Optional[float]:
+    try:
+        if x is None:
+            return None
+        s = str(x).strip()
+        # Strip $ and commas if present (common in meta)
+        s = s.replace(",", "")
+        if s.startswith("$"):
+            s = s[1:]
+        v = float(s)
+        if not math.isfinite(v):
+            return None
+        return v
+    except Exception:
+        return None
+def _fmt_price(v: float) -> str:
+    try:
+        return f"${float(v):.2f}"
+    except Exception:
+        return f"${v}"
+def _normalize_categories(val) -> List[str]:
+    """
+    Normalize 'categories' to list[str], handling:
+      - None
+      - list/tuple/set of str
+      - stringified lists like "['A','B']" OR ["['A','B']"]
+      - delimited strings "A > B, C; D"
+    """
+    def _from_string(s: str):
+        s = s.strip()
+        # Try literal list/tuple: "['A','B']" / '["A","B"]' / "(A,B)"
+        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
+            try:
+                parsed = ast.literal_eval(s)
+                if isinstance(parsed, (list, tuple, set)):
+                    return [str(x).strip() for x in parsed if x is not None and str(x).strip()]
+            except Exception:
+                pass
+        # Delimited fallback
+        if re.search(r"[>|,/;]+", s):
+            return [p.strip() for p in re.split(r"[>|,/;]+", s) if p.strip()]
+        return [s] if s else []
+    if val is None:
+        return []
+    # Already a container?
+    if isinstance(val, (list, tuple, set)):
+        out = []
+        for x in val:
+            if x is None:
+                continue
+            if isinstance(x, (list, tuple, set)):
+                # flatten nested containers
+                for y in x:
+                    if y is None:
+                        continue
+                    if isinstance(y, (list, tuple, set)):
+                        out.extend([str(z).strip() for z in y if z is not None and str(z).strip()])
+                    elif isinstance(y, str):
+                        out.extend(_from_string(y))
+                    else:
+                        out.append(str(y).strip())
+            elif isinstance(x, str):
+                out.extend(_from_string(x))
+            else:
+                out.append(str(x).strip())
+        # dedupe + keep order
+        seen, dedup = set(), []
+        for c in out:
+            if c and c not in seen:
+                seen.add(c)
+                dedup.append(c)
+        return dedup
+    # Scalar string
+    return _from_string(str(val))
+# ----------------------------- agent --------------------------------------
+class ChatAgent:
+    def __init__(self, config: Optional[ChatAgentConfig] = None) -> None:
+        self.config = config or ChatAgentConfig()
+    # ---- parse last user text ----
+    def _parse_price_cap(self, text: str) -> Optional[float]:
+        m = self.config.price_re.search(text or "")
+        if not m:
+            return None
+        return _safe_float(m.group(1))
+    def _parse_keyword(self, text: str) -> Optional[str]:
+        t = (text or "").lower()
+        # remove price fragments
+        t = self.config.price_re.sub(" ", t)
+        # pick first token that isn't a stopword and has letters
+        for w in re.findall(r"[a-z][a-z0-9\-]+", t):
+            if w in self.config.stopwords:
+                continue
+            return w
+        return None
+    # ---- load catalog ----
+    def _items_df(self, dataset: str) -> pd.DataFrame:
+        """
+        Load the product catalog from processed data.
+        Prefers items_with_meta.parquet (your structure), falls back to joined.parquet.
+        Returns a DataFrame; missing columns are filled with sensible defaults.
+        """
+        proc = get_processed_path(dataset)
+        for fname in ["items_with_meta.parquet", "joined.parquet", "items_meta.parquet", "items.parquet"]:
+            fp = proc / fname
+            if fp.exists():
+                try:
+                    df = pd.read_parquet(fp)
+                    break
+                except Exception:
+                    continue
+        else:
+            # nothing found
+            return pd.DataFrame(columns=["item_id", "title", "brand", "price", "categories", "image_url"])
+        # Make sure expected columns exist
+        for col in ["item_id", "title", "brand", "price", "categories", "image_url"]:
+            if col not in df.columns:
+                df[col] = None
+        # Some pipelines store images under imageURL/imageURLHighRes
+        if ("image_url" not in df.columns or df["image_url"].isna().all()):
+            for alt in ("imageURLHighRes", "imageURL"):
+                if alt in df.columns:
+                    # pick first image if it's a list-like
+                    def _first_img(v):
+                        if isinstance(v, (list, tuple)) and v:
+                            return v[0]
+                        return v
+                    df["image_url"] = df[alt].apply(_first_img)
+                    break
+        return df
+    # --------- main entrypoint expected by API ---------
+    def reply(
+        self,
+        messages: List[Dict[str, str]],
+        dataset: Optional[str] = None,
+        user_id: Optional[str] = None,  # unused in this simple baseline
+        k: int = 5,
+    ) -> Dict[str, Any]:
+        """
+        Baseline behavior:
+        - Parse last user message → (keyword, price cap)
+        - Filter catalog by price<=cap and keyword match in title/brand/categories
+        - Rank by lowest price (as a proxy score)
+        - Return top-k with normalized fields
+        """
+        if not dataset:
+            dataset = "beauty"
+        # last user utterance
+        last_user = ""
+        for m in reversed(messages or []):
+            if (m.get("role") or "").lower() == "user":
+                last_user = m.get("content") or ""
+                break
+        cap = self._parse_price_cap(last_user)
+        kw = self._parse_keyword(last_user)
+        df = self._items_df(dataset)
+        # Column presence map for debugging
+        colmap = {
+            "item_id": "item_id" if "item_id" in df.columns else None,
+            "title": "title" if "title" in df.columns else None,
+            "brand": "brand" if "brand" in df.columns else None,
+            "price": "price" if "price" in df.columns else None,
+            "categories": "categories" if "categories" in df.columns else None,
+            "image_url": "image_url" if "image_url" in df.columns else None,
+        }
+        # ------- filtering -------
+        if len(df) == 0:
+            sub = df
+        else:
+            mask = pd.Series(True, index=df.index)
+            # price filter
+            if cap is not None and colmap["price"]:
+                price_num = df[colmap["price"]].apply(_safe_float)
+                mask &= pd.to_numeric(price_num, errors="coerce").le(cap)
+            # keyword filter (title OR brand OR categories)
+            if kw:
+                kw_l = kw.lower()
+                parts = []
+                if colmap["title"]:
+                    parts.append(df[colmap["title"]].astype(str).str.lower().str.contains(kw_l, na=False))
+                if colmap["brand"]:
+                    parts.append(df[colmap["brand"]].astype(str).str.lower().str.contains(kw_l, na=False))
+                if colmap["categories"]:
+                    parts.append(df[colmap["categories"]].astype(str).str.lower().str.contains(kw_l, na=False))
+                if parts:
+                    m_any = parts[0]
+                    for p in parts[1:]:
+                        m_any = m_any | p
+                    mask &= m_any
+            sub = df[mask].copy()
+        # ------- scoring & sorting (cheaper → higher score) -------
+        if len(sub) > 0:
+            price_num = sub[colmap["price"]].apply(_safe_float) if colmap["price"] else 0.0
+            sub["score"] = pd.to_numeric(price_num, errors="coerce").apply(
+                lambda p: 1.0 / (p + 1e-6) if pd.notnull(p) and p > 0 else 0.0
+            )
+            sort_cols = ["score"]
+            ascending = [False]
+            if colmap["brand"]:
+                sort_cols.append(colmap["brand"])
+                ascending.append(True)
+            if colmap["title"]:
+                sort_cols.append(colmap["title"])
+                ascending.append(True)
+            sub = sub.sort_values(by=sort_cols, ascending=ascending).head(max(1, int(k)))
+        # ------- build recs -------
+        recs: List[Dict[str, Any]] = []
+        for _, r in sub.iterrows():
+            recs.append(
+                {
+                    "item_id": r.get(colmap["item_id"]) if colmap["item_id"] else None,
+                    "score": float(r.get("score") or 0.0),
+                    "brand": (r.get(colmap["brand"]) if colmap["brand"] else None) or None,
+                    "price": _safe_float(r.get(colmap["price"]) if colmap["price"] else None),
+                    "categories": _normalize_categories(r.get(colmap["categories"]) if colmap["categories"] else None),
+                    "image_url": (r.get(colmap["image_url"]) if colmap["image_url"] else None) or None,
+                }
+            )
+        # Fallback: if filter empty, return cheapest k overall
+        if not recs and len(df) > 0:
+            df2 = df.copy()
+            pnum = df2[colmap["price"]].apply(_safe_float) if colmap["price"] else None
+            df2["pnum"] = pd.to_numeric(pnum, errors="coerce")
+            df2 = df2.sort_values(by=["pnum"]).head(max(1, int(k)))
+            for _, r in df2.iterrows():
+                recs.append(
+                    {
+                        "item_id": r.get(colmap["item_id"]) if colmap["item_id"] else None,
+                        "score": 0.0,
+                        "brand": (r.get(colmap["brand"]) if colmap["brand"] else None) or None,
+                        "price": _safe_float(r.get(colmap["price"]) if colmap["price"] else None),
+                        "categories": _normalize_categories(r.get(colmap["categories"]) if colmap["categories"] else None),
+                        "image_url": (r.get(colmap["image_url"]) if colmap["image_url"] else None) or None,
+                    }
+                )
+        # reply sentence
+        reply_bits = []
+        if kw:
+            reply_bits.append(f"**{kw}**")
+        if cap is not None:
+            reply_bits.append(f"≤ {_fmt_price(cap)}")
+        reply_str = "I found items " + (" ".join(reply_bits) if reply_bits else "you might like") + f" on **{dataset}**."
+        # Helpful debug
+        debug = {
+            "parsed_keyword": kw,
+            "price_cap": cap,
+            "matched": len(recs),
+            "colmap": colmap,
+        }
+        return {"reply": reply_str, "recommendations": recs, "debug": debug}

src/agents/data_agent.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# src/agents/data_agent.py
+import subprocess
+import sys
+import os
+from pathlib import Path
+from typing import Literal
+import urllib.request
+class DataAgent:
+    """
+    Runs data prep scripts for a dataset:
+    - Downloads raw files if not present
+    - join_meta.py
+    - build_text_emb.py
+    - build_image_emb.py
+    - build_meta_emb.py
+    """
+    def _run(self, argv):
+        print("→", " ".join(argv))
+        subprocess.check_call(argv)
+    def _download_raw_data(self, dataset: str):
+        if dataset != "beauty":
+            raise ValueError(f"Auto-download is only supported for 'beauty' dataset")
+        base_dir = Path("data/raw/beauty")
+        base_dir.mkdir(parents=True, exist_ok=True)
+        files = {
+            "reviews.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/reviews.json",
+            "meta.json": "https://huggingface.co/datasets/mickey1976/mayankc-amazon_beauty_subset/resolve/main/meta.json",
+        }
+        for fname, url in files.items():
+            out_path = base_dir / fname
+            if not out_path.exists():
+                print(f"⬇️ Downloading {fname}...")
+                urllib.request.urlretrieve(url, out_path)
+                print(f"✅ Saved to {out_path}")
+            else:
+                print(f"✔️ Already exists: {out_path}")
+    def prepare(self, dataset: Literal["beauty"] = "beauty"):
+        print(f"

src/agents/index_agent.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# src/agents/index_agent.py
+from __future__ import annotations
+import subprocess
+import sys
+from dataclasses import dataclass
+@dataclass
+class IndexConfig:
+    dataset: str
+    fusion: str = "concat"          # "concat" | "weighted"
+    w_text: float = 1.0
+    w_image: float = 1.0
+    w_meta: float = 0.0
+    out_name: str = ""              # e.g. "beauty_concat_best"
+class IndexAgent:
+    def _run(self, argv: list[str]) -> None:
+        # Run the CLI step in the same interpreter/venv
+        subprocess.check_call(argv)
+    def build(self, cfg: IndexConfig) -> None:
+        args = [
+            sys.executable, "scripts/build_faiss.py",
+            "--dataset", cfg.dataset,
+            "--fusion", cfg.fusion,
+            "--w_text", str(cfg.w_text),
+            "--w_image", str(cfg.w_image),
+            "--w_meta", str(cfg.w_meta),
+        ]
+        if cfg.out_name:
+            args += ["--out_name", cfg.out_name]
+        print("→", " ".join(args))
+        self._run(args)
+        print("✓ Index build complete.")

src/agents/model_agent.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import subprocess, sys
+class ModelAgent:
+    """Runs evaluation / sweeps for fusion strategies."""
+    def eval(self, dataset: str="beauty"):
+        print("→ eval fusion on", dataset)
+        subprocess.check_call([sys.executable, "scripts/eval_fusion.py", "--dataset", dataset])
+        print("✓ Evaluation complete.")

src/agents/orchestrator.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env python3
+# src/agents/orchestrator.py
+import argparse
+import subprocess
+def run_eval(dataset: str):
+    runs = [
+        # 1. No FAISS - Weighted Fusion
+        ["scripts/eval_fusion.py", "--dataset", dataset, "--fusion", "weighted", "--use_defaults", "--k", "10", "--run_name", "weighted"],
+        # 2. No FAISS - Concat Fusion
+        ["scripts/eval_fusion.py", "--dataset", dataset, "--fusion", "concat", "--use_defaults", "--k", "10", "--run_name", "concat"],
+        # 3. FAISS - Weighted Fusion
+        ["scripts/eval_fusion.py", "--dataset", dataset, "--fusion", "weighted", "--use_defaults", "--use_faiss", "--k", "10", "--run_name", "cove_faiss_weighted"],
+        # 4. FAISS - Concat Fusion
+        ["scripts/eval_fusion.py", "--dataset", dataset, "--fusion", "concat", "--use_defaults", "--use_faiss", "--k", "10", "--run_name", "cove_faiss_concat"],
+        # 5. CoVE FAISS Only + Logits
+        ["scripts/eval_cove.py", "--dataset", dataset, "--mode", "cove_faiss_only", "--save_candidates"],
+        ["scripts/eval_logits_cove.py", dataset],
+        # 6. CoVE FAISS Concat + Logits
+        ["scripts/eval_cove.py", "--dataset", dataset, "--mode", "cove_faiss_concat", "--save_candidates"],
+        ["scripts/eval_logits_cove.py", dataset],
+        # 7. Full CoVE Logits (pure model)
+        ["scripts/eval_cove.py", "--dataset", dataset, "--mode", "cove_logits", "--full"],
+    ]
+    for i, cmd in enumerate(runs, 1):
+        print(f"\n[🚀] Running {i}/{len(runs)}: {' '.join(cmd)}")
+        subprocess.run(["PYTHONPATH=./src"] + cmd, check=True, shell=False)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", required=True)
+    args = parser.parse_args()
+    run_eval(args.dataset)
+if __name__ == "__main__":
+    main()

src/agents/recommend_agent.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import json
+import urllib.request
+from typing import Optional
+class RecommendAgent:
+    """
+    Hits your local FastAPI recommender (port 8000).
+    """
+    def __init__(self, api_base: str="http://127.0.0.1:8000"):
+        self.api_base = api_base.rstrip("/")
+    def recommend(self,
+                  dataset: str,
+                  user: str,
+                  k: int = 10,
+                  fusion: str = "concat",
+                  w_text: float = 1.0,
+                  w_image: float = 1.0,
+                  w_meta: float = 0.0,
+                  use_faiss: bool = True,
+                  faiss_name: Optional[str] = None,
+                  exclude_seen: bool = True):
+        payload = {
+            "dataset": dataset, "user_id": user, "k": k,
+            "fusion": fusion, "w_text": w_text, "w_image": w_image, "w_meta": w_meta,
+            "use_faiss": use_faiss, "exclude_seen": exclude_seen
+        }
+        if use_faiss and faiss_name:
+            payload["faiss_name"] = faiss_name
+        url = f"{self.api_base}/recommend"
+        req = urllib.request.Request(url, data=json.dumps(payload).encode("utf-8"),
+                                     headers={"Content-Type": "application/json"})
+        with urllib.request.urlopen(req) as resp:
+            body = resp.read()
+            data = json.loads(body)
+            return data

src/agents/report_agent.py ADDED Viewed

	@@ -0,0 +1,319 @@

+#!/usr/bin/env python3
+# src/agents/report_agent.py
+from __future__ import annotations
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import pandas as pd
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parents[2]  # repo root (/notebooks/MMR-Agentic-CoVE)
+LOGS = ROOT / "logs"
+PLOTS = LOGS / "plots"
+REPORTS_ROOT = ROOT / "reports"
+def _ensure_dir(p: Path):
+    p.mkdir(parents=True, exist_ok=True)
+def _load_metrics(csv_fp: Path) -> pd.DataFrame:
+    if not csv_fp.exists():
+        raise FileNotFoundError(f"Missing metrics CSV: {csv_fp}")
+    df = pd.read_csv(csv_fp, engine="python", on_bad_lines="skip")
+    # normalize
+    for col in ["run_name", "dataset", "fusion"]:
+        if col not in df.columns:
+            df[col] = ""
+        df[col] = df[col].fillna("").astype(str)
+    for wcol in ["w_text", "w_image", "w_meta"]:
+        if wcol not in df.columns:
+            df[wcol] = float("nan")
+    # convenience flags
+    if "faiss" not in df.columns:
+        df["faiss"] = df["run_name"].str.contains("faiss", case=False, na=False).astype(bool)
+    return df
+def _metric_cols(df: pd.DataFrame, k: int) -> Dict[str, Optional[str]]:
+    # prefer explicit @k
+    hit_col = f"hit@{k}" if f"hit@{k}" in df.columns else ("hit" if "hit" in df.columns else None)
+    ndcg_col = f"ndcg@{k}" if f"ndcg@{k}" in df.columns else ("ndcg" if "ndcg" in df.columns else None)
+    return {"hit": hit_col, "ndcg": ndcg_col}
+def _top_n_table(
+    df: pd.DataFrame, dataset: str, k: int, top_n: int = 5,
+    prefer_faiss: bool = True
+) -> tuple[pd.DataFrame, Dict[str, Any]]:
+    df = df.copy()
+    df = df[df["dataset"] == dataset] if "dataset" in df.columns else df
+    cols = _metric_cols(df, k)
+    hitc, ndcgc = cols["hit"], cols["ndcg"]
+    if not ndcgc and not hitc:
+        raise ValueError(f"No hit/ndcg columns found for k={k}. Available: {list(df.columns)}")
+    # sort keys: ndcg desc, then hit desc; optional FAISS preference when tied
+    sort_cols = []
+    if ndcgc: sort_cols.append(ndcgc)
+    if hitc:  sort_cols.append(hitc)
+    if not sort_cols:
+        raise ValueError("No sortable metric columns.")
+    df["_faiss"] = df.get("faiss", df["run_name"].str.contains("faiss", case=False, na=False)).astype(int)
+    by = [c for c in sort_cols] + (["_faiss"] if prefer_faiss else [])
+    df_sorted = df.sort_values(by=by, ascending=[False]*len(by))
+    # build a compact table for the report
+    keep_cols = ["run_name", "dataset", "fusion", "w_text", "w_image", "w_meta"]
+    if hitc:  keep_cols.append(hitc)
+    if ndcgc: keep_cols.append(ndcgc)
+    top = df_sorted[keep_cols].head(top_n).reset_index(drop=True)
+    # choose recommendation = first row
+    rec_row = top.iloc[0].to_dict()
+    rec = {
+        "dataset": dataset,
+        "k": k,
+        "recommended_run": rec_row["run_name"],
+        "fusion": rec_row.get("fusion"),
+        "weights": {
+            "w_text": float(rec_row.get("w_text")) if pd.notna(rec_row.get("w_text")) else None,
+            "w_image": float(rec_row.get("w_image")) if pd.notna(rec_row.get("w_image")) else None,
+            "w_meta": float(rec_row.get("w_meta")) if pd.notna(rec_row.get("w_meta")) else None,
+        },
+        "metrics": {
+            (hitc or "hit"): float(rec_row.get(hitc)) if hitc else None,
+            (ndcgc or "ndcg"): float(rec_row.get(ndcgc)) if ndcgc else None,
+        },
+    }
+    return top, rec
+def _md_table(df: pd.DataFrame) -> str:
+    """
+    Return a markdown-ish table. Falls back to a preformatted text block if
+    pandas' to_markdown requires 'tabulate' and it's not installed.
+    """
+    try:
+        return df.to_markdown(index=False)
+    except Exception:
+        # Fallback: plain text inside code fences so the report still renders.
+        return "```\n" + df.to_string(index=False) + "\n```"
+def _copy_plots_into(out_dir: Path, dataset: str) -> list[str]:
+    """
+    Return the list of plot filenames that were copied into out_dir.
+    Only copies files that exist under logs/plots.
+    """
+    wanted = [
+        f"{dataset}_k10_quality.png",
+        f"{dataset}_k10_quality_trend.png",
+        f"{dataset}_k10_latency.png",
+        f"{dataset}_w_meta_ndcg@10.png",
+        f"{dataset}_w_meta_hit@10.png",
+        f"{dataset}_k_ndcg@10.png",
+    ]
+    copied: list[str] = []
+    for name in wanted:
+        src = PLOTS / name
+        if src.exists():
+            try:
+                import shutil
+                dst = out_dir / name
+                shutil.copy2(src, dst)
+                copied.append(name)
+            except Exception:
+                pass
+    return copied
+def _baseline_quadrant(df: pd.DataFrame, dataset: str, k: int) -> Optional[pd.DataFrame]:
+    """
+    Build a compact 2x2 comparison if rows exist:
+      No-FAISS / FAISS   ×   concat / weighted
+    """
+    cols = _metric_cols(df, k)
+    hitc, ndcgc = cols["hit"], cols["ndcg"]
+    if not ndcgc and not hitc:
+        return None
+    d = df.copy()
+    if "dataset" in d.columns:
+        d = d[d["dataset"] == dataset]
+    if "fusion" not in d.columns:
+        return None
+    if "faiss" not in d.columns:
+        d["faiss"] = d["run_name"].str.contains("faiss", case=False, na=False).astype(bool)
+    # For each quadrant, pick the best row (by ndcg then hit)
+    rows = []
+    for fa in [False, True]:
+        for fu in ["concat", "weighted"]:
+            sub = d[(d["fusion"].str.lower()==fu) & (d["faiss"]==fa)]
+            if ndcgc: sub = sub.sort_values(ndcgc, ascending=False)
+            if hitc:
+                sub = sub.sort_values([ndcgc, hitc], ascending=[False, False]) if ndcgc else sub.sort_values(hitc, ascending=False)
+            if sub.empty:
+                rows.append({"faiss": "Yes" if fa else "No", "fusion": fu, "run_name": "—",
+                             "hit@k": None if not hitc else None, "ndcg@k": None if not ndcgc else None})
+            else:
+                r = sub.iloc[0]
+                rows.append({
+                    "faiss": "Yes" if fa else "No",
+                    "fusion": fu,
+                    "run_name": r.get("run_name", ""),
+                    "hit@k": (float(r[hitc]) if hitc else None),
+                    "ndcg@k": (float(r[ndcgc]) if ndcgc else None),
+                })
+    out = pd.DataFrame(rows, columns=["faiss","fusion","run_name","hit@k","ndcg@k"])
+    # Return None if literally no metrics found
+    if out[["hit@k","ndcg@k"]].isna().all().all():
+        return None
+    return out
+def _write_report(
+    out_dir: Path,
+    tag: str,
+    dataset: str,
+    k: Optional[int],
+    include_compare: bool,
+    top_n: int,
+    prefer_faiss: bool,
+    metrics_csv: Path,
+) -> None:
+    _ensure_dir(out_dir)
+    # Self-contained: copy plots into the report directory
+    copied_plots = _copy_plots_into(out_dir, dataset)
+    # optional compare section + recommendation
+    compare_md = ""
+    summary_json: Dict[str, Any] = {}
+    if include_compare and k is not None:
+        df_all = _load_metrics(metrics_csv)
+        try:
+            top, rec = _top_n_table(df_all, dataset=dataset, k=k, top_n=top_n, prefer_faiss=prefer_faiss)
+            compare_md = (
+                "## Top runs (auto)\n\n"
+                + _md_table(top.rename(columns={
+                    f"hit@{k}": "hit@k", f"ndcg@{k}": "ndcg@k"
+                })) + "\n\n"
+                "### Recommendation (auto)\n\n"
+                "```json\n" + json.dumps(rec, indent=2) + "\n```\n"
+            )
+            summary_json["recommendation"] = rec
+            summary_json["top_runs"] = json.loads(top.to_json(orient="records"))
+            # Add a 4-way baseline quadrant if possible
+            quad = _baseline_quadrant(df_all, dataset=dataset, k=k)
+            if quad is not None:
+                compare_md += "\n### Baseline 4-way comparison (FAISS × Fusion)\n\n"
+                compare_md += _md_table(quad) + "\n"
+                summary_json["baseline_quadrant"] = json.loads(quad.to_json(orient="records"))
+        except Exception as e:
+            compare_md = f"> Could not compute comparison for k={k}: {e}\n"
+    # build markdown
+    md_parts = [f"# {dataset} — {tag}\n"]
+    if include_compare and k is not None:
+        md_parts.append(compare_md)
+    if copied_plots:
+        md_parts.append("## Plots\n")
+        for name in copied_plots:
+            md_parts.append(f"![{name}](./{name})\n")
+    # metrics snapshot (also save a filtered CSV into the report for grading)
+    try:
+        dfm = _load_metrics(metrics_csv)
+        snap = dfm[dfm["dataset"] == dataset] if "dataset" in dfm.columns else dfm
+        md_parts.append("## Metrics snapshot\n\n")
+        show_cols = [c for c in ["run_name","dataset","fusion","w_text","w_image","w_meta",
+                                 "k","hit","ndcg","hit@5","ndcg@5","hit@10","ndcg@10","hit@20","ndcg@20","p50_ms","p95_ms"]
+                     if c in snap.columns]
+        if not show_cols:
+            show_cols = list(snap.columns)[:10]
+        md_parts.append(_md_table(snap[show_cols].tail(20)) + "\n")
+        # Save a compact CSV snapshot into the report folder
+        snap.to_csv(out_dir / "metrics.csv", index=False)
+    except Exception as e:
+        md_parts.append(f"> Could not include metrics snapshot: {e}\n")
+    # write index.md
+    md_path = out_dir / "index.md"
+    md_path.write_text("\n".join(md_parts), encoding="utf-8")
+    # render HTML (pretty if markdown package available; otherwise fallback)
+    html_path = out_dir / "index.html"
+    try:
+        import markdown  # type: ignore
+        html = markdown.markdown(md_path.read_text(encoding="utf-8"), extensions=["tables"])
+        html_full = [
+            "<html><head><meta charset='utf-8'><title>Report</title>",
+            "<style>body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,Roboto;max-width:900px;margin:40px auto;padding:0 16px} table{border-collapse:collapse} th,td{border:1px solid #ddd;padding:6px 8px}</style>",
+            "</head><body>",
+            html,
+            "</body></html>",
+        ]
+        html_path.write_text("\n".join(html_full), encoding="utf-8")
+    except Exception:
+        # simple fallback
+        html = [
+            "<html><head><meta charset='utf-8'><title>Report</title></head><body>",
+            f"<pre style='font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace'>{md_path.read_text(encoding='utf-8')}</pre>",
+            "</body></html>",
+        ]
+        html_path.write_text("\n".join(html), encoding="utf-8")
+    # write summary.json
+    (out_dir / "summary.json").write_text(json.dumps({
+        "dataset": dataset,
+        "tag": tag,
+        "k": k,
+        "include_compare": include_compare,
+        **summary_json
+    }, indent=2), encoding="utf-8")
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dataset", required=True)
+    ap.add_argument("--tag", default="report")
+    ap.add_argument("--k", type=int, default=10, help="k to use for comparison tables")
+    ap.add_argument("--include-compare", action="store_true", help="Include Top runs + Recommendation section")
+    ap.add_argument("--top-n", type=int, default=3, help="How many runs to show in the Top table")
+    ap.add_argument("--prefer-faiss", action="store_true", help="Prefer FAISS runs when metrics tie")
+    ap.add_argument("--metrics_csv", default=str(LOGS / "metrics.csv"))
+    ap.add_argument("--plots_dir", default=str(PLOTS))
+    ap.add_argument("--out", default="", help="Optional explicit out path (file or directory)")
+    ap.add_argument("--no-plots", action="store_true", help="(kept for back-compat; plots are referenced if present)")
+    ap.add_argument("--zip", action="store_true", help="Zip the report folder")
+    args = ap.parse_args()
+    dataset = args.dataset
+    tag = args.tag
+    out_dir = Path(args.out) if args.out else (REPORTS_ROOT / dataset / f"{pd.Timestamp.now():%Y%m%d_%H%M%S} {tag}")
+    _ensure_dir(out_dir)
+    # Create report
+    _write_report(
+        out_dir=out_dir,
+        tag=tag,
+        dataset=dataset,
+        k=args.k if args.include_compare else None,
+        include_compare=args.include_compare,
+        top_n=args.top_n,
+        prefer_faiss=args.prefer_faiss,
+        metrics_csv=Path(args.metrics_csv),
+    )
+    print(f"→ Assembling report at {out_dir}")
+    print(f"✓ Report ready: {out_dir}")
+    if args.zip:
+        import shutil
+        zpath = out_dir.with_suffix(".zip")
+        base = out_dir.name
+        shutil.make_archive(str(zpath.with_suffix("")), "zip", out_dir.parent, base)
+        print(f"📦 Zipped → {zpath}")
+if __name__ == "__main__":
+    main()

src/agents/run_agent.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# scripts/run_agent.py
+import argparse
+from agents.data_agent import DataAgent
+from agents.index_agent import IndexAgent
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--intent", required=True, choices=["prepare", "index"], help="What the agent should do")
+    parser.add_argument("--dataset", default="beauty", help="Dataset name")
+    parser.add_argument("--fusion", choices=["concat", "weighted"], help="Fusion mode (for indexing)")
+    parser.add_argument("--w_text", type=float, default=1.0, help="Weight for text embeddings")
+    parser.add_argument("--w_image", type=float, default=1.0, help="Weight for image embeddings")
+    parser.add_argument("--w_meta", type=float, default=1.0, help="Weight for meta embeddings")
+    parser.add_argument("--faiss_name", default="default_index", help="Name for the FAISS index output")
+    args = parser.parse_args()
+    if args.intent == "prepare":
+        DataAgent().prepare(args.dataset)
+    elif args.intent == "index":
+        IndexAgent().index(
+            dataset=args.dataset,
+            fusion=args.fusion,
+            w_text=args.w_text,
+            w_image=args.w_image,
+            w_meta=args.w_meta,
+            faiss_name=args.faiss_name
+        )

src/cove/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

File without changes

src/cove/.ipynb_checkpoints/fuse_index-checkpoint.py ADDED Viewed

File without changes

src/cove/.ipynb_checkpoints/io-checkpoint.py ADDED Viewed

File without changes

src/cove/__init__.py ADDED Viewed

File without changes

src/cove/fuse_index.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# src/cove/fuse_index.py
+from __future__ import annotations
+from pathlib import Path
+from typing import Tuple, Optional
+import numpy as np
+import faiss  # pip install faiss-cpu
+from .io import read_item_parquet, align_by_ids
+def l2norm_rows(M: np.ndarray) -> np.ndarray:
+    return M / (np.linalg.norm(M, axis=1, keepdims=True) + 1e-12)
+def concat_fuse(parts: Tuple[np.ndarray, ...], weights: Tuple[float, ...]) -> np.ndarray:
+    scaled = []
+    for X, w in zip(parts, weights):
+        if X is None or X.size == 0 or w == 0.0:
+            continue
+        scaled.append(w * X)
+    if not scaled:
+        raise ValueError("Nothing to fuse.")
+    return np.concatenate(scaled, axis=1).astype(np.float32)
+def weighted_sum(Vt: np.ndarray,
+                 Vi: Optional[np.ndarray],
+                 Vm: Optional[np.ndarray],
+                 wt=1.0, wi=0.0, wm=0.0) -> np.ndarray:
+    parts = [wt * Vt]
+    D = Vt.shape[1]
+    if Vi is not None and wi != 0.0:
+        if Vi.shape[1] != D:
+            raise ValueError("Weighted-sum requires equal dims.")
+        parts.append(wi * Vi)
+    if Vm is not None and wm != 0.0:
+        if Vm.shape[1] != D:
+            raise ValueError("Weighted-sum requires equal dims.")
+        parts.append(wm * Vm)
+    return np.sum(parts, axis=0).astype(np.float32)
+def build_ivfpq(V: np.ndarray, nlist=2048, m=32, nbits=8, use_opq: bool=False):
+    dim = V.shape[1]
+    Vn = l2norm_rows(V)
+    # optional OPQ: orthogonal rotation before PQ
+    opq = None
+    if use_opq:
+        opq = faiss.OPQMatrix(dim, m)
+        opq.train(Vn)
+        Vn = opq.apply_py(Vn)
+    quantizer = faiss.IndexFlatIP(dim)
+    index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, nbits, faiss.METRIC_INNER_PRODUCT)
+    index.train(Vn)
+    index.add(Vn)
+    return index, opq
+def build_pq(V: np.ndarray, m=32, nbits=8, use_opq: bool=False):
+    dim = V.shape[1]
+    Vn = l2norm_rows(V)
+    opq = None
+    if use_opq:
+        opq = faiss.OPQMatrix(dim, m)
+        opq.train(Vn)
+        Vn = opq.apply_py(Vn)
+    index = faiss.IndexPQ(dim, m, nbits, faiss.METRIC_INNER_PRODUCT)
+    index.train(Vn)
+    index.add(Vn)
+    return index, opq
+def save_index(out_dir: Path, base: str, index, item_ids, opq=None):
+    out_dir.mkdir(parents=True, exist_ok=True)
+    faiss.write_index(index, str(out_dir / f"{base}.faiss"))
+    np.save(out_dir / f"{base}.npy", np.array(item_ids, dtype=object))
+    if opq is not None:
+        faiss.write_VectorTransform(opq, str(out_dir / f"{base}.opq"))
+def fuse_mm_with_cove(
+    proc_dir: Path,
+    cove_fp: Path,
+    fusion: str = "concat",
+    w_text=1.0, w_image=0.0, w_meta=0.0, w_cove=1.0,
+) -> Tuple[np.ndarray, np.ndarray]:
+    # base item ids & text vectors (your standard files)
+    I_text_ids, Vt = read_item_parquet(proc_dir / "item_text_emb.parquet")
+    item_ids = I_text_ids  # master order
+    # align optional parts
+    Vi = Vm = None
+    if (proc_dir / "item_image_emb.parquet").exists():
+        I_img_ids, Vi_raw = read_item_parquet(proc_dir / "item_image_emb.parquet")
+        Vi = align_by_ids(item_ids, I_img_ids, Vi_raw)
+    if (proc_dir / "item_meta_emb.parquet").exists():
+        I_met_ids, Vm_raw = read_item_parquet(proc_dir / "item_meta_emb.parquet")
+        Vm = align_by_ids(item_ids, I_met_ids, Vm_raw)
+    # CoVE item vectors (already trained elsewhere)
+    C_ids, Vc_raw = read_item_parquet(cove_fp)
+    Vc = align_by_ids(item_ids, C_ids, Vc_raw)
+    if fusion == "concat":
+        V = concat_fuse((Vt, Vi, Vm, Vc), (w_text, w_image, w_meta, w_cove))
+    elif fusion == "weighted":
+        # weighted-sum MM first (requires same dim), then concat CoVE (or sum if same dim)
+        Vmm = concat_fuse((Vt, Vi, Vm), (w_text, w_image, w_meta))  # safe concat
+        V   = concat_fuse((Vmm, Vc), (1.0, w_cove))
+    else:
+        raise ValueError("fusion must be 'concat' or 'weighted'")
+    return item_ids, V

src/cove/io.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# src/cove/io.py
+from __future__ import annotations
+from pathlib import Path
+from typing import Tuple, Optional, List, Dict
+import numpy as np
+import pandas as pd
+def read_item_parquet(fp: Path, id_col="item_id", vec_col="vector") -> Tuple[np.ndarray, np.ndarray]:
+    df = pd.read_parquet(fp)
+    ids = df[id_col].to_numpy()
+    vecs = np.stack(df[vec_col].to_numpy()).astype(np.float32)
+    return ids, vecs
+def align_by_ids(base_ids: np.ndarray,
+                 other_ids: np.ndarray,
+                 other_vecs: np.ndarray,
+                 dim: Optional[int] = None) -> np.ndarray:
+    """Return matrix aligned to base_ids; missing rows -> zeros."""
+    m: Dict[str, np.ndarray] = {str(i): v for i, v in zip(other_ids, other_vecs)}
+    if dim is None:
+        # infer from first vector; if none, return zeros
+        a_vec = next(iter(m.values()), None)
+        dim = len(a_vec) if a_vec is not None else 0
+    out = np.zeros((len(base_ids), dim), dtype=np.float32)
+    for r, iid in enumerate(base_ids):
+        v = m.get(str(iid))
+        if v is not None:
+            out[r] = v
+    return out

src/data/.ipynb_checkpoints/init-checkpoint.py ADDED Viewed

File without changes

src/data/.ipynb_checkpoints/loader-checkpoint.py ADDED Viewed

File without changes

src/data/.ipynb_checkpoints/registry-checkpoint.py ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Keep this light to avoid pulling heavy modules at import time
2	+ __all__ = []

src/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (166 Bytes). View file

src/data/__pycache__/loader.cpython-311.pyc ADDED Viewed

Binary file (1.1 kB). View file

src/data/__pycache__/registry.cpython-311.pyc ADDED Viewed

Binary file (2.67 kB). View file

src/data/loader.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# src/data/loader.py
+import json
+from pathlib import Path
+def load_dataset(dataset: str):
+    base_path = Path("data/processed") / dataset
+    with open(base_path / "seq.json", "r") as f:
+        user_seqs = json.load(f)  # this keeps the full {user_id: [item_id, ...]} dict
+    with open(base_path / "candidate_items.json", "r") as f:
+        candidate_items = json.load(f)
+    return user_seqs, candidate_items

src/data/registry.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# src/data/registry.py
+from __future__ import annotations
+from pathlib import Path
+from typing import Dict
+# Canonical path helpers live in utils.paths
+from utils.paths import (
+    RAW_DIR,
+    PROCESSED_DIR,
+    get_dataset_paths as _get_dataset_paths,  # returns dict[str, Path]
+    get_raw_path,
+    get_processed_path,
+)
+def get_paths(dataset: str) -> Dict[str, Path]:
+    """
+    Return raw and processed directories for a dataset name (as Path objects).
+    Creates them if they do not exist.
+    Example:
+        d = get_paths("beauty")
+        d["raw_dir"] -> Path(.../data/raw/beauty)
+        d["processed_dir"] -> Path(.../data/processed/beauty)
+    """
+    name = (dataset or "").lower()
+    raw_dir = RAW_DIR / name
+    processed_dir = PROCESSED_DIR / name
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    processed_dir.mkdir(parents=True, exist_ok=True)
+    return {"raw_dir": raw_dir, "processed_dir": processed_dir}
+def raw_file(dataset: str, filename: str) -> Path:
+    """Convenience: Path to a file inside data/raw/<dataset>/"""
+    return get_paths(dataset)["raw_dir"] / filename
+def processed_file(dataset: str, filename: str) -> Path:
+    """Convenience: Path to a file inside data/processed/<dataset>/"""
+    return get_paths(dataset)["processed_dir"] / filename
+# ---------------------------------------------------------------------
+# Compatibility shim used by older code/tests:
+# This now returns Path objects instead of strings.
+# ---------------------------------------------------------------------
+def get_dataset_paths(dataset: str) -> Dict[str, Path]:
+    """
+    Returns absolute paths (as Path objects) for the given dataset:
+    {
+      "raw": Path(.../data/raw/<dataset>),
+      "processed": Path(.../data/processed/<dataset>),
+      "cache": Path(.../data/cache/<dataset>),
+      "logs": Path(.../logs),
+      "meta_features_path": Path(.../meta_features.npy),
+      "text_features_path": Path(.../text_features.npy),
+      "image_features_path": Path(.../image_features.npy),
+      "labels_path": Path(.../labels.json)
+    }
+    """
+    return _get_dataset_paths(dataset)
+__all__ = [
+    "get_paths",
+    "raw_file",
+    "processed_file",
+    "get_dataset_paths",  # keep public for tests
+    "get_raw_path",
+    "get_processed_path",
+]

src/models/.ipynb_checkpoints/fusion-checkpoint.py ADDED Viewed

File without changes