Spaces:

kamp0010
/

cc1

Runtime error

App Files Files Community

kamp0010 commited on 4 days ago

Commit

872bcb0

verified ·

1 Parent(s): 2e4865a

Rename app.py to main.py

Browse files

Files changed (2) hide show

app.py +0 -464
main.py +252 -0

app.py DELETED Viewed

@@ -1,464 +0,0 @@
-import os
-import builtins
-_real_input = builtins.input
-def _auto_yes(prompt=""):
-    if any(kw in str(prompt).lower() for kw in ("custom code", "trust", "wish to run")):
-        return "y"
-    return _real_input(prompt)
-builtins.input = _auto_yes
-os.environ["TRUST_REMOTE_CODE"]             = "1"
-os.environ["HF_HUB_DISABLE_PROGRESS_BARS"]  = "1"
-os.environ["TOKENIZERS_PARALLELISM"]        = "false"
-os.environ["HF_HUB_VERBOSITY"]             = "error"
-import streamlit as st
-import numpy as np
-import torch
-import re
-from transformers import AutoModel, AutoTokenizer
-# ─────────────────────────── Page config ──────────────────────────────────────
-st.set_page_config(
-    page_title="pplx-embed · Semantic Search",
-    page_icon="◈",
-    layout="wide",
-    initial_sidebar_state="expanded",
-)
-# ─────────────────────────── Global CSS ───────────────────────────────────────
-st.markdown("""
-<style>
-@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=JetBrains+Mono:wght@300;400;500&display=swap');
-/* ── Base ── */
-html, body, [data-testid="stAppViewContainer"] {
-    background: #0c0e14 !important;
-    color: #e8e4d9 !important;
-    font-family: 'JetBrains Mono', monospace !important;
-}
-[data-testid="stSidebar"] {
-    background: #10121a !important;
-    border-right: 1px solid #1e2235 !important;
-}
-[data-testid="stSidebar"] * { color: #e8e4d9 !important; }
-/* ── Hide default Streamlit chrome ── */
-#MainMenu, footer, header { visibility: hidden; }
-.block-container { padding: 2rem 2.5rem 3rem !important; max-width: 1100px !important; }
-/* ── Hero header ── */
-.hero {
-    display: flex;
-    align-items: flex-end;
-    gap: 1rem;
-    margin-bottom: 0.25rem;
-}
-.hero-icon {
-    font-size: 2.8rem;
-    line-height: 1;
-    color: #f5a623;
-    font-family: 'Syne', sans-serif;
-}
-.hero-title {
-    font-family: 'Syne', sans-serif;
-    font-weight: 800;
-    font-size: 2.4rem;
-    letter-spacing: -0.04em;
-    color: #f0ede6;
-    line-height: 1;
-}
-.hero-title span { color: #f5a623; }
-.hero-sub {
-    font-family: 'JetBrains Mono', monospace;
-    font-size: 0.72rem;
-    color: #5a6080;
-    letter-spacing: 0.12em;
-    text-transform: uppercase;
-    margin-bottom: 2rem;
-    margin-top: 0.3rem;
-}
-.divider {
-    height: 1px;
-    background: linear-gradient(90deg, #f5a623 0%, #f5a62322 40%, transparent 100%);
-    margin-bottom: 2rem;
-}
-/* ── Upload zone ── */
-[data-testid="stFileUploader"] {
-    background: #13161f !important;
-    border: 1px dashed #2a2e42 !important;
-    border-radius: 8px !important;
-    transition: border-color 0.2s;
-}
-[data-testid="stFileUploader"]:hover {
-    border-color: #f5a623 !important;
-}
-[data-testid="stFileUploader"] * { color: #7a80a0 !important; }
-[data-testid="stFileUploader"] label { color: #e8e4d9 !important; }
-/* ── Text input ── */
-[data-testid="stTextInput"] input {
-    background: #13161f !important;
-    border: 1px solid #2a2e42 !important;
-    border-radius: 6px !important;
-    color: #f0ede6 !important;
-    font-family: 'JetBrains Mono', monospace !important;
-    font-size: 0.9rem !important;
-    padding: 0.75rem 1rem !important;
-    transition: border-color 0.2s, box-shadow 0.2s;
-}
-[data-testid="stTextInput"] input:focus {
-    border-color: #f5a623 !important;
-    box-shadow: 0 0 0 3px #f5a62318 !important;
-    outline: none !important;
-}
-[data-testid="stTextInput"] label {
-    color: #7a80a0 !important;
-    font-size: 0.7rem !important;
-    letter-spacing: 0.1em !important;
-    text-transform: uppercase !important;
-    font-family: 'JetBrains Mono', monospace !important;
-}
-/* ── Button ── */
-[data-testid="stButton"] button {
-    background: #f5a623 !important;
-    color: #0c0e14 !important;
-    font-family: 'Syne', sans-serif !important;
-    font-weight: 700 !important;
-    font-size: 0.85rem !important;
-    letter-spacing: 0.08em !important;
-    text-transform: uppercase !important;
-    border: none !important;
-    border-radius: 6px !important;
-    padding: 0.6rem 1.8rem !important;
-    cursor: pointer !important;
-    transition: background 0.15s, transform 0.1s !important;
-}
-[data-testid="stButton"] button:hover {
-    background: #ffc048 !important;
-    transform: translateY(-1px) !important;
-}
-[data-testid="stButton"] button:active { transform: translateY(0) !important; }
-[data-testid="stButton"] button:disabled {
-    background: #1e2235 !important;
-    color: #3a3f55 !important;
-    cursor: not-allowed !important;
-    transform: none !important;
-}
-/* ── Sliders ── */
-[data-testid="stSlider"] > div > div > div > div {
-    background: #f5a623 !important;
-}
-[data-testid="stSlider"] label {
-    color: #7a80a0 !important;
-    font-size: 0.7rem !important;
-    letter-spacing: 0.08em !important;
-    text-transform: uppercase !important;
-}
-/* ── Expander ── */
-[data-testid="stExpander"] {
-    background: #13161f !important;
-    border: 1px solid #1e2235 !important;
-    border-radius: 6px !important;
-}
-[data-testid="stExpander"] summary {
-    color: #7a80a0 !important;
-    font-size: 0.75rem !important;
-    letter-spacing: 0.08em !important;
-}
-/* ── Alerts / info ── */
-[data-testid="stAlert"] {
-    background: #13161f !important;
-    border-radius: 6px !important;
-    border-left: 3px solid #f5a623 !important;
-    font-family: 'JetBrains Mono', monospace !important;
-    font-size: 0.82rem !important;
-}
-/* ── Spinner text ── */
-[data-testid="stSpinner"] p { color: #7a80a0 !important; font-size: 0.8rem !important; }
-/* ── Sidebar labels ── */
-.sidebar-label {
-    font-size: 0.65rem;
-    letter-spacing: 0.15em;
-    text-transform: uppercase;
-    color: #f5a623;
-    font-family: 'Syne', sans-serif;
-    font-weight: 700;
-    margin-bottom: 1rem;
-    margin-top: 0.5rem;
-}
-.sidebar-how {
-    font-size: 0.72rem;
-    color: #5a6080;
-    line-height: 1.8;
-    border-left: 2px solid #1e2235;
-    padding-left: 0.8rem;
-    margin-top: 0.5rem;
-}
-.sidebar-step { color: #f5a623; font-weight: 500; }
-/* ── Result cards ── */
-@keyframes fadeSlideIn {
-    from { opacity: 0; transform: translateY(10px); }
-    to   { opacity: 1; transform: translateY(0); }
-}
-.result-card {
-    background: #13161f;
-    border: 1px solid #1e2235;
-    border-radius: 8px;
-    padding: 1.1rem 1.3rem;
-    margin-bottom: 0.75rem;
-    animation: fadeSlideIn 0.3s ease both;
-    position: relative;
-    overflow: hidden;
-    transition: border-color 0.2s, transform 0.15s;
-}
-.result-card:hover {
-    border-color: #f5a62355;
-    transform: translateX(3px);
-}
-.result-card::before {
-    content: '';
-    position: absolute;
-    left: 0; top: 0; bottom: 0;
-    width: 3px;
-    border-radius: 8px 0 0 8px;
-}
-.card-high::before  { background: #4ade80; }
-.card-mid::before   { background: #f5a623; }
-.card-low::before   { background: #f87171; }
-.card-meta {
-    display: flex;
-    align-items: center;
-    gap: 0.75rem;
-    margin-bottom: 0.6rem;
-}
-.card-rank {
-    font-family: 'Syne', sans-serif;
-    font-weight: 800;
-    font-size: 0.7rem;
-    color: #3a3f55;
-    letter-spacing: 0.1em;
-}
-.card-score-bar {
-    flex: 1;
-    height: 3px;
-    background: #1e2235;
-    border-radius: 99px;
-    overflow: hidden;
-}
-.card-score-fill {
-    height: 100%;
-    border-radius: 99px;
-    transition: width 0.6s cubic-bezier(.16,1,.3,1);
-}
-.card-score-num {
-    font-family: 'JetBrains Mono', monospace;
-    font-size: 0.7rem;
-    font-weight: 500;
-    letter-spacing: 0.05em;
-}
-.card-text {
-    font-family: 'JetBrains Mono', monospace;
-    font-size: 0.82rem;
-    line-height: 1.75;
-    color: #c8c4b8;
-}
-.results-header {
-    font-family: 'Syne', sans-serif;
-    font-weight: 700;
-    font-size: 0.7rem;
-    letter-spacing: 0.18em;
-    text-transform: uppercase;
-    color: #5a6080;
-    margin-bottom: 1rem;
-    margin-top: 1.5rem;
-}
-.index-badge {
-    display: inline-flex;
-    align-items: center;
-    gap: 0.4rem;
-    background: #13161f;
-    border: 1px solid #1e2235;
-    border-radius: 4px;
-    padding: 0.3rem 0.7rem;
-    font-size: 0.72rem;
-    color: #7a80a0;
-    margin-bottom: 1rem;
-}
-.index-badge span { color: #f5a623; font-weight: 600; }
-</style>
-""", unsafe_allow_html=True)
-# ─────────────────────────── Model loading ────────────────────────────────────
-@st.cache_resource(show_spinner="◈  Loading models…")
-def load_models():
-    ctx_model   = AutoModel.from_pretrained("perplexity-ai/pplx-embed-context-v1-0.6B", trust_remote_code=True)
-    query_model = AutoModel.from_pretrained("perplexity-ai/pplx-embed-v1-0.6B",         trust_remote_code=True)
-    tokenizer   = AutoTokenizer.from_pretrained("perplexity-ai/pplx-embed-v1-0.6B",     trust_remote_code=True)
-    ctx_model.eval(); query_model.eval()
-    return ctx_model, query_model, tokenizer
-ctx_model, query_model, tokenizer = load_models()
-# ─────────────────────────── Encoding helpers ─────────────────────────────────
-def mean_pool(token_embeddings, attention_mask):
-    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
-def _encode(model, texts):
-    if hasattr(model, "encode"):
-        result = model.encode(texts)
-        if isinstance(result, (list, tuple)):
-            return np.vstack([np.array(r).flatten() for r in result])
-        return np.array(result)
-    encoded = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
-    with torch.no_grad():
-        out = model(**encoded)
-    return mean_pool(out.last_hidden_state, encoded["attention_mask"]).cpu().numpy()
-def embed_document_chunks(chunks):
-    if hasattr(ctx_model, "encode"):
-        return np.array(ctx_model.encode([chunks])[0])
-    return _encode(ctx_model, chunks)
-def embed_query(query):
-    return _encode(query_model, [query])[0].flatten()
-def chunk_text(text, chunk_size=3, overlap=1):
-    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
-    sentences = [s.strip() for s in sentences if s.strip()]
-    chunks, i = [], 0
-    while i < len(sentences):
-        chunks.append(" ".join(sentences[i : i + chunk_size]))
-        i += max(1, chunk_size - overlap)
-    return chunks
-def cosine_sim(a, b):
-    na, nb = np.linalg.norm(a), np.linalg.norm(b)
-    return float(np.dot(a, b) / (na * nb)) if na and nb else 0.0
-def search(query, chunks, embeddings, top_k=5):
-    q = embed_query(query)
-    scores = [cosine_sim(q, embeddings[i]) for i in range(len(chunks))]
-    ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
-    return [(chunks[idx], score) for idx, score in ranked[:top_k]]
-# ─────────────────────────── Sidebar ──────────────────────────────────────────
-with st.sidebar:
-    st.markdown('<div class="sidebar-label">◈ Configuration</div>', unsafe_allow_html=True)
-    chunk_size = st.slider("Sentences per chunk", 1, 8, 3)
-    overlap    = st.slider("Sentence overlap",    0, 4, 1)
-    top_k      = st.slider("Results to show",     1, 10, 5)
-    st.markdown("---")
-    st.markdown('<div class="sidebar-label">How it works</div>', unsafe_allow_html=True)
-    st.markdown("""
-    <div class="sidebar-how">
-      <div><span class="sidebar-step">01 ·</span> File split into overlapping sentence chunks</div>
-      <div><span class="sidebar-step">02 ·</span> Chunks embedded as one document — each chunk sees its neighbours</div>
-      <div><span class="sidebar-step">03 ·</span> Query embedded with the standalone model</div>
-      <div><span class="sidebar-step">04 ·</span> Cosine similarity ranks results</div>
-    </div>
-    """, unsafe_allow_html=True)
-    st.markdown("---")
-    st.markdown("""
-    <div style="font-size:0.65rem;color:#3a3f55;line-height:1.6;">
-        context model · pplx-embed-context-v1-0.6B<br>
-        query model &nbsp;· pplx-embed-v1-0.6B<br>
-        dim · 1024 · int8 · cosine
-    </div>
-    """, unsafe_allow_html=True)
-# ─────────────────────────── Main UI ──────────────────────────────────────────
-st.markdown("""
-<div class="hero">
-    <div class="hero-icon">◈</div>
-    <div class="hero-title">pplx<span>·</span>search</div>
-</div>
-<div class="hero-sub">contextual semantic search · perplexity embed v1</div>
-<div class="divider"></div>
-""", unsafe_allow_html=True)
-uploaded = st.file_uploader("Drop a document to index", type=["txt", "md"], label_visibility="visible")
-if uploaded:
-    raw_text = uploaded.read().decode("utf-8", errors="replace")
-    with st.expander(f"Preview · {uploaded.name}", expanded=False):
-        st.code(raw_text[:4000] + ("…" if len(raw_text) > 4000 else ""), language=None)
-    cache_key = (uploaded.name, uploaded.size, chunk_size, overlap)
-    if st.session_state.get("cache_key") != cache_key:
-        with st.spinner("Embedding document chunks…"):
-            chunks     = chunk_text(raw_text, chunk_size=chunk_size, overlap=overlap)
-            embeddings = embed_document_chunks(chunks)
-        st.session_state.update(cache_key=cache_key, chunks=chunks, embeddings=embeddings)
-    else:
-        chunks     = st.session_state["chunks"]
-        embeddings = st.session_state["embeddings"]
-    chunk_count = len(chunks)
-    st.markdown(
-        f'<div class="index-badge">◈ indexed &nbsp;<span>{chunk_count} chunks</span>&nbsp; from &nbsp;<span>{uploaded.name}</span></div>',
-        unsafe_allow_html=True,
-    )
-    col1, col2 = st.columns([4, 1])
-    with col1:
-        query = st.text_input("query", placeholder="Ask anything about the document…", label_visibility="collapsed")
-    with col2:
-        search_btn = st.button("Search ↗", disabled=not (query or "").strip(), use_container_width=True)
-    if search_btn and query.strip():
-        with st.spinner("Searching…"):
-            results = search(query, chunks, embeddings, top_k=top_k)
-        st.markdown('<div class="results-header">— Results</div>', unsafe_allow_html=True)
-        for rank, (chunk_txt, score) in enumerate(results, 1):
-            pct = score * 100
-            if pct >= 60:
-                card_cls, fill_color, score_color = "card-high", "#4ade80", "#4ade80"
-            elif pct >= 35:
-                card_cls, fill_color, score_color = "card-mid",  "#f5a623", "#f5a623"
-            else:
-                card_cls, fill_color, score_color = "card-low",  "#f87171", "#f87171"
-            delay = (rank - 1) * 0.07
-            st.markdown(f"""
-            <div class="result-card {card_cls}" style="animation-delay:{delay}s">
-                <div class="card-meta">
-                    <div class="card-rank">#{rank:02d}</div>
-                    <div class="card-score-bar">
-                        <div class="card-score-fill" style="width:{min(pct,100):.1f}%;background:{fill_color};"></div>
-                    </div>
-                    <div class="card-score-num" style="color:{score_color}">{pct:.1f}%</div>
-                </div>
-                <div class="card-text">{chunk_txt}</div>
-            </div>
-            """, unsafe_allow_html=True)
-else:
-    st.markdown("""
-    <div style="
-        margin-top: 3rem;
-        border: 1px dashed #1e2235;
-        border-radius: 10px;
-        padding: 3rem 2rem;
-        text-align: center;
-        color: #3a3f55;
-        font-size: 0.8rem;
-        letter-spacing: 0.08em;
-    ">
-        <div style="font-size:2.5rem;margin-bottom:1rem;opacity:0.3">◈</div>
-        Upload a <code style="color:#f5a62366">.txt</code> or <code style="color:#f5a62366">.md</code> file to begin indexing
-    </div>
-    """, unsafe_allow_html=True)

main.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import os
+import builtins
+# ── Auto-answer transformers custom code prompt ────────────────────────────────
+_real_input = builtins.input
+def _auto_yes(prompt=""):
+    if any(kw in str(prompt).lower() for kw in ("custom code", "trust", "wish to run")):
+        return "y"
+    return _real_input(prompt)
+builtins.input = _auto_yes
+os.environ["TRUST_REMOTE_CODE"]             = "1"
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"]  = "1"
+os.environ["TOKENIZERS_PARALLELISM"]        = "false"
+os.environ["HF_HUB_VERBOSITY"]             = "error"
+import re
+import numpy as np
+import torch
+from contextlib import asynccontextmanager
+from typing import Annotated
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from transformers import AutoModel, AutoTokenizer
+# ─────────────────────────── Models (loaded once at startup) ──────────────────
+models: dict = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    print("Loading embedding models…")
+    ctx_model   = AutoModel.from_pretrained("perplexity-ai/pplx-embed-context-v1-0.6B", trust_remote_code=True)
+    query_model = AutoModel.from_pretrained("perplexity-ai/pplx-embed-v1-0.6B",         trust_remote_code=True)
+    tokenizer   = AutoTokenizer.from_pretrained("perplexity-ai/pplx-embed-v1-0.6B",     trust_remote_code=True)
+    ctx_model.eval()
+    query_model.eval()
+    models["ctx"]       = ctx_model
+    models["query"]     = query_model
+    models["tokenizer"] = tokenizer
+    print("Models ready.")
+    yield
+    models.clear()
+# ─────────────────────────── App ──────────────────────────────────────────────
+app = FastAPI(
+    title="pplx-embed Semantic Search API",
+    description=(
+        "Upload a document and search it semantically using "
+        "perplexity-ai/pplx-embed-context-v1-0.6B + pplx-embed-v1-0.6B."
+    ),
+    version="1.0.0",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ─────────────────────────── Helpers ──────────────────────────────────────────
+def mean_pool(token_embeddings: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
+def _encode(model, texts: list[str]) -> np.ndarray:
+    if hasattr(model, "encode"):
+        result = model.encode(texts)
+        if isinstance(result, (list, tuple)):
+            return np.vstack([np.array(r).flatten() for r in result])
+        return np.array(result)
+    tokenizer = models["tokenizer"]
+    encoded = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
+    with torch.no_grad():
+        out = model(**encoded)
+    return mean_pool(out.last_hidden_state, encoded["attention_mask"]).cpu().numpy()
+def embed_chunks(chunks: list[str]) -> np.ndarray:
+    ctx = models["ctx"]
+    if hasattr(ctx, "encode"):
+        return np.array(ctx.encode([chunks])[0])
+    return _encode(ctx, chunks)
+def embed_query_text(query: str) -> np.ndarray:
+    return _encode(models["query"], [query])[0].flatten()
+def chunk_text(text: str, chunk_size: int = 3, overlap: int = 1) -> list[str]:
+    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+    sentences = [s.strip() for s in sentences if s.strip()]
+    chunks, i = [], 0
+    while i < len(sentences):
+        chunks.append(" ".join(sentences[i : i + chunk_size]))
+        i += max(1, chunk_size - overlap)
+    return chunks
+def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
+    na, nb = np.linalg.norm(a), np.linalg.norm(b)
+    return float(np.dot(a, b) / (na * nb)) if na and nb else 0.0
+# ─────────────────────────── In-memory document store ─────────────────────────
+# Maps doc_id → { chunks: list[str], embeddings: np.ndarray }
+store: dict[str, dict] = {}
+# ─────────────────────────── Schemas ──────────────────────────────────────────
+class IndexResponse(BaseModel):
+    doc_id: str
+    chunks_indexed: int
+    message: str
+class SearchRequest(BaseModel):
+    doc_id: str = Field(..., description="ID returned by /index")
+    query: str  = Field(..., description="Natural language question")
+    top_k: int  = Field(5, ge=1, le=20)
+class SearchResult(BaseModel):
+    rank: int
+    score: float
+    text: str
+class SearchResponse(BaseModel):
+    doc_id: str
+    query: str
+    results: list[SearchResult]
+class EmbedRequest(BaseModel):
+    texts: list[str] = Field(..., description="List of strings to embed independently")
+class EmbedResponse(BaseModel):
+    embeddings: list[list[float]]
+    dimensions: int
+# ─────────────────────────── Routes ───────────────────────────────────────────
+@app.get("/", tags=["health"])
+def root():
+    return {"status": "ok", "docs": "/docs"}
+@app.get("/health", tags=["health"])
+def health():
+    return {"status": "ok", "models_loaded": bool(models)}
+@app.post("/index", response_model=IndexResponse, tags=["search"])
+async def index_document(
+    file: Annotated[UploadFile, File(description=".txt or .md file to index")],
+    doc_id: Annotated[str, Form(description="Unique ID for this document")] = "",
+    chunk_size: Annotated[int, Form()] = 3,
+    overlap: Annotated[int, Form()] = 1,
+):
+    """
+    Upload a .txt or .md file and embed it. Returns a doc_id you use in /search.
+    If doc_id is empty, the filename (without extension) is used.
+    """
+    if not models:
+        raise HTTPException(503, "Models not loaded yet — please retry in a few seconds.")
+    content = await file.read()
+    try:
+        text = content.decode("utf-8")
+    except UnicodeDecodeError:
+        text = content.decode("latin-1")
+    resolved_id = doc_id.strip() or os.path.splitext(file.filename or "doc")[0]
+    chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
+    if not chunks:
+        raise HTTPException(400, "Document produced no text chunks. Check the file contents.")
+    embeddings = embed_chunks(chunks)
+    store[resolved_id] = {"chunks": chunks, "embeddings": embeddings}
+    return IndexResponse(
+        doc_id=resolved_id,
+        chunks_indexed=len(chunks),
+        message=f"Document '{resolved_id}' indexed successfully.",
+    )
+@app.post("/search", response_model=SearchResponse, tags=["search"])
+def search_document(req: SearchRequest):
+    """
+    Search a previously indexed document by doc_id.
+    """
+    if req.doc_id not in store:
+        raise HTTPException(404, f"doc_id '{req.doc_id}' not found. Call /index first.")
+    doc      = store[req.doc_id]
+    chunks   = doc["chunks"]
+    embs     = doc["embeddings"]
+    q        = embed_query_text(req.query)
+    scores   = [cosine_sim(q, embs[i]) for i in range(len(chunks))]
+    ranked   = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[: req.top_k]
+    return SearchResponse(
+        doc_id=req.doc_id,
+        query=req.query,
+        results=[
+            SearchResult(rank=i + 1, score=round(score, 4), text=chunks[idx])
+            for i, (idx, score) in enumerate(ranked)
+        ],
+    )
+@app.post("/embed", response_model=EmbedResponse, tags=["embeddings"])
+def embed_texts(req: EmbedRequest):
+    """
+    Embed arbitrary texts with the query model. Returns raw float embeddings.
+    """
+    if not models:
+        raise HTTPException(503, "Models not loaded yet.")
+    if len(req.texts) > 64:
+        raise HTTPException(400, "Maximum 64 texts per request.")
+    embs = _encode(models["query"], req.texts)
+    return EmbedResponse(
+        embeddings=embs.tolist(),
+        dimensions=embs.shape[1],
+    )
+@app.get("/documents", tags=["search"])
+def list_documents():
+    """List all currently indexed document IDs."""
+    return {
+        "documents": [
+            {"doc_id": k, "chunks": len(v["chunks"])}
+            for k, v in store.items()
+        ]
+    }
+@app.delete("/documents/{doc_id}", tags=["search"])
+def delete_document(doc_id: str):
+    """Remove a document from the index."""
+    if doc_id not in store:
+        raise HTTPException(404, f"doc_id '{doc_id}' not found.")
+    del store[doc_id]
+    return {"deleted": doc_id}