Spaces:

slxhere
/

doc_alive

Sleeping

App Files Files Community

slxhere commited on Sep 20, 2025

Commit

5c9f0d9

0 Parent(s):

Add audio generation

Browse files

Files changed (15) hide show

.gitignore +0 -0
README.md +62 -0
__init__.py +0 -0
app.py +198 -0
generation/__pycache__/gen_audio.cpython-310.pyc +0 -0
generation/__pycache__/gen_img.cpython-310.pyc +0 -0
generation/gen_audio.py +104 -0
generation/gen_img.py +113 -0
llm/__pycache__/call_llm.cpython-310.pyc +0 -0
llm/call_llm.py +72 -0
rag/__pycache__/extract_text.cpython-310.pyc +0 -0
rag/__pycache__/rag.cpython-310.pyc +0 -0
rag/extract_text.py +16 -0
rag/rag.py +273 -0
requirements.txt +8 -0

.gitignore ADDED Viewed

Binary file (22 Bytes). View file

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+title: Doc Alive - RAG to Image
+emoji: 📦🎨
+colorFrom: blue
+colorTo: pink
+sdk: gradio
+sdk_version: "5.44.1"
+app_file: app.py
+pinned: false
+---
+# 📦→🧠→🎨 Doc Alive: RAG-to-Image with OpenAI
+This project turns documents into **illustrations** with the help of RAG (Retrieval-Augmented Generation), LLM prompt engineering, and OpenAI’s image generation.
+Upload a `.txt`, `.md`, or `.pdf` file, describe your goal, and the app will:
+1. **Extract text** from your file
+2. **Retrieve key excerpts** using embeddings
+3. **Ask an LLM** to craft a structured image generation spec
+4. **Generate an illustration** with OpenAI’s image model
+---
+## 🚀 Demo
+This app runs on **Hugging Face Spaces** using **Gradio**.
+---
+## 🔑 API Key
+You must provide your own **OpenAI API key** to use this demo.
+- Enter your key in the input box (starts with `sk-...`)
+- The key is **not stored** — it is only used in memory for your current session
+---
+## 📂 Project Structure
+├─ app.py # Gradio UI entry
+├─ requirements.txt # Dependencies
+├─ rag/ # Text extraction + retrieval
+├─ llm/ # Structured LLM call helper
+├─ generation/ # Image generation helper
+---
+## 🛠 Tech Stack
+- [Gradio](https://www.gradio.app/) – UI framework
+- [OpenAI](https://platform.openai.com/) – LLM + image generation
+- [RAG (text-embedding-3-small)](https://platform.openai.com/docs/guides/embeddings) – semantic retrieval
+---
+## ⚠️ Notes
+- The OpenAI API key is required for both embeddings and image generation
+- We do **not** log or save your key
+- Depending on your key usage, OpenAI will bill API calls

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os, json
+from pathlib import Path
+from typing import Dict, Any
+import gradio as gr
+from rag.extract_text import extract_text
+from rag.rag import OpenAIEmbedRAG
+from llm.call_llm import call_llm_structured
+from generation.gen_img import generate_image_with_openai_from_llm_spec
+from generation.gen_audio import generate_audio_with_openai_from_llm_spec
+# ---------- Output directory ----------
+OUT_DIR = Path("outputs")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+# ---------- Helpers ----------
+def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
+    """Safe serialization for retrieved chunks."""
+    items = []
+    if hits is None:
+        hits = []
+    for i, h in enumerate(hits):
+        if not isinstance(h, dict):
+            continue
+        t = h.get("text")
+        if not t:
+            continue
+        items.append({"id": h.get("id", i), "excerpt": t[:limit]})
+    if not items:
+        fb = (raw_fallback or "")[:limit]
+        if fb:
+            items = [{"id": 0, "excerpt": fb}]
+    return json.dumps(items, ensure_ascii=False)
+# ---------- Core pipeline ----------
+def run_pipeline(
+    file_path: str,
+    openai_api_key: str,
+    user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
+    topk: int = 6,
+    llm_model: str = "gpt-5-nano",
+):
+    """
+    Runs the full pipeline using the provided API key.
+    SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
+    """
+    if not openai_api_key or not openai_api_key.strip():
+        raise ValueError("OpenAI API key is required.")
+    # 1) Extract text
+    raw = extract_text(file_path)
+    # 2) RAG (embeddings + search)
+    rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
+    rag.build(raw)
+    hits = rag.search(user_goal, topk=topk)
+    snippets_json = hits_to_snippets_json(hits, raw)
+    # 3) LLM → structured JSON (image/audio/debug)
+    system_prompt = """
+    You are a prompt engineer for **visual** and **audio** generation.
+    Return a JSON object strictly matching this schema:
+    {
+    "image": {
+        "prompt": "string, a detailed description of what the image should show",
+        "negative_prompt": "string, optional description of what to avoid",
+        "style": ["string", ...],   // optional styles like "cinematic", "oil painting"
+        "width": int,               // optional, default 1024
+        "height": int               // optional, default 1024
+    },
+    "audio": {
+        "text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
+        "voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
+        "speed": float,             // optional, default 1.0
+    },
+    "debug": {
+        "reasoning": "string, brief reasoning why you designed the prompts this way"
+    }
+    }
+    Rules:
+    - Always output valid JSON only, no explanations outside JSON.
+    - The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud,
+    not raw excerpts.
+    - All places in JSON must be filled with valid content, do not leave any entry empty!
+    - Keep the image prompt concise but vivid.
+    """.strip()
+    user_prompt = f"""
+    Goal: {user_goal}
+    Below are the most relevant retrieved excerpts (with source IDs):
+    {snippets_json}
+    Now produce the JSON object strictly following the schema.
+    """.strip()
+    spec = call_llm_structured(
+        system_prompt,
+        user_prompt,
+        model=llm_model,
+        openai_key=openai_api_key,
+    )
+    print(spec)
+    # 4) Image generation
+    result_img = generate_image_with_openai_from_llm_spec(
+        spec,
+        out_dir=str(OUT_DIR),
+        openai_key=openai_api_key,
+    )
+    img_obj = result_img["image"]
+    # 5) Audio generation
+    result_audio = generate_audio_with_openai_from_llm_spec(
+        spec,
+        out_dir=str(OUT_DIR),
+        openai_key=openai_api_key,
+    )
+    audio_bytes = result_audio["audio_bytes"]
+    # 6) Pretty meta for UI
+    pretty = {
+        "spec": spec,
+        "used_chunks_preview": json.loads(snippets_json),
+    }
+    raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
+    return img_obj, audio_bytes, pretty, raw_json
+# ---------- Gradio UI ----------
+def ui_pipeline(file, api_key, goal, topk, model_name):
+    if file is None:
+        return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
+    try:
+        return run_pipeline(
+            file_path=file.name,
+            openai_api_key=api_key,
+            user_goal=goal,
+            topk=int(topk),
+            llm_model=model_name,
+        )
+    except Exception as e:
+        return None, None, {"error": str(e)}, ""
+with gr.Blocks(title="File → (RAG + LLM) → Prompts → Image+Audio") as demo:
+    gr.Markdown(
+        "# 📦→🧠→🎨+🔊  Generate illustration and narration for your documents\n"
+        "**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n"
+        "- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
+        "- Generate both an image **and** an audio narration.\n"
+    )
+    with gr.Row():
+        file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
+        goal = gr.Textbox(
+            label="Your goal (more detail → better results)",
+            value="Generate an illustration and a narration that matches the spirit of this text",
+            lines=2,
+        )
+    with gr.Row():
+        api_key_in = gr.Textbox(
+            label="OpenAI API key",
+            placeholder="sk-...",
+            type="password",
+        )
+        topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
+        model_llm = gr.Dropdown(
+            choices=["gpt-5-nano"],
+            value="gpt-5-nano",
+            label="LLM model",
+        )
+    run_btn = gr.Button("Run", variant="primary")
+    out_img = gr.Image(label="Generated image")
+    out_audio = gr.Audio(label="Generated audio", type="numpy")  # ⚡ 改成 numpy
+    out_json = gr.JSON(label="Spec & Metadata")
+    out_raw = gr.Code(label="Raw JSON (debug)", language="json")
+    run_btn.click(
+        ui_pipeline,
+        inputs=[file_in, api_key_in, goal, topk, model_llm],
+        outputs=[out_img, out_audio, out_json, out_raw],
+    )
+if __name__ == "__main__":
+    demo.launch()

generation/__pycache__/gen_audio.cpython-310.pyc ADDED Viewed

Binary file (3.23 kB). View file

generation/__pycache__/gen_img.cpython-310.pyc ADDED Viewed

Binary file (3.52 kB). View file

generation/gen_audio.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os, json, uuid
+from pathlib import Path
+from typing import Dict, Optional
+from openai import OpenAI
+# ============ Utility Functions ============
+def _build_openai_tts_prompt(text: str,
+                             style: Optional[str] = None,
+                             speed: Optional[float] = None) -> str:
+    """Merge text, style, and other options into a single TTS input string."""
+    parts = [text.strip()]
+    if style:
+        parts.append(f"Style: {style.strip()}")
+    if speed:
+        parts.append(f"Speaking speed: {speed}")
+    return " ".join([p for p in parts if p])
+# ============ Generator Wrapper ============
+class OpenAIAudioGenerator:
+    """
+    Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts).
+    """
+    def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
+        self.client = client or OpenAI()
+        self.out_dir = Path(out_dir)
+        self.out_dir.mkdir(parents=True, exist_ok=True)
+    def generate_from_spec(self,
+                           audio_spec: Dict,
+                           filename_prefix: str = "speech",
+                           save_meta: bool = False) -> Dict:
+        """
+        audio_spec example:
+        {
+            "text": "Hello, world!",
+            "voice": "alloy",
+            "speed": 1.0,
+        }
+        """
+        prompt_text = _build_openai_tts_prompt(
+            audio_spec.get("text", ""),
+            audio_spec.get("style"),
+            audio_spec.get("speed")
+        )
+        voice = audio_spec.get("voice", "alloy")
+        fmt = audio_spec.get("format", "mp3")
+        model = audio_spec.get("model", "gpt-4o-mini-tts")
+        filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}"
+        file_path = self.out_dir / filename
+        try:
+            with self.client.audio.speech.with_streaming_response.create(
+                model=model,
+                voice=voice,
+                input=prompt_text,
+            ) as response:
+                response.stream_to_file(file_path)
+            # 读出字节后删除文件
+            audio_bytes = file_path.read_bytes()
+            os.remove(file_path)
+        except Exception as e:
+            raise RuntimeError(f"OpenAI Audio generation failed: {e}")
+        meta = {
+            "model": model,
+            "voice": voice,
+            "format": fmt,
+            "prompt_sent": prompt_text,
+            "llm_audio_spec": audio_spec
+        }
+        if save_meta:
+            meta_file = file_path.with_suffix(".json")
+            meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
+        return {"audio_bytes": audio_bytes, "meta": meta}
+# ============ Integration with Main Pipeline (Example) ============
+def generate_audio_with_openai_from_llm_spec(spec: Dict,
+                                             out_dir: str = "outputs",
+                                             openai_key=None) -> Dict:
+    """
+    Directly feed the spec returned by call_llm_structured:
+    spec = {
+        "image": {...},
+        "audio": {...},
+        "debug": {...}
+    }
+    """
+    client = OpenAI(api_key=openai_key)
+    gen = OpenAIAudioGenerator(out_dir=out_dir, client=client)
+    return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio")

generation/gen_img.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os, base64, json, uuid, math, io
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+from openai import OpenAI
+from datetime import datetime
+from PIL import Image
+# ============ Utility Functions ============
+def _build_openai_prompt(prompt: str,
+                         styles: List[str] | None,
+                         negative_prompt: str | None) -> str:
+    """Merge positive prompt / style / negative prompt into one natural language prompt suitable for gpt-image-1."""
+    parts = [prompt.strip()]
+    if styles:
+        parts.append(", ".join([s.strip() for s in styles if s.strip()]))
+    # OpenAI does not have a separate parameter for negative prompts; phrasing in natural language is safer
+    if negative_prompt and negative_prompt.strip():
+        parts.append(f"\nAvoid: {negative_prompt.strip()}.")
+    return " ".join([p for p in parts if p])
+# ============ Generator Wrapper ============
+class OpenAIImageGenerator:
+    """
+    Generate images using the OpenAI Images API (gpt-image-1).
+    Reference: Official Image generation docs, help center, and Python SDK usage.
+    """
+    def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
+        self.client = client or OpenAI()
+        self.out_dir = Path(out_dir)
+        self.out_dir.mkdir(parents=True, exist_ok=True)
+    def generate_from_spec(self, image_spec: Dict,
+                           transparent_bg: bool = False,
+                           filename_prefix: str = "img",
+                           save_meta=False) -> Dict:
+        """
+        image_spec should follow the JSON output from your LLM, e.g.:
+        {
+          "prompt": "...",
+          "negative_prompt": "...",
+          "style": ["photorealistic","cinematic"],
+          "width": 1024, "height": 1024
+          ...
+        }
+        """
+        prompt_text = _build_openai_prompt(
+            image_spec.get("prompt", ""),
+            image_spec.get("style", []),
+            image_spec.get("negative_prompt", ""),
+        )
+        size = "1024x1024"
+        # Assemble parameters
+        params = dict(
+            model="gpt-image-1",
+            prompt=prompt_text,
+            n=1,
+            size=size,
+        )
+        # Transparent background: may or may not be supported; if not, SDK will raise, caught below.
+        if transparent_bg:
+            params["background"] = "transparent"  # If unsupported, ignored in except block
+        try:
+            resp = self.client.images.generate(**params)  # Official images.generate call
+        except Exception as e:
+            # Retry without background parameter if unsupported
+            if transparent_bg:
+                params.pop("background", None)
+                resp = self.client.images.generate(**params)
+            else:
+                raise
+        # Parse response (Base64 JSON)
+        # Official examples usually decode data[0].b64_json into PNG bytes and save to disk.
+        b64_data = resp.data[0].b64_json
+        image_bytes = base64.b64decode(b64_data)
+        # Save metadata (for reproducibility/auditing)
+        meta = {
+            "model": "gpt-image-1",
+            "size": size,
+            "prompt_sent": prompt_text,
+            "transparent_bg": transparent_bg,
+            "llm_image_spec": image_spec,
+        }
+        img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        return {"image": img, "meta": meta, "path": None}
+# ============ Integration with Main Pipeline (Example) ============
+def generate_image_with_openai_from_llm_spec(spec: Dict, out_dir: str = "outputs", openai_key=None) -> Dict:
+    """
+    Directly feed the spec returned by call_llm_structured:
+    spec = {
+      "image": {...},  # The image_spec above
+      "audio": {...},
+      "debug": {...}
+    }
+    """
+    client = OpenAI(api_key=openai_key)
+    gen = OpenAIImageGenerator(out_dir=out_dir, client=client)
+    return gen.generate_from_spec(spec["image"], transparent_bg=False, filename_prefix="gptimg")

llm/__pycache__/call_llm.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

llm/call_llm.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from typing import List, Dict
+from openai import OpenAI
+from pydantic import BaseModel
+from typing import List, Optional, Union, Dict
+from pydantic import BaseModel, Field, ValidationError
+import json
+import logging
+# --- Pydantic schemas ---
+class ImageSpec(BaseModel):
+    prompt: str = Field(..., description="Positive prompts")
+    negative_prompt: str = Field(default="", description="Negative prompts")
+    style: List[str] = Field(default_factory=list)
+    width: int = 1024
+    height: int = 1024
+    steps: int = 30
+    cfg_scale: float = 6.5
+    sampler: Optional[str] = "DPM++ 2M Karras"
+    seed: Union[str, int] = "random"
+class AudioSpec(BaseModel):
+    text: str
+    voice: str
+    speed: float = 1.0
+class UsedChunk(BaseModel):
+    id: Union[int, str]
+    excerpt: str
+class DebugInfo(BaseModel):
+    used_chunks: List[UsedChunk] = Field(default_factory=list)
+    keywords: List[str] = Field(default_factory=list)
+class GenerationSpec(BaseModel):
+    image: ImageSpec
+    audio: AudioSpec
+    debug: DebugInfo = Field(default_factory=DebugInfo)
+# --- Single-path structured call (no fallback) ---
+def call_llm_structured(
+    system_prompt: str,
+    user_prompt: str,
+    model: str = "gpt-5-nano",
+    openai_key=None
+) -> Dict:
+    """
+    Call OpenAI Responses API and parse directly into the GenerationSpec schema.
+    If parsing fails, raise RuntimeError (no fallbacks).
+    """
+    client = OpenAI(api_key=openai_key)
+    response = client.responses.parse(
+        model=model,
+        input=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        text_format=GenerationSpec,  # enforce schema at the API level
+    )
+    parsed = getattr(response, "output_parsed", None)
+    if parsed is None:
+        # Optionally include response for easier debugging
+        raise RuntimeError("LLM did not return a parsed result (output_parsed=None).")
+    # Pydantic v2: model_dump(); v1: dict()
+    return parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()

rag/__pycache__/extract_text.cpython-310.pyc ADDED Viewed

Binary file (695 Bytes). View file

rag/__pycache__/rag.cpython-310.pyc ADDED Viewed

Binary file (8 kB). View file

rag/extract_text.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from pathlib import Path
+import pdfplumber
+def extract_text(path: str) -> str:
+    p = Path(path)
+    if p.suffix.lower() in [".txt", ".md"]:
+        return p.read_text(encoding="utf-8", errors="ignore")
+    if p.suffix.lower() == ".pdf":
+        text = []
+        with pdfplumber.open(str(p)) as pdf:
+            for page in pdf.pages:
+                text.append(page.extract_text() or "")
+        return "\n".join(text)
+    # TODO: docx, html, image(OCR), audio(ASR)
+    raise ValueError(f"Unsupported file type: {p.suffix}")

rag/rag.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# pip install openai faiss-cpu tiktoken numpy
+from __future__ import annotations
+import os, time, math
+from typing import List, Dict, Any
+from dataclasses import dataclass
+import numpy as np
+import faiss
+import tiktoken
+from openai import OpenAI
+import re
+# ========= Basic Utilities =========
+def l2_normalize(mat: np.ndarray) -> np.ndarray:
+    """Row-wise L2 normalize for cosine similarity via inner product."""
+    norm = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
+    return mat / norm
+def batch(iterable, n=128):
+    """Yield lists of size n from an iterable (last one may be shorter)."""
+    buf = []
+    for x in iterable:
+        buf.append(x)
+        if len(buf) >= n:
+            yield buf
+            buf = []
+    if buf:
+        yield buf
+# ========= OpenAI Embeddings RAG =========
+@dataclass
+class Chunk:
+    """A single chunk of the document, with token offsets for traceability."""
+    id: int
+    text: str
+    start_token: int
+    end_token: int
+class OpenAIEmbedRAG:
+    """
+    Retrieval module using OpenAI Embeddings + FAISS (IP over L2-normalized vectors = cosine).
+    Design notes:
+      - Single-pass tokenization for the whole document (no repeated encode/decode).
+      - Chunk.text is ALWAYS a string (never None) to avoid downstream NoneType errors.
+      - Graceful degradation: empty input => no index; search() returns [].
+      - Optional MMR re-ranking (diversity) via mmr_search().
+    """
+    def __init__(self,
+                 model: str = "text-embedding-3-small",
+                 chunk_size_tokens: int = 800,
+                 overlap_tokens: int = 100,
+                 batch_size: int = 256,
+                 openai_key=None):
+        self.client = OpenAI(api_key=openai_key)
+        self.model = model
+        self.batch_size = batch_size
+        self.enc = tiktoken.get_encoding("cl100k_base")  # Tokenizer for embedding-3 models
+        self.chunk_size = max(1, int(chunk_size_tokens))
+        self.overlap = max(0, int(overlap_tokens))
+        if self.overlap >= self.chunk_size:
+            # Ensure forward progress: overlap must be smaller than chunk size
+            self.overlap = max(0, self.chunk_size // 4)
+        self._doc_token_ids: List[int] | None = None
+        self.chunks: List[Chunk] = []
+        self.index: faiss.IndexFlatIP | None = None
+        self._emb_dim: int | None = None
+        self._emb_matrix: np.ndarray | None = None  # store chunk embeddings for MMR / analysis
+    # ---- Text cleaning ----
+    def _clean_text(self, text: str) -> str:
+        """
+        Light normalization:
+          - Collapse consecutive whitespace to a single space.
+          - Remove non-printable control chars (keep \n and \t).
+          - Trim leading/trailing spaces.
+        """
+        text = re.sub(r"\s+", " ", text or "")
+        text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\t")
+        return text.strip()
+    # ---- Tokenization helpers ----
+    def _tokenize(self, text: str) -> List[int]:
+        return self.enc.encode(text)
+    def _detokenize(self, ids: List[int]) -> str:
+        return self.enc.decode(ids)
+    # ---- Chunking (by tokens) ----
+    # It is possible to use dynamic chunking, however to constraint cost, we use fixed size chunking
+    def chunk_text(self, text: str) -> List[Chunk]:
+        """
+        Tokenize once and create overlapping windows of token ids.
+        Each Chunk stores its decoded text and token offsets.
+        """
+        self._doc_token_ids = self._tokenize(text)
+        total = len(self._doc_token_ids)
+        chunks: List[Chunk] = []
+        if total == 0:
+            return chunks
+        print(f"[RAG] Total tokens: {total}. Chunk size: {self.chunk_size}, overlap: {self.overlap}")
+        stride = self.chunk_size - self.overlap
+        i, cid = 0, 0
+        while i < total:
+            j = min(i + self.chunk_size, total)
+            ids_slice = self._doc_token_ids[i:j]
+            txt = self._detokenize(ids_slice)
+            chunks.append(Chunk(id=cid, text=txt, start_token=i, end_token=j))
+            cid += 1
+            if j == total:
+                break
+            i += stride  # always moves forward
+        return chunks
+    # ---- OpenAI Embeddings (batched) ----
+    def _embed_texts(self, texts: List[str], max_retries=3) -> np.ndarray:
+        """
+        Call OpenAI Embeddings with encoding_format='float'.
+        Returns a float32 matrix with rows aligned to input order.
+        """
+        for attempt in range(max_retries):
+            try:
+                resp = self.client.embeddings.create(
+                    model=self.model,
+                    input=texts,
+                    encoding_format="float",
+                )
+                vecs = [None] * len(resp.data)
+                for item in resp.data:
+                    vecs[item.index] = np.array(item.embedding, dtype=np.float32)
+                arr = np.vstack(vecs)
+                if self._emb_dim is None:
+                    self._emb_dim = arr.shape[1]
+                return arr
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    raise
+                # simple exponential backoff
+                time.sleep(0.8 * (attempt + 1))
+    # ---- Build FAISS index ----
+    def build(self, text: str):
+        """
+        Clean -> chunk -> embed -> build an IP index on normalized vectors.
+        Graceful if text is empty: index remains None and chunks empty.
+        """
+        text = self._clean_text(text)
+        self.chunks = self.chunk_text(text)
+        if not self.chunks:
+            self.index = None
+            self._emb_matrix = None
+            return
+        all_vecs = []
+        # Embed chunk texts in batches
+        for chunk_batch in batch([c.text for c in self.chunks], n=self.batch_size):
+            arr = self._embed_texts(chunk_batch)
+            all_vecs.append(arr)
+        mat = np.vstack(all_vecs).astype(np.float32)
+        mat = l2_normalize(mat)
+        self._emb_matrix = mat  # keep for MMR / diagnostics
+        self.index = faiss.IndexFlatIP(mat.shape[1])
+        self.index.add(mat)
+    # ---- Plain vector search ----
+    def search(self, query: str, topk: int = 6) -> List[Dict[str, Any]]:
+        """
+        Return top-k chunks by cosine similarity (via IP on normalized vectors).
+        If the index hasn't been built or the doc is empty, returns [].
+        """
+        if not self.index or not self.chunks:
+            return []
+        q = self._clean_text(query)
+        if not q:
+            return []
+        qv = self._embed_texts([q])
+        qv = l2_normalize(qv)
+        D, I = self.index.search(qv.astype(np.float32), max(1, int(topk)))
+        results = []
+        for rank, idx in enumerate(I[0]):
+            if idx == -1:
+                continue
+            ch = self.chunks[int(idx)]
+            results.append({
+                "id": ch.id,
+                "score": float(D[0][rank]),
+                "text": ch.text,
+                "start_token": ch.start_token,
+                "end_token": ch.end_token
+            })
+        return results
+    # ---- Optional: MMR search (diversified) ----
+    def mmr_search(self, query: str, topk: int = 6, fetch_k: int | None = None, lambda_mult: float = 0.5) -> List[Dict[str, Any]]:
+        """
+        Maximal Marginal Relevance.
+        - fetch_k: number of initial candidates to consider (defaults to 4*topk).
+        - lambda_mult in [0,1]: 1 emphasizes relevance; 0 emphasizes diversity.
+        """
+        if self._emb_matrix is None or not self.chunks:
+            return []
+        q = self._clean_text(query)
+        if not q:
+            return []
+        qv = l2_normalize(self._embed_texts([q]))[0]  # (d,)
+        # Precompute query-to-chunk relevance
+        rel = self._emb_matrix @ qv  # (N,)
+        N = len(self.chunks)
+        k = max(1, int(topk))
+        m = min(N, int(fetch_k) if fetch_k else min(N, 4 * k))
+        # Get top-m by relevance
+        cand_idx = np.argpartition(-rel, m-1)[:m]
+        cand_idx = cand_idx[np.argsort(-rel[cand_idx])]  # sort by relevance
+        selected: List[int] = []
+        selected_set = set()
+        for _ in range(min(k, m)):
+            if not selected:
+                best = int(cand_idx[0])
+                selected.append(best)
+                selected_set.add(best)
+                continue
+            # Diversity term: max similarity to items already selected
+            S = self._emb_matrix[selected]  # (s, d)
+            # compute max cosine sim to the selected set for each candidate
+            # (S @ cand.T) => for each candidate's vector v, max over s rows
+            cand_vecs = self._emb_matrix[cand_idx]  # (m, d)
+            sims = cand_vecs @ S.T  # (m, s)
+            max_sims = sims.max(axis=1)  # (m,)
+            # MMR objective
+            scores = lambda_mult * rel[cand_idx] - (1 - lambda_mult) * max_sims
+            # pick best candidate not yet selected
+            order = np.argsort(-scores)
+            for j in order:
+                idx_j = int(cand_idx[j])
+                if idx_j not in selected_set:
+                    selected.append(idx_j)
+                    selected_set.add(idx_j)
+                    break
+        # Format results in the same structure as search()
+        out = []
+        for idx in selected:
+            ch = self.chunks[idx]
+            out.append({
+                "id": ch.id,
+                "score": float(rel[idx]),
+                "text": ch.text,
+                "start_token": ch.start_token,
+                "end_token": ch.end_token
+            })
+        return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+faiss_cpu
+gradio==5.44.1
+numpy<2.0
+openai
+pdfplumber==0.11.7
+Pillow
+pydantic
+tiktoken==0.11.0