Commit ·
5c9f0d9
0
Parent(s):
Add audio generation
Browse files- .gitignore +0 -0
- README.md +62 -0
- __init__.py +0 -0
- app.py +198 -0
- generation/__pycache__/gen_audio.cpython-310.pyc +0 -0
- generation/__pycache__/gen_img.cpython-310.pyc +0 -0
- generation/gen_audio.py +104 -0
- generation/gen_img.py +113 -0
- llm/__pycache__/call_llm.cpython-310.pyc +0 -0
- llm/call_llm.py +72 -0
- rag/__pycache__/extract_text.cpython-310.pyc +0 -0
- rag/__pycache__/rag.cpython-310.pyc +0 -0
- rag/extract_text.py +16 -0
- rag/rag.py +273 -0
- requirements.txt +8 -0
.gitignore
ADDED
|
Binary file (22 Bytes). View file
|
|
|
README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Doc Alive - RAG to Image
|
| 3 |
+
emoji: 📦🎨
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "5.44.1"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# 📦→🧠→🎨 Doc Alive: RAG-to-Image with OpenAI
|
| 13 |
+
|
| 14 |
+
This project turns documents into **illustrations** with the help of RAG (Retrieval-Augmented Generation), LLM prompt engineering, and OpenAI’s image generation.
|
| 15 |
+
|
| 16 |
+
Upload a `.txt`, `.md`, or `.pdf` file, describe your goal, and the app will:
|
| 17 |
+
1. **Extract text** from your file
|
| 18 |
+
2. **Retrieve key excerpts** using embeddings
|
| 19 |
+
3. **Ask an LLM** to craft a structured image generation spec
|
| 20 |
+
4. **Generate an illustration** with OpenAI’s image model
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## 🚀 Demo
|
| 25 |
+
|
| 26 |
+
This app runs on **Hugging Face Spaces** using **Gradio**.
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## 🔑 API Key
|
| 31 |
+
|
| 32 |
+
You must provide your own **OpenAI API key** to use this demo.
|
| 33 |
+
- Enter your key in the input box (starts with `sk-...`)
|
| 34 |
+
- The key is **not stored** — it is only used in memory for your current session
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## 📂 Project Structure
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
├─ app.py # Gradio UI entry
|
| 42 |
+
├─ requirements.txt # Dependencies
|
| 43 |
+
├─ rag/ # Text extraction + retrieval
|
| 44 |
+
├─ llm/ # Structured LLM call helper
|
| 45 |
+
├─ generation/ # Image generation helper
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
---
|
| 49 |
+
|
| 50 |
+
## 🛠 Tech Stack
|
| 51 |
+
|
| 52 |
+
- [Gradio](https://www.gradio.app/) – UI framework
|
| 53 |
+
- [OpenAI](https://platform.openai.com/) – LLM + image generation
|
| 54 |
+
- [RAG (text-embedding-3-small)](https://platform.openai.com/docs/guides/embeddings) – semantic retrieval
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## ⚠️ Notes
|
| 59 |
+
|
| 60 |
+
- The OpenAI API key is required for both embeddings and image generation
|
| 61 |
+
- We do **not** log or save your key
|
| 62 |
+
- Depending on your key usage, OpenAI will bill API calls
|
__init__.py
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
from rag.extract_text import extract_text
|
| 8 |
+
from rag.rag import OpenAIEmbedRAG
|
| 9 |
+
from llm.call_llm import call_llm_structured
|
| 10 |
+
from generation.gen_img import generate_image_with_openai_from_llm_spec
|
| 11 |
+
from generation.gen_audio import generate_audio_with_openai_from_llm_spec
|
| 12 |
+
|
| 13 |
+
# ---------- Output directory ----------
|
| 14 |
+
OUT_DIR = Path("outputs")
|
| 15 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ---------- Helpers ----------
|
| 19 |
+
def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
|
| 20 |
+
"""Safe serialization for retrieved chunks."""
|
| 21 |
+
items = []
|
| 22 |
+
if hits is None:
|
| 23 |
+
hits = []
|
| 24 |
+
for i, h in enumerate(hits):
|
| 25 |
+
if not isinstance(h, dict):
|
| 26 |
+
continue
|
| 27 |
+
t = h.get("text")
|
| 28 |
+
if not t:
|
| 29 |
+
continue
|
| 30 |
+
items.append({"id": h.get("id", i), "excerpt": t[:limit]})
|
| 31 |
+
if not items:
|
| 32 |
+
fb = (raw_fallback or "")[:limit]
|
| 33 |
+
if fb:
|
| 34 |
+
items = [{"id": 0, "excerpt": fb}]
|
| 35 |
+
return json.dumps(items, ensure_ascii=False)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ---------- Core pipeline ----------
|
| 39 |
+
def run_pipeline(
|
| 40 |
+
file_path: str,
|
| 41 |
+
openai_api_key: str,
|
| 42 |
+
user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
|
| 43 |
+
topk: int = 6,
|
| 44 |
+
llm_model: str = "gpt-5-nano",
|
| 45 |
+
):
|
| 46 |
+
"""
|
| 47 |
+
Runs the full pipeline using the provided API key.
|
| 48 |
+
SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
if not openai_api_key or not openai_api_key.strip():
|
| 52 |
+
raise ValueError("OpenAI API key is required.")
|
| 53 |
+
|
| 54 |
+
# 1) Extract text
|
| 55 |
+
raw = extract_text(file_path)
|
| 56 |
+
|
| 57 |
+
# 2) RAG (embeddings + search)
|
| 58 |
+
rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
|
| 59 |
+
rag.build(raw)
|
| 60 |
+
hits = rag.search(user_goal, topk=topk)
|
| 61 |
+
|
| 62 |
+
snippets_json = hits_to_snippets_json(hits, raw)
|
| 63 |
+
|
| 64 |
+
# 3) LLM → structured JSON (image/audio/debug)
|
| 65 |
+
system_prompt = """
|
| 66 |
+
You are a prompt engineer for **visual** and **audio** generation.
|
| 67 |
+
|
| 68 |
+
Return a JSON object strictly matching this schema:
|
| 69 |
+
|
| 70 |
+
{
|
| 71 |
+
"image": {
|
| 72 |
+
"prompt": "string, a detailed description of what the image should show",
|
| 73 |
+
"negative_prompt": "string, optional description of what to avoid",
|
| 74 |
+
"style": ["string", ...], // optional styles like "cinematic", "oil painting"
|
| 75 |
+
"width": int, // optional, default 1024
|
| 76 |
+
"height": int // optional, default 1024
|
| 77 |
+
},
|
| 78 |
+
"audio": {
|
| 79 |
+
"text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
|
| 80 |
+
"voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
|
| 81 |
+
"speed": float, // optional, default 1.0
|
| 82 |
+
},
|
| 83 |
+
"debug": {
|
| 84 |
+
"reasoning": "string, brief reasoning why you designed the prompts this way"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
Rules:
|
| 89 |
+
- Always output valid JSON only, no explanations outside JSON.
|
| 90 |
+
- The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud,
|
| 91 |
+
not raw excerpts.
|
| 92 |
+
- All places in JSON must be filled with valid content, do not leave any entry empty!
|
| 93 |
+
- Keep the image prompt concise but vivid.
|
| 94 |
+
""".strip()
|
| 95 |
+
|
| 96 |
+
user_prompt = f"""
|
| 97 |
+
Goal: {user_goal}
|
| 98 |
+
|
| 99 |
+
Below are the most relevant retrieved excerpts (with source IDs):
|
| 100 |
+
{snippets_json}
|
| 101 |
+
|
| 102 |
+
Now produce the JSON object strictly following the schema.
|
| 103 |
+
""".strip()
|
| 104 |
+
|
| 105 |
+
spec = call_llm_structured(
|
| 106 |
+
system_prompt,
|
| 107 |
+
user_prompt,
|
| 108 |
+
model=llm_model,
|
| 109 |
+
openai_key=openai_api_key,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
print(spec)
|
| 113 |
+
|
| 114 |
+
# 4) Image generation
|
| 115 |
+
result_img = generate_image_with_openai_from_llm_spec(
|
| 116 |
+
spec,
|
| 117 |
+
out_dir=str(OUT_DIR),
|
| 118 |
+
openai_key=openai_api_key,
|
| 119 |
+
)
|
| 120 |
+
img_obj = result_img["image"]
|
| 121 |
+
|
| 122 |
+
# 5) Audio generation
|
| 123 |
+
result_audio = generate_audio_with_openai_from_llm_spec(
|
| 124 |
+
spec,
|
| 125 |
+
out_dir=str(OUT_DIR),
|
| 126 |
+
openai_key=openai_api_key,
|
| 127 |
+
)
|
| 128 |
+
audio_bytes = result_audio["audio_bytes"]
|
| 129 |
+
|
| 130 |
+
# 6) Pretty meta for UI
|
| 131 |
+
pretty = {
|
| 132 |
+
"spec": spec,
|
| 133 |
+
"used_chunks_preview": json.loads(snippets_json),
|
| 134 |
+
}
|
| 135 |
+
raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
|
| 136 |
+
return img_obj, audio_bytes, pretty, raw_json
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ---------- Gradio UI ----------
|
| 140 |
+
def ui_pipeline(file, api_key, goal, topk, model_name):
|
| 141 |
+
if file is None:
|
| 142 |
+
return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
|
| 143 |
+
try:
|
| 144 |
+
return run_pipeline(
|
| 145 |
+
file_path=file.name,
|
| 146 |
+
openai_api_key=api_key,
|
| 147 |
+
user_goal=goal,
|
| 148 |
+
topk=int(topk),
|
| 149 |
+
llm_model=model_name,
|
| 150 |
+
)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
return None, None, {"error": str(e)}, ""
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
with gr.Blocks(title="File → (RAG + LLM) → Prompts → Image+Audio") as demo:
|
| 156 |
+
gr.Markdown(
|
| 157 |
+
"# 📦→🧠→🎨+🔊 Generate illustration and narration for your documents\n"
|
| 158 |
+
"**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n"
|
| 159 |
+
"- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
|
| 160 |
+
"- Generate both an image **and** an audio narration.\n"
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
with gr.Row():
|
| 164 |
+
file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
|
| 165 |
+
goal = gr.Textbox(
|
| 166 |
+
label="Your goal (more detail → better results)",
|
| 167 |
+
value="Generate an illustration and a narration that matches the spirit of this text",
|
| 168 |
+
lines=2,
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
with gr.Row():
|
| 172 |
+
api_key_in = gr.Textbox(
|
| 173 |
+
label="OpenAI API key",
|
| 174 |
+
placeholder="sk-...",
|
| 175 |
+
type="password",
|
| 176 |
+
)
|
| 177 |
+
topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
|
| 178 |
+
model_llm = gr.Dropdown(
|
| 179 |
+
choices=["gpt-5-nano"],
|
| 180 |
+
value="gpt-5-nano",
|
| 181 |
+
label="LLM model",
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
run_btn = gr.Button("Run", variant="primary")
|
| 185 |
+
|
| 186 |
+
out_img = gr.Image(label="Generated image")
|
| 187 |
+
out_audio = gr.Audio(label="Generated audio", type="numpy") # ⚡ 改成 numpy
|
| 188 |
+
out_json = gr.JSON(label="Spec & Metadata")
|
| 189 |
+
out_raw = gr.Code(label="Raw JSON (debug)", language="json")
|
| 190 |
+
|
| 191 |
+
run_btn.click(
|
| 192 |
+
ui_pipeline,
|
| 193 |
+
inputs=[file_in, api_key_in, goal, topk, model_llm],
|
| 194 |
+
outputs=[out_img, out_audio, out_json, out_raw],
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
if __name__ == "__main__":
|
| 198 |
+
demo.launch()
|
generation/__pycache__/gen_audio.cpython-310.pyc
ADDED
|
Binary file (3.23 kB). View file
|
|
|
generation/__pycache__/gen_img.cpython-310.pyc
ADDED
|
Binary file (3.52 kB). View file
|
|
|
generation/gen_audio.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json, uuid
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict, Optional
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# ============ Utility Functions ============
|
| 8 |
+
|
| 9 |
+
def _build_openai_tts_prompt(text: str,
|
| 10 |
+
style: Optional[str] = None,
|
| 11 |
+
speed: Optional[float] = None) -> str:
|
| 12 |
+
"""Merge text, style, and other options into a single TTS input string."""
|
| 13 |
+
parts = [text.strip()]
|
| 14 |
+
if style:
|
| 15 |
+
parts.append(f"Style: {style.strip()}")
|
| 16 |
+
if speed:
|
| 17 |
+
parts.append(f"Speaking speed: {speed}")
|
| 18 |
+
return " ".join([p for p in parts if p])
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ============ Generator Wrapper ============
|
| 22 |
+
|
| 23 |
+
class OpenAIAudioGenerator:
|
| 24 |
+
"""
|
| 25 |
+
Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts).
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
|
| 29 |
+
self.client = client or OpenAI()
|
| 30 |
+
self.out_dir = Path(out_dir)
|
| 31 |
+
self.out_dir.mkdir(parents=True, exist_ok=True)
|
| 32 |
+
|
| 33 |
+
def generate_from_spec(self,
|
| 34 |
+
audio_spec: Dict,
|
| 35 |
+
filename_prefix: str = "speech",
|
| 36 |
+
save_meta: bool = False) -> Dict:
|
| 37 |
+
"""
|
| 38 |
+
audio_spec example:
|
| 39 |
+
{
|
| 40 |
+
"text": "Hello, world!",
|
| 41 |
+
"voice": "alloy",
|
| 42 |
+
"speed": 1.0,
|
| 43 |
+
}
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
prompt_text = _build_openai_tts_prompt(
|
| 47 |
+
audio_spec.get("text", ""),
|
| 48 |
+
audio_spec.get("style"),
|
| 49 |
+
audio_spec.get("speed")
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
voice = audio_spec.get("voice", "alloy")
|
| 53 |
+
fmt = audio_spec.get("format", "mp3")
|
| 54 |
+
model = audio_spec.get("model", "gpt-4o-mini-tts")
|
| 55 |
+
|
| 56 |
+
filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}"
|
| 57 |
+
file_path = self.out_dir / filename
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
with self.client.audio.speech.with_streaming_response.create(
|
| 61 |
+
model=model,
|
| 62 |
+
voice=voice,
|
| 63 |
+
input=prompt_text,
|
| 64 |
+
) as response:
|
| 65 |
+
response.stream_to_file(file_path)
|
| 66 |
+
|
| 67 |
+
# 读出字节后删除文件
|
| 68 |
+
audio_bytes = file_path.read_bytes()
|
| 69 |
+
os.remove(file_path)
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
raise RuntimeError(f"OpenAI Audio generation failed: {e}")
|
| 73 |
+
|
| 74 |
+
meta = {
|
| 75 |
+
"model": model,
|
| 76 |
+
"voice": voice,
|
| 77 |
+
"format": fmt,
|
| 78 |
+
"prompt_sent": prompt_text,
|
| 79 |
+
"llm_audio_spec": audio_spec
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
if save_meta:
|
| 83 |
+
meta_file = file_path.with_suffix(".json")
|
| 84 |
+
meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
|
| 85 |
+
|
| 86 |
+
return {"audio_bytes": audio_bytes, "meta": meta}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ============ Integration with Main Pipeline (Example) ============
|
| 90 |
+
|
| 91 |
+
def generate_audio_with_openai_from_llm_spec(spec: Dict,
|
| 92 |
+
out_dir: str = "outputs",
|
| 93 |
+
openai_key=None) -> Dict:
|
| 94 |
+
"""
|
| 95 |
+
Directly feed the spec returned by call_llm_structured:
|
| 96 |
+
spec = {
|
| 97 |
+
"image": {...},
|
| 98 |
+
"audio": {...},
|
| 99 |
+
"debug": {...}
|
| 100 |
+
}
|
| 101 |
+
"""
|
| 102 |
+
client = OpenAI(api_key=openai_key)
|
| 103 |
+
gen = OpenAIAudioGenerator(out_dir=out_dir, client=client)
|
| 104 |
+
return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio")
|
generation/gen_img.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, base64, json, uuid, math, io
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict, List, Tuple, Optional
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from PIL import Image
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# ============ Utility Functions ============
|
| 11 |
+
|
| 12 |
+
def _build_openai_prompt(prompt: str,
|
| 13 |
+
styles: List[str] | None,
|
| 14 |
+
negative_prompt: str | None) -> str:
|
| 15 |
+
"""Merge positive prompt / style / negative prompt into one natural language prompt suitable for gpt-image-1."""
|
| 16 |
+
parts = [prompt.strip()]
|
| 17 |
+
if styles:
|
| 18 |
+
parts.append(", ".join([s.strip() for s in styles if s.strip()]))
|
| 19 |
+
# OpenAI does not have a separate parameter for negative prompts; phrasing in natural language is safer
|
| 20 |
+
if negative_prompt and negative_prompt.strip():
|
| 21 |
+
parts.append(f"\nAvoid: {negative_prompt.strip()}.")
|
| 22 |
+
return " ".join([p for p in parts if p])
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ============ Generator Wrapper ============
|
| 26 |
+
|
| 27 |
+
class OpenAIImageGenerator:
|
| 28 |
+
"""
|
| 29 |
+
Generate images using the OpenAI Images API (gpt-image-1).
|
| 30 |
+
Reference: Official Image generation docs, help center, and Python SDK usage.
|
| 31 |
+
"""
|
| 32 |
+
def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
|
| 33 |
+
self.client = client or OpenAI()
|
| 34 |
+
self.out_dir = Path(out_dir)
|
| 35 |
+
self.out_dir.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
def generate_from_spec(self, image_spec: Dict,
|
| 38 |
+
transparent_bg: bool = False,
|
| 39 |
+
filename_prefix: str = "img",
|
| 40 |
+
save_meta=False) -> Dict:
|
| 41 |
+
"""
|
| 42 |
+
image_spec should follow the JSON output from your LLM, e.g.:
|
| 43 |
+
{
|
| 44 |
+
"prompt": "...",
|
| 45 |
+
"negative_prompt": "...",
|
| 46 |
+
"style": ["photorealistic","cinematic"],
|
| 47 |
+
"width": 1024, "height": 1024
|
| 48 |
+
...
|
| 49 |
+
}
|
| 50 |
+
"""
|
| 51 |
+
prompt_text = _build_openai_prompt(
|
| 52 |
+
image_spec.get("prompt", ""),
|
| 53 |
+
image_spec.get("style", []),
|
| 54 |
+
image_spec.get("negative_prompt", ""),
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
size = "1024x1024"
|
| 58 |
+
|
| 59 |
+
# Assemble parameters
|
| 60 |
+
params = dict(
|
| 61 |
+
model="gpt-image-1",
|
| 62 |
+
prompt=prompt_text,
|
| 63 |
+
n=1,
|
| 64 |
+
size=size,
|
| 65 |
+
)
|
| 66 |
+
# Transparent background: may or may not be supported; if not, SDK will raise, caught below.
|
| 67 |
+
if transparent_bg:
|
| 68 |
+
params["background"] = "transparent" # If unsupported, ignored in except block
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
resp = self.client.images.generate(**params) # Official images.generate call
|
| 72 |
+
except Exception as e:
|
| 73 |
+
# Retry without background parameter if unsupported
|
| 74 |
+
if transparent_bg:
|
| 75 |
+
params.pop("background", None)
|
| 76 |
+
resp = self.client.images.generate(**params)
|
| 77 |
+
else:
|
| 78 |
+
raise
|
| 79 |
+
|
| 80 |
+
# Parse response (Base64 JSON)
|
| 81 |
+
# Official examples usually decode data[0].b64_json into PNG bytes and save to disk.
|
| 82 |
+
b64_data = resp.data[0].b64_json
|
| 83 |
+
image_bytes = base64.b64decode(b64_data)
|
| 84 |
+
|
| 85 |
+
# Save metadata (for reproducibility/auditing)
|
| 86 |
+
meta = {
|
| 87 |
+
"model": "gpt-image-1",
|
| 88 |
+
"size": size,
|
| 89 |
+
"prompt_sent": prompt_text,
|
| 90 |
+
"transparent_bg": transparent_bg,
|
| 91 |
+
"llm_image_spec": image_spec,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 96 |
+
return {"image": img, "meta": meta, "path": None}
|
| 97 |
+
|
| 98 |
+
# ============ Integration with Main Pipeline (Example) ============
|
| 99 |
+
|
| 100 |
+
def generate_image_with_openai_from_llm_spec(spec: Dict, out_dir: str = "outputs", openai_key=None) -> Dict:
|
| 101 |
+
"""
|
| 102 |
+
Directly feed the spec returned by call_llm_structured:
|
| 103 |
+
spec = {
|
| 104 |
+
"image": {...}, # The image_spec above
|
| 105 |
+
"audio": {...},
|
| 106 |
+
"debug": {...}
|
| 107 |
+
}
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
client = OpenAI(api_key=openai_key)
|
| 111 |
+
|
| 112 |
+
gen = OpenAIImageGenerator(out_dir=out_dir, client=client)
|
| 113 |
+
return gen.generate_from_spec(spec["image"], transparent_bg=False, filename_prefix="gptimg")
|
llm/__pycache__/call_llm.cpython-310.pyc
ADDED
|
Binary file (2.75 kB). View file
|
|
|
llm/call_llm.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict
|
| 2 |
+
from openai import OpenAI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional, Union, Dict
|
| 7 |
+
from pydantic import BaseModel, Field, ValidationError
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# --- Pydantic schemas ---
|
| 14 |
+
class ImageSpec(BaseModel):
|
| 15 |
+
prompt: str = Field(..., description="Positive prompts")
|
| 16 |
+
negative_prompt: str = Field(default="", description="Negative prompts")
|
| 17 |
+
style: List[str] = Field(default_factory=list)
|
| 18 |
+
width: int = 1024
|
| 19 |
+
height: int = 1024
|
| 20 |
+
steps: int = 30
|
| 21 |
+
cfg_scale: float = 6.5
|
| 22 |
+
sampler: Optional[str] = "DPM++ 2M Karras"
|
| 23 |
+
seed: Union[str, int] = "random"
|
| 24 |
+
|
| 25 |
+
class AudioSpec(BaseModel):
|
| 26 |
+
text: str
|
| 27 |
+
voice: str
|
| 28 |
+
speed: float = 1.0
|
| 29 |
+
|
| 30 |
+
class UsedChunk(BaseModel):
|
| 31 |
+
id: Union[int, str]
|
| 32 |
+
excerpt: str
|
| 33 |
+
|
| 34 |
+
class DebugInfo(BaseModel):
|
| 35 |
+
used_chunks: List[UsedChunk] = Field(default_factory=list)
|
| 36 |
+
keywords: List[str] = Field(default_factory=list)
|
| 37 |
+
|
| 38 |
+
class GenerationSpec(BaseModel):
|
| 39 |
+
image: ImageSpec
|
| 40 |
+
audio: AudioSpec
|
| 41 |
+
debug: DebugInfo = Field(default_factory=DebugInfo)
|
| 42 |
+
|
| 43 |
+
# --- Single-path structured call (no fallback) ---
|
| 44 |
+
def call_llm_structured(
|
| 45 |
+
system_prompt: str,
|
| 46 |
+
user_prompt: str,
|
| 47 |
+
model: str = "gpt-5-nano",
|
| 48 |
+
openai_key=None
|
| 49 |
+
) -> Dict:
|
| 50 |
+
"""
|
| 51 |
+
Call OpenAI Responses API and parse directly into the GenerationSpec schema.
|
| 52 |
+
If parsing fails, raise RuntimeError (no fallbacks).
|
| 53 |
+
"""
|
| 54 |
+
client = OpenAI(api_key=openai_key)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
response = client.responses.parse(
|
| 58 |
+
model=model,
|
| 59 |
+
input=[
|
| 60 |
+
{"role": "system", "content": system_prompt},
|
| 61 |
+
{"role": "user", "content": user_prompt},
|
| 62 |
+
],
|
| 63 |
+
text_format=GenerationSpec, # enforce schema at the API level
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
parsed = getattr(response, "output_parsed", None)
|
| 67 |
+
if parsed is None:
|
| 68 |
+
# Optionally include response for easier debugging
|
| 69 |
+
raise RuntimeError("LLM did not return a parsed result (output_parsed=None).")
|
| 70 |
+
|
| 71 |
+
# Pydantic v2: model_dump(); v1: dict()
|
| 72 |
+
return parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
|
rag/__pycache__/extract_text.cpython-310.pyc
ADDED
|
Binary file (695 Bytes). View file
|
|
|
rag/__pycache__/rag.cpython-310.pyc
ADDED
|
Binary file (8 kB). View file
|
|
|
rag/extract_text.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import pdfplumber
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def extract_text(path: str) -> str:
|
| 6 |
+
p = Path(path)
|
| 7 |
+
if p.suffix.lower() in [".txt", ".md"]:
|
| 8 |
+
return p.read_text(encoding="utf-8", errors="ignore")
|
| 9 |
+
if p.suffix.lower() == ".pdf":
|
| 10 |
+
text = []
|
| 11 |
+
with pdfplumber.open(str(p)) as pdf:
|
| 12 |
+
for page in pdf.pages:
|
| 13 |
+
text.append(page.extract_text() or "")
|
| 14 |
+
return "\n".join(text)
|
| 15 |
+
# TODO: docx, html, image(OCR), audio(ASR)
|
| 16 |
+
raise ValueError(f"Unsupported file type: {p.suffix}")
|
rag/rag.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pip install openai faiss-cpu tiktoken numpy
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
import os, time, math
|
| 5 |
+
from typing import List, Dict, Any
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
import numpy as np
|
| 8 |
+
import faiss
|
| 9 |
+
import tiktoken
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
import re
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ========= Basic Utilities =========
|
| 16 |
+
|
| 17 |
+
def l2_normalize(mat: np.ndarray) -> np.ndarray:
|
| 18 |
+
"""Row-wise L2 normalize for cosine similarity via inner product."""
|
| 19 |
+
norm = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
|
| 20 |
+
return mat / norm
|
| 21 |
+
|
| 22 |
+
def batch(iterable, n=128):
|
| 23 |
+
"""Yield lists of size n from an iterable (last one may be shorter)."""
|
| 24 |
+
buf = []
|
| 25 |
+
for x in iterable:
|
| 26 |
+
buf.append(x)
|
| 27 |
+
if len(buf) >= n:
|
| 28 |
+
yield buf
|
| 29 |
+
buf = []
|
| 30 |
+
if buf:
|
| 31 |
+
yield buf
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ========= OpenAI Embeddings RAG =========
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class Chunk:
|
| 38 |
+
"""A single chunk of the document, with token offsets for traceability."""
|
| 39 |
+
id: int
|
| 40 |
+
text: str
|
| 41 |
+
start_token: int
|
| 42 |
+
end_token: int
|
| 43 |
+
|
| 44 |
+
class OpenAIEmbedRAG:
|
| 45 |
+
"""
|
| 46 |
+
Retrieval module using OpenAI Embeddings + FAISS (IP over L2-normalized vectors = cosine).
|
| 47 |
+
Design notes:
|
| 48 |
+
- Single-pass tokenization for the whole document (no repeated encode/decode).
|
| 49 |
+
- Chunk.text is ALWAYS a string (never None) to avoid downstream NoneType errors.
|
| 50 |
+
- Graceful degradation: empty input => no index; search() returns [].
|
| 51 |
+
- Optional MMR re-ranking (diversity) via mmr_search().
|
| 52 |
+
"""
|
| 53 |
+
def __init__(self,
|
| 54 |
+
model: str = "text-embedding-3-small",
|
| 55 |
+
chunk_size_tokens: int = 800,
|
| 56 |
+
overlap_tokens: int = 100,
|
| 57 |
+
batch_size: int = 256,
|
| 58 |
+
openai_key=None):
|
| 59 |
+
self.client = OpenAI(api_key=openai_key)
|
| 60 |
+
self.model = model
|
| 61 |
+
self.batch_size = batch_size
|
| 62 |
+
self.enc = tiktoken.get_encoding("cl100k_base") # Tokenizer for embedding-3 models
|
| 63 |
+
self.chunk_size = max(1, int(chunk_size_tokens))
|
| 64 |
+
self.overlap = max(0, int(overlap_tokens))
|
| 65 |
+
if self.overlap >= self.chunk_size:
|
| 66 |
+
# Ensure forward progress: overlap must be smaller than chunk size
|
| 67 |
+
self.overlap = max(0, self.chunk_size // 4)
|
| 68 |
+
|
| 69 |
+
self._doc_token_ids: List[int] | None = None
|
| 70 |
+
self.chunks: List[Chunk] = []
|
| 71 |
+
self.index: faiss.IndexFlatIP | None = None
|
| 72 |
+
self._emb_dim: int | None = None
|
| 73 |
+
self._emb_matrix: np.ndarray | None = None # store chunk embeddings for MMR / analysis
|
| 74 |
+
|
| 75 |
+
# ---- Text cleaning ----
|
| 76 |
+
def _clean_text(self, text: str) -> str:
|
| 77 |
+
"""
|
| 78 |
+
Light normalization:
|
| 79 |
+
- Collapse consecutive whitespace to a single space.
|
| 80 |
+
- Remove non-printable control chars (keep \n and \t).
|
| 81 |
+
- Trim leading/trailing spaces.
|
| 82 |
+
"""
|
| 83 |
+
text = re.sub(r"\s+", " ", text or "")
|
| 84 |
+
text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\t")
|
| 85 |
+
return text.strip()
|
| 86 |
+
|
| 87 |
+
# ---- Tokenization helpers ----
|
| 88 |
+
def _tokenize(self, text: str) -> List[int]:
|
| 89 |
+
return self.enc.encode(text)
|
| 90 |
+
|
| 91 |
+
def _detokenize(self, ids: List[int]) -> str:
|
| 92 |
+
return self.enc.decode(ids)
|
| 93 |
+
|
| 94 |
+
# ---- Chunking (by tokens) ----
|
| 95 |
+
# It is possible to use dynamic chunking, however to constraint cost, we use fixed size chunking
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def chunk_text(self, text: str) -> List[Chunk]:
|
| 99 |
+
"""
|
| 100 |
+
Tokenize once and create overlapping windows of token ids.
|
| 101 |
+
Each Chunk stores its decoded text and token offsets.
|
| 102 |
+
"""
|
| 103 |
+
self._doc_token_ids = self._tokenize(text)
|
| 104 |
+
total = len(self._doc_token_ids)
|
| 105 |
+
chunks: List[Chunk] = []
|
| 106 |
+
if total == 0:
|
| 107 |
+
return chunks
|
| 108 |
+
|
| 109 |
+
print(f"[RAG] Total tokens: {total}. Chunk size: {self.chunk_size}, overlap: {self.overlap}")
|
| 110 |
+
|
| 111 |
+
stride = self.chunk_size - self.overlap
|
| 112 |
+
i, cid = 0, 0
|
| 113 |
+
while i < total:
|
| 114 |
+
j = min(i + self.chunk_size, total)
|
| 115 |
+
ids_slice = self._doc_token_ids[i:j]
|
| 116 |
+
txt = self._detokenize(ids_slice)
|
| 117 |
+
chunks.append(Chunk(id=cid, text=txt, start_token=i, end_token=j))
|
| 118 |
+
cid += 1
|
| 119 |
+
if j == total:
|
| 120 |
+
break
|
| 121 |
+
i += stride # always moves forward
|
| 122 |
+
return chunks
|
| 123 |
+
|
| 124 |
+
# ---- OpenAI Embeddings (batched) ----
|
| 125 |
+
def _embed_texts(self, texts: List[str], max_retries=3) -> np.ndarray:
|
| 126 |
+
"""
|
| 127 |
+
Call OpenAI Embeddings with encoding_format='float'.
|
| 128 |
+
Returns a float32 matrix with rows aligned to input order.
|
| 129 |
+
"""
|
| 130 |
+
for attempt in range(max_retries):
|
| 131 |
+
try:
|
| 132 |
+
resp = self.client.embeddings.create(
|
| 133 |
+
model=self.model,
|
| 134 |
+
input=texts,
|
| 135 |
+
encoding_format="float",
|
| 136 |
+
)
|
| 137 |
+
vecs = [None] * len(resp.data)
|
| 138 |
+
for item in resp.data:
|
| 139 |
+
vecs[item.index] = np.array(item.embedding, dtype=np.float32)
|
| 140 |
+
arr = np.vstack(vecs)
|
| 141 |
+
if self._emb_dim is None:
|
| 142 |
+
self._emb_dim = arr.shape[1]
|
| 143 |
+
return arr
|
| 144 |
+
except Exception as e:
|
| 145 |
+
if attempt == max_retries - 1:
|
| 146 |
+
raise
|
| 147 |
+
# simple exponential backoff
|
| 148 |
+
time.sleep(0.8 * (attempt + 1))
|
| 149 |
+
|
| 150 |
+
# ---- Build FAISS index ----
|
| 151 |
+
def build(self, text: str):
|
| 152 |
+
"""
|
| 153 |
+
Clean -> chunk -> embed -> build an IP index on normalized vectors.
|
| 154 |
+
Graceful if text is empty: index remains None and chunks empty.
|
| 155 |
+
"""
|
| 156 |
+
text = self._clean_text(text)
|
| 157 |
+
self.chunks = self.chunk_text(text)
|
| 158 |
+
if not self.chunks:
|
| 159 |
+
self.index = None
|
| 160 |
+
self._emb_matrix = None
|
| 161 |
+
return
|
| 162 |
+
|
| 163 |
+
all_vecs = []
|
| 164 |
+
# Embed chunk texts in batches
|
| 165 |
+
for chunk_batch in batch([c.text for c in self.chunks], n=self.batch_size):
|
| 166 |
+
arr = self._embed_texts(chunk_batch)
|
| 167 |
+
all_vecs.append(arr)
|
| 168 |
+
|
| 169 |
+
mat = np.vstack(all_vecs).astype(np.float32)
|
| 170 |
+
mat = l2_normalize(mat)
|
| 171 |
+
self._emb_matrix = mat # keep for MMR / diagnostics
|
| 172 |
+
|
| 173 |
+
self.index = faiss.IndexFlatIP(mat.shape[1])
|
| 174 |
+
self.index.add(mat)
|
| 175 |
+
|
| 176 |
+
# ---- Plain vector search ----
|
| 177 |
+
def search(self, query: str, topk: int = 6) -> List[Dict[str, Any]]:
|
| 178 |
+
"""
|
| 179 |
+
Return top-k chunks by cosine similarity (via IP on normalized vectors).
|
| 180 |
+
If the index hasn't been built or the doc is empty, returns [].
|
| 181 |
+
"""
|
| 182 |
+
if not self.index or not self.chunks:
|
| 183 |
+
return []
|
| 184 |
+
|
| 185 |
+
q = self._clean_text(query)
|
| 186 |
+
if not q:
|
| 187 |
+
return []
|
| 188 |
+
|
| 189 |
+
qv = self._embed_texts([q])
|
| 190 |
+
qv = l2_normalize(qv)
|
| 191 |
+
D, I = self.index.search(qv.astype(np.float32), max(1, int(topk)))
|
| 192 |
+
results = []
|
| 193 |
+
for rank, idx in enumerate(I[0]):
|
| 194 |
+
if idx == -1:
|
| 195 |
+
continue
|
| 196 |
+
ch = self.chunks[int(idx)]
|
| 197 |
+
results.append({
|
| 198 |
+
"id": ch.id,
|
| 199 |
+
"score": float(D[0][rank]),
|
| 200 |
+
"text": ch.text,
|
| 201 |
+
"start_token": ch.start_token,
|
| 202 |
+
"end_token": ch.end_token
|
| 203 |
+
})
|
| 204 |
+
return results
|
| 205 |
+
|
| 206 |
+
# ---- Optional: MMR search (diversified) ----
|
| 207 |
+
def mmr_search(self, query: str, topk: int = 6, fetch_k: int | None = None, lambda_mult: float = 0.5) -> List[Dict[str, Any]]:
|
| 208 |
+
"""
|
| 209 |
+
Maximal Marginal Relevance.
|
| 210 |
+
- fetch_k: number of initial candidates to consider (defaults to 4*topk).
|
| 211 |
+
- lambda_mult in [0,1]: 1 emphasizes relevance; 0 emphasizes diversity.
|
| 212 |
+
"""
|
| 213 |
+
if self._emb_matrix is None or not self.chunks:
|
| 214 |
+
return []
|
| 215 |
+
|
| 216 |
+
q = self._clean_text(query)
|
| 217 |
+
if not q:
|
| 218 |
+
return []
|
| 219 |
+
|
| 220 |
+
qv = l2_normalize(self._embed_texts([q]))[0] # (d,)
|
| 221 |
+
# Precompute query-to-chunk relevance
|
| 222 |
+
rel = self._emb_matrix @ qv # (N,)
|
| 223 |
+
|
| 224 |
+
N = len(self.chunks)
|
| 225 |
+
k = max(1, int(topk))
|
| 226 |
+
m = min(N, int(fetch_k) if fetch_k else min(N, 4 * k))
|
| 227 |
+
|
| 228 |
+
# Get top-m by relevance
|
| 229 |
+
cand_idx = np.argpartition(-rel, m-1)[:m]
|
| 230 |
+
cand_idx = cand_idx[np.argsort(-rel[cand_idx])] # sort by relevance
|
| 231 |
+
|
| 232 |
+
selected: List[int] = []
|
| 233 |
+
selected_set = set()
|
| 234 |
+
|
| 235 |
+
for _ in range(min(k, m)):
|
| 236 |
+
if not selected:
|
| 237 |
+
best = int(cand_idx[0])
|
| 238 |
+
selected.append(best)
|
| 239 |
+
selected_set.add(best)
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
# Diversity term: max similarity to items already selected
|
| 243 |
+
S = self._emb_matrix[selected] # (s, d)
|
| 244 |
+
# compute max cosine sim to the selected set for each candidate
|
| 245 |
+
# (S @ cand.T) => for each candidate's vector v, max over s rows
|
| 246 |
+
cand_vecs = self._emb_matrix[cand_idx] # (m, d)
|
| 247 |
+
sims = cand_vecs @ S.T # (m, s)
|
| 248 |
+
max_sims = sims.max(axis=1) # (m,)
|
| 249 |
+
|
| 250 |
+
# MMR objective
|
| 251 |
+
scores = lambda_mult * rel[cand_idx] - (1 - lambda_mult) * max_sims
|
| 252 |
+
# pick best candidate not yet selected
|
| 253 |
+
order = np.argsort(-scores)
|
| 254 |
+
for j in order:
|
| 255 |
+
idx_j = int(cand_idx[j])
|
| 256 |
+
if idx_j not in selected_set:
|
| 257 |
+
selected.append(idx_j)
|
| 258 |
+
selected_set.add(idx_j)
|
| 259 |
+
break
|
| 260 |
+
|
| 261 |
+
# Format results in the same structure as search()
|
| 262 |
+
out = []
|
| 263 |
+
for idx in selected:
|
| 264 |
+
ch = self.chunks[idx]
|
| 265 |
+
out.append({
|
| 266 |
+
"id": ch.id,
|
| 267 |
+
"score": float(rel[idx]),
|
| 268 |
+
"text": ch.text,
|
| 269 |
+
"start_token": ch.start_token,
|
| 270 |
+
"end_token": ch.end_token
|
| 271 |
+
})
|
| 272 |
+
return out
|
| 273 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
faiss_cpu
|
| 2 |
+
gradio==5.44.1
|
| 3 |
+
numpy<2.0
|
| 4 |
+
openai
|
| 5 |
+
pdfplumber==0.11.7
|
| 6 |
+
Pillow
|
| 7 |
+
pydantic
|
| 8 |
+
tiktoken==0.11.0
|