slxhere commited on
Commit
5c9f0d9
·
0 Parent(s):

Add audio generation

Browse files
.gitignore ADDED
Binary file (22 Bytes). View file
 
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Doc Alive - RAG to Image
3
+ emoji: 📦🎨
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: "5.44.1"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # 📦→🧠→🎨 Doc Alive: RAG-to-Image with OpenAI
13
+
14
+ This project turns documents into **illustrations** with the help of RAG (Retrieval-Augmented Generation), LLM prompt engineering, and OpenAI’s image generation.
15
+
16
+ Upload a `.txt`, `.md`, or `.pdf` file, describe your goal, and the app will:
17
+ 1. **Extract text** from your file
18
+ 2. **Retrieve key excerpts** using embeddings
19
+ 3. **Ask an LLM** to craft a structured image generation spec
20
+ 4. **Generate an illustration** with OpenAI’s image model
21
+
22
+ ---
23
+
24
+ ## 🚀 Demo
25
+
26
+ This app runs on **Hugging Face Spaces** using **Gradio**.
27
+
28
+ ---
29
+
30
+ ## 🔑 API Key
31
+
32
+ You must provide your own **OpenAI API key** to use this demo.
33
+ - Enter your key in the input box (starts with `sk-...`)
34
+ - The key is **not stored** — it is only used in memory for your current session
35
+
36
+ ---
37
+
38
+ ## 📂 Project Structure
39
+
40
+
41
+ ├─ app.py # Gradio UI entry
42
+ ├─ requirements.txt # Dependencies
43
+ ├─ rag/ # Text extraction + retrieval
44
+ ├─ llm/ # Structured LLM call helper
45
+ ├─ generation/ # Image generation helper
46
+
47
+
48
+ ---
49
+
50
+ ## 🛠 Tech Stack
51
+
52
+ - [Gradio](https://www.gradio.app/) – UI framework
53
+ - [OpenAI](https://platform.openai.com/) – LLM + image generation
54
+ - [RAG (text-embedding-3-small)](https://platform.openai.com/docs/guides/embeddings) – semantic retrieval
55
+
56
+ ---
57
+
58
+ ## ⚠️ Notes
59
+
60
+ - The OpenAI API key is required for both embeddings and image generation
61
+ - We do **not** log or save your key
62
+ - Depending on your key usage, OpenAI will bill API calls
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from pathlib import Path
3
+ from typing import Dict, Any
4
+
5
+ import gradio as gr
6
+
7
+ from rag.extract_text import extract_text
8
+ from rag.rag import OpenAIEmbedRAG
9
+ from llm.call_llm import call_llm_structured
10
+ from generation.gen_img import generate_image_with_openai_from_llm_spec
11
+ from generation.gen_audio import generate_audio_with_openai_from_llm_spec
12
+
13
+ # ---------- Output directory ----------
14
+ OUT_DIR = Path("outputs")
15
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
16
+
17
+
18
+ # ---------- Helpers ----------
19
+ def hits_to_snippets_json(hits, raw_fallback: str, limit: int = 500) -> str:
20
+ """Safe serialization for retrieved chunks."""
21
+ items = []
22
+ if hits is None:
23
+ hits = []
24
+ for i, h in enumerate(hits):
25
+ if not isinstance(h, dict):
26
+ continue
27
+ t = h.get("text")
28
+ if not t:
29
+ continue
30
+ items.append({"id": h.get("id", i), "excerpt": t[:limit]})
31
+ if not items:
32
+ fb = (raw_fallback or "")[:limit]
33
+ if fb:
34
+ items = [{"id": 0, "excerpt": fb}]
35
+ return json.dumps(items, ensure_ascii=False)
36
+
37
+
38
+ # ---------- Core pipeline ----------
39
+ def run_pipeline(
40
+ file_path: str,
41
+ openai_api_key: str,
42
+ user_goal: str = "Generate an image and a narration audio that reflects the essence of the text",
43
+ topk: int = 6,
44
+ llm_model: str = "gpt-5-nano",
45
+ ):
46
+ """
47
+ Runs the full pipeline using the provided API key.
48
+ SECURITY NOTE: We do not write the API key to disk or include it in any outputs.
49
+ """
50
+
51
+ if not openai_api_key or not openai_api_key.strip():
52
+ raise ValueError("OpenAI API key is required.")
53
+
54
+ # 1) Extract text
55
+ raw = extract_text(file_path)
56
+
57
+ # 2) RAG (embeddings + search)
58
+ rag = OpenAIEmbedRAG(model="text-embedding-3-small", openai_key=openai_api_key)
59
+ rag.build(raw)
60
+ hits = rag.search(user_goal, topk=topk)
61
+
62
+ snippets_json = hits_to_snippets_json(hits, raw)
63
+
64
+ # 3) LLM → structured JSON (image/audio/debug)
65
+ system_prompt = """
66
+ You are a prompt engineer for **visual** and **audio** generation.
67
+
68
+ Return a JSON object strictly matching this schema:
69
+
70
+ {
71
+ "image": {
72
+ "prompt": "string, a detailed description of what the image should show",
73
+ "negative_prompt": "string, optional description of what to avoid",
74
+ "style": ["string", ...], // optional styles like "cinematic", "oil painting"
75
+ "width": int, // optional, default 1024
76
+ "height": int // optional, default 1024
77
+ },
78
+ "audio": {
79
+ "text": "string, the exact narration script to be read aloud, written in natural spoken language, only contains the script itself without extra beginning",
80
+ "voice": "string, choose from [alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse], default 'alloy', should match the atmosphere of the text",
81
+ "speed": float, // optional, default 1.0
82
+ },
83
+ "debug": {
84
+ "reasoning": "string, brief reasoning why you designed the prompts this way"
85
+ }
86
+ }
87
+
88
+ Rules:
89
+ - Always output valid JSON only, no explanations outside JSON.
90
+ - The narration text (`audio.text`) must be **rewritten into a smooth script** that could be directly read aloud,
91
+ not raw excerpts.
92
+ - All places in JSON must be filled with valid content, do not leave any entry empty!
93
+ - Keep the image prompt concise but vivid.
94
+ """.strip()
95
+
96
+ user_prompt = f"""
97
+ Goal: {user_goal}
98
+
99
+ Below are the most relevant retrieved excerpts (with source IDs):
100
+ {snippets_json}
101
+
102
+ Now produce the JSON object strictly following the schema.
103
+ """.strip()
104
+
105
+ spec = call_llm_structured(
106
+ system_prompt,
107
+ user_prompt,
108
+ model=llm_model,
109
+ openai_key=openai_api_key,
110
+ )
111
+
112
+ print(spec)
113
+
114
+ # 4) Image generation
115
+ result_img = generate_image_with_openai_from_llm_spec(
116
+ spec,
117
+ out_dir=str(OUT_DIR),
118
+ openai_key=openai_api_key,
119
+ )
120
+ img_obj = result_img["image"]
121
+
122
+ # 5) Audio generation
123
+ result_audio = generate_audio_with_openai_from_llm_spec(
124
+ spec,
125
+ out_dir=str(OUT_DIR),
126
+ openai_key=openai_api_key,
127
+ )
128
+ audio_bytes = result_audio["audio_bytes"]
129
+
130
+ # 6) Pretty meta for UI
131
+ pretty = {
132
+ "spec": spec,
133
+ "used_chunks_preview": json.loads(snippets_json),
134
+ }
135
+ raw_json = json.dumps(pretty, ensure_ascii=False, indent=2)
136
+ return img_obj, audio_bytes, pretty, raw_json
137
+
138
+
139
+ # ---------- Gradio UI ----------
140
+ def ui_pipeline(file, api_key, goal, topk, model_name):
141
+ if file is None:
142
+ return None, None, {"error": "Please upload a file (.txt/.md/.pdf)."}, ""
143
+ try:
144
+ return run_pipeline(
145
+ file_path=file.name,
146
+ openai_api_key=api_key,
147
+ user_goal=goal,
148
+ topk=int(topk),
149
+ llm_model=model_name,
150
+ )
151
+ except Exception as e:
152
+ return None, None, {"error": str(e)}, ""
153
+
154
+
155
+ with gr.Blocks(title="File → (RAG + LLM) → Prompts → Image+Audio") as demo:
156
+ gr.Markdown(
157
+ "# 📦→🧠→🎨+🔊 Generate illustration and narration for your documents\n"
158
+ "**Bring your own OpenAI API key. We do not store your key; it is only used in memory for this run.**\n\n"
159
+ "- Upload a text/PDF, retrieve with embeddings, let the LLM craft prompts.\n"
160
+ "- Generate both an image **and** an audio narration.\n"
161
+ )
162
+
163
+ with gr.Row():
164
+ file_in = gr.File(label="Upload file (.txt/.md/.pdf)")
165
+ goal = gr.Textbox(
166
+ label="Your goal (more detail → better results)",
167
+ value="Generate an illustration and a narration that matches the spirit of this text",
168
+ lines=2,
169
+ )
170
+
171
+ with gr.Row():
172
+ api_key_in = gr.Textbox(
173
+ label="OpenAI API key",
174
+ placeholder="sk-...",
175
+ type="password",
176
+ )
177
+ topk = gr.Slider(1, 12, value=6, step=1, label="Top-K retrieved chunks")
178
+ model_llm = gr.Dropdown(
179
+ choices=["gpt-5-nano"],
180
+ value="gpt-5-nano",
181
+ label="LLM model",
182
+ )
183
+
184
+ run_btn = gr.Button("Run", variant="primary")
185
+
186
+ out_img = gr.Image(label="Generated image")
187
+ out_audio = gr.Audio(label="Generated audio", type="numpy") # ⚡ 改成 numpy
188
+ out_json = gr.JSON(label="Spec & Metadata")
189
+ out_raw = gr.Code(label="Raw JSON (debug)", language="json")
190
+
191
+ run_btn.click(
192
+ ui_pipeline,
193
+ inputs=[file_in, api_key_in, goal, topk, model_llm],
194
+ outputs=[out_img, out_audio, out_json, out_raw],
195
+ )
196
+
197
+ if __name__ == "__main__":
198
+ demo.launch()
generation/__pycache__/gen_audio.cpython-310.pyc ADDED
Binary file (3.23 kB). View file
 
generation/__pycache__/gen_img.cpython-310.pyc ADDED
Binary file (3.52 kB). View file
 
generation/gen_audio.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, uuid
2
+ from pathlib import Path
3
+ from typing import Dict, Optional
4
+ from openai import OpenAI
5
+
6
+
7
+ # ============ Utility Functions ============
8
+
9
+ def _build_openai_tts_prompt(text: str,
10
+ style: Optional[str] = None,
11
+ speed: Optional[float] = None) -> str:
12
+ """Merge text, style, and other options into a single TTS input string."""
13
+ parts = [text.strip()]
14
+ if style:
15
+ parts.append(f"Style: {style.strip()}")
16
+ if speed:
17
+ parts.append(f"Speaking speed: {speed}")
18
+ return " ".join([p for p in parts if p])
19
+
20
+
21
+ # ============ Generator Wrapper ============
22
+
23
+ class OpenAIAudioGenerator:
24
+ """
25
+ Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts).
26
+ """
27
+
28
+ def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
29
+ self.client = client or OpenAI()
30
+ self.out_dir = Path(out_dir)
31
+ self.out_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ def generate_from_spec(self,
34
+ audio_spec: Dict,
35
+ filename_prefix: str = "speech",
36
+ save_meta: bool = False) -> Dict:
37
+ """
38
+ audio_spec example:
39
+ {
40
+ "text": "Hello, world!",
41
+ "voice": "alloy",
42
+ "speed": 1.0,
43
+ }
44
+ """
45
+
46
+ prompt_text = _build_openai_tts_prompt(
47
+ audio_spec.get("text", ""),
48
+ audio_spec.get("style"),
49
+ audio_spec.get("speed")
50
+ )
51
+
52
+ voice = audio_spec.get("voice", "alloy")
53
+ fmt = audio_spec.get("format", "mp3")
54
+ model = audio_spec.get("model", "gpt-4o-mini-tts")
55
+
56
+ filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}"
57
+ file_path = self.out_dir / filename
58
+
59
+ try:
60
+ with self.client.audio.speech.with_streaming_response.create(
61
+ model=model,
62
+ voice=voice,
63
+ input=prompt_text,
64
+ ) as response:
65
+ response.stream_to_file(file_path)
66
+
67
+ # 读出字节后删除文件
68
+ audio_bytes = file_path.read_bytes()
69
+ os.remove(file_path)
70
+
71
+ except Exception as e:
72
+ raise RuntimeError(f"OpenAI Audio generation failed: {e}")
73
+
74
+ meta = {
75
+ "model": model,
76
+ "voice": voice,
77
+ "format": fmt,
78
+ "prompt_sent": prompt_text,
79
+ "llm_audio_spec": audio_spec
80
+ }
81
+
82
+ if save_meta:
83
+ meta_file = file_path.with_suffix(".json")
84
+ meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
85
+
86
+ return {"audio_bytes": audio_bytes, "meta": meta}
87
+
88
+
89
+ # ============ Integration with Main Pipeline (Example) ============
90
+
91
+ def generate_audio_with_openai_from_llm_spec(spec: Dict,
92
+ out_dir: str = "outputs",
93
+ openai_key=None) -> Dict:
94
+ """
95
+ Directly feed the spec returned by call_llm_structured:
96
+ spec = {
97
+ "image": {...},
98
+ "audio": {...},
99
+ "debug": {...}
100
+ }
101
+ """
102
+ client = OpenAI(api_key=openai_key)
103
+ gen = OpenAIAudioGenerator(out_dir=out_dir, client=client)
104
+ return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio")
generation/gen_img.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, json, uuid, math, io
2
+ from pathlib import Path
3
+ from typing import Dict, List, Tuple, Optional
4
+ from openai import OpenAI
5
+ from datetime import datetime
6
+ from PIL import Image
7
+
8
+
9
+
10
+ # ============ Utility Functions ============
11
+
12
+ def _build_openai_prompt(prompt: str,
13
+ styles: List[str] | None,
14
+ negative_prompt: str | None) -> str:
15
+ """Merge positive prompt / style / negative prompt into one natural language prompt suitable for gpt-image-1."""
16
+ parts = [prompt.strip()]
17
+ if styles:
18
+ parts.append(", ".join([s.strip() for s in styles if s.strip()]))
19
+ # OpenAI does not have a separate parameter for negative prompts; phrasing in natural language is safer
20
+ if negative_prompt and negative_prompt.strip():
21
+ parts.append(f"\nAvoid: {negative_prompt.strip()}.")
22
+ return " ".join([p for p in parts if p])
23
+
24
+
25
+ # ============ Generator Wrapper ============
26
+
27
+ class OpenAIImageGenerator:
28
+ """
29
+ Generate images using the OpenAI Images API (gpt-image-1).
30
+ Reference: Official Image generation docs, help center, and Python SDK usage.
31
+ """
32
+ def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
33
+ self.client = client or OpenAI()
34
+ self.out_dir = Path(out_dir)
35
+ self.out_dir.mkdir(parents=True, exist_ok=True)
36
+
37
+ def generate_from_spec(self, image_spec: Dict,
38
+ transparent_bg: bool = False,
39
+ filename_prefix: str = "img",
40
+ save_meta=False) -> Dict:
41
+ """
42
+ image_spec should follow the JSON output from your LLM, e.g.:
43
+ {
44
+ "prompt": "...",
45
+ "negative_prompt": "...",
46
+ "style": ["photorealistic","cinematic"],
47
+ "width": 1024, "height": 1024
48
+ ...
49
+ }
50
+ """
51
+ prompt_text = _build_openai_prompt(
52
+ image_spec.get("prompt", ""),
53
+ image_spec.get("style", []),
54
+ image_spec.get("negative_prompt", ""),
55
+ )
56
+
57
+ size = "1024x1024"
58
+
59
+ # Assemble parameters
60
+ params = dict(
61
+ model="gpt-image-1",
62
+ prompt=prompt_text,
63
+ n=1,
64
+ size=size,
65
+ )
66
+ # Transparent background: may or may not be supported; if not, SDK will raise, caught below.
67
+ if transparent_bg:
68
+ params["background"] = "transparent" # If unsupported, ignored in except block
69
+
70
+ try:
71
+ resp = self.client.images.generate(**params) # Official images.generate call
72
+ except Exception as e:
73
+ # Retry without background parameter if unsupported
74
+ if transparent_bg:
75
+ params.pop("background", None)
76
+ resp = self.client.images.generate(**params)
77
+ else:
78
+ raise
79
+
80
+ # Parse response (Base64 JSON)
81
+ # Official examples usually decode data[0].b64_json into PNG bytes and save to disk.
82
+ b64_data = resp.data[0].b64_json
83
+ image_bytes = base64.b64decode(b64_data)
84
+
85
+ # Save metadata (for reproducibility/auditing)
86
+ meta = {
87
+ "model": "gpt-image-1",
88
+ "size": size,
89
+ "prompt_sent": prompt_text,
90
+ "transparent_bg": transparent_bg,
91
+ "llm_image_spec": image_spec,
92
+ }
93
+
94
+
95
+ img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
96
+ return {"image": img, "meta": meta, "path": None}
97
+
98
+ # ============ Integration with Main Pipeline (Example) ============
99
+
100
+ def generate_image_with_openai_from_llm_spec(spec: Dict, out_dir: str = "outputs", openai_key=None) -> Dict:
101
+ """
102
+ Directly feed the spec returned by call_llm_structured:
103
+ spec = {
104
+ "image": {...}, # The image_spec above
105
+ "audio": {...},
106
+ "debug": {...}
107
+ }
108
+ """
109
+
110
+ client = OpenAI(api_key=openai_key)
111
+
112
+ gen = OpenAIImageGenerator(out_dir=out_dir, client=client)
113
+ return gen.generate_from_spec(spec["image"], transparent_bg=False, filename_prefix="gptimg")
llm/__pycache__/call_llm.cpython-310.pyc ADDED
Binary file (2.75 kB). View file
 
llm/call_llm.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ from openai import OpenAI
3
+ from pydantic import BaseModel
4
+
5
+
6
+ from typing import List, Optional, Union, Dict
7
+ from pydantic import BaseModel, Field, ValidationError
8
+ import json
9
+ import logging
10
+
11
+
12
+
13
+ # --- Pydantic schemas ---
14
+ class ImageSpec(BaseModel):
15
+ prompt: str = Field(..., description="Positive prompts")
16
+ negative_prompt: str = Field(default="", description="Negative prompts")
17
+ style: List[str] = Field(default_factory=list)
18
+ width: int = 1024
19
+ height: int = 1024
20
+ steps: int = 30
21
+ cfg_scale: float = 6.5
22
+ sampler: Optional[str] = "DPM++ 2M Karras"
23
+ seed: Union[str, int] = "random"
24
+
25
+ class AudioSpec(BaseModel):
26
+ text: str
27
+ voice: str
28
+ speed: float = 1.0
29
+
30
+ class UsedChunk(BaseModel):
31
+ id: Union[int, str]
32
+ excerpt: str
33
+
34
+ class DebugInfo(BaseModel):
35
+ used_chunks: List[UsedChunk] = Field(default_factory=list)
36
+ keywords: List[str] = Field(default_factory=list)
37
+
38
+ class GenerationSpec(BaseModel):
39
+ image: ImageSpec
40
+ audio: AudioSpec
41
+ debug: DebugInfo = Field(default_factory=DebugInfo)
42
+
43
+ # --- Single-path structured call (no fallback) ---
44
+ def call_llm_structured(
45
+ system_prompt: str,
46
+ user_prompt: str,
47
+ model: str = "gpt-5-nano",
48
+ openai_key=None
49
+ ) -> Dict:
50
+ """
51
+ Call OpenAI Responses API and parse directly into the GenerationSpec schema.
52
+ If parsing fails, raise RuntimeError (no fallbacks).
53
+ """
54
+ client = OpenAI(api_key=openai_key)
55
+
56
+
57
+ response = client.responses.parse(
58
+ model=model,
59
+ input=[
60
+ {"role": "system", "content": system_prompt},
61
+ {"role": "user", "content": user_prompt},
62
+ ],
63
+ text_format=GenerationSpec, # enforce schema at the API level
64
+ )
65
+
66
+ parsed = getattr(response, "output_parsed", None)
67
+ if parsed is None:
68
+ # Optionally include response for easier debugging
69
+ raise RuntimeError("LLM did not return a parsed result (output_parsed=None).")
70
+
71
+ # Pydantic v2: model_dump(); v1: dict()
72
+ return parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
rag/__pycache__/extract_text.cpython-310.pyc ADDED
Binary file (695 Bytes). View file
 
rag/__pycache__/rag.cpython-310.pyc ADDED
Binary file (8 kB). View file
 
rag/extract_text.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import pdfplumber
3
+
4
+
5
+ def extract_text(path: str) -> str:
6
+ p = Path(path)
7
+ if p.suffix.lower() in [".txt", ".md"]:
8
+ return p.read_text(encoding="utf-8", errors="ignore")
9
+ if p.suffix.lower() == ".pdf":
10
+ text = []
11
+ with pdfplumber.open(str(p)) as pdf:
12
+ for page in pdf.pages:
13
+ text.append(page.extract_text() or "")
14
+ return "\n".join(text)
15
+ # TODO: docx, html, image(OCR), audio(ASR)
16
+ raise ValueError(f"Unsupported file type: {p.suffix}")
rag/rag.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install openai faiss-cpu tiktoken numpy
2
+
3
+ from __future__ import annotations
4
+ import os, time, math
5
+ from typing import List, Dict, Any
6
+ from dataclasses import dataclass
7
+ import numpy as np
8
+ import faiss
9
+ import tiktoken
10
+ from openai import OpenAI
11
+ import re
12
+
13
+
14
+
15
+ # ========= Basic Utilities =========
16
+
17
+ def l2_normalize(mat: np.ndarray) -> np.ndarray:
18
+ """Row-wise L2 normalize for cosine similarity via inner product."""
19
+ norm = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
20
+ return mat / norm
21
+
22
+ def batch(iterable, n=128):
23
+ """Yield lists of size n from an iterable (last one may be shorter)."""
24
+ buf = []
25
+ for x in iterable:
26
+ buf.append(x)
27
+ if len(buf) >= n:
28
+ yield buf
29
+ buf = []
30
+ if buf:
31
+ yield buf
32
+
33
+
34
+ # ========= OpenAI Embeddings RAG =========
35
+
36
+ @dataclass
37
+ class Chunk:
38
+ """A single chunk of the document, with token offsets for traceability."""
39
+ id: int
40
+ text: str
41
+ start_token: int
42
+ end_token: int
43
+
44
+ class OpenAIEmbedRAG:
45
+ """
46
+ Retrieval module using OpenAI Embeddings + FAISS (IP over L2-normalized vectors = cosine).
47
+ Design notes:
48
+ - Single-pass tokenization for the whole document (no repeated encode/decode).
49
+ - Chunk.text is ALWAYS a string (never None) to avoid downstream NoneType errors.
50
+ - Graceful degradation: empty input => no index; search() returns [].
51
+ - Optional MMR re-ranking (diversity) via mmr_search().
52
+ """
53
+ def __init__(self,
54
+ model: str = "text-embedding-3-small",
55
+ chunk_size_tokens: int = 800,
56
+ overlap_tokens: int = 100,
57
+ batch_size: int = 256,
58
+ openai_key=None):
59
+ self.client = OpenAI(api_key=openai_key)
60
+ self.model = model
61
+ self.batch_size = batch_size
62
+ self.enc = tiktoken.get_encoding("cl100k_base") # Tokenizer for embedding-3 models
63
+ self.chunk_size = max(1, int(chunk_size_tokens))
64
+ self.overlap = max(0, int(overlap_tokens))
65
+ if self.overlap >= self.chunk_size:
66
+ # Ensure forward progress: overlap must be smaller than chunk size
67
+ self.overlap = max(0, self.chunk_size // 4)
68
+
69
+ self._doc_token_ids: List[int] | None = None
70
+ self.chunks: List[Chunk] = []
71
+ self.index: faiss.IndexFlatIP | None = None
72
+ self._emb_dim: int | None = None
73
+ self._emb_matrix: np.ndarray | None = None # store chunk embeddings for MMR / analysis
74
+
75
+ # ---- Text cleaning ----
76
+ def _clean_text(self, text: str) -> str:
77
+ """
78
+ Light normalization:
79
+ - Collapse consecutive whitespace to a single space.
80
+ - Remove non-printable control chars (keep \n and \t).
81
+ - Trim leading/trailing spaces.
82
+ """
83
+ text = re.sub(r"\s+", " ", text or "")
84
+ text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\t")
85
+ return text.strip()
86
+
87
+ # ---- Tokenization helpers ----
88
+ def _tokenize(self, text: str) -> List[int]:
89
+ return self.enc.encode(text)
90
+
91
+ def _detokenize(self, ids: List[int]) -> str:
92
+ return self.enc.decode(ids)
93
+
94
+ # ---- Chunking (by tokens) ----
95
+ # It is possible to use dynamic chunking, however to constraint cost, we use fixed size chunking
96
+
97
+
98
+ def chunk_text(self, text: str) -> List[Chunk]:
99
+ """
100
+ Tokenize once and create overlapping windows of token ids.
101
+ Each Chunk stores its decoded text and token offsets.
102
+ """
103
+ self._doc_token_ids = self._tokenize(text)
104
+ total = len(self._doc_token_ids)
105
+ chunks: List[Chunk] = []
106
+ if total == 0:
107
+ return chunks
108
+
109
+ print(f"[RAG] Total tokens: {total}. Chunk size: {self.chunk_size}, overlap: {self.overlap}")
110
+
111
+ stride = self.chunk_size - self.overlap
112
+ i, cid = 0, 0
113
+ while i < total:
114
+ j = min(i + self.chunk_size, total)
115
+ ids_slice = self._doc_token_ids[i:j]
116
+ txt = self._detokenize(ids_slice)
117
+ chunks.append(Chunk(id=cid, text=txt, start_token=i, end_token=j))
118
+ cid += 1
119
+ if j == total:
120
+ break
121
+ i += stride # always moves forward
122
+ return chunks
123
+
124
+ # ---- OpenAI Embeddings (batched) ----
125
+ def _embed_texts(self, texts: List[str], max_retries=3) -> np.ndarray:
126
+ """
127
+ Call OpenAI Embeddings with encoding_format='float'.
128
+ Returns a float32 matrix with rows aligned to input order.
129
+ """
130
+ for attempt in range(max_retries):
131
+ try:
132
+ resp = self.client.embeddings.create(
133
+ model=self.model,
134
+ input=texts,
135
+ encoding_format="float",
136
+ )
137
+ vecs = [None] * len(resp.data)
138
+ for item in resp.data:
139
+ vecs[item.index] = np.array(item.embedding, dtype=np.float32)
140
+ arr = np.vstack(vecs)
141
+ if self._emb_dim is None:
142
+ self._emb_dim = arr.shape[1]
143
+ return arr
144
+ except Exception as e:
145
+ if attempt == max_retries - 1:
146
+ raise
147
+ # simple exponential backoff
148
+ time.sleep(0.8 * (attempt + 1))
149
+
150
+ # ---- Build FAISS index ----
151
+ def build(self, text: str):
152
+ """
153
+ Clean -> chunk -> embed -> build an IP index on normalized vectors.
154
+ Graceful if text is empty: index remains None and chunks empty.
155
+ """
156
+ text = self._clean_text(text)
157
+ self.chunks = self.chunk_text(text)
158
+ if not self.chunks:
159
+ self.index = None
160
+ self._emb_matrix = None
161
+ return
162
+
163
+ all_vecs = []
164
+ # Embed chunk texts in batches
165
+ for chunk_batch in batch([c.text for c in self.chunks], n=self.batch_size):
166
+ arr = self._embed_texts(chunk_batch)
167
+ all_vecs.append(arr)
168
+
169
+ mat = np.vstack(all_vecs).astype(np.float32)
170
+ mat = l2_normalize(mat)
171
+ self._emb_matrix = mat # keep for MMR / diagnostics
172
+
173
+ self.index = faiss.IndexFlatIP(mat.shape[1])
174
+ self.index.add(mat)
175
+
176
+ # ---- Plain vector search ----
177
+ def search(self, query: str, topk: int = 6) -> List[Dict[str, Any]]:
178
+ """
179
+ Return top-k chunks by cosine similarity (via IP on normalized vectors).
180
+ If the index hasn't been built or the doc is empty, returns [].
181
+ """
182
+ if not self.index or not self.chunks:
183
+ return []
184
+
185
+ q = self._clean_text(query)
186
+ if not q:
187
+ return []
188
+
189
+ qv = self._embed_texts([q])
190
+ qv = l2_normalize(qv)
191
+ D, I = self.index.search(qv.astype(np.float32), max(1, int(topk)))
192
+ results = []
193
+ for rank, idx in enumerate(I[0]):
194
+ if idx == -1:
195
+ continue
196
+ ch = self.chunks[int(idx)]
197
+ results.append({
198
+ "id": ch.id,
199
+ "score": float(D[0][rank]),
200
+ "text": ch.text,
201
+ "start_token": ch.start_token,
202
+ "end_token": ch.end_token
203
+ })
204
+ return results
205
+
206
+ # ---- Optional: MMR search (diversified) ----
207
+ def mmr_search(self, query: str, topk: int = 6, fetch_k: int | None = None, lambda_mult: float = 0.5) -> List[Dict[str, Any]]:
208
+ """
209
+ Maximal Marginal Relevance.
210
+ - fetch_k: number of initial candidates to consider (defaults to 4*topk).
211
+ - lambda_mult in [0,1]: 1 emphasizes relevance; 0 emphasizes diversity.
212
+ """
213
+ if self._emb_matrix is None or not self.chunks:
214
+ return []
215
+
216
+ q = self._clean_text(query)
217
+ if not q:
218
+ return []
219
+
220
+ qv = l2_normalize(self._embed_texts([q]))[0] # (d,)
221
+ # Precompute query-to-chunk relevance
222
+ rel = self._emb_matrix @ qv # (N,)
223
+
224
+ N = len(self.chunks)
225
+ k = max(1, int(topk))
226
+ m = min(N, int(fetch_k) if fetch_k else min(N, 4 * k))
227
+
228
+ # Get top-m by relevance
229
+ cand_idx = np.argpartition(-rel, m-1)[:m]
230
+ cand_idx = cand_idx[np.argsort(-rel[cand_idx])] # sort by relevance
231
+
232
+ selected: List[int] = []
233
+ selected_set = set()
234
+
235
+ for _ in range(min(k, m)):
236
+ if not selected:
237
+ best = int(cand_idx[0])
238
+ selected.append(best)
239
+ selected_set.add(best)
240
+ continue
241
+
242
+ # Diversity term: max similarity to items already selected
243
+ S = self._emb_matrix[selected] # (s, d)
244
+ # compute max cosine sim to the selected set for each candidate
245
+ # (S @ cand.T) => for each candidate's vector v, max over s rows
246
+ cand_vecs = self._emb_matrix[cand_idx] # (m, d)
247
+ sims = cand_vecs @ S.T # (m, s)
248
+ max_sims = sims.max(axis=1) # (m,)
249
+
250
+ # MMR objective
251
+ scores = lambda_mult * rel[cand_idx] - (1 - lambda_mult) * max_sims
252
+ # pick best candidate not yet selected
253
+ order = np.argsort(-scores)
254
+ for j in order:
255
+ idx_j = int(cand_idx[j])
256
+ if idx_j not in selected_set:
257
+ selected.append(idx_j)
258
+ selected_set.add(idx_j)
259
+ break
260
+
261
+ # Format results in the same structure as search()
262
+ out = []
263
+ for idx in selected:
264
+ ch = self.chunks[idx]
265
+ out.append({
266
+ "id": ch.id,
267
+ "score": float(rel[idx]),
268
+ "text": ch.text,
269
+ "start_token": ch.start_token,
270
+ "end_token": ch.end_token
271
+ })
272
+ return out
273
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ faiss_cpu
2
+ gradio==5.44.1
3
+ numpy<2.0
4
+ openai
5
+ pdfplumber==0.11.7
6
+ Pillow
7
+ pydantic
8
+ tiktoken==0.11.0