Spaces:
Runtime error
Runtime error
Upload 14 files
Browse files- src/README.md +75 -0
- src/agent.py +42 -0
- src/app.py +169 -0
- src/callback.py +241 -0
- src/config.py +34 -0
- src/extract_pdf.py +12 -0
- src/handbook_export.md +30 -0
- src/handbook_generator.py +143 -0
- src/pdf_processor.py +92 -0
- src/prompt.py +21 -0
- src/rag.py +162 -0
- src/rag_tools.py +24 -0
- src/runner_app.py +97 -0
- src/streamlit_app.py +167 -35
src/README.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Handbook Generator (AI Engineering Assignment)
|
| 2 |
+
|
| 3 |
+
Same technology as the **policy** project: **Google ADK**, **LiteLLM** (OpenAI), and **Streamlit**. The UI talks to the ADK agent **directly** (no API). The agent uses a **RAG tool** (ChromaDB) to answer from uploaded PDFs.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **PDF upload** β Upload PDFs; text is extracted, chunked, embedded (OpenAI), stored in **ChromaDB** (local).
|
| 8 |
+
- **Chat** β ADK agent runs in the same process; agent calls **RAG tool** `query_uploaded_documents` and answers with the LLM (OpenAI via LiteLLM).
|
| 9 |
+
- **Handbook generation** β Request a 20,000+ word handbook; generation uses RAG and runs section-by-section.
|
| 10 |
+
- **Export** β Download the handbook as Markdown.
|
| 11 |
+
|
| 12 |
+
## Architecture
|
| 13 |
+
|
| 14 |
+
- **Streamlit** (`streamlit_app.py`) β UI; imports **runner_app** to run the agent directly.
|
| 15 |
+
- **runner_app.py** β ADK Runner + session; `run_chat(message)` runs the agent (sync wrapper around `runner.run_async`).
|
| 16 |
+
- **agent.py** β ADK Agent (LiteLLM/OpenAI), tools = `[query_uploaded_documents]`.
|
| 17 |
+
- **RAG** (`rag.py` + `rag_tools.py`) β ChromaDB + OpenAI embeddings.
|
| 18 |
+
|
| 19 |
+
## Setup
|
| 20 |
+
|
| 21 |
+
### 1. Python
|
| 22 |
+
|
| 23 |
+
Use Python 3.10+.
|
| 24 |
+
|
| 25 |
+
### 2. Install dependencies
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
cd ass2
|
| 29 |
+
pip install -r requirements.txt
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### 3. Environment
|
| 33 |
+
|
| 34 |
+
Create a `.env` file in `ass2` (see `.env.example`):
|
| 35 |
+
|
| 36 |
+
```
|
| 37 |
+
OPENAI_API_KEY=sk-your-openai-api-key-here
|
| 38 |
+
MODEL=gpt-4o
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Run (single command)
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
cd ass2
|
| 45 |
+
streamlit run streamlit_app.py
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
Open **http://localhost:8501**. No separate API server.
|
| 49 |
+
|
| 50 |
+
## How to use
|
| 51 |
+
|
| 52 |
+
1. **Upload PDFs** β In "Upload PDFs", select PDFs and click **Index PDFs**.
|
| 53 |
+
2. **Chat** β In "Chat", ask questions; the ADK agent uses the RAG tool and answers from your documents.
|
| 54 |
+
3. **Generate handbook** β In "Generate Handbook", enter a topic and click **Generate handbook**, then download as Markdown.
|
| 55 |
+
|
| 56 |
+
## Project structure (all in `ass2`)
|
| 57 |
+
|
| 58 |
+
| File | Purpose |
|
| 59 |
+
|------|--------|
|
| 60 |
+
| `streamlit_app.py` | Streamlit UI (upload, chat, handbook) |
|
| 61 |
+
| `runner_app.py` | ADK Runner + session; `run_chat(message)` for Streamlit |
|
| 62 |
+
| `agent.py` | ADK agent (LiteLLM/OpenAI) + RAG tool |
|
| 63 |
+
| `prompt.py` | Agent name, description, instruction |
|
| 64 |
+
| `rag_tools.py` | ADK tool: `query_uploaded_documents` |
|
| 65 |
+
| `callback.py` | ADK callbacks |
|
| 66 |
+
| `rag.py` | ChromaDB + OpenAI embeddings |
|
| 67 |
+
| `pdf_processor.py` | PDF text extraction and chunking |
|
| 68 |
+
| `handbook_generator.py` | 20k-word handbook generation |
|
| 69 |
+
| `config.py` | Settings and paths |
|
| 70 |
+
|
| 71 |
+
## Tech stack
|
| 72 |
+
|
| 73 |
+
- **Agent:** Google ADK, LiteLLM (OpenAI)
|
| 74 |
+
- **RAG:** OpenAI embeddings, ChromaDB (local)
|
| 75 |
+
- **UI:** Streamlit (agent runs in-process, no FastAPI)
|
src/agent.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ADK agent for the Handbook Generator.
|
| 3 |
+
Uses Google ADK + LiteLLM (OpenAI) + RAG tools.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from google.adk.agents.llm_agent import Agent
|
| 8 |
+
from google.adk.models.lite_llm import LiteLlm
|
| 9 |
+
|
| 10 |
+
import prompt as prmpt
|
| 11 |
+
import callback as cb
|
| 12 |
+
import rag_tools as tls
|
| 13 |
+
|
| 14 |
+
# π Load .env file
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 18 |
+
model = os.getenv("MODEL", "openai/gpt-4o")
|
| 19 |
+
|
| 20 |
+
if not openai_api_key:
|
| 21 |
+
raise ValueError("OPENAI_API_KEY is not set in .env")
|
| 22 |
+
|
| 23 |
+
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 24 |
+
|
| 25 |
+
# Create the agent (same pattern as Policy)
|
| 26 |
+
root_agent = Agent(
|
| 27 |
+
model=LiteLlm(
|
| 28 |
+
model=model,
|
| 29 |
+
),
|
| 30 |
+
name=prmpt.AGENT_CONFIG["name"],
|
| 31 |
+
description=prmpt.AGENT_CONFIG["description"],
|
| 32 |
+
instruction=prmpt.AGENT_CONFIG["instruction"],
|
| 33 |
+
tools=[tls.query_uploaded_documents],
|
| 34 |
+
before_agent_callback=cb.before_agent_callback,
|
| 35 |
+
after_agent_callback=cb.after_agent_callback,
|
| 36 |
+
before_model_callback=cb.before_model_callback,
|
| 37 |
+
after_model_callback=cb.after_model_callback,
|
| 38 |
+
before_tool_callback=cb.before_tool_callback,
|
| 39 |
+
after_tool_callback=cb.after_tool_callback,
|
| 40 |
+
on_model_error_callback=cb.on_model_error_callback,
|
| 41 |
+
on_tool_error_callback=cb.on_tool_error_callback,
|
| 42 |
+
)
|
src/app.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Handbook Generator β Gradio UI (legacy fallback).
|
| 3 |
+
Primary UI is streamlit_app.py.
|
| 4 |
+
Run: python app.py
|
| 5 |
+
"""
|
| 6 |
+
import asyncio
|
| 7 |
+
import shutil
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
import gradio as gr
|
| 11 |
+
|
| 12 |
+
from config import GROK_API_KEY, UPLOADS_DIR, BASE_DIR
|
| 13 |
+
from handbook_generator import build_handbook
|
| 14 |
+
from rag import get_context_for_query, index_pdf, reset_index
|
| 15 |
+
|
| 16 |
+
HANDBOOK_EXPORT_PATH = BASE_DIR / "handbook_export.md"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _run_async(coro):
|
| 20 |
+
"""Run an async coroutine from sync Gradio code."""
|
| 21 |
+
try:
|
| 22 |
+
return asyncio.run(coro)
|
| 23 |
+
except RuntimeError:
|
| 24 |
+
import concurrent.futures
|
| 25 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
| 26 |
+
return pool.submit(asyncio.run, coro).result(timeout=300)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def ensure_api_key():
|
| 30 |
+
if not GROK_API_KEY:
|
| 31 |
+
raise gr.Error(
|
| 32 |
+
"GROK_API_KEY is not set. Create a .env file in the ass2 folder with: GROK_API_KEY=your-key"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _file_path(f):
|
| 37 |
+
"""Get path from Gradio file input (path string or object with .name)."""
|
| 38 |
+
if f is None:
|
| 39 |
+
return None
|
| 40 |
+
if isinstance(f, (str, Path)):
|
| 41 |
+
return Path(f)
|
| 42 |
+
return Path(getattr(f, "name", str(f)))
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def upload_and_index(files):
|
| 46 |
+
"""Handle PDF upload(s) and index into LightRAG."""
|
| 47 |
+
if not files:
|
| 48 |
+
return "No files selected."
|
| 49 |
+
ensure_api_key()
|
| 50 |
+
reset_index()
|
| 51 |
+
saved = []
|
| 52 |
+
for f in (files if isinstance(files, list) else [files]):
|
| 53 |
+
path = _file_path(f)
|
| 54 |
+
if path is None or not path.exists():
|
| 55 |
+
continue
|
| 56 |
+
dest = UPLOADS_DIR / path.name
|
| 57 |
+
try:
|
| 58 |
+
shutil.copy(str(path), str(dest))
|
| 59 |
+
except Exception:
|
| 60 |
+
dest = path
|
| 61 |
+
try:
|
| 62 |
+
n = _run_async(index_pdf(dest, source_name=path.name))
|
| 63 |
+
saved.append(f"{path.name}: indexed")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
saved.append(f"{path.name}: Error - {e}")
|
| 66 |
+
return "\n".join(saved) if saved else "No PDFs processed."
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def chat(message, history):
|
| 70 |
+
"""RAG chat: retrieve context and answer using Grok via LiteLLM."""
|
| 71 |
+
ensure_api_key()
|
| 72 |
+
from litellm import completion
|
| 73 |
+
from config import CHAT_MODEL
|
| 74 |
+
|
| 75 |
+
context = _run_async(get_context_for_query(message))
|
| 76 |
+
if not context or not context.strip():
|
| 77 |
+
context = "No documents have been uploaded yet. Ask the user to upload PDFs first."
|
| 78 |
+
|
| 79 |
+
system = (
|
| 80 |
+
"You are a helpful assistant. Answer based ONLY on the following context "
|
| 81 |
+
"from the user's uploaded documents. If the answer is not in the context, say so clearly."
|
| 82 |
+
)
|
| 83 |
+
user_content = f"Context from uploaded documents:\n\n{context}\n\n---\n\nUser question: {message}"
|
| 84 |
+
|
| 85 |
+
resp = completion(
|
| 86 |
+
model=CHAT_MODEL,
|
| 87 |
+
messages=[
|
| 88 |
+
{"role": "system", "content": system},
|
| 89 |
+
{"role": "user", "content": user_content},
|
| 90 |
+
],
|
| 91 |
+
api_key=GROK_API_KEY,
|
| 92 |
+
max_tokens=1500,
|
| 93 |
+
temperature=0.3,
|
| 94 |
+
)
|
| 95 |
+
return (resp.choices[0].message.content or "").strip()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def run_handbook_simple(topic):
|
| 99 |
+
"""Generate handbook and return (status, markdown)."""
|
| 100 |
+
ensure_api_key()
|
| 101 |
+
if not (topic and topic.strip()):
|
| 102 |
+
return "Enter a topic first.", ""
|
| 103 |
+
status_msgs = []
|
| 104 |
+
try:
|
| 105 |
+
full_md = _run_async(build_handbook(topic.strip(), on_progress=status_msgs.append))
|
| 106 |
+
status = "\n".join(status_msgs) if status_msgs else "Done."
|
| 107 |
+
return status, full_md
|
| 108 |
+
except Exception as e:
|
| 109 |
+
return f"Error: {e}", ""
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
with gr.Blocks(title="Handbook Generator", theme=gr.themes.Soft()) as demo:
|
| 113 |
+
gr.Markdown("# Handbook Generator\nUpload PDFs, chat about them, and generate a 20,000+ word handbook.")
|
| 114 |
+
|
| 115 |
+
with gr.Tab("Upload PDFs"):
|
| 116 |
+
file_input = gr.File(
|
| 117 |
+
file_count="multiple",
|
| 118 |
+
file_types=[".pdf"],
|
| 119 |
+
label="Upload one or more PDFs",
|
| 120 |
+
)
|
| 121 |
+
index_btn = gr.Button("Index PDFs")
|
| 122 |
+
index_out = gr.Textbox(label="Index result", lines=4)
|
| 123 |
+
|
| 124 |
+
with gr.Tab("Chat"):
|
| 125 |
+
chatbot = gr.ChatInterface(
|
| 126 |
+
fn=chat,
|
| 127 |
+
type="messages",
|
| 128 |
+
title="Ask questions about your uploaded documents",
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
with gr.Tab("Generate Handbook"):
|
| 132 |
+
gr.Markdown(
|
| 133 |
+
"Enter a topic (e.g. *Create a handbook on Retrieval-Augmented Generation*). "
|
| 134 |
+
"Generation may take several minutes."
|
| 135 |
+
)
|
| 136 |
+
topic_in = gr.Textbox(
|
| 137 |
+
label="Handbook topic",
|
| 138 |
+
placeholder="e.g. Retrieval-Augmented Generation",
|
| 139 |
+
lines=1,
|
| 140 |
+
)
|
| 141 |
+
gen_btn = gr.Button("Generate 20k-word handbook")
|
| 142 |
+
status_out = gr.Textbox(label="Status", lines=4, interactive=False)
|
| 143 |
+
handbook_out = gr.Markdown(label="Handbook (Markdown)")
|
| 144 |
+
export_btn = gr.DownloadButton("Export as Markdown", visible=False)
|
| 145 |
+
|
| 146 |
+
index_btn.click(
|
| 147 |
+
fn=lambda files: upload_and_index(files) if files else "No files selected.",
|
| 148 |
+
inputs=[file_input],
|
| 149 |
+
outputs=[index_out],
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
def do_handbook(topic):
|
| 153 |
+
status, md = run_handbook_simple(topic)
|
| 154 |
+
if md:
|
| 155 |
+
HANDBOOK_EXPORT_PATH.write_text(md, encoding="utf-8")
|
| 156 |
+
return (
|
| 157 |
+
status,
|
| 158 |
+
md,
|
| 159 |
+
gr.update(visible=bool(md), value=str(HANDBOOK_EXPORT_PATH) if md else None),
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
gen_btn.click(
|
| 163 |
+
fn=do_handbook,
|
| 164 |
+
inputs=[topic_in],
|
| 165 |
+
outputs=[status_out, handbook_out, export_btn],
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
if __name__ == "__main__":
|
| 169 |
+
demo.launch(server_name="127.0.0.1", server_port=7860)
|
src/callback.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ADK callbacks for logging and optional audit (same as policy/callback.py).
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Any, Optional
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
_audit_sink: Optional[Any] = None
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def set_audit_sink(sink: Any) -> None:
|
| 14 |
+
global _audit_sink
|
| 15 |
+
_audit_sink = sink
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _get_session_id(context: Any) -> Optional[str]:
|
| 19 |
+
try:
|
| 20 |
+
if hasattr(context, "session") and context.session is not None:
|
| 21 |
+
return getattr(context.session, "id", None) or getattr(
|
| 22 |
+
context.session, "session_id", None
|
| 23 |
+
)
|
| 24 |
+
except Exception:
|
| 25 |
+
pass
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _get_message_preview(content: Any, max_len: int = 500) -> Optional[str]:
|
| 30 |
+
if content is None:
|
| 31 |
+
return None
|
| 32 |
+
try:
|
| 33 |
+
if hasattr(content, "parts") and content.parts:
|
| 34 |
+
text = getattr(content.parts[0], "text", None) or str(content.parts[0])[:max_len]
|
| 35 |
+
return (text or "")[:max_len] if text else None
|
| 36 |
+
if isinstance(content, str):
|
| 37 |
+
return content[:max_len]
|
| 38 |
+
return str(content)[:max_len]
|
| 39 |
+
except Exception:
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _emit(event: dict) -> None:
|
| 44 |
+
logger.debug("[ADK callback] %s", event.get("event_type"), extra=event)
|
| 45 |
+
if _audit_sink is not None and hasattr(_audit_sink, "store"):
|
| 46 |
+
try:
|
| 47 |
+
_audit_sink.store(event)
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.warning("[ADK callback] audit sink store failed: %s", e)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _context_from_args(*args: Any, **kwargs: Any) -> Any:
|
| 53 |
+
return kwargs.get("callback_context") or (args[0] if args else None)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def before_agent_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 57 |
+
context = _context_from_args(*args, **kwargs)
|
| 58 |
+
if context is None:
|
| 59 |
+
return None
|
| 60 |
+
try:
|
| 61 |
+
event = {
|
| 62 |
+
"event_type": "before_agent",
|
| 63 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 64 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 65 |
+
"user_id": getattr(context, "user_id", None),
|
| 66 |
+
"session_id": _get_session_id(context),
|
| 67 |
+
"tool_name": None,
|
| 68 |
+
"message_preview": _get_message_preview(getattr(context, "user_content", None)),
|
| 69 |
+
"has_error": False,
|
| 70 |
+
"details": {},
|
| 71 |
+
}
|
| 72 |
+
_emit(event)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.warning("[ADK callback] before_agent failed: %s", e)
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def after_agent_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 79 |
+
context = _context_from_args(*args, **kwargs)
|
| 80 |
+
if context is None:
|
| 81 |
+
return None
|
| 82 |
+
try:
|
| 83 |
+
event = {
|
| 84 |
+
"event_type": "after_agent",
|
| 85 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 86 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 87 |
+
"user_id": getattr(context, "user_id", None),
|
| 88 |
+
"session_id": _get_session_id(context),
|
| 89 |
+
"tool_name": None,
|
| 90 |
+
"message_preview": _get_message_preview(getattr(context, "user_content", None)),
|
| 91 |
+
"has_error": False,
|
| 92 |
+
"details": {},
|
| 93 |
+
}
|
| 94 |
+
_emit(event)
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.warning("[ADK callback] after_agent failed: %s", e)
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def before_model_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 101 |
+
context = _context_from_args(*args, **kwargs)
|
| 102 |
+
llm_request = kwargs.get("llm_request")
|
| 103 |
+
if context is None:
|
| 104 |
+
return None
|
| 105 |
+
try:
|
| 106 |
+
message_preview = None
|
| 107 |
+
if llm_request is not None and hasattr(llm_request, "contents") and llm_request.contents:
|
| 108 |
+
last = llm_request.contents[-1]
|
| 109 |
+
message_preview = _get_message_preview(last)
|
| 110 |
+
event = {
|
| 111 |
+
"event_type": "before_model",
|
| 112 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 113 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 114 |
+
"user_id": getattr(context, "user_id", None),
|
| 115 |
+
"session_id": _get_session_id(context),
|
| 116 |
+
"tool_name": None,
|
| 117 |
+
"message_preview": message_preview,
|
| 118 |
+
"has_error": False,
|
| 119 |
+
"details": {},
|
| 120 |
+
}
|
| 121 |
+
_emit(event)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.warning("[ADK callback] before_model failed: %s", e)
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def after_model_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 128 |
+
context = _context_from_args(*args, **kwargs)
|
| 129 |
+
if context is None:
|
| 130 |
+
return None
|
| 131 |
+
try:
|
| 132 |
+
event = {
|
| 133 |
+
"event_type": "after_model",
|
| 134 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 135 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 136 |
+
"user_id": getattr(context, "user_id", None),
|
| 137 |
+
"session_id": _get_session_id(context),
|
| 138 |
+
"tool_name": None,
|
| 139 |
+
"message_preview": _get_message_preview(getattr(context, "user_content", None)),
|
| 140 |
+
"has_error": False,
|
| 141 |
+
"details": {},
|
| 142 |
+
}
|
| 143 |
+
_emit(event)
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.warning("[ADK callback] after_model failed: %s", e)
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def before_tool_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 150 |
+
context = _context_from_args(*args, **kwargs)
|
| 151 |
+
tool_name = kwargs.get("tool_name")
|
| 152 |
+
tool_args = kwargs.get("tool_input") or kwargs.get("tool_args")
|
| 153 |
+
if context is None:
|
| 154 |
+
return None
|
| 155 |
+
try:
|
| 156 |
+
event = {
|
| 157 |
+
"event_type": "before_tool",
|
| 158 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 159 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 160 |
+
"user_id": getattr(context, "user_id", None),
|
| 161 |
+
"session_id": _get_session_id(context),
|
| 162 |
+
"tool_name": tool_name,
|
| 163 |
+
"message_preview": str(tool_args)[:500] if tool_args is not None else None,
|
| 164 |
+
"has_error": False,
|
| 165 |
+
"details": {"tool_args": tool_args} if tool_args is not None else {},
|
| 166 |
+
}
|
| 167 |
+
_emit(event)
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.warning("[ADK callback] before_tool failed: %s", e)
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def after_tool_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 174 |
+
context = _context_from_args(*args, **kwargs)
|
| 175 |
+
tool_name = kwargs.get("tool_name")
|
| 176 |
+
tool_result = kwargs.get("tool_result") or kwargs.get("result")
|
| 177 |
+
if context is None:
|
| 178 |
+
return None
|
| 179 |
+
try:
|
| 180 |
+
event = {
|
| 181 |
+
"event_type": "after_tool",
|
| 182 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 183 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 184 |
+
"user_id": getattr(context, "user_id", None),
|
| 185 |
+
"session_id": _get_session_id(context),
|
| 186 |
+
"tool_name": tool_name,
|
| 187 |
+
"message_preview": str(tool_result)[:500] if tool_result is not None else None,
|
| 188 |
+
"has_error": False,
|
| 189 |
+
"details": {"tool_result": tool_result} if tool_result is not None else {},
|
| 190 |
+
}
|
| 191 |
+
_emit(event)
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logger.warning("[ADK callback] after_tool failed: %s", e)
|
| 194 |
+
return None
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def on_model_error_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 198 |
+
context = _context_from_args(*args, **kwargs)
|
| 199 |
+
error = kwargs.get("error")
|
| 200 |
+
if context is None:
|
| 201 |
+
return None
|
| 202 |
+
try:
|
| 203 |
+
event = {
|
| 204 |
+
"event_type": "on_model_error",
|
| 205 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 206 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 207 |
+
"user_id": getattr(context, "user_id", None),
|
| 208 |
+
"session_id": _get_session_id(context),
|
| 209 |
+
"tool_name": None,
|
| 210 |
+
"message_preview": str(error)[:500] if error is not None else None,
|
| 211 |
+
"has_error": True,
|
| 212 |
+
"details": {"error": str(error)},
|
| 213 |
+
}
|
| 214 |
+
_emit(event)
|
| 215 |
+
except Exception as e:
|
| 216 |
+
logger.warning("[ADK callback] on_model_error failed: %s", e)
|
| 217 |
+
return None
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def on_tool_error_callback(*args: Any, **kwargs: Any) -> Optional[Any]:
|
| 221 |
+
context = _context_from_args(*args, **kwargs)
|
| 222 |
+
tool_name = kwargs.get("tool_name")
|
| 223 |
+
error = kwargs.get("error")
|
| 224 |
+
if context is None:
|
| 225 |
+
return None
|
| 226 |
+
try:
|
| 227 |
+
event = {
|
| 228 |
+
"event_type": "on_tool_error",
|
| 229 |
+
"agent_name": getattr(context, "agent_name", None),
|
| 230 |
+
"invocation_id": getattr(context, "invocation_id", None),
|
| 231 |
+
"user_id": getattr(context, "user_id", None),
|
| 232 |
+
"session_id": _get_session_id(context),
|
| 233 |
+
"tool_name": tool_name,
|
| 234 |
+
"message_preview": str(error)[:500] if error is not None else None,
|
| 235 |
+
"has_error": True,
|
| 236 |
+
"details": {"error": str(error)},
|
| 237 |
+
}
|
| 238 |
+
_emit(event)
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.warning("[ADK callback] on_tool_error failed: %s", e)
|
| 241 |
+
return None
|
src/config.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration for the Handbook Generator app."""
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
# Load from .env if present
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# ββ API Keys ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 11 |
+
CHAT_MODEL = os.getenv("MODEL", "openai/gpt-4o").strip() or "openai/gpt-4o"
|
| 12 |
+
EMBEDDING_MODEL = "text-embedding-3-small"
|
| 13 |
+
|
| 14 |
+
# Supabase
|
| 15 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL", "")
|
| 16 |
+
SUPABASE_KEY = os.getenv("SUPABASE_KEY", "")
|
| 17 |
+
|
| 18 |
+
# ββ Paths βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 20 |
+
UPLOADS_DIR = BASE_DIR / "uploads"
|
| 21 |
+
UPLOADS_DIR.mkdir(exist_ok=True)
|
| 22 |
+
|
| 23 |
+
# LightRAG working directory
|
| 24 |
+
WORKING_DIR = BASE_DIR / "lightrag_working"
|
| 25 |
+
WORKING_DIR.mkdir(exist_ok=True)
|
| 26 |
+
|
| 27 |
+
# ββ Chunking (used by pdf_processor) βββββββββββββββββββββββββββββββββ
|
| 28 |
+
CHUNK_SIZE = 1000
|
| 29 |
+
CHUNK_OVERLAP = 200
|
| 30 |
+
|
| 31 |
+
# ββ Handbook generation βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
TARGET_WORD_COUNT = 20000
|
| 33 |
+
SECTION_WORD_TARGET = 1200
|
| 34 |
+
MAX_SECTIONS = 25
|
src/extract_pdf.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Extract assignment PDF text to a file for reading."""
|
| 2 |
+
from pypdf import PdfReader
|
| 3 |
+
|
| 4 |
+
reader = PdfReader("AI-Engineering-Assignment.pdf")
|
| 5 |
+
with open("assignment_text.txt", "w", encoding="utf-8") as f:
|
| 6 |
+
for i, page in enumerate(reader.pages):
|
| 7 |
+
text = page.extract_text()
|
| 8 |
+
if text:
|
| 9 |
+
f.write(f"--- Page {i+1} ---\n")
|
| 10 |
+
f.write(text)
|
| 11 |
+
f.write("\n\n")
|
| 12 |
+
print("Done. Written to assignment_text.txt")
|
src/handbook_export.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# jungle book
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
"The Jungle Book," a timeless classic penned by Rudyard Kipling, is an exemplary piece of literature that continues to captivate readers of all ages. Originally published in 1894, this collection of stories has been celebrated for its vibrant depiction of the natural world and its profound exploration of human nature through the lens of animal allegories. The tales unfold in the lush, mysterious jungles of India, where anthropomorphized animals and human characters coexist, revealing insights into society, morality, and the intrinsic link between man and nature.
|
| 6 |
+
|
| 7 |
+
### Overview
|
| 8 |
+
|
| 9 |
+
At its core, "The Jungle Book" is a compilation of stories and poems, with the most renowned being the tales of Mowgli, a young boy raised by wolves in the Indian jungle. These narratives are not merely stories of adventure and wildlife; they are rich tapestries woven with themes of identity, belonging, and the struggle between civilization and the wild. Kipling's masterful storytelling is complemented by his intricate descriptions of the jungle environment, which becomes a character in its own right, embodying both beauty and peril.
|
| 10 |
+
|
| 11 |
+
The structure of "The Jungle Book" is unique, as it oscillates between the adventures of Mowgli and other standalone tales featuring a diverse array of characters, such as Rikki-Tikki-Tavi, the valiant mongoose, and Kotick, the white seal. This collection is a mosaic of narratives that, while distinct, collectively paint a vivid picture of Kipling's imagined world. Each story serves as a parable, imparting lessons about courage, loyalty, and the consequences of human actions on the natural world.
|
| 12 |
+
|
| 13 |
+
### Importance
|
| 14 |
+
|
| 15 |
+
"The Jungle Book" holds a significant place in both literary and cultural history. Its importance is multifaceted, spanning from its contributions to children's literature to its impact on popular culture. The book is celebrated for its ability to transcend the boundaries of age and time, offering insights that resonate with both young readers and adults.
|
| 16 |
+
|
| 17 |
+
One of the key reasons for its enduring importance is Kipling's ability to convey complex themes through the guise of seemingly simple stories. The character of Mowgli, for instance, is a powerful allegory for the journey of self-discovery and the quest for identity. Through Mowgli's interactions with the jungle's inhabitants, Kipling explores the idea of what it means to be human and the innate tension between nature and nurture.
|
| 18 |
+
|
| 19 |
+
Furthermore, "The Jungle Book" is an important work in the context of colonial literature. Written during a time when the British Empire was at its zenith, the book reflects the complexities of colonial attitudes towards India. Kipling, an Anglo-Indian, infuses his stories with a nuanced understanding of Indian culture, even as he grapples with the ideological biases of his time. This duality provides a rich ground for analysis and discussion, making "The Jungle Book" a valuable resource for examining the interplay between literature and historical context.
|
| 20 |
+
|
| 21 |
+
### Scope
|
| 22 |
+
|
| 23 |
+
The scope of "The Jungle Book" extends beyond its pages, influencing a wide range of adaptations and interpretations across different media. The stories have been adapted into numerous films, television series, stage productions, and even operas, each bringing a new perspective to Kipling's work. These adaptations have contributed to the book's lasting legacy, ensuring its relevance in contemporary culture.
|
| 24 |
+
|
| 25 |
+
In literature, "The Jungle Book" has inspired countless authors and storytellers, influencing the way animals and nature are depicted in fiction. Kipling's anthropomorphic portrayal of animals set a precedent for future works, encouraging writers to explore the moral and philosophical implications of the animal kingdom as a reflection of human society.
|
| 26 |
+
|
| 27 |
+
Academically, "The Jungle Book" is a rich subject for study, offering insights into narrative structure, character development, and thematic exploration. It serves as a foundational text in the study of children's literature, colonial literature, and ecological narratives. Scholars analyze its themes of belonging and identity, the ethical dilemmas posed by its characters, and its portrayal of the natural world, which continues to resonate in today's discussions about environmental conservation and human impact on nature.
|
| 28 |
+
|
| 29 |
+
In conclusion, "The Jungle Book" is more than just a collection of stories; it is a cultural artifact that has shaped the literary landscape and continues to inspire and provoke thought. Its blend of adventure, moral lessons, and richly drawn characters ensures its place as a beloved classic, while its themes of identity, nature, and society provide fertile ground for continued exploration and appreciation.
|
| 30 |
+
|
src/handbook_generator.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate 20,000+ word handbooks using the LongWriter technique and OpenAI."""
|
| 2 |
+
from typing import Callable, List, Optional
|
| 3 |
+
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
from config import (
|
| 6 |
+
OPENAI_API_KEY,
|
| 7 |
+
CHAT_MODEL,
|
| 8 |
+
SECTION_WORD_TARGET,
|
| 9 |
+
TARGET_WORD_COUNT,
|
| 10 |
+
)
|
| 11 |
+
from rag import get_context_for_query
|
| 12 |
+
|
| 13 |
+
_client: Optional[OpenAI] = None
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _get_client() -> OpenAI:
|
| 17 |
+
global _client
|
| 18 |
+
if _client is None:
|
| 19 |
+
_client = OpenAI(api_key=OPENAI_API_KEY)
|
| 20 |
+
return _client
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def word_count(text: str) -> int:
|
| 24 |
+
return len(text.split())
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def llm_call(prompt: str, system_prompt: str = "You are a professional writer.") -> str:
|
| 28 |
+
"""Synchronous OpenAI call."""
|
| 29 |
+
# Extract model name without provider prefix
|
| 30 |
+
model = CHAT_MODEL.split("/")[-1] if "/" in CHAT_MODEL else CHAT_MODEL
|
| 31 |
+
client = _get_client()
|
| 32 |
+
resp = client.chat.completions.create(
|
| 33 |
+
model=model,
|
| 34 |
+
messages=[
|
| 35 |
+
{"role": "system", "content": system_prompt},
|
| 36 |
+
{"role": "user", "content": prompt},
|
| 37 |
+
],
|
| 38 |
+
temperature=0.7,
|
| 39 |
+
)
|
| 40 |
+
return resp.choices[0].message.content or ""
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# βββ LongWriter Phase 1: Outline βββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
def generate_outline(topic: str, context: str) -> List[dict]:
|
| 45 |
+
"""Generate a detailed structure for the handbook."""
|
| 46 |
+
prompt = f"""You are planning a 20,000-word handbook on "{topic}".
|
| 47 |
+
Based on the reference material, create a highly detailed outline.
|
| 48 |
+
For each major section, provide a title and 3-5 sub-points (keywords) to cover.
|
| 49 |
+
Aim for 15-20 sections to ensure we can hit the 20k word target.
|
| 50 |
+
|
| 51 |
+
Reference: {context[:10000] if context else 'No reference material available.'}
|
| 52 |
+
|
| 53 |
+
Output as a list of sections (one per line):
|
| 54 |
+
Section Title | point 1, point 2, point 3
|
| 55 |
+
..."""
|
| 56 |
+
|
| 57 |
+
content = llm_call(prompt)
|
| 58 |
+
sections = []
|
| 59 |
+
for line in content.splitlines():
|
| 60 |
+
if "|" in line:
|
| 61 |
+
title, points = line.split("|", 1)
|
| 62 |
+
sections.append({"title": title.strip(), "points": points.strip()})
|
| 63 |
+
return sections
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# βββ LongWriter Phase 2: Generate long sections ββββββββββββββββββββββ
|
| 67 |
+
def generate_long_section(
|
| 68 |
+
section_title: str,
|
| 69 |
+
points: str,
|
| 70 |
+
topic: str,
|
| 71 |
+
context: str,
|
| 72 |
+
previous_context: str,
|
| 73 |
+
) -> str:
|
| 74 |
+
"""Generate a single section following LongWriter length instructions."""
|
| 75 |
+
prev_snippet = previous_context[-2000:] if previous_context else "This is the first section."
|
| 76 |
+
ctx_snippet = context[:5000] if context else "No reference material."
|
| 77 |
+
|
| 78 |
+
prompt = f"""Write a comprehensive section for a handbook on "{topic}".
|
| 79 |
+
Section Title: {section_title}
|
| 80 |
+
Key points to cover: {points}
|
| 81 |
+
|
| 82 |
+
Reference Material: {ctx_snippet}
|
| 83 |
+
|
| 84 |
+
LongWriter Instructions:
|
| 85 |
+
1. This section MUST be at least 1,500 words long.
|
| 86 |
+
2. Be extremely descriptive. Explain concepts in depth.
|
| 87 |
+
3. Use examples, analogies, and detailed breakdowns.
|
| 88 |
+
4. Maintain continuity with previous sections: {prev_snippet}
|
| 89 |
+
|
| 90 |
+
Write only the content for "{section_title}". Start with ## {section_title}."""
|
| 91 |
+
|
| 92 |
+
return llm_call(prompt)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# βββ Main builder ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 96 |
+
def build_handbook(
|
| 97 |
+
topic: str,
|
| 98 |
+
on_progress: Optional[Callable[[str], None]] = None,
|
| 99 |
+
) -> str:
|
| 100 |
+
"""Build a 20k+ word handbook using the LongWriter technique."""
|
| 101 |
+
|
| 102 |
+
def progress(msg: str):
|
| 103 |
+
if on_progress:
|
| 104 |
+
on_progress(msg)
|
| 105 |
+
|
| 106 |
+
progress("Gathering context for outline...")
|
| 107 |
+
context = get_context_for_query(topic)
|
| 108 |
+
|
| 109 |
+
progress("Generating detailed outline (LongWriter phase 1)...")
|
| 110 |
+
sections = generate_outline(topic, context)
|
| 111 |
+
if not sections:
|
| 112 |
+
sections = [{"title": "Introduction", "points": "Overview, Importance, Scope"}]
|
| 113 |
+
|
| 114 |
+
progress(f"Outline created with {len(sections)} sections. Starting generation...")
|
| 115 |
+
|
| 116 |
+
full_handbook = [f"# {topic}\n\n"]
|
| 117 |
+
total_words = 0
|
| 118 |
+
previous_content = ""
|
| 119 |
+
|
| 120 |
+
for i, sec in enumerate(sections):
|
| 121 |
+
title = sec["title"]
|
| 122 |
+
points = sec["points"]
|
| 123 |
+
progress(f"Writing Section {i+1}/{len(sections)}: {title} (Target: 1500+ words)...")
|
| 124 |
+
|
| 125 |
+
sec_context = get_context_for_query(f"{topic}: {title} {points}")
|
| 126 |
+
|
| 127 |
+
section_text = generate_long_section(
|
| 128 |
+
title, points, topic, sec_context, previous_content
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
full_handbook.append(section_text + "\n\n")
|
| 132 |
+
previous_content += section_text
|
| 133 |
+
total_words += word_count(section_text)
|
| 134 |
+
|
| 135 |
+
progress(f"Section completed. Current word count: {total_words}")
|
| 136 |
+
|
| 137 |
+
if total_words >= TARGET_WORD_COUNT:
|
| 138 |
+
progress("Target length reached.")
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
final_doc = "".join(full_handbook)
|
| 142 |
+
progress(f"Handbook generated. Final word count: {word_count(final_doc)}")
|
| 143 |
+
return final_doc
|
src/pdf_processor.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF text extraction and chunking for RAG."""
|
| 2 |
+
import re
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import pdfplumber
|
| 7 |
+
from pypdf import PdfReader
|
| 8 |
+
|
| 9 |
+
from config import CHUNK_OVERLAP, CHUNK_SIZE
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def extract_text_from_pdf(pdf_path: str | Path) -> str:
|
| 13 |
+
"""Extract text from a PDF using pdfplumber (better for tables) with pypdf fallback."""
|
| 14 |
+
path = Path(pdf_path)
|
| 15 |
+
if not path.exists():
|
| 16 |
+
raise FileNotFoundError(f"PDF not found: {path}")
|
| 17 |
+
|
| 18 |
+
text_parts: List[str] = []
|
| 19 |
+
try:
|
| 20 |
+
with pdfplumber.open(path) as pdf:
|
| 21 |
+
for page in pdf.pages:
|
| 22 |
+
t = page.extract_text()
|
| 23 |
+
if t:
|
| 24 |
+
text_parts.append(t)
|
| 25 |
+
except Exception:
|
| 26 |
+
# Fallback to pypdf
|
| 27 |
+
reader = PdfReader(path)
|
| 28 |
+
for page in reader.pages:
|
| 29 |
+
t = page.extract_text()
|
| 30 |
+
if t:
|
| 31 |
+
text_parts.append(t)
|
| 32 |
+
|
| 33 |
+
raw = "\n\n".join(text_parts)
|
| 34 |
+
# Normalize whitespace
|
| 35 |
+
return re.sub(r"\s+", " ", raw).strip()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def chunk_text(
|
| 39 |
+
text: str,
|
| 40 |
+
chunk_size: int = CHUNK_SIZE,
|
| 41 |
+
overlap: int = CHUNK_OVERLAP,
|
| 42 |
+
) -> List[dict]:
|
| 43 |
+
"""
|
| 44 |
+
Split text into overlapping chunks for embedding.
|
| 45 |
+
Returns list of dicts with 'text' and 'metadata' (source, chunk_index).
|
| 46 |
+
"""
|
| 47 |
+
if not text or not text.strip():
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
chunks: List[dict] = []
|
| 51 |
+
start = 0
|
| 52 |
+
index = 0
|
| 53 |
+
text = text.strip()
|
| 54 |
+
|
| 55 |
+
while start < len(text):
|
| 56 |
+
end = start + chunk_size
|
| 57 |
+
chunk = text[start:end]
|
| 58 |
+
|
| 59 |
+
# Try to break at sentence or word boundary
|
| 60 |
+
if end < len(text):
|
| 61 |
+
last_period = chunk.rfind(". ")
|
| 62 |
+
last_newline = chunk.rfind("\n")
|
| 63 |
+
break_at = max(last_period, last_newline)
|
| 64 |
+
if break_at > chunk_size // 2:
|
| 65 |
+
chunk = chunk[: break_at + 1]
|
| 66 |
+
end = start + break_at + 1
|
| 67 |
+
|
| 68 |
+
chunk = chunk.strip()
|
| 69 |
+
if chunk:
|
| 70 |
+
chunks.append({
|
| 71 |
+
"text": chunk,
|
| 72 |
+
"metadata": {"chunk_index": index},
|
| 73 |
+
})
|
| 74 |
+
index += 1
|
| 75 |
+
|
| 76 |
+
start = end - overlap if end < len(text) else len(text)
|
| 77 |
+
|
| 78 |
+
return chunks
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def process_pdf(pdf_path: str | Path, source_name: str | None = None) -> List[dict]:
|
| 82 |
+
"""
|
| 83 |
+
Extract text from PDF and return chunks with source metadata.
|
| 84 |
+
source_name: optional label (e.g. filename) for metadata.
|
| 85 |
+
"""
|
| 86 |
+
path = Path(pdf_path)
|
| 87 |
+
source_name = source_name or path.name
|
| 88 |
+
text = extract_text_from_pdf(path)
|
| 89 |
+
chunks = chunk_text(text)
|
| 90 |
+
for c in chunks:
|
| 91 |
+
c["metadata"]["source"] = source_name
|
| 92 |
+
return chunks
|
src/prompt.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agent prompt configuration for the Handbook Generator."""
|
| 2 |
+
|
| 3 |
+
AGENT_CONFIG = {
|
| 4 |
+
"name": "handbook_assistant",
|
| 5 |
+
"description": "An AI assistant that answers questions from uploaded PDFs and helps generate handbooks. Uses RAG (retrieval from uploaded documents) via tools.",
|
| 6 |
+
"instruction": """
|
| 7 |
+
You are the Handbook Assistant, an intelligent assistant that helps users understand their uploaded PDF documents and generate long-form handbooks.
|
| 8 |
+
|
| 9 |
+
CRITICAL RULES:
|
| 10 |
+
1. ALWAYS call the query_uploaded_documents tool FIRST before answering ANY question about documents, assignments, PDFs, or their content. NEVER assume documents are not uploaded β always check by calling the tool.
|
| 11 |
+
2. When the user says they uploaded a PDF or asks about "my document", "my assignment", "my PDF", etc., you MUST call query_uploaded_documents with their question.
|
| 12 |
+
3. Only say "no documents uploaded" if the tool explicitly returns that message.
|
| 13 |
+
4. After getting tool results, summarize the retrieved content in your answer. Do not dump raw text.
|
| 14 |
+
|
| 15 |
+
How to behave:
|
| 16 |
+
- Use the query_uploaded_documents tool for ANY question that could relate to uploaded content.
|
| 17 |
+
- Be professional and helpful.
|
| 18 |
+
- For handbook generation, tell the user to use the "Generate Handbook" tab.
|
| 19 |
+
- Maintain context across messages.
|
| 20 |
+
"""
|
| 21 |
+
}
|
src/rag.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG System β Hybrid approach:
|
| 3 |
+
1. Simple vector search (OpenAI embeddings + local storage) for reliable chat retrieval
|
| 4 |
+
2. LightRAG knowledge graph for enriched context (optional, non-blocking)
|
| 5 |
+
|
| 6 |
+
This avoids LightRAG's internal async worker issues with Streamlit.
|
| 7 |
+
"""
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Optional
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
from openai import OpenAI
|
| 15 |
+
|
| 16 |
+
from config import (
|
| 17 |
+
OPENAI_API_KEY,
|
| 18 |
+
CHAT_MODEL,
|
| 19 |
+
WORKING_DIR,
|
| 20 |
+
EMBEDDING_MODEL,
|
| 21 |
+
CHUNK_SIZE,
|
| 22 |
+
CHUNK_OVERLAP,
|
| 23 |
+
)
|
| 24 |
+
from pdf_processor import extract_text_from_pdf, chunk_text
|
| 25 |
+
|
| 26 |
+
# βββ Vector store file βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
VECTORS_FILE = WORKING_DIR / "vectors.json"
|
| 28 |
+
|
| 29 |
+
_client: Optional[OpenAI] = None
|
| 30 |
+
_chunks_db: list[dict] = [] # {"text": ..., "embedding": [...]}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _get_client() -> OpenAI:
|
| 34 |
+
global _client
|
| 35 |
+
if _client is None:
|
| 36 |
+
_client = OpenAI(api_key=OPENAI_API_KEY)
|
| 37 |
+
return _client
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _embed_texts(texts: list[str]) -> list[list[float]]:
|
| 41 |
+
"""Get embeddings from OpenAI (sync, reliable)."""
|
| 42 |
+
client = _get_client()
|
| 43 |
+
response = client.embeddings.create(
|
| 44 |
+
model=EMBEDDING_MODEL,
|
| 45 |
+
input=texts,
|
| 46 |
+
)
|
| 47 |
+
return [item.embedding for item in response.data]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
| 51 |
+
a_np = np.array(a)
|
| 52 |
+
b_np = np.array(b)
|
| 53 |
+
dot = np.dot(a_np, b_np)
|
| 54 |
+
norm = np.linalg.norm(a_np) * np.linalg.norm(b_np)
|
| 55 |
+
return float(dot / norm) if norm > 0 else 0.0
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _load_db():
|
| 59 |
+
"""Load vector DB from disk."""
|
| 60 |
+
global _chunks_db
|
| 61 |
+
if VECTORS_FILE.exists():
|
| 62 |
+
with open(VECTORS_FILE, "r", encoding="utf-8") as f:
|
| 63 |
+
_chunks_db = json.load(f)
|
| 64 |
+
else:
|
| 65 |
+
_chunks_db = []
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _save_db():
|
| 69 |
+
"""Save vector DB to disk."""
|
| 70 |
+
WORKING_DIR.mkdir(parents=True, exist_ok=True)
|
| 71 |
+
with open(VECTORS_FILE, "w", encoding="utf-8") as f:
|
| 72 |
+
json.dump(_chunks_db, f)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# βββ Public API (all synchronous β no event loop issues) βββββββββββββ
|
| 76 |
+
|
| 77 |
+
def index_pdf(pdf_path: str | Path, source_name: str | None = None) -> int:
|
| 78 |
+
"""Extract text from PDF, chunk, embed, and store. Returns number of chunks."""
|
| 79 |
+
global _chunks_db
|
| 80 |
+
|
| 81 |
+
text = extract_text_from_pdf(pdf_path)
|
| 82 |
+
if not text:
|
| 83 |
+
return 0
|
| 84 |
+
|
| 85 |
+
source = source_name or Path(pdf_path).name
|
| 86 |
+
chunks = chunk_text(text)
|
| 87 |
+
|
| 88 |
+
if not chunks:
|
| 89 |
+
return 0
|
| 90 |
+
|
| 91 |
+
# Get embeddings for all chunks
|
| 92 |
+
texts = [c["text"] for c in chunks]
|
| 93 |
+
|
| 94 |
+
# Embed in batches of 20 to avoid token limits
|
| 95 |
+
all_embeddings = []
|
| 96 |
+
for i in range(0, len(texts), 20):
|
| 97 |
+
batch = texts[i:i+20]
|
| 98 |
+
batch_embeddings = _embed_texts(batch)
|
| 99 |
+
all_embeddings.extend(batch_embeddings)
|
| 100 |
+
|
| 101 |
+
# Store
|
| 102 |
+
for chunk, embedding in zip(chunks, all_embeddings):
|
| 103 |
+
_chunks_db.append({
|
| 104 |
+
"text": chunk["text"],
|
| 105 |
+
"source": source,
|
| 106 |
+
"embedding": embedding,
|
| 107 |
+
})
|
| 108 |
+
|
| 109 |
+
_save_db()
|
| 110 |
+
return len(chunks)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def index_pdfs(pdf_paths: List[str | Path]) -> int:
|
| 114 |
+
"""Index multiple PDFs."""
|
| 115 |
+
total = 0
|
| 116 |
+
for p in pdf_paths:
|
| 117 |
+
total += index_pdf(p)
|
| 118 |
+
return total
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def get_context_for_query(query: str, top_k: int = 5) -> str:
|
| 122 |
+
"""Retrieve relevant chunks using cosine similarity."""
|
| 123 |
+
_load_db()
|
| 124 |
+
|
| 125 |
+
if not _chunks_db:
|
| 126 |
+
return ""
|
| 127 |
+
|
| 128 |
+
# Embed the query
|
| 129 |
+
query_embedding = _embed_texts([query])[0]
|
| 130 |
+
|
| 131 |
+
# Score all chunks
|
| 132 |
+
scored = []
|
| 133 |
+
for chunk in _chunks_db:
|
| 134 |
+
sim = _cosine_similarity(query_embedding, chunk["embedding"])
|
| 135 |
+
scored.append((sim, chunk["text"], chunk.get("source", "unknown")))
|
| 136 |
+
|
| 137 |
+
# Sort by similarity
|
| 138 |
+
scored.sort(key=lambda x: x[0], reverse=True)
|
| 139 |
+
|
| 140 |
+
# Take top_k
|
| 141 |
+
results = scored[:top_k]
|
| 142 |
+
|
| 143 |
+
if not results:
|
| 144 |
+
return ""
|
| 145 |
+
|
| 146 |
+
# Format context
|
| 147 |
+
context_parts = []
|
| 148 |
+
for i, (score, text, source) in enumerate(results, 1):
|
| 149 |
+
context_parts.append(f"[Source: {source} | Relevance: {score:.2f}]\n{text}")
|
| 150 |
+
|
| 151 |
+
return "\n\n---\n\n".join(context_parts)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def reset_index():
|
| 155 |
+
"""Clear all indexed data."""
|
| 156 |
+
global _chunks_db
|
| 157 |
+
import shutil
|
| 158 |
+
|
| 159 |
+
_chunks_db = []
|
| 160 |
+
if WORKING_DIR.exists():
|
| 161 |
+
shutil.rmtree(WORKING_DIR)
|
| 162 |
+
WORKING_DIR.mkdir(parents=True, exist_ok=True)
|
src/rag_tools.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG tools for the ADK agent.
|
| 3 |
+
The agent calls these tools to get context from uploaded PDFs.
|
| 4 |
+
"""
|
| 5 |
+
from rag import get_context_for_query
|
| 6 |
+
|
| 7 |
+
RAG_TOP_K = 8
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def query_uploaded_documents(query: str) -> str:
|
| 11 |
+
"""
|
| 12 |
+
Retrieve relevant passages from the user's uploaded PDF documents
|
| 13 |
+
for a given question or topic.
|
| 14 |
+
Call this whenever the user asks about the content of their documents.
|
| 15 |
+
"""
|
| 16 |
+
if not query or not str(query).strip():
|
| 17 |
+
return "Please provide a non-empty question or topic to search the documents."
|
| 18 |
+
context = get_context_for_query(query.strip(), top_k=RAG_TOP_K)
|
| 19 |
+
if not context or not context.strip():
|
| 20 |
+
return (
|
| 21 |
+
"No documents have been indexed yet, or no relevant passages were found. "
|
| 22 |
+
"Ask the user to upload and index PDFs first, or try a different query."
|
| 23 |
+
)
|
| 24 |
+
return context
|
src/runner_app.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ADK Runner setup β used by Streamlit to run the agent directly (no API).
|
| 3 |
+
"""
|
| 4 |
+
import asyncio
|
| 5 |
+
import logging
|
| 6 |
+
from google.adk.sessions import InMemorySessionService
|
| 7 |
+
from google.adk.runners import Runner
|
| 8 |
+
from google.genai import types
|
| 9 |
+
|
| 10 |
+
from agent import root_agent
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
APP_NAME = "handbook_app"
|
| 15 |
+
session_service = InMemorySessionService()
|
| 16 |
+
|
| 17 |
+
runner = Runner(
|
| 18 |
+
agent=root_agent,
|
| 19 |
+
app_name=APP_NAME,
|
| 20 |
+
session_service=session_service,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _text_from_content(content) -> str | None:
|
| 25 |
+
if content is None:
|
| 26 |
+
return None
|
| 27 |
+
try:
|
| 28 |
+
parts = getattr(content, "parts", None)
|
| 29 |
+
if not parts:
|
| 30 |
+
return None
|
| 31 |
+
for part in parts:
|
| 32 |
+
text = getattr(part, "text", None)
|
| 33 |
+
if text is not None and str(text).strip():
|
| 34 |
+
return str(text).strip()
|
| 35 |
+
except (AttributeError, TypeError, IndexError):
|
| 36 |
+
pass
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
async def _run_chat_async(user_id: str, session_id: str, message: str) -> str:
|
| 41 |
+
session = await session_service.get_session(
|
| 42 |
+
app_name=APP_NAME,
|
| 43 |
+
user_id=user_id,
|
| 44 |
+
session_id=session_id,
|
| 45 |
+
)
|
| 46 |
+
if not session:
|
| 47 |
+
await session_service.create_session(
|
| 48 |
+
app_name=APP_NAME,
|
| 49 |
+
user_id=user_id,
|
| 50 |
+
session_id=session_id,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
user_content = types.Content(
|
| 54 |
+
role="user",
|
| 55 |
+
parts=[types.Part(text=message)],
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
final_response = None
|
| 59 |
+
async for event in runner.run_async(
|
| 60 |
+
user_id=user_id,
|
| 61 |
+
session_id=session_id,
|
| 62 |
+
new_message=user_content,
|
| 63 |
+
):
|
| 64 |
+
try:
|
| 65 |
+
if getattr(event, "is_final_response", lambda: False)() and getattr(event, "content", None):
|
| 66 |
+
text = _text_from_content(event.content)
|
| 67 |
+
if text:
|
| 68 |
+
final_response = text
|
| 69 |
+
break
|
| 70 |
+
if getattr(event, "content", None):
|
| 71 |
+
text = _text_from_content(event.content)
|
| 72 |
+
if text:
|
| 73 |
+
final_response = text
|
| 74 |
+
except (AttributeError, TypeError, KeyError):
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
return final_response or "No response from agent. Please try again."
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def run_chat(message: str, user_id: str = "default_user") -> str:
|
| 81 |
+
"""Run the ADK agent with the given message. Sync wrapper for Streamlit."""
|
| 82 |
+
session_id = f"{user_id}_session"
|
| 83 |
+
try:
|
| 84 |
+
try:
|
| 85 |
+
return asyncio.run(_run_chat_async(user_id, session_id, message))
|
| 86 |
+
except RuntimeError as re:
|
| 87 |
+
if "event loop" in str(re).lower() or "already running" in str(re).lower():
|
| 88 |
+
import concurrent.futures
|
| 89 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex:
|
| 90 |
+
future = ex.submit(
|
| 91 |
+
asyncio.run, _run_chat_async(user_id, session_id, message)
|
| 92 |
+
)
|
| 93 |
+
return future.result(timeout=120)
|
| 94 |
+
raise
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.exception("Chat failed: %s", e)
|
| 97 |
+
return f"Error: {e}"
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,172 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
|
| 13 |
-
|
| 14 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Handbook Generator β Streamlit UI with ADK agent.
|
| 3 |
+
All RAG operations are synchronous β no event loop issues.
|
| 4 |
+
"""
|
| 5 |
+
import asyncio
|
| 6 |
+
import time
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
import streamlit as st
|
| 10 |
+
from google.adk.sessions import InMemorySessionService
|
| 11 |
+
from google.adk.runners import Runner
|
| 12 |
+
from google.genai import types
|
| 13 |
|
| 14 |
+
from config import OPENAI_API_KEY, UPLOADS_DIR, BASE_DIR
|
| 15 |
+
from rag import index_pdf, reset_index
|
| 16 |
+
from handbook_generator import build_handbook
|
| 17 |
+
from agent import root_agent
|
| 18 |
|
| 19 |
+
HANDBOOK_EXPORT_PATH = BASE_DIR / "handbook_export.md"
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
APP_NAME = "handbook_app"
|
| 23 |
+
session_service = InMemorySessionService()
|
| 24 |
+
runner = Runner(
|
| 25 |
+
agent=root_agent,
|
| 26 |
+
app_name=APP_NAME,
|
| 27 |
+
session_service=session_service,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
+
st.set_page_config(page_title="Handbook Generator", page_icon="π", layout="wide")
|
| 32 |
+
|
| 33 |
+
st.markdown("""
|
| 34 |
+
<style>
|
| 35 |
+
.stChatMessage { margin-bottom: 1.1rem !important; border-radius: 16px !important; }
|
| 36 |
+
.stChatInput > div > div { border-radius: 24px !important; padding: 0.5rem 1rem; }
|
| 37 |
+
</style>
|
| 38 |
+
""", unsafe_allow_html=True)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def ensure_api_key():
|
| 42 |
+
if not OPENAI_API_KEY:
|
| 43 |
+
st.error("OPENAI_API_KEY is not set. Create a .env with OPENAI_API_KEY=sk-...")
|
| 44 |
+
return False
|
| 45 |
+
return True
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if not ensure_api_key():
|
| 49 |
+
st.stop()
|
| 50 |
+
|
| 51 |
+
st.title("π Handbook Generator")
|
| 52 |
+
st.caption("Upload PDFs β Chat with ADK agent (RAG) β Generate 20k-word handbook")
|
| 53 |
+
|
| 54 |
+
for key, value in {"messages": [], "user_id": "default_user"}.items():
|
| 55 |
+
if key not in st.session_state:
|
| 56 |
+
st.session_state[key] = value
|
| 57 |
+
|
| 58 |
+
tab1, tab2, tab3 = st.tabs(["Upload PDFs", "Chat", "Generate Handbook"])
|
| 59 |
+
|
| 60 |
+
# ββ Tab 1: Upload (synchronous β no async needed) ββββββββββββββββ
|
| 61 |
+
with tab1:
|
| 62 |
+
st.subheader("Upload and index PDFs")
|
| 63 |
+
files = st.file_uploader("Choose PDF files", type=["pdf"], accept_multiple_files=True)
|
| 64 |
+
if st.button("Index PDFs"):
|
| 65 |
+
if not files:
|
| 66 |
+
st.warning("Select at least one PDF.")
|
| 67 |
+
else:
|
| 68 |
+
reset_index()
|
| 69 |
+
results = []
|
| 70 |
+
for f in files:
|
| 71 |
+
dest = UPLOADS_DIR / f.name
|
| 72 |
+
dest.write_bytes(f.getvalue())
|
| 73 |
+
try:
|
| 74 |
+
n = index_pdf(dest, source_name=f.name)
|
| 75 |
+
results.append(f"β
{f.name}: {n} chunks indexed")
|
| 76 |
+
except Exception as e:
|
| 77 |
+
results.append(f"β {f.name}: Error β {e}")
|
| 78 |
+
st.success("\n".join(results))
|
| 79 |
+
|
| 80 |
+
# ββ Tab 2: Chat βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
+
with tab2:
|
| 82 |
+
st.subheader("Chat (ADK agent + RAG tool)")
|
| 83 |
+
|
| 84 |
+
for msg in st.session_state.messages:
|
| 85 |
+
role = "user" if msg["role"] == "user" else "assistant"
|
| 86 |
+
avatar = "π€" if role == "user" else "π€"
|
| 87 |
+
with st.chat_message(role, avatar=avatar):
|
| 88 |
+
st.markdown(msg["content"])
|
| 89 |
+
|
| 90 |
+
user_input = st.chat_input("Ask about your uploaded documents...")
|
| 91 |
+
|
| 92 |
+
if user_input:
|
| 93 |
+
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 94 |
+
with st.chat_message("user", avatar="π€"):
|
| 95 |
+
st.markdown(user_input)
|
| 96 |
+
|
| 97 |
+
with st.chat_message("assistant", avatar="π€"):
|
| 98 |
+
placeholder = st.empty()
|
| 99 |
+
placeholder.markdown("β Thinkingβ¦")
|
| 100 |
+
|
| 101 |
+
user_id = st.session_state.user_id
|
| 102 |
+
session_id = f"{user_id}_session"
|
| 103 |
+
|
| 104 |
+
# ADK agent is async, run it properly
|
| 105 |
+
async def run_agent():
|
| 106 |
+
try:
|
| 107 |
+
session = await session_service.get_session(
|
| 108 |
+
app_name=APP_NAME, user_id=user_id, session_id=session_id,
|
| 109 |
+
)
|
| 110 |
+
if not session:
|
| 111 |
+
await session_service.create_session(
|
| 112 |
+
app_name=APP_NAME, user_id=user_id, session_id=session_id,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
user_content = types.Content(
|
| 116 |
+
role="user",
|
| 117 |
+
parts=[types.Part.from_text(text=user_input)],
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
response_text = ""
|
| 121 |
+
async for event in runner.run_async(
|
| 122 |
+
user_id=user_id, session_id=session_id, new_message=user_content,
|
| 123 |
+
):
|
| 124 |
+
if event.is_final_response():
|
| 125 |
+
if event.content and event.content.parts:
|
| 126 |
+
response_text = event.content.parts[0].text
|
| 127 |
+
break
|
| 128 |
+
|
| 129 |
+
return response_text or "(No response generated)"
|
| 130 |
+
except Exception as exc:
|
| 131 |
+
return f"**Error occurred:** {str(exc)}"
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
response = asyncio.run(run_agent())
|
| 135 |
+
except RuntimeError:
|
| 136 |
+
# Fallback if event loop already running
|
| 137 |
+
import concurrent.futures
|
| 138 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
| 139 |
+
response = pool.submit(asyncio.run, run_agent()).result(timeout=120)
|
| 140 |
+
|
| 141 |
+
placeholder.markdown(response)
|
| 142 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
| 143 |
+
st.rerun()
|
| 144 |
+
|
| 145 |
+
# ββ Tab 3: Handbook (synchronous) βββββββββββββββββββββββββββββββββ
|
| 146 |
+
with tab3:
|
| 147 |
+
st.subheader("Generate 20k-word handbook")
|
| 148 |
+
topic = st.text_input(
|
| 149 |
+
"Handbook topic", placeholder="e.g. Retrieval-Augmented Generation",
|
| 150 |
+
)
|
| 151 |
+
if st.button("Generate handbook"):
|
| 152 |
+
if not (topic and topic.strip()):
|
| 153 |
+
st.warning("Enter a topic.")
|
| 154 |
+
else:
|
| 155 |
+
status_placeholder = st.empty()
|
| 156 |
+
progress_msgs: list[str] = []
|
| 157 |
+
|
| 158 |
+
def on_progress(msg):
|
| 159 |
+
progress_msgs.append(msg)
|
| 160 |
+
status_placeholder.text("\n".join(progress_msgs))
|
| 161 |
|
| 162 |
+
try:
|
| 163 |
+
full_md = build_handbook(topic.strip(), on_progress=on_progress)
|
| 164 |
+
status_placeholder.success("Generation complete.")
|
| 165 |
+
st.markdown(full_md)
|
| 166 |
+
HANDBOOK_EXPORT_PATH.write_text(full_md, encoding="utf-8")
|
| 167 |
+
st.download_button(
|
| 168 |
+
"Download as Markdown", data=full_md,
|
| 169 |
+
file_name="handbook.md", mime="text/markdown",
|
| 170 |
+
)
|
| 171 |
+
except Exception as e:
|
| 172 |
+
status_placeholder.error(str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|