Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,6 @@ from pathlib import Path
|
|
| 6 |
import gradio as gr
|
| 7 |
from PyPDF2 import PdfReader # pip install PyPDF2
|
| 8 |
|
| 9 |
-
from helper import get_openai_api_key, get_llama_cloud_api_key
|
| 10 |
from llama_parse import LlamaParse
|
| 11 |
from llama_index.core import (
|
| 12 |
Settings, VectorStoreIndex, StorageContext, load_index_from_storage
|
|
@@ -24,12 +23,12 @@ Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-large")
|
|
| 24 |
Settings.chunk_size = 512
|
| 25 |
Settings.chunk_overlap = 64
|
| 26 |
|
| 27 |
-
os.
|
| 28 |
-
os.
|
| 29 |
|
| 30 |
# ---- 2. Parser Setup ----
|
| 31 |
parser = LlamaParse(
|
| 32 |
-
api_key =
|
| 33 |
base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
|
| 34 |
result_type = "markdown",
|
| 35 |
content_guideline_instruction = (
|
|
@@ -40,22 +39,20 @@ parser = LlamaParse(
|
|
| 40 |
verbose=True
|
| 41 |
)
|
| 42 |
|
| 43 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
| 45 |
-
# Validate uploads
|
| 46 |
if not uploaded_files:
|
| 47 |
return "❗ Please upload at least one PDF."
|
| 48 |
if len(uploaded_files) > 5:
|
| 49 |
return "❗ You can upload up to 5 PDF files."
|
| 50 |
|
| 51 |
-
# Ensure user_data directory
|
| 52 |
-
user_dir = Path("./user_data")
|
| 53 |
-
user_dir.mkdir(exist_ok=True)
|
| 54 |
-
|
| 55 |
-
# Prepare list of QueryEngineTools
|
| 56 |
tools = []
|
| 57 |
for file_obj in uploaded_files:
|
| 58 |
-
#
|
| 59 |
try:
|
| 60 |
reader = PdfReader(file_obj.name)
|
| 61 |
except Exception as e:
|
|
@@ -63,35 +60,36 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
|
| 63 |
if len(reader.pages) > 20:
|
| 64 |
return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
|
| 65 |
|
| 66 |
-
# Copy
|
| 67 |
-
dest =
|
| 68 |
-
shutil.copyfile(file_obj.name, dest)
|
| 69 |
|
| 70 |
-
# Parse
|
| 71 |
docs = parser.load_data(dest)
|
| 72 |
|
| 73 |
-
# Index folder
|
| 74 |
-
stem
|
| 75 |
idx_dir = Path(f"./index_data/{stem}")
|
| 76 |
|
| 77 |
-
# Load or build index
|
| 78 |
if idx_dir.exists() and any(idx_dir.iterdir()):
|
| 79 |
-
sc
|
| 80 |
idx = load_index_from_storage(sc)
|
| 81 |
else:
|
| 82 |
sc = StorageContext.from_defaults()
|
| 83 |
idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
|
| 84 |
-
sc.persist(persist_dir=str(idx_dir))
|
| 85 |
-
|
| 86 |
-
#
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
)
|
| 92 |
-
tools.append(qe_tool)
|
| 93 |
|
| 94 |
-
# Combine into SubQuestionQueryEngine + Agent
|
| 95 |
subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
|
| 96 |
tools.append(
|
| 97 |
QueryEngineTool.from_defaults(
|
|
@@ -103,27 +101,82 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
|
| 103 |
agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
|
| 104 |
ctx = Context(agent)
|
| 105 |
|
| 106 |
-
# Run agent
|
| 107 |
-
|
| 108 |
-
return str(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
# ---- 4. Gradio UI ----
|
| 111 |
with gr.Blocks() as demo:
|
| 112 |
gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
)
|
| 123 |
-
output = gr.Textbox(label="Answer")
|
| 124 |
-
submit = gr.Button("Ask")
|
| 125 |
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
if __name__ == "__main__":
|
| 129 |
demo.launch()
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from PyPDF2 import PdfReader # pip install PyPDF2
|
| 8 |
|
|
|
|
| 9 |
from llama_parse import LlamaParse
|
| 10 |
from llama_index.core import (
|
| 11 |
Settings, VectorStoreIndex, StorageContext, load_index_from_storage
|
|
|
|
| 23 |
Settings.chunk_size = 512
|
| 24 |
Settings.chunk_overlap = 64
|
| 25 |
|
| 26 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 27 |
+
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
| 28 |
|
| 29 |
# ---- 2. Parser Setup ----
|
| 30 |
parser = LlamaParse(
|
| 31 |
+
api_key = LLAMA_CLOUD_API_KEY,
|
| 32 |
base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
|
| 33 |
result_type = "markdown",
|
| 34 |
content_guideline_instruction = (
|
|
|
|
| 39 |
verbose=True
|
| 40 |
)
|
| 41 |
|
| 42 |
+
# Ensure directories exist
|
| 43 |
+
Path("./user_data").mkdir(exist_ok=True)
|
| 44 |
+
Path("./index_data").mkdir(exist_ok=True)
|
| 45 |
+
|
| 46 |
+
# ---- 3a. Upload + Answer Logic ----
|
| 47 |
async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
|
|
|
| 48 |
if not uploaded_files:
|
| 49 |
return "❗ Please upload at least one PDF."
|
| 50 |
if len(uploaded_files) > 5:
|
| 51 |
return "❗ You can upload up to 5 PDF files."
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
tools = []
|
| 54 |
for file_obj in uploaded_files:
|
| 55 |
+
# 1) Page-count check
|
| 56 |
try:
|
| 57 |
reader = PdfReader(file_obj.name)
|
| 58 |
except Exception as e:
|
|
|
|
| 60 |
if len(reader.pages) > 20:
|
| 61 |
return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
|
| 62 |
|
| 63 |
+
# 2) Copy PDF into user_data
|
| 64 |
+
dest = Path("./user_data") / Path(file_obj.name).name
|
| 65 |
+
shutil.copyfile(file_obj.name, dest)
|
| 66 |
|
| 67 |
+
# 3) Parse via LlamaParse
|
| 68 |
docs = parser.load_data(dest)
|
| 69 |
|
| 70 |
+
# 4) Index folder per file stem
|
| 71 |
+
stem = dest.stem
|
| 72 |
idx_dir = Path(f"./index_data/{stem}")
|
| 73 |
|
| 74 |
+
# 5) Load or build index
|
| 75 |
if idx_dir.exists() and any(idx_dir.iterdir()):
|
| 76 |
+
sc = StorageContext.from_defaults(persist_dir=str(idx_dir))
|
| 77 |
idx = load_index_from_storage(sc)
|
| 78 |
else:
|
| 79 |
sc = StorageContext.from_defaults()
|
| 80 |
idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
|
| 81 |
+
sc.persist(persist_dir=str(idx_dir))
|
| 82 |
+
|
| 83 |
+
# 6) Wrap in QueryEngineTool
|
| 84 |
+
tools.append(
|
| 85 |
+
QueryEngineTool.from_defaults(
|
| 86 |
+
query_engine=idx.as_query_engine(),
|
| 87 |
+
name=f"vector_index_{stem}",
|
| 88 |
+
description=f"Query engine for {stem}.pdf"
|
| 89 |
+
)
|
| 90 |
)
|
|
|
|
| 91 |
|
| 92 |
+
# 7) Combine tools into SubQuestionQueryEngine + Agent
|
| 93 |
subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
|
| 94 |
tools.append(
|
| 95 |
QueryEngineTool.from_defaults(
|
|
|
|
| 101 |
agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
|
| 102 |
ctx = Context(agent)
|
| 103 |
|
| 104 |
+
# 8) Run agent
|
| 105 |
+
resp = await agent.run(question, ctx=ctx)
|
| 106 |
+
return str(resp)
|
| 107 |
+
|
| 108 |
+
# ---- 3b. Remove Documents Logic ----
|
| 109 |
+
def remove_docs(filenames: str) -> str:
|
| 110 |
+
"""
|
| 111 |
+
filenames: comma-separated list of exact PDF filenames (with .pdf)
|
| 112 |
+
Deletes each from ./user_data/ and its index folder under ./index_data/
|
| 113 |
+
"""
|
| 114 |
+
if not filenames.strip():
|
| 115 |
+
return "❗ Enter at least one filename to remove."
|
| 116 |
+
|
| 117 |
+
removed, not_found = [], []
|
| 118 |
+
for name in [f.strip() for f in filenames.split(",")]:
|
| 119 |
+
pdf_path = Path("./user_data") / name
|
| 120 |
+
idx_path = Path("./index_data") / Path(name).stem
|
| 121 |
+
|
| 122 |
+
ok = True
|
| 123 |
+
if pdf_path.exists():
|
| 124 |
+
pdf_path.unlink()
|
| 125 |
+
else:
|
| 126 |
+
ok = False
|
| 127 |
+
|
| 128 |
+
if idx_path.exists():
|
| 129 |
+
shutil.rmtree(idx_path)
|
| 130 |
+
else:
|
| 131 |
+
ok = ok and False
|
| 132 |
+
|
| 133 |
+
if ok:
|
| 134 |
+
removed.append(name)
|
| 135 |
+
else:
|
| 136 |
+
not_found.append(name)
|
| 137 |
+
|
| 138 |
+
msg = ""
|
| 139 |
+
if removed:
|
| 140 |
+
msg += f"✅ Removed: {', '.join(removed)}.\n"
|
| 141 |
+
if not_found:
|
| 142 |
+
msg += f"⚠️ Not found: {', '.join(not_found)}."
|
| 143 |
+
return msg.strip()
|
| 144 |
|
| 145 |
# ---- 4. Gradio UI ----
|
| 146 |
with gr.Blocks() as demo:
|
| 147 |
gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
|
| 148 |
+
|
| 149 |
+
with gr.Tab("Ask Questions"):
|
| 150 |
+
with gr.Row():
|
| 151 |
+
file_input = gr.UploadButton(
|
| 152 |
+
"Upload up to 5 PDFs",
|
| 153 |
+
file_types=[".pdf"],
|
| 154 |
+
file_count="multiple"
|
| 155 |
+
)
|
| 156 |
+
question = gr.Textbox(
|
| 157 |
+
lines=2,
|
| 158 |
+
placeholder="Ask your question about the uploaded slide decks..."
|
| 159 |
+
)
|
| 160 |
+
output = gr.Textbox(label="Answer")
|
| 161 |
+
ask_btn = gr.Button("Ask")
|
| 162 |
+
ask_btn.click(
|
| 163 |
+
fn=answer,
|
| 164 |
+
inputs=[file_input, question],
|
| 165 |
+
outputs=output
|
| 166 |
)
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
with gr.Tab("Remove Documents"):
|
| 169 |
+
remove_input = gr.Textbox(
|
| 170 |
+
lines=1,
|
| 171 |
+
placeholder="e.g. Q1-Slides.pdf, Q2-Slides.pdf"
|
| 172 |
+
)
|
| 173 |
+
remove_output = gr.Textbox(label="Removal Status")
|
| 174 |
+
remove_btn = gr.Button("Remove Docs")
|
| 175 |
+
remove_btn.click(
|
| 176 |
+
fn=remove_docs,
|
| 177 |
+
inputs=remove_input,
|
| 178 |
+
outputs=remove_output
|
| 179 |
+
)
|
| 180 |
|
| 181 |
if __name__ == "__main__":
|
| 182 |
demo.launch()
|