Spaces:

getGO007
/

PDFChat

Sleeping

App Files Files Community

getGO007 commited on Apr 25, 2025

Commit

01d9bfd

verified ·

1 Parent(s): 0f7341d

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -44

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ from pathlib import Path
 import gradio as gr
 from PyPDF2 import PdfReader  # pip install PyPDF2
-from helper import get_openai_api_key, get_llama_cloud_api_key
 from llama_parse import LlamaParse
 from llama_index.core import (
     Settings, VectorStoreIndex, StorageContext, load_index_from_storage
@@ -24,12 +23,12 @@ Settings.embed_model   = OpenAIEmbedding(model_name="text-embedding-3-large")
 Settings.chunk_size    = 512
 Settings.chunk_overlap = 64
-os.environ["OPENAI_API_KEY"]      = get_openai_api_key()
-os.environ["LLAMA_CLOUD_API_KEY"] = get_llama_cloud_api_key()
 # ---- 2. Parser Setup ----
 parser = LlamaParse(
-    api_key  = os.getenv("LLAMA_CLOUD_API_KEY"),
     base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
     result_type = "markdown",
     content_guideline_instruction = (
@@ -40,22 +39,20 @@ parser = LlamaParse(
     verbose=True
 )
-# ---- 3. Core “Answer” Logic ----
 async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
-    # Validate uploads
     if not uploaded_files:
         return "❗ Please upload at least one PDF."
     if len(uploaded_files) > 5:
         return "❗ You can upload up to 5 PDF files."
-    # Ensure user_data directory
-    user_dir = Path("./user_data")
-    user_dir.mkdir(exist_ok=True)
-    # Prepare list of QueryEngineTools
     tools = []
     for file_obj in uploaded_files:
-        # Read page count
         try:
             reader = PdfReader(file_obj.name)
         except Exception as e:
@@ -63,35 +60,36 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
         if len(reader.pages) > 20:
             return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
-        # Copy file to persistent location
-        dest = user_dir / Path(file_obj.name).name
-        shutil.copyfile(file_obj.name, dest)  # permanent copy :contentReference[oaicite:3]{index=3}
-        # Parse PDF into Documents
         docs = parser.load_data(dest)
-        # Index folder named after file stem
-        stem = dest.stem
         idx_dir = Path(f"./index_data/{stem}")
-        # Load or build index
         if idx_dir.exists() and any(idx_dir.iterdir()):
-            sc = StorageContext.from_defaults(persist_dir=str(idx_dir))
             idx = load_index_from_storage(sc)
         else:
             sc  = StorageContext.from_defaults()
             idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
-            sc.persist(persist_dir=str(idx_dir))  # persist per-file index :contentReference[oaicite:4]{index=4}
-        # Create a QueryEngineTool for this index
-        qe_tool = QueryEngineTool.from_defaults(
-            query_engine=idx.as_query_engine(),
-            name=f"vector_index_{stem}",
-            description=f"Query engine for slides in {stem}.pdf"
         )
-        tools.append(qe_tool)
-    # Combine into SubQuestionQueryEngine + Agent
     subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
     tools.append(
         QueryEngineTool.from_defaults(
@@ -103,27 +101,82 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
     agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
     ctx   = Context(agent)
-    # Run agent
-    response = await agent.run(question, ctx=ctx)
-    return str(response)
 # ---- 4. Gradio UI ----
 with gr.Blocks() as demo:
     gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
-    with gr.Row():
-        file_input = gr.UploadButton(
-            "Upload up to 5 PDFs",
-            file_types=[".pdf"],
-            file_count="multiple"  # support multiple uploads
-        )
-        question = gr.Textbox(
-            lines=2,
-            placeholder="Ask your question about the uploaded slide decks..."
         )
-    output = gr.Textbox(label="Answer")
-    submit = gr.Button("Ask")
-    submit.click(fn=answer, inputs=[file_input, question], outputs=output)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from PyPDF2 import PdfReader  # pip install PyPDF2
 from llama_parse import LlamaParse
 from llama_index.core import (
     Settings, VectorStoreIndex, StorageContext, load_index_from_storage
 Settings.chunk_size    = 512
 Settings.chunk_overlap = 64
+OPENAI_API_KEY      = os.getenv("OPENAI_API_KEY")
+LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
 # ---- 2. Parser Setup ----
 parser = LlamaParse(
+    api_key  = LLAMA_CLOUD_API_KEY,
     base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
     result_type = "markdown",
     content_guideline_instruction = (
     verbose=True
 )
+# Ensure directories exist
+Path("./user_data").mkdir(exist_ok=True)
+Path("./index_data").mkdir(exist_ok=True)
+# ---- 3a. Upload + Answer Logic ----
 async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
     if not uploaded_files:
         return "❗ Please upload at least one PDF."
     if len(uploaded_files) > 5:
         return "❗ You can upload up to 5 PDF files."
     tools = []
     for file_obj in uploaded_files:
+        # 1) Page-count check
         try:
             reader = PdfReader(file_obj.name)
         except Exception as e:
         if len(reader.pages) > 20:
             return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
+        # 2) Copy PDF into user_data
+        dest = Path("./user_data") / Path(file_obj.name).name
+        shutil.copyfile(file_obj.name, dest)
+        # 3) Parse via LlamaParse
         docs = parser.load_data(dest)
+        # 4) Index folder per file stem
+        stem   = dest.stem
         idx_dir = Path(f"./index_data/{stem}")
+        # 5) Load or build index
         if idx_dir.exists() and any(idx_dir.iterdir()):
+            sc  = StorageContext.from_defaults(persist_dir=str(idx_dir))
             idx = load_index_from_storage(sc)
         else:
             sc  = StorageContext.from_defaults()
             idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
+            sc.persist(persist_dir=str(idx_dir))
+        # 6) Wrap in QueryEngineTool
+        tools.append(
+            QueryEngineTool.from_defaults(
+                query_engine=idx.as_query_engine(),
+                name=f"vector_index_{stem}",
+                description=f"Query engine for {stem}.pdf"
+            )
         )
+    # 7) Combine tools into SubQuestionQueryEngine + Agent
     subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
     tools.append(
         QueryEngineTool.from_defaults(
     agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
     ctx   = Context(agent)
+    # 8) Run agent
+    resp = await agent.run(question, ctx=ctx)
+    return str(resp)
+# ---- 3b. Remove Documents Logic ----
+def remove_docs(filenames: str) -> str:
+    """
+    filenames: comma-separated list of exact PDF filenames (with .pdf)
+    Deletes each from ./user_data/ and its index folder under ./index_data/
+    """
+    if not filenames.strip():
+        return "❗ Enter at least one filename to remove."
+    removed, not_found = [], []
+    for name in [f.strip() for f in filenames.split(",")]:
+        pdf_path = Path("./user_data") / name
+        idx_path = Path("./index_data") / Path(name).stem
+        ok = True
+        if pdf_path.exists():
+            pdf_path.unlink()
+        else:
+            ok = False
+        if idx_path.exists():
+            shutil.rmtree(idx_path)
+        else:
+            ok = ok and False
+        if ok:
+            removed.append(name)
+        else:
+            not_found.append(name)
+    msg = ""
+    if removed:
+        msg += f"✅ Removed: {', '.join(removed)}.\n"
+    if not_found:
+        msg += f"⚠️ Not found: {', '.join(not_found)}."
+    return msg.strip()
 # ---- 4. Gradio UI ----
 with gr.Blocks() as demo:
     gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
+    with gr.Tab("Ask Questions"):
+        with gr.Row():
+            file_input = gr.UploadButton(
+                "Upload up to 5 PDFs",
+                file_types=[".pdf"],
+                file_count="multiple"
+            )
+            question = gr.Textbox(
+                lines=2,
+                placeholder="Ask your question about the uploaded slide decks..."
+            )
+        output = gr.Textbox(label="Answer")
+        ask_btn = gr.Button("Ask")
+        ask_btn.click(
+            fn=answer,
+            inputs=[file_input, question],
+            outputs=output
         )
+    with gr.Tab("Remove Documents"):
+        remove_input  = gr.Textbox(
+            lines=1,
+            placeholder="e.g. Q1-Slides.pdf, Q2-Slides.pdf"
+        )
+        remove_output = gr.Textbox(label="Removal Status")
+        remove_btn    = gr.Button("Remove Docs")
+        remove_btn.click(
+            fn=remove_docs,
+            inputs=remove_input,
+            outputs=remove_output
+        )
 if __name__ == "__main__":
     demo.launch()