vision-rag-sourced-docker-image

Paused

App Files Files Community

manu commited on Mar 4

Commit

4f3a756

verified ·

1 Parent(s): 068f2e8

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -39

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from colpali_engine.models import ColQwen2, ColQwen2Processor
@@ -30,42 +32,50 @@ def encode_image_to_base64(image):
     return base64.b64encode(buffered.getvalue()).decode("utf-8")
-def query_gpt4o_mini(query, images, api_key):
     """Calls OpenAI's GPT-4o-mini with the query and image data."""
     if api_key and api_key.startswith("sk"):
         try:
             from openai import OpenAI
-            base64_images = [encode_image_to_base64(image[0]) for image in images]
             client = OpenAI(api_key=api_key.strip())
-            PROMPT = """
-            You are a smart assistant designed to answer questions about a PDF document.
-            You are given relevant information in the form of PDF pages. Use them to construct a short response to the question, and cite your sources (page numbers, etc).
-            If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
-            Give detailed and extensive answers, only containing info in the pages you are given.
-            You can answer using information contained in plots and figures if necessary.
-            Answer in the same language as the query.
-            Query: {query}
-            PDF pages:
             """
             response = client.chat.completions.create(
             model="gpt-4o-mini",
             messages=[
                 {
                   "role": "user",
-                  "content": [
-                    {
-                      "type": "text",
-                      "text": PROMPT.format(query=query)
-                    }] + [{
-                      "type": "image_url",
-                      "image_url": {
-                        "url": f"data:image/jpeg;base64,{im}"
-                        },
-                    } for im in base64_images]
                 }
               ],
               max_tokens=500,
@@ -77,7 +87,7 @@ def query_gpt4o_mini(query, images, api_key):
     return "Enter your OpenAI API key to get a custom response"
-def search(query: str, ds, images, k, api_key):
     k = min(k, len(ds))
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     if device != model.device:
@@ -95,7 +105,9 @@ def search(query: str, ds, images, k, api_key):
     results = []
     for idx in top_k_indices:
-        results.append((images[idx], f"Page {idx}"))
     # Generate response from GPT-4o-mini
     ai_response = query_gpt4o_mini(query, results, api_key)
@@ -103,22 +115,62 @@ def search(query: str, ds, images, k, api_key):
     return results, ai_response
-def index(files, ds):
     print("Converting files")
-    images = convert_files(files)
     print(f"Files converted with {len(images)} images.")
-    return index_gpu(images, ds)
-def convert_files(files):
     images = []
     for f in files:
-        images.extend(convert_from_path(f, thread_count=4))
     if len(images) >= 500:
         raise gr.Error("The number of images in the dataset should be less than 500.")
-    return images
 def index_gpu(images, ds):
@@ -141,7 +193,7 @@ def index_gpu(images, ds):
             batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
             embeddings_doc = model(**batch_doc)
         ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
-    return f"Uploaded and converted {len(images)} pages", ds, images
@@ -166,6 +218,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             api_key = gr.Textbox(placeholder="Enter your OpenAI KEY here (optional)", label="API key")
             embeds = gr.State(value=[])
             imgs = gr.State(value=[])
         with gr.Column(scale=3):
             gr.Markdown("## 2️⃣ Search")
@@ -178,8 +231,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     output_gallery = gr.Gallery(label="Retrieved Documents", height=600, show_label=True)
     output_text = gr.Textbox(label="AI Response", placeholder="Generated response based on retrieved documents")
-    convert_button.click(index, inputs=[file, embeds], outputs=[message, embeds, imgs])
-    search_button.click(search, inputs=[query, embeds, imgs, k, api_key], outputs=[output_gallery, output_text])
 if __name__ == "__main__":
-    demo.queue(max_size=5).launch(debug=True)

 from torch.utils.data import DataLoader
 from tqdm import tqdm
+from pqdm.processes import pqdm
 from colpali_engine.models import ColQwen2, ColQwen2Processor
     return base64.b64encode(buffered.getvalue()).decode("utf-8")
+DEFAULT_SYSTEM_PROMPT = """
+    You are a smart assistant designed to answer questions about a PDF document.
+    You are given relevant information in the form of PDF pages preceded by their metadata (PDF title, page number, surrounding context).
+    Use them to construct a short response to the question, and cite your sources (page number, pdf title).
+    If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
+    Give detailed and extensive answers, only containing info in the pages you are given.
+    You can answer using information contained in plots and figures if necessary.
+    Answer in the same language as the query.
+"""
+def query_gpt4o_mini(query, images, api_key, system_prompt=DEFAULT_SYSTEM_PROMPT):
     """Calls OpenAI's GPT-4o-mini with the query and image data."""
     if api_key and api_key.startswith("sk"):
         try:
             from openai import OpenAI
             client = OpenAI(api_key=api_key.strip())
+            prompt = f"""
+                {system_prompt}
+                Query: {query}
+                PDF pages:
             """
+            messages = [{"type": "text", "text": prompt}]
+            for im, capt in images:
+                if capt is not None:
+                    messages.append({
+                            "type": "text",
+                            "text": capt
+                        })
+                messages.append({
+                      "type": "image_url",
+                      "image_url": {
+                        "url": f"data:image/jpeg;base64,{encode_image_to_base64(im)}"
+                        },
+                    })
             response = client.chat.completions.create(
             model="gpt-4o-mini",
             messages=[
                 {
                   "role": "user",
+                  "content": messages
                 }
               ],
               max_tokens=500,
     return "Enter your OpenAI API key to get a custom response"
+def search(query: str, ds, images, metadatas, k, api_key):
     k = min(k, len(ds))
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     if device != model.device:
     results = []
     for idx in top_k_indices:
+        img = images[idx]
+        meta = metadatas[idx]
+        results.append((img, f"Document: {meta['title']}, Page: {meta['page']}, Context: {meta['context']}"))
     # Generate response from GPT-4o-mini
     ai_response = query_gpt4o_mini(query, results, api_key)
     return results, ai_response
+def index(files, ds, api_key):
     print("Converting files")
+    images, metadatas = convert_files(files, api_key)
     print(f"Files converted with {len(images)} images.")
+    ds = index_gpu(images, ds)
+    print(f"Indexed {len(ds)} images.")
+    return f"Uploaded and converted {len(images)} pages", ds, images, metadatas
+DEFAULT_CONTEXT_PROMPT = """
+    You are a smart assistant designed to extract context of PDF pages.
+    Give detailed and extensive answers, only containing info in the pages you are given.
+    You can answer using information contained in plots and figures if necessary.
+    Answer in the same language as the query.
+"""
+def extract_context(images, api_key, window=10):
+    """Extracts context from images."""
+    prompt = "Give the general context about these pages."
+    window_contexts = []
+    args = [(prompt, (images[max(i-window+1, 0):i+1], None), api_key, DEFAULT_CONTEXT_PROMPT)
+            for i in range(0, len(images), window)]
+    window_contexts = pqdm(args, query_gpt4o_mini, n_jobs=8)
+    # for i in tqdm(range(0, len(images), window), desc="Extracting context", total=len(images)//window):
+    #     window_images = images[max(i-window+1, 0):i+1]
+    #     window_images = [(image, None) for image in window_images]
+    #     window_contexts.append(query_gpt4o_mini(prompt, window_images, api_key, system_prompt=DEFAULT_CONTEXT_PROMPT))
+    contexts = []
+    for i in range(len(images)):
+        context = window_contexts[i//window]
+        contexts.append(context)
+    assert len(contexts) == len(images)
+    return contexts
+def extract_metadata(file, images, api_key, window=10):
+    """Extracts metadata from pdfs. Extract page number, file name, and authors."""
+    title = file.split("/")[-1]
+    contexts = extract_context(images, api_key, window=window)
+    return [{"title": title, "page": i+1, "context": contexts[i]} for i in range(len(images))]
+def convert_files(files, api_key):
     images = []
+    metadatas = []
     for f in files:
+        file_images = convert_from_path(f, thread_count=4)
+        file_metadatas = extract_metadata(f, file_images, api_key)
+        images.extend(file_images)
+        metadatas.extend(file_metadatas)
     if len(images) >= 500:
         raise gr.Error("The number of images in the dataset should be less than 500.")
+    return images, metadatas
 def index_gpu(images, ds):
             batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
             embeddings_doc = model(**batch_doc)
         ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
+    return ds
             api_key = gr.Textbox(placeholder="Enter your OpenAI KEY here (optional)", label="API key")
             embeds = gr.State(value=[])
             imgs = gr.State(value=[])
+            metadatas = gr.State(value=[])
         with gr.Column(scale=3):
             gr.Markdown("## 2️⃣ Search")
     output_gallery = gr.Gallery(label="Retrieved Documents", height=600, show_label=True)
     output_text = gr.Textbox(label="AI Response", placeholder="Generated response based on retrieved documents")
+    convert_button.click(index, inputs=[file, embeds, api_key], outputs=[message, embeds, imgs, metadatas])
+    search_button.click(search, inputs=[query, embeds, imgs, metadatas, k, api_key], outputs=[output_gallery, output_text])
 if __name__ == "__main__":
+    demo.queue(max_size=5).launch(debug=True, server_name="0.0.0.0")