freddyaboulton HF Staff commited on
Commit
8ed3105
·
verified ·
1 Parent(s): fa52aff

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. Lorem_ipsum.pdf +0 -0
  2. README.md +7 -7
  3. requirements.txt +2 -0
  4. run.ipynb +1 -0
  5. run.py +52 -0
Lorem_ipsum.pdf ADDED
Binary file (42.3 kB). View file
 
README.md CHANGED
@@ -1,12 +1,12 @@
 
1
  ---
2
- title: Highlight Pdf
3
- emoji: 🌖
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.20.1
8
- app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+
2
  ---
3
+ title: highlight_pdf
4
+ emoji: 🔥
5
+ colorFrom: indigo
6
+ colorTo: indigo
7
  sdk: gradio
8
  sdk_version: 5.20.1
9
+ app_file: run.py
10
  pinned: false
11
+ hf_oauth: true
12
  ---
 
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio_pdf>=0.0.22
2
+ pymupdf>=1.25.3
run.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: highlight_pdf"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio gradio_pdf>=0.0.22 pymupdf>=1.25.3"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/highlight_pdf/Lorem_ipsum.pdf"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from gradio_pdf import PDF\n", "import pymupdf\n", "import os\n", "from pathlib import Path\n", "\n", "current_dir = Path(os.path.abspath(''))\n", "\n", "def highlight_text_in_pdf(pdf_file: Path, highlight_text: str):\n", " page_number = 0\n", " doc = pymupdf.open(pdf_file)\n", " for page in doc:\n", " text_instances = page.search_for(highlight_text)\n", " if len(text_instances) > 0:\n", " page_number = page.number\n", " for inst in text_instances:\n", " page.add_highlight_annot(inst)\n", "\n", " new_pdf_file = str(pdf_file.parents[0]) + \"/new_\" + pdf_file.name\n", " doc.save(new_pdf_file)\n", "\n", " if page_number is None:\n", " page_number = 0\n", " \n", " return new_pdf_file, page_number + 1\n", "\n", "def ask(query): \n", " result = f\"Something about : {query}\"\n", " sources = \"Document 1\"\n", " pdf_path = current_dir / \"Lorem_ipsum.pdf\"\n", " pdf_name = \"Document 1\"\n", " context_to_highlight = \"Ut velit mauris\"\n", "\n", " pdf, page_number = highlight_text_in_pdf(pdf_path, context_to_highlight)\n", " return result, sources + f\" - Page {page_number}\", PDF(pdf, label=pdf_name, starting_page=page_number, interactive=True)\n", "\n", "\n", "if __name__ == \"__main__\":\n", " with gr.Blocks() as demo:\n", " title = gr.HTML(f\"<center><h1>Bot</h1></center>\")\n", " with gr.Row():\n", " with gr.Column(scale=2):\n", " input = gr.Textbox(label=\"Question\", autofocus=True, interactive=True)\n", " btn = gr.Button(\"Ask\", variant=\"primary\")\n", " output = gr.Markdown(label=\"Anwser\")\n", " with gr.Column(scale=2):\n", " srcs = gr.Textbox(label=\"Sources\", interactive=False)\n", " pdf = PDF(label=\"Document\")\n", " \n", " btn.click(fn=ask, inputs=input, outputs=[output, srcs, pdf])\n", "\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
run.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_pdf import PDF
3
+ import pymupdf
4
+ import os
5
+ from pathlib import Path
6
+
7
+ current_dir = Path(os.path.abspath(''))
8
+
9
+ def highlight_text_in_pdf(pdf_file: Path, highlight_text: str):
10
+ page_number = 0
11
+ doc = pymupdf.open(pdf_file)
12
+ for page in doc:
13
+ text_instances = page.search_for(highlight_text)
14
+ if len(text_instances) > 0:
15
+ page_number = page.number
16
+ for inst in text_instances:
17
+ page.add_highlight_annot(inst)
18
+
19
+ new_pdf_file = str(pdf_file.parents[0]) + "/new_" + pdf_file.name
20
+ doc.save(new_pdf_file)
21
+
22
+ if page_number is None:
23
+ page_number = 0
24
+
25
+ return new_pdf_file, page_number + 1
26
+
27
+ def ask(query):
28
+ result = f"Something about : {query}"
29
+ sources = "Document 1"
30
+ pdf_path = current_dir / "Lorem_ipsum.pdf"
31
+ pdf_name = "Document 1"
32
+ context_to_highlight = "Ut velit mauris"
33
+
34
+ pdf, page_number = highlight_text_in_pdf(pdf_path, context_to_highlight)
35
+ return result, sources + f" - Page {page_number}", PDF(pdf, label=pdf_name, starting_page=page_number, interactive=True)
36
+
37
+
38
+ if __name__ == "__main__":
39
+ with gr.Blocks() as demo:
40
+ title = gr.HTML(f"<center><h1>Bot</h1></center>")
41
+ with gr.Row():
42
+ with gr.Column(scale=2):
43
+ input = gr.Textbox(label="Question", autofocus=True, interactive=True)
44
+ btn = gr.Button("Ask", variant="primary")
45
+ output = gr.Markdown(label="Anwser")
46
+ with gr.Column(scale=2):
47
+ srcs = gr.Textbox(label="Sources", interactive=False)
48
+ pdf = PDF(label="Document")
49
+
50
+ btn.click(fn=ask, inputs=input, outputs=[output, srcs, pdf])
51
+
52
+ demo.launch()